{ "best_metric": 0.9025216698646545, "best_model_checkpoint": "outputs/checkpoint-7650", "epoch": 4.998087312506292, "eval_steps": 50, "global_step": 9310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005368947350760042, "grad_norm": 1.5859375, "learning_rate": 5e-06, "loss": 1.5327, "step": 1 }, { "epoch": 0.0010737894701520084, "grad_norm": 1.9375, "learning_rate": 1e-05, "loss": 1.4627, "step": 2 }, { "epoch": 0.0016106842052280126, "grad_norm": 2.0, "learning_rate": 1.5e-05, "loss": 1.5503, "step": 3 }, { "epoch": 0.0021475789403040168, "grad_norm": 1.6875, "learning_rate": 2e-05, "loss": 1.4298, "step": 4 }, { "epoch": 0.0026844736753800207, "grad_norm": 1.546875, "learning_rate": 2.5e-05, "loss": 1.302, "step": 5 }, { "epoch": 0.003221368410456025, "grad_norm": 1.6640625, "learning_rate": 3e-05, "loss": 1.6321, "step": 6 }, { "epoch": 0.003758263145532029, "grad_norm": 1.6328125, "learning_rate": 3.5e-05, "loss": 1.4398, "step": 7 }, { "epoch": 0.0042951578806080335, "grad_norm": 1.46875, "learning_rate": 4e-05, "loss": 1.4147, "step": 8 }, { "epoch": 0.0048320526156840375, "grad_norm": 1.484375, "learning_rate": 4.5e-05, "loss": 1.3094, "step": 9 }, { "epoch": 0.0053689473507600415, "grad_norm": 1.40625, "learning_rate": 5e-05, "loss": 1.5951, "step": 10 }, { "epoch": 0.005905842085836045, "grad_norm": 1.3046875, "learning_rate": 4.99999985735917e-05, "loss": 1.6913, "step": 11 }, { "epoch": 0.00644273682091205, "grad_norm": 1.4140625, "learning_rate": 4.999999429436697e-05, "loss": 1.3411, "step": 12 }, { "epoch": 0.006979631555988054, "grad_norm": 1.1796875, "learning_rate": 4.9999987162326275e-05, "loss": 1.225, "step": 13 }, { "epoch": 0.007516526291064058, "grad_norm": 1.484375, "learning_rate": 4.9999977177470465e-05, "loss": 1.1572, "step": 14 }, { "epoch": 0.008053421026140062, "grad_norm": 1.1875, "learning_rate": 4.999996433980065e-05, "loss": 1.4555, "step": 15 }, { "epoch": 0.008590315761216067, "grad_norm": 1.1953125, "learning_rate": 4.999994864931831e-05, "loss": 1.3826, "step": 16 }, { "epoch": 0.00912721049629207, "grad_norm": 1.0625, "learning_rate": 4.999993010602522e-05, "loss": 1.4102, "step": 17 }, { "epoch": 0.009664105231368075, "grad_norm": 1.03125, "learning_rate": 4.999990870992352e-05, "loss": 1.0747, "step": 18 }, { "epoch": 0.01020099996644408, "grad_norm": 1.0859375, "learning_rate": 4.9999884461015623e-05, "loss": 1.3182, "step": 19 }, { "epoch": 0.010737894701520083, "grad_norm": 1.09375, "learning_rate": 4.999985735930432e-05, "loss": 1.3525, "step": 20 }, { "epoch": 0.011274789436596088, "grad_norm": 1.0546875, "learning_rate": 4.999982740479269e-05, "loss": 1.2065, "step": 21 }, { "epoch": 0.01181168417167209, "grad_norm": 0.96875, "learning_rate": 4.9999794597484165e-05, "loss": 1.1728, "step": 22 }, { "epoch": 0.012348578906748096, "grad_norm": 0.96875, "learning_rate": 4.999975893738247e-05, "loss": 1.213, "step": 23 }, { "epoch": 0.0128854736418241, "grad_norm": 1.0703125, "learning_rate": 4.999972042449169e-05, "loss": 1.1957, "step": 24 }, { "epoch": 0.013422368376900104, "grad_norm": 2.0625, "learning_rate": 4.9999679058816204e-05, "loss": 1.4011, "step": 25 }, { "epoch": 0.013959263111976108, "grad_norm": 0.9921875, "learning_rate": 4.9999634840360755e-05, "loss": 1.0791, "step": 26 }, { "epoch": 0.014496157847052112, "grad_norm": 0.91796875, "learning_rate": 4.999958776913037e-05, "loss": 0.921, "step": 27 }, { "epoch": 0.015033052582128116, "grad_norm": 0.9140625, "learning_rate": 4.9999537845130426e-05, "loss": 1.2306, "step": 28 }, { "epoch": 0.015569947317204121, "grad_norm": 1.09375, "learning_rate": 4.9999485068366625e-05, "loss": 1.304, "step": 29 }, { "epoch": 0.016106842052280124, "grad_norm": 1.25, "learning_rate": 4.999942943884498e-05, "loss": 1.34, "step": 30 }, { "epoch": 0.016643736787356127, "grad_norm": 0.94921875, "learning_rate": 4.999937095657184e-05, "loss": 1.2341, "step": 31 }, { "epoch": 0.017180631522432134, "grad_norm": 1.09375, "learning_rate": 4.9999309621553894e-05, "loss": 1.1547, "step": 32 }, { "epoch": 0.017717526257508137, "grad_norm": 0.921875, "learning_rate": 4.999924543379812e-05, "loss": 1.1039, "step": 33 }, { "epoch": 0.01825442099258414, "grad_norm": 1.0625, "learning_rate": 4.9999178393311855e-05, "loss": 1.1743, "step": 34 }, { "epoch": 0.018791315727660147, "grad_norm": 0.9453125, "learning_rate": 4.999910850010275e-05, "loss": 1.2444, "step": 35 }, { "epoch": 0.01932821046273615, "grad_norm": 0.96875, "learning_rate": 4.999903575417877e-05, "loss": 1.2285, "step": 36 }, { "epoch": 0.019865105197812153, "grad_norm": 0.84765625, "learning_rate": 4.9998960155548227e-05, "loss": 0.9883, "step": 37 }, { "epoch": 0.02040199993288816, "grad_norm": 0.91796875, "learning_rate": 4.9998881704219745e-05, "loss": 0.9327, "step": 38 }, { "epoch": 0.020938894667964163, "grad_norm": 0.96875, "learning_rate": 4.999880040020227e-05, "loss": 1.1627, "step": 39 }, { "epoch": 0.021475789403040166, "grad_norm": 1.09375, "learning_rate": 4.9998716243505096e-05, "loss": 1.1357, "step": 40 }, { "epoch": 0.02201268413811617, "grad_norm": 1.265625, "learning_rate": 4.999862923413781e-05, "loss": 1.0989, "step": 41 }, { "epoch": 0.022549578873192175, "grad_norm": 1.0078125, "learning_rate": 4.999853937211034e-05, "loss": 1.2711, "step": 42 }, { "epoch": 0.02308647360826818, "grad_norm": 0.88671875, "learning_rate": 4.999844665743295e-05, "loss": 1.3272, "step": 43 }, { "epoch": 0.02362336834334418, "grad_norm": 0.86328125, "learning_rate": 4.9998351090116226e-05, "loss": 1.114, "step": 44 }, { "epoch": 0.02416026307842019, "grad_norm": 0.83984375, "learning_rate": 4.999825267017105e-05, "loss": 1.1632, "step": 45 }, { "epoch": 0.02469715781349619, "grad_norm": 0.86328125, "learning_rate": 4.9998151397608674e-05, "loss": 1.2763, "step": 46 }, { "epoch": 0.025234052548572194, "grad_norm": 0.96484375, "learning_rate": 4.999804727244065e-05, "loss": 1.223, "step": 47 }, { "epoch": 0.0257709472836482, "grad_norm": 0.98046875, "learning_rate": 4.999794029467886e-05, "loss": 1.1651, "step": 48 }, { "epoch": 0.026307842018724204, "grad_norm": 0.9140625, "learning_rate": 4.99978304643355e-05, "loss": 1.1746, "step": 49 }, { "epoch": 0.026844736753800207, "grad_norm": 0.98046875, "learning_rate": 4.9997717781423114e-05, "loss": 1.3132, "step": 50 }, { "epoch": 0.026844736753800207, "eval_loss": 1.0921833515167236, "eval_model_preparation_time": 0.0202, "eval_runtime": 47.907, "eval_samples_per_second": 20.081, "eval_steps_per_second": 20.081, "step": 50 }, { "epoch": 0.027381631488876214, "grad_norm": 0.90234375, "learning_rate": 4.999760224595456e-05, "loss": 1.1041, "step": 51 }, { "epoch": 0.027918526223952217, "grad_norm": 0.87890625, "learning_rate": 4.999748385794302e-05, "loss": 1.1916, "step": 52 }, { "epoch": 0.02845542095902822, "grad_norm": 1.0546875, "learning_rate": 4.999736261740201e-05, "loss": 1.0385, "step": 53 }, { "epoch": 0.028992315694104223, "grad_norm": 1.2109375, "learning_rate": 4.999723852434535e-05, "loss": 1.293, "step": 54 }, { "epoch": 0.02952921042918023, "grad_norm": 1.0234375, "learning_rate": 4.9997111578787214e-05, "loss": 1.4005, "step": 55 }, { "epoch": 0.030066105164256233, "grad_norm": 0.87109375, "learning_rate": 4.999698178074209e-05, "loss": 1.0834, "step": 56 }, { "epoch": 0.030602999899332236, "grad_norm": 1.0, "learning_rate": 4.999684913022478e-05, "loss": 1.0948, "step": 57 }, { "epoch": 0.031139894634408243, "grad_norm": 0.8125, "learning_rate": 4.9996713627250426e-05, "loss": 1.1725, "step": 58 }, { "epoch": 0.031676789369484246, "grad_norm": 1.1015625, "learning_rate": 4.9996575271834475e-05, "loss": 1.3369, "step": 59 }, { "epoch": 0.03221368410456025, "grad_norm": 0.9140625, "learning_rate": 4.999643406399275e-05, "loss": 1.0575, "step": 60 }, { "epoch": 0.03275057883963625, "grad_norm": 0.84375, "learning_rate": 4.999629000374133e-05, "loss": 1.0591, "step": 61 }, { "epoch": 0.033287473574712255, "grad_norm": 0.86328125, "learning_rate": 4.9996143091096684e-05, "loss": 1.2379, "step": 62 }, { "epoch": 0.033824368309788265, "grad_norm": 0.890625, "learning_rate": 4.999599332607554e-05, "loss": 1.1148, "step": 63 }, { "epoch": 0.03436126304486427, "grad_norm": 0.984375, "learning_rate": 4.999584070869502e-05, "loss": 1.0229, "step": 64 }, { "epoch": 0.03489815777994027, "grad_norm": 0.7890625, "learning_rate": 4.999568523897254e-05, "loss": 1.1811, "step": 65 }, { "epoch": 0.035435052515016274, "grad_norm": 1.0859375, "learning_rate": 4.999552691692581e-05, "loss": 0.9079, "step": 66 }, { "epoch": 0.03597194725009228, "grad_norm": 0.97265625, "learning_rate": 4.999536574257292e-05, "loss": 1.1585, "step": 67 }, { "epoch": 0.03650884198516828, "grad_norm": 0.91796875, "learning_rate": 4.999520171593226e-05, "loss": 1.2089, "step": 68 }, { "epoch": 0.03704573672024429, "grad_norm": 0.94921875, "learning_rate": 4.999503483702255e-05, "loss": 1.0255, "step": 69 }, { "epoch": 0.037582631455320294, "grad_norm": 0.92578125, "learning_rate": 4.999486510586282e-05, "loss": 1.0401, "step": 70 }, { "epoch": 0.0381195261903963, "grad_norm": 1.0390625, "learning_rate": 4.9994692522472455e-05, "loss": 1.2341, "step": 71 }, { "epoch": 0.0386564209254723, "grad_norm": 1.0546875, "learning_rate": 4.999451708687114e-05, "loss": 1.0865, "step": 72 }, { "epoch": 0.0391933156605483, "grad_norm": 1.6484375, "learning_rate": 4.9994338799078896e-05, "loss": 1.3078, "step": 73 }, { "epoch": 0.039730210395624306, "grad_norm": 0.9609375, "learning_rate": 4.999415765911606e-05, "loss": 1.0737, "step": 74 }, { "epoch": 0.04026710513070031, "grad_norm": 0.83203125, "learning_rate": 4.999397366700332e-05, "loss": 1.1142, "step": 75 }, { "epoch": 0.04080399986577632, "grad_norm": 0.79296875, "learning_rate": 4.9993786822761656e-05, "loss": 0.9994, "step": 76 }, { "epoch": 0.04134089460085232, "grad_norm": 0.96875, "learning_rate": 4.99935971264124e-05, "loss": 1.227, "step": 77 }, { "epoch": 0.041877789335928325, "grad_norm": 1.09375, "learning_rate": 4.999340457797718e-05, "loss": 1.2586, "step": 78 }, { "epoch": 0.04241468407100433, "grad_norm": 0.83203125, "learning_rate": 4.999320917747799e-05, "loss": 0.9687, "step": 79 }, { "epoch": 0.04295157880608033, "grad_norm": 1.125, "learning_rate": 4.999301092493712e-05, "loss": 1.115, "step": 80 }, { "epoch": 0.043488473541156335, "grad_norm": 1.0, "learning_rate": 4.99928098203772e-05, "loss": 0.9307, "step": 81 }, { "epoch": 0.04402536827623234, "grad_norm": 0.75390625, "learning_rate": 4.999260586382116e-05, "loss": 0.8806, "step": 82 }, { "epoch": 0.04456226301130835, "grad_norm": 0.9140625, "learning_rate": 4.999239905529229e-05, "loss": 0.947, "step": 83 }, { "epoch": 0.04509915774638435, "grad_norm": 0.796875, "learning_rate": 4.999218939481418e-05, "loss": 1.0248, "step": 84 }, { "epoch": 0.045636052481460354, "grad_norm": 1.40625, "learning_rate": 4.999197688241076e-05, "loss": 1.0739, "step": 85 }, { "epoch": 0.04617294721653636, "grad_norm": 0.9375, "learning_rate": 4.999176151810629e-05, "loss": 1.2311, "step": 86 }, { "epoch": 0.04670984195161236, "grad_norm": 0.83984375, "learning_rate": 4.999154330192532e-05, "loss": 1.0813, "step": 87 }, { "epoch": 0.04724673668668836, "grad_norm": 0.90234375, "learning_rate": 4.9991322233892784e-05, "loss": 1.0484, "step": 88 }, { "epoch": 0.04778363142176437, "grad_norm": 0.81640625, "learning_rate": 4.999109831403388e-05, "loss": 1.2286, "step": 89 }, { "epoch": 0.04832052615684038, "grad_norm": 0.95703125, "learning_rate": 4.999087154237418e-05, "loss": 1.3113, "step": 90 }, { "epoch": 0.04885742089191638, "grad_norm": 0.9375, "learning_rate": 4.999064191893955e-05, "loss": 1.0101, "step": 91 }, { "epoch": 0.04939431562699238, "grad_norm": 0.80859375, "learning_rate": 4.999040944375619e-05, "loss": 0.9629, "step": 92 }, { "epoch": 0.049931210362068386, "grad_norm": 0.83984375, "learning_rate": 4.999017411685064e-05, "loss": 1.4695, "step": 93 }, { "epoch": 0.05046810509714439, "grad_norm": 0.8046875, "learning_rate": 4.998993593824975e-05, "loss": 1.0943, "step": 94 }, { "epoch": 0.05100499983222039, "grad_norm": 0.76953125, "learning_rate": 4.998969490798069e-05, "loss": 1.0775, "step": 95 }, { "epoch": 0.0515418945672964, "grad_norm": 0.8203125, "learning_rate": 4.9989451026070975e-05, "loss": 1.0083, "step": 96 }, { "epoch": 0.052078789302372405, "grad_norm": 1.0234375, "learning_rate": 4.998920429254844e-05, "loss": 1.3772, "step": 97 }, { "epoch": 0.05261568403744841, "grad_norm": 0.9921875, "learning_rate": 4.9988954707441226e-05, "loss": 0.9465, "step": 98 }, { "epoch": 0.05315257877252441, "grad_norm": 0.74609375, "learning_rate": 4.998870227077782e-05, "loss": 1.1598, "step": 99 }, { "epoch": 0.053689473507600415, "grad_norm": 1.15625, "learning_rate": 4.9988446982587035e-05, "loss": 1.3205, "step": 100 }, { "epoch": 0.053689473507600415, "eval_loss": 1.0550328493118286, "eval_runtime": 48.3738, "eval_samples_per_second": 19.887, "eval_steps_per_second": 19.887, "step": 100 }, { "epoch": 0.05422636824267642, "grad_norm": 0.9453125, "learning_rate": 4.998818884289799e-05, "loss": 1.2547, "step": 101 }, { "epoch": 0.05476326297775243, "grad_norm": 1.4921875, "learning_rate": 4.998792785174014e-05, "loss": 1.2115, "step": 102 }, { "epoch": 0.05530015771282843, "grad_norm": 0.83203125, "learning_rate": 4.998766400914329e-05, "loss": 1.2368, "step": 103 }, { "epoch": 0.055837052447904434, "grad_norm": 0.78125, "learning_rate": 4.998739731513753e-05, "loss": 1.0902, "step": 104 }, { "epoch": 0.05637394718298044, "grad_norm": 0.7265625, "learning_rate": 4.9987127769753295e-05, "loss": 1.0537, "step": 105 }, { "epoch": 0.05691084191805644, "grad_norm": 1.0390625, "learning_rate": 4.998685537302135e-05, "loss": 0.9671, "step": 106 }, { "epoch": 0.05744773665313244, "grad_norm": 0.83203125, "learning_rate": 4.998658012497276e-05, "loss": 1.0743, "step": 107 }, { "epoch": 0.057984631388208446, "grad_norm": 0.7890625, "learning_rate": 4.998630202563896e-05, "loss": 1.1052, "step": 108 }, { "epoch": 0.058521526123284456, "grad_norm": 0.83984375, "learning_rate": 4.998602107505167e-05, "loss": 0.8357, "step": 109 }, { "epoch": 0.05905842085836046, "grad_norm": 1.0, "learning_rate": 4.998573727324295e-05, "loss": 1.2236, "step": 110 }, { "epoch": 0.05959531559343646, "grad_norm": 0.8671875, "learning_rate": 4.9985450620245196e-05, "loss": 1.1851, "step": 111 }, { "epoch": 0.060132210328512466, "grad_norm": 0.796875, "learning_rate": 4.998516111609111e-05, "loss": 0.9234, "step": 112 }, { "epoch": 0.06066910506358847, "grad_norm": 0.87109375, "learning_rate": 4.9984868760813717e-05, "loss": 1.1588, "step": 113 }, { "epoch": 0.06120599979866447, "grad_norm": 0.80078125, "learning_rate": 4.9984573554446404e-05, "loss": 1.0522, "step": 114 }, { "epoch": 0.06174289453374048, "grad_norm": 0.875, "learning_rate": 4.998427549702284e-05, "loss": 1.0305, "step": 115 }, { "epoch": 0.062279789268816485, "grad_norm": 0.84375, "learning_rate": 4.998397458857704e-05, "loss": 1.0935, "step": 116 }, { "epoch": 0.06281668400389248, "grad_norm": 0.7890625, "learning_rate": 4.9983670829143346e-05, "loss": 1.1522, "step": 117 }, { "epoch": 0.06335357873896849, "grad_norm": 0.921875, "learning_rate": 4.998336421875641e-05, "loss": 1.4326, "step": 118 }, { "epoch": 0.0638904734740445, "grad_norm": 0.77734375, "learning_rate": 4.9983054757451244e-05, "loss": 1.0302, "step": 119 }, { "epoch": 0.0644273682091205, "grad_norm": 0.76953125, "learning_rate": 4.998274244526313e-05, "loss": 1.2382, "step": 120 }, { "epoch": 0.06496426294419651, "grad_norm": 0.85546875, "learning_rate": 4.998242728222773e-05, "loss": 1.0985, "step": 121 }, { "epoch": 0.0655011576792725, "grad_norm": 0.84375, "learning_rate": 4.9982109268380995e-05, "loss": 1.2041, "step": 122 }, { "epoch": 0.06603805241434851, "grad_norm": 0.8515625, "learning_rate": 4.9981788403759224e-05, "loss": 1.0621, "step": 123 }, { "epoch": 0.06657494714942451, "grad_norm": 0.8515625, "learning_rate": 4.998146468839903e-05, "loss": 1.3032, "step": 124 }, { "epoch": 0.06711184188450052, "grad_norm": 0.8046875, "learning_rate": 4.998113812233735e-05, "loss": 0.9213, "step": 125 }, { "epoch": 0.06764873661957653, "grad_norm": 0.88671875, "learning_rate": 4.9980808705611435e-05, "loss": 1.0401, "step": 126 }, { "epoch": 0.06818563135465253, "grad_norm": 0.90234375, "learning_rate": 4.998047643825891e-05, "loss": 1.2438, "step": 127 }, { "epoch": 0.06872252608972854, "grad_norm": 0.96875, "learning_rate": 4.998014132031766e-05, "loss": 0.9991, "step": 128 }, { "epoch": 0.06925942082480453, "grad_norm": 0.97265625, "learning_rate": 4.9979803351825936e-05, "loss": 1.0276, "step": 129 }, { "epoch": 0.06979631555988054, "grad_norm": 0.7734375, "learning_rate": 4.997946253282231e-05, "loss": 1.194, "step": 130 }, { "epoch": 0.07033321029495654, "grad_norm": 0.89453125, "learning_rate": 4.997911886334567e-05, "loss": 1.1206, "step": 131 }, { "epoch": 0.07087010503003255, "grad_norm": 0.78125, "learning_rate": 4.9978772343435234e-05, "loss": 0.9971, "step": 132 }, { "epoch": 0.07140699976510856, "grad_norm": 0.890625, "learning_rate": 4.997842297313054e-05, "loss": 1.1272, "step": 133 }, { "epoch": 0.07194389450018455, "grad_norm": 0.78515625, "learning_rate": 4.997807075247146e-05, "loss": 0.9863, "step": 134 }, { "epoch": 0.07248078923526056, "grad_norm": 0.8046875, "learning_rate": 4.997771568149818e-05, "loss": 1.2149, "step": 135 }, { "epoch": 0.07301768397033656, "grad_norm": 0.875, "learning_rate": 4.997735776025124e-05, "loss": 1.1231, "step": 136 }, { "epoch": 0.07355457870541257, "grad_norm": 0.76953125, "learning_rate": 4.997699698877145e-05, "loss": 1.0775, "step": 137 }, { "epoch": 0.07409147344048858, "grad_norm": 0.80078125, "learning_rate": 4.99766333671e-05, "loss": 0.9318, "step": 138 }, { "epoch": 0.07462836817556458, "grad_norm": 0.890625, "learning_rate": 4.9976266895278384e-05, "loss": 1.0069, "step": 139 }, { "epoch": 0.07516526291064059, "grad_norm": 0.85546875, "learning_rate": 4.997589757334842e-05, "loss": 0.9013, "step": 140 }, { "epoch": 0.07570215764571658, "grad_norm": 0.953125, "learning_rate": 4.9975525401352235e-05, "loss": 1.0932, "step": 141 }, { "epoch": 0.0762390523807926, "grad_norm": 0.921875, "learning_rate": 4.997515037933232e-05, "loss": 1.1049, "step": 142 }, { "epoch": 0.07677594711586859, "grad_norm": 0.80859375, "learning_rate": 4.997477250733146e-05, "loss": 1.2303, "step": 143 }, { "epoch": 0.0773128418509446, "grad_norm": 0.875, "learning_rate": 4.997439178539278e-05, "loss": 1.0978, "step": 144 }, { "epoch": 0.07784973658602061, "grad_norm": 0.8828125, "learning_rate": 4.997400821355972e-05, "loss": 1.2369, "step": 145 }, { "epoch": 0.0783866313210966, "grad_norm": 0.85546875, "learning_rate": 4.9973621791876055e-05, "loss": 1.2848, "step": 146 }, { "epoch": 0.07892352605617262, "grad_norm": 0.84375, "learning_rate": 4.997323252038588e-05, "loss": 1.1895, "step": 147 }, { "epoch": 0.07946042079124861, "grad_norm": 0.84375, "learning_rate": 4.99728403991336e-05, "loss": 1.024, "step": 148 }, { "epoch": 0.07999731552632462, "grad_norm": 0.7421875, "learning_rate": 4.9972445428163994e-05, "loss": 0.8837, "step": 149 }, { "epoch": 0.08053421026140062, "grad_norm": 0.91796875, "learning_rate": 4.99720476075221e-05, "loss": 1.1442, "step": 150 }, { "epoch": 0.08053421026140062, "eval_loss": 1.036392331123352, "eval_runtime": 48.3992, "eval_samples_per_second": 19.876, "eval_steps_per_second": 19.876, "step": 150 }, { "epoch": 0.08107110499647663, "grad_norm": 0.859375, "learning_rate": 4.9971646937253334e-05, "loss": 1.2851, "step": 151 }, { "epoch": 0.08160799973155264, "grad_norm": 0.88671875, "learning_rate": 4.9971243417403414e-05, "loss": 1.287, "step": 152 }, { "epoch": 0.08214489446662863, "grad_norm": 0.90625, "learning_rate": 4.997083704801838e-05, "loss": 1.2777, "step": 153 }, { "epoch": 0.08268178920170464, "grad_norm": 0.8828125, "learning_rate": 4.997042782914462e-05, "loss": 1.2524, "step": 154 }, { "epoch": 0.08321868393678064, "grad_norm": 0.69921875, "learning_rate": 4.997001576082881e-05, "loss": 1.0275, "step": 155 }, { "epoch": 0.08375557867185665, "grad_norm": 0.84375, "learning_rate": 4.996960084311798e-05, "loss": 1.2872, "step": 156 }, { "epoch": 0.08429247340693265, "grad_norm": 0.9453125, "learning_rate": 4.996918307605949e-05, "loss": 1.1124, "step": 157 }, { "epoch": 0.08482936814200866, "grad_norm": 0.91015625, "learning_rate": 4.9968762459700994e-05, "loss": 1.0677, "step": 158 }, { "epoch": 0.08536626287708467, "grad_norm": 0.8671875, "learning_rate": 4.9968338994090496e-05, "loss": 1.0741, "step": 159 }, { "epoch": 0.08590315761216066, "grad_norm": 0.8125, "learning_rate": 4.9967912679276316e-05, "loss": 1.1287, "step": 160 }, { "epoch": 0.08644005234723667, "grad_norm": 0.81640625, "learning_rate": 4.996748351530712e-05, "loss": 0.9927, "step": 161 }, { "epoch": 0.08697694708231267, "grad_norm": 0.9453125, "learning_rate": 4.996705150223186e-05, "loss": 1.1585, "step": 162 }, { "epoch": 0.08751384181738868, "grad_norm": 0.87890625, "learning_rate": 4.996661664009984e-05, "loss": 1.3801, "step": 163 }, { "epoch": 0.08805073655246468, "grad_norm": 0.75390625, "learning_rate": 4.996617892896069e-05, "loss": 1.0954, "step": 164 }, { "epoch": 0.08858763128754069, "grad_norm": 0.84375, "learning_rate": 4.996573836886435e-05, "loss": 1.1691, "step": 165 }, { "epoch": 0.0891245260226167, "grad_norm": 0.78515625, "learning_rate": 4.9965294959861095e-05, "loss": 1.0086, "step": 166 }, { "epoch": 0.08966142075769269, "grad_norm": 0.82421875, "learning_rate": 4.9964848702001524e-05, "loss": 1.0528, "step": 167 }, { "epoch": 0.0901983154927687, "grad_norm": 0.82421875, "learning_rate": 4.996439959533656e-05, "loss": 1.1207, "step": 168 }, { "epoch": 0.0907352102278447, "grad_norm": 0.70703125, "learning_rate": 4.996394763991746e-05, "loss": 0.85, "step": 169 }, { "epoch": 0.09127210496292071, "grad_norm": 0.83203125, "learning_rate": 4.9963492835795797e-05, "loss": 1.1433, "step": 170 }, { "epoch": 0.09180899969799672, "grad_norm": 0.828125, "learning_rate": 4.9963035183023455e-05, "loss": 1.0846, "step": 171 }, { "epoch": 0.09234589443307271, "grad_norm": 0.9453125, "learning_rate": 4.9962574681652675e-05, "loss": 1.3203, "step": 172 }, { "epoch": 0.09288278916814872, "grad_norm": 0.921875, "learning_rate": 4.996211133173599e-05, "loss": 1.076, "step": 173 }, { "epoch": 0.09341968390322472, "grad_norm": 0.8125, "learning_rate": 4.996164513332628e-05, "loss": 1.1099, "step": 174 }, { "epoch": 0.09395657863830073, "grad_norm": 0.84765625, "learning_rate": 4.996117608647676e-05, "loss": 1.1499, "step": 175 }, { "epoch": 0.09449347337337673, "grad_norm": 0.75390625, "learning_rate": 4.9960704191240926e-05, "loss": 1.0821, "step": 176 }, { "epoch": 0.09503036810845274, "grad_norm": 0.9453125, "learning_rate": 4.996022944767265e-05, "loss": 1.0568, "step": 177 }, { "epoch": 0.09556726284352875, "grad_norm": 0.875, "learning_rate": 4.99597518558261e-05, "loss": 1.0568, "step": 178 }, { "epoch": 0.09610415757860474, "grad_norm": 0.69140625, "learning_rate": 4.995927141575577e-05, "loss": 1.1308, "step": 179 }, { "epoch": 0.09664105231368075, "grad_norm": 0.86328125, "learning_rate": 4.995878812751649e-05, "loss": 0.9887, "step": 180 }, { "epoch": 0.09717794704875675, "grad_norm": 0.90234375, "learning_rate": 4.9958301991163405e-05, "loss": 1.1791, "step": 181 }, { "epoch": 0.09771484178383276, "grad_norm": 0.76953125, "learning_rate": 4.995781300675199e-05, "loss": 1.0797, "step": 182 }, { "epoch": 0.09825173651890876, "grad_norm": 0.71875, "learning_rate": 4.9957321174338043e-05, "loss": 1.0162, "step": 183 }, { "epoch": 0.09878863125398477, "grad_norm": 1.0, "learning_rate": 4.99568264939777e-05, "loss": 1.0634, "step": 184 }, { "epoch": 0.09932552598906078, "grad_norm": 0.7734375, "learning_rate": 4.9956328965727394e-05, "loss": 1.1382, "step": 185 }, { "epoch": 0.09986242072413677, "grad_norm": 0.71875, "learning_rate": 4.995582858964392e-05, "loss": 0.9861, "step": 186 }, { "epoch": 0.10039931545921278, "grad_norm": 0.80078125, "learning_rate": 4.995532536578435e-05, "loss": 1.0947, "step": 187 }, { "epoch": 0.10093621019428878, "grad_norm": 1.109375, "learning_rate": 4.9954819294206124e-05, "loss": 1.2332, "step": 188 }, { "epoch": 0.10147310492936479, "grad_norm": 0.90234375, "learning_rate": 4.9954310374966996e-05, "loss": 1.1089, "step": 189 }, { "epoch": 0.10200999966444078, "grad_norm": 0.875, "learning_rate": 4.9953798608125025e-05, "loss": 1.0227, "step": 190 }, { "epoch": 0.1025468943995168, "grad_norm": 1.421875, "learning_rate": 4.995328399373862e-05, "loss": 0.8639, "step": 191 }, { "epoch": 0.1030837891345928, "grad_norm": 0.72265625, "learning_rate": 4.995276653186651e-05, "loss": 0.9628, "step": 192 }, { "epoch": 0.1036206838696688, "grad_norm": 1.0, "learning_rate": 4.995224622256773e-05, "loss": 1.2933, "step": 193 }, { "epoch": 0.10415757860474481, "grad_norm": 0.859375, "learning_rate": 4.9951723065901665e-05, "loss": 1.0998, "step": 194 }, { "epoch": 0.1046944733398208, "grad_norm": 0.7578125, "learning_rate": 4.995119706192801e-05, "loss": 1.146, "step": 195 }, { "epoch": 0.10523136807489682, "grad_norm": 0.8203125, "learning_rate": 4.995066821070679e-05, "loss": 0.9429, "step": 196 }, { "epoch": 0.10576826280997281, "grad_norm": 0.78515625, "learning_rate": 4.9950136512298354e-05, "loss": 1.1212, "step": 197 }, { "epoch": 0.10630515754504882, "grad_norm": 0.70703125, "learning_rate": 4.994960196676337e-05, "loss": 1.1948, "step": 198 }, { "epoch": 0.10684205228012483, "grad_norm": 0.97265625, "learning_rate": 4.994906457416284e-05, "loss": 1.218, "step": 199 }, { "epoch": 0.10737894701520083, "grad_norm": 0.74609375, "learning_rate": 4.994852433455809e-05, "loss": 0.9155, "step": 200 }, { "epoch": 0.10737894701520083, "eval_loss": 1.021553874015808, "eval_runtime": 45.6574, "eval_samples_per_second": 21.07, "eval_steps_per_second": 21.07, "step": 200 }, { "epoch": 0.10791584175027684, "grad_norm": 1.0625, "learning_rate": 4.9947981248010764e-05, "loss": 1.1516, "step": 201 }, { "epoch": 0.10845273648535284, "grad_norm": 0.796875, "learning_rate": 4.9947435314582844e-05, "loss": 1.075, "step": 202 }, { "epoch": 0.10898963122042885, "grad_norm": 0.77734375, "learning_rate": 4.994688653433661e-05, "loss": 1.1804, "step": 203 }, { "epoch": 0.10952652595550486, "grad_norm": 0.7109375, "learning_rate": 4.99463349073347e-05, "loss": 0.9413, "step": 204 }, { "epoch": 0.11006342069058085, "grad_norm": 0.75, "learning_rate": 4.9945780433640066e-05, "loss": 1.1992, "step": 205 }, { "epoch": 0.11060031542565686, "grad_norm": 0.703125, "learning_rate": 4.9945223113315966e-05, "loss": 1.004, "step": 206 }, { "epoch": 0.11113721016073286, "grad_norm": 0.61328125, "learning_rate": 4.9944662946425995e-05, "loss": 0.8692, "step": 207 }, { "epoch": 0.11167410489580887, "grad_norm": 0.828125, "learning_rate": 4.994409993303409e-05, "loss": 1.3249, "step": 208 }, { "epoch": 0.11221099963088486, "grad_norm": 0.875, "learning_rate": 4.994353407320449e-05, "loss": 0.9644, "step": 209 }, { "epoch": 0.11274789436596087, "grad_norm": 0.9296875, "learning_rate": 4.994296536700177e-05, "loss": 1.1006, "step": 210 }, { "epoch": 0.11328478910103688, "grad_norm": 0.7578125, "learning_rate": 4.994239381449082e-05, "loss": 1.0991, "step": 211 }, { "epoch": 0.11382168383611288, "grad_norm": 0.87109375, "learning_rate": 4.994181941573687e-05, "loss": 1.1392, "step": 212 }, { "epoch": 0.11435857857118889, "grad_norm": 0.68359375, "learning_rate": 4.9941242170805465e-05, "loss": 0.9001, "step": 213 }, { "epoch": 0.11489547330626489, "grad_norm": 0.90625, "learning_rate": 4.994066207976247e-05, "loss": 1.1169, "step": 214 }, { "epoch": 0.1154323680413409, "grad_norm": 0.65234375, "learning_rate": 4.9940079142674076e-05, "loss": 1.0101, "step": 215 }, { "epoch": 0.11596926277641689, "grad_norm": 0.9453125, "learning_rate": 4.993949335960683e-05, "loss": 1.182, "step": 216 }, { "epoch": 0.1165061575114929, "grad_norm": 0.7109375, "learning_rate": 4.993890473062754e-05, "loss": 1.0099, "step": 217 }, { "epoch": 0.11704305224656891, "grad_norm": 0.7265625, "learning_rate": 4.9938313255803406e-05, "loss": 0.9841, "step": 218 }, { "epoch": 0.11757994698164491, "grad_norm": 0.91015625, "learning_rate": 4.9937718935201905e-05, "loss": 1.3115, "step": 219 }, { "epoch": 0.11811684171672092, "grad_norm": 0.7578125, "learning_rate": 4.993712176889086e-05, "loss": 1.0432, "step": 220 }, { "epoch": 0.11865373645179692, "grad_norm": 0.92578125, "learning_rate": 4.9936521756938426e-05, "loss": 1.0891, "step": 221 }, { "epoch": 0.11919063118687293, "grad_norm": 0.7265625, "learning_rate": 4.993591889941306e-05, "loss": 0.9407, "step": 222 }, { "epoch": 0.11972752592194892, "grad_norm": 0.75390625, "learning_rate": 4.993531319638356e-05, "loss": 1.0568, "step": 223 }, { "epoch": 0.12026442065702493, "grad_norm": 0.921875, "learning_rate": 4.993470464791904e-05, "loss": 1.2601, "step": 224 }, { "epoch": 0.12080131539210094, "grad_norm": 0.87890625, "learning_rate": 4.993409325408895e-05, "loss": 1.1111, "step": 225 }, { "epoch": 0.12133821012717694, "grad_norm": 0.80078125, "learning_rate": 4.9933479014963055e-05, "loss": 1.1189, "step": 226 }, { "epoch": 0.12187510486225295, "grad_norm": 0.88671875, "learning_rate": 4.9932861930611454e-05, "loss": 0.8981, "step": 227 }, { "epoch": 0.12241199959732894, "grad_norm": 0.671875, "learning_rate": 4.9932242001104556e-05, "loss": 0.8803, "step": 228 }, { "epoch": 0.12294889433240495, "grad_norm": 0.6640625, "learning_rate": 4.99316192265131e-05, "loss": 0.8567, "step": 229 }, { "epoch": 0.12348578906748096, "grad_norm": 0.73828125, "learning_rate": 4.9930993606908154e-05, "loss": 0.9962, "step": 230 }, { "epoch": 0.12402268380255696, "grad_norm": 0.76953125, "learning_rate": 4.9930365142361115e-05, "loss": 1.0204, "step": 231 }, { "epoch": 0.12455957853763297, "grad_norm": 0.8046875, "learning_rate": 4.99297338329437e-05, "loss": 1.2177, "step": 232 }, { "epoch": 0.12509647327270898, "grad_norm": 1.0, "learning_rate": 4.992909967872794e-05, "loss": 1.0827, "step": 233 }, { "epoch": 0.12563336800778496, "grad_norm": 0.79296875, "learning_rate": 4.992846267978621e-05, "loss": 1.137, "step": 234 }, { "epoch": 0.12617026274286097, "grad_norm": 0.8515625, "learning_rate": 4.992782283619118e-05, "loss": 0.9979, "step": 235 }, { "epoch": 0.12670715747793698, "grad_norm": 0.80859375, "learning_rate": 4.99271801480159e-05, "loss": 0.9574, "step": 236 }, { "epoch": 0.127244052213013, "grad_norm": 0.71875, "learning_rate": 4.9926534615333675e-05, "loss": 1.2623, "step": 237 }, { "epoch": 0.127780946948089, "grad_norm": 0.7734375, "learning_rate": 4.992588623821819e-05, "loss": 0.9104, "step": 238 }, { "epoch": 0.12831784168316498, "grad_norm": 0.93359375, "learning_rate": 4.992523501674342e-05, "loss": 1.0249, "step": 239 }, { "epoch": 0.128854736418241, "grad_norm": 0.734375, "learning_rate": 4.992458095098368e-05, "loss": 0.9328, "step": 240 }, { "epoch": 0.129391631153317, "grad_norm": 0.75390625, "learning_rate": 4.992392404101361e-05, "loss": 1.0394, "step": 241 }, { "epoch": 0.12992852588839301, "grad_norm": 0.8046875, "learning_rate": 4.9923264286908164e-05, "loss": 1.0472, "step": 242 }, { "epoch": 0.130465420623469, "grad_norm": 1.046875, "learning_rate": 4.992260168874265e-05, "loss": 1.0126, "step": 243 }, { "epoch": 0.131002315358545, "grad_norm": 0.83984375, "learning_rate": 4.9921936246592656e-05, "loss": 0.9135, "step": 244 }, { "epoch": 0.13153921009362102, "grad_norm": 0.796875, "learning_rate": 4.9921267960534124e-05, "loss": 1.1838, "step": 245 }, { "epoch": 0.13207610482869703, "grad_norm": 0.7734375, "learning_rate": 4.992059683064332e-05, "loss": 1.0228, "step": 246 }, { "epoch": 0.13261299956377304, "grad_norm": 0.703125, "learning_rate": 4.991992285699682e-05, "loss": 0.9633, "step": 247 }, { "epoch": 0.13314989429884902, "grad_norm": 0.765625, "learning_rate": 4.991924603967154e-05, "loss": 1.0594, "step": 248 }, { "epoch": 0.13368678903392503, "grad_norm": 0.90625, "learning_rate": 4.9918566378744704e-05, "loss": 1.3159, "step": 249 }, { "epoch": 0.13422368376900104, "grad_norm": 1.0625, "learning_rate": 4.991788387429388e-05, "loss": 1.2449, "step": 250 }, { "epoch": 0.13422368376900104, "eval_loss": 1.0114037990570068, "eval_runtime": 45.6876, "eval_samples_per_second": 21.056, "eval_steps_per_second": 21.056, "step": 250 }, { "epoch": 0.13476057850407705, "grad_norm": 0.74609375, "learning_rate": 4.9917198526396945e-05, "loss": 1.0977, "step": 251 }, { "epoch": 0.13529747323915306, "grad_norm": 0.734375, "learning_rate": 4.991651033513212e-05, "loss": 0.9752, "step": 252 }, { "epoch": 0.13583436797422904, "grad_norm": 0.87890625, "learning_rate": 4.99158193005779e-05, "loss": 1.1368, "step": 253 }, { "epoch": 0.13637126270930505, "grad_norm": 0.71875, "learning_rate": 4.9915125422813187e-05, "loss": 1.019, "step": 254 }, { "epoch": 0.13690815744438106, "grad_norm": 0.88671875, "learning_rate": 4.991442870191713e-05, "loss": 1.133, "step": 255 }, { "epoch": 0.13744505217945707, "grad_norm": 0.8125, "learning_rate": 4.991372913796924e-05, "loss": 1.2847, "step": 256 }, { "epoch": 0.13798194691453308, "grad_norm": 0.76171875, "learning_rate": 4.991302673104935e-05, "loss": 0.9674, "step": 257 }, { "epoch": 0.13851884164960906, "grad_norm": 0.8515625, "learning_rate": 4.991232148123761e-05, "loss": 1.2452, "step": 258 }, { "epoch": 0.13905573638468507, "grad_norm": 0.75390625, "learning_rate": 4.99116133886145e-05, "loss": 0.926, "step": 259 }, { "epoch": 0.13959263111976108, "grad_norm": 0.89453125, "learning_rate": 4.9910902453260824e-05, "loss": 1.0061, "step": 260 }, { "epoch": 0.1401295258548371, "grad_norm": 0.9296875, "learning_rate": 4.991018867525771e-05, "loss": 1.2372, "step": 261 }, { "epoch": 0.14066642058991308, "grad_norm": 0.6953125, "learning_rate": 4.99094720546866e-05, "loss": 0.9288, "step": 262 }, { "epoch": 0.1412033153249891, "grad_norm": 0.7734375, "learning_rate": 4.990875259162928e-05, "loss": 1.1232, "step": 263 }, { "epoch": 0.1417402100600651, "grad_norm": 0.83984375, "learning_rate": 4.990803028616785e-05, "loss": 1.0133, "step": 264 }, { "epoch": 0.1422771047951411, "grad_norm": 0.859375, "learning_rate": 4.990730513838472e-05, "loss": 1.1617, "step": 265 }, { "epoch": 0.14281399953021712, "grad_norm": 1.0390625, "learning_rate": 4.990657714836266e-05, "loss": 1.2293, "step": 266 }, { "epoch": 0.1433508942652931, "grad_norm": 0.83984375, "learning_rate": 4.990584631618472e-05, "loss": 1.2229, "step": 267 }, { "epoch": 0.1438877890003691, "grad_norm": 0.71875, "learning_rate": 4.990511264193431e-05, "loss": 1.0121, "step": 268 }, { "epoch": 0.14442468373544512, "grad_norm": 0.75, "learning_rate": 4.990437612569515e-05, "loss": 0.9549, "step": 269 }, { "epoch": 0.14496157847052113, "grad_norm": 0.9765625, "learning_rate": 4.9903636767551285e-05, "loss": 1.0386, "step": 270 }, { "epoch": 0.14549847320559714, "grad_norm": 1.0, "learning_rate": 4.990289456758709e-05, "loss": 1.0869, "step": 271 }, { "epoch": 0.14603536794067312, "grad_norm": 0.703125, "learning_rate": 4.9902149525887255e-05, "loss": 1.0721, "step": 272 }, { "epoch": 0.14657226267574913, "grad_norm": 0.68359375, "learning_rate": 4.990140164253679e-05, "loss": 0.9355, "step": 273 }, { "epoch": 0.14710915741082514, "grad_norm": 0.8671875, "learning_rate": 4.990065091762106e-05, "loss": 1.0889, "step": 274 }, { "epoch": 0.14764605214590115, "grad_norm": 0.73828125, "learning_rate": 4.989989735122571e-05, "loss": 0.8915, "step": 275 }, { "epoch": 0.14818294688097716, "grad_norm": 0.734375, "learning_rate": 4.989914094343675e-05, "loss": 1.046, "step": 276 }, { "epoch": 0.14871984161605314, "grad_norm": 0.828125, "learning_rate": 4.989838169434048e-05, "loss": 1.0452, "step": 277 }, { "epoch": 0.14925673635112915, "grad_norm": 1.0546875, "learning_rate": 4.9897619604023545e-05, "loss": 1.2057, "step": 278 }, { "epoch": 0.14979363108620516, "grad_norm": 0.93359375, "learning_rate": 4.989685467257291e-05, "loss": 1.243, "step": 279 }, { "epoch": 0.15033052582128117, "grad_norm": 0.8125, "learning_rate": 4.9896086900075865e-05, "loss": 1.1677, "step": 280 }, { "epoch": 0.15086742055635716, "grad_norm": 0.96484375, "learning_rate": 4.989531628662002e-05, "loss": 1.1926, "step": 281 }, { "epoch": 0.15140431529143317, "grad_norm": 0.9453125, "learning_rate": 4.989454283229331e-05, "loss": 0.9606, "step": 282 }, { "epoch": 0.15194121002650918, "grad_norm": 0.7578125, "learning_rate": 4.9893766537184006e-05, "loss": 1.0276, "step": 283 }, { "epoch": 0.1524781047615852, "grad_norm": 0.75390625, "learning_rate": 4.9892987401380686e-05, "loss": 1.1926, "step": 284 }, { "epoch": 0.1530149994966612, "grad_norm": 0.97265625, "learning_rate": 4.989220542497226e-05, "loss": 0.974, "step": 285 }, { "epoch": 0.15355189423173718, "grad_norm": 0.8125, "learning_rate": 4.989142060804796e-05, "loss": 1.0926, "step": 286 }, { "epoch": 0.1540887889668132, "grad_norm": 1.0078125, "learning_rate": 4.989063295069733e-05, "loss": 1.2385, "step": 287 }, { "epoch": 0.1546256837018892, "grad_norm": 0.8671875, "learning_rate": 4.988984245301028e-05, "loss": 1.1706, "step": 288 }, { "epoch": 0.1551625784369652, "grad_norm": 0.765625, "learning_rate": 4.9889049115077005e-05, "loss": 0.9743, "step": 289 }, { "epoch": 0.15569947317204122, "grad_norm": 0.8046875, "learning_rate": 4.988825293698802e-05, "loss": 1.1596, "step": 290 }, { "epoch": 0.1562363679071172, "grad_norm": 0.8046875, "learning_rate": 4.988745391883421e-05, "loss": 1.126, "step": 291 }, { "epoch": 0.1567732626421932, "grad_norm": 0.6328125, "learning_rate": 4.988665206070671e-05, "loss": 1.0426, "step": 292 }, { "epoch": 0.15731015737726922, "grad_norm": 0.77734375, "learning_rate": 4.988584736269706e-05, "loss": 1.0523, "step": 293 }, { "epoch": 0.15784705211234523, "grad_norm": 0.74609375, "learning_rate": 4.988503982489707e-05, "loss": 0.923, "step": 294 }, { "epoch": 0.15838394684742121, "grad_norm": 0.85546875, "learning_rate": 4.988422944739889e-05, "loss": 1.1883, "step": 295 }, { "epoch": 0.15892084158249722, "grad_norm": 0.76171875, "learning_rate": 4.988341623029499e-05, "loss": 1.0239, "step": 296 }, { "epoch": 0.15945773631757323, "grad_norm": 1.171875, "learning_rate": 4.9882600173678186e-05, "loss": 0.9064, "step": 297 }, { "epoch": 0.15999463105264924, "grad_norm": 0.73828125, "learning_rate": 4.9881781277641586e-05, "loss": 0.9026, "step": 298 }, { "epoch": 0.16053152578772525, "grad_norm": 0.78515625, "learning_rate": 4.9880959542278645e-05, "loss": 1.0779, "step": 299 }, { "epoch": 0.16106842052280124, "grad_norm": 0.74609375, "learning_rate": 4.9880134967683124e-05, "loss": 0.7856, "step": 300 }, { "epoch": 0.16106842052280124, "eval_loss": 1.0021421909332275, "eval_runtime": 45.8583, "eval_samples_per_second": 20.978, "eval_steps_per_second": 20.978, "step": 300 }, { "epoch": 0.16160531525787725, "grad_norm": 0.796875, "learning_rate": 4.987930755394912e-05, "loss": 0.9734, "step": 301 }, { "epoch": 0.16214220999295326, "grad_norm": 0.890625, "learning_rate": 4.987847730117106e-05, "loss": 1.0082, "step": 302 }, { "epoch": 0.16267910472802927, "grad_norm": 0.80078125, "learning_rate": 4.987764420944367e-05, "loss": 0.9955, "step": 303 }, { "epoch": 0.16321599946310528, "grad_norm": 0.8359375, "learning_rate": 4.987680827886203e-05, "loss": 1.1873, "step": 304 }, { "epoch": 0.16375289419818126, "grad_norm": 0.94140625, "learning_rate": 4.9875969509521526e-05, "loss": 0.9822, "step": 305 }, { "epoch": 0.16428978893325727, "grad_norm": 0.73046875, "learning_rate": 4.987512790151787e-05, "loss": 0.9305, "step": 306 }, { "epoch": 0.16482668366833328, "grad_norm": 0.8671875, "learning_rate": 4.98742834549471e-05, "loss": 1.1409, "step": 307 }, { "epoch": 0.1653635784034093, "grad_norm": 0.765625, "learning_rate": 4.987343616990559e-05, "loss": 1.0024, "step": 308 }, { "epoch": 0.1659004731384853, "grad_norm": 0.83203125, "learning_rate": 4.987258604649001e-05, "loss": 1.0753, "step": 309 }, { "epoch": 0.16643736787356128, "grad_norm": 0.8515625, "learning_rate": 4.987173308479738e-05, "loss": 1.0717, "step": 310 }, { "epoch": 0.1669742626086373, "grad_norm": 0.76953125, "learning_rate": 4.987087728492503e-05, "loss": 1.2467, "step": 311 }, { "epoch": 0.1675111573437133, "grad_norm": 0.67578125, "learning_rate": 4.987001864697062e-05, "loss": 0.8469, "step": 312 }, { "epoch": 0.1680480520787893, "grad_norm": 0.87109375, "learning_rate": 4.986915717103212e-05, "loss": 1.234, "step": 313 }, { "epoch": 0.1685849468138653, "grad_norm": 0.98046875, "learning_rate": 4.986829285720785e-05, "loss": 1.0514, "step": 314 }, { "epoch": 0.1691218415489413, "grad_norm": 0.71875, "learning_rate": 4.9867425705596434e-05, "loss": 1.0181, "step": 315 }, { "epoch": 0.16965873628401731, "grad_norm": 0.8515625, "learning_rate": 4.986655571629682e-05, "loss": 1.0346, "step": 316 }, { "epoch": 0.17019563101909332, "grad_norm": 0.87890625, "learning_rate": 4.9865682889408295e-05, "loss": 1.1612, "step": 317 }, { "epoch": 0.17073252575416933, "grad_norm": 0.8671875, "learning_rate": 4.9864807225030454e-05, "loss": 0.8934, "step": 318 }, { "epoch": 0.17126942048924532, "grad_norm": 0.6640625, "learning_rate": 4.986392872326322e-05, "loss": 0.8901, "step": 319 }, { "epoch": 0.17180631522432133, "grad_norm": 0.7578125, "learning_rate": 4.9863047384206835e-05, "loss": 1.072, "step": 320 }, { "epoch": 0.17234320995939734, "grad_norm": 0.7265625, "learning_rate": 4.9862163207961887e-05, "loss": 1.1071, "step": 321 }, { "epoch": 0.17288010469447335, "grad_norm": 0.734375, "learning_rate": 4.9861276194629256e-05, "loss": 1.0346, "step": 322 }, { "epoch": 0.17341699942954936, "grad_norm": 0.8203125, "learning_rate": 4.9860386344310175e-05, "loss": 1.0508, "step": 323 }, { "epoch": 0.17395389416462534, "grad_norm": 0.67578125, "learning_rate": 4.9859493657106185e-05, "loss": 0.9795, "step": 324 }, { "epoch": 0.17449078889970135, "grad_norm": 0.77734375, "learning_rate": 4.985859813311914e-05, "loss": 1.096, "step": 325 }, { "epoch": 0.17502768363477736, "grad_norm": 0.875, "learning_rate": 4.985769977245124e-05, "loss": 1.2532, "step": 326 }, { "epoch": 0.17556457836985337, "grad_norm": 0.81640625, "learning_rate": 4.9856798575205e-05, "loss": 1.2384, "step": 327 }, { "epoch": 0.17610147310492935, "grad_norm": 0.94140625, "learning_rate": 4.985589454148326e-05, "loss": 1.1486, "step": 328 }, { "epoch": 0.17663836784000536, "grad_norm": 0.70703125, "learning_rate": 4.9854987671389174e-05, "loss": 0.9248, "step": 329 }, { "epoch": 0.17717526257508137, "grad_norm": 0.80859375, "learning_rate": 4.9854077965026234e-05, "loss": 0.9731, "step": 330 }, { "epoch": 0.17771215731015738, "grad_norm": 0.69921875, "learning_rate": 4.985316542249825e-05, "loss": 0.8214, "step": 331 }, { "epoch": 0.1782490520452334, "grad_norm": 0.7890625, "learning_rate": 4.985225004390934e-05, "loss": 0.8996, "step": 332 }, { "epoch": 0.17878594678030937, "grad_norm": 0.98046875, "learning_rate": 4.985133182936399e-05, "loss": 1.2789, "step": 333 }, { "epoch": 0.17932284151538538, "grad_norm": 0.80078125, "learning_rate": 4.985041077896695e-05, "loss": 1.1322, "step": 334 }, { "epoch": 0.1798597362504614, "grad_norm": 0.79296875, "learning_rate": 4.984948689282333e-05, "loss": 1.0258, "step": 335 }, { "epoch": 0.1803966309855374, "grad_norm": 0.7265625, "learning_rate": 4.984856017103857e-05, "loss": 0.8328, "step": 336 }, { "epoch": 0.18093352572061341, "grad_norm": 0.76171875, "learning_rate": 4.984763061371841e-05, "loss": 1.2591, "step": 337 }, { "epoch": 0.1814704204556894, "grad_norm": 0.78515625, "learning_rate": 4.9846698220968934e-05, "loss": 1.0533, "step": 338 }, { "epoch": 0.1820073151907654, "grad_norm": 0.98828125, "learning_rate": 4.9845762992896526e-05, "loss": 1.076, "step": 339 }, { "epoch": 0.18254420992584142, "grad_norm": 0.84375, "learning_rate": 4.984482492960791e-05, "loss": 0.9798, "step": 340 }, { "epoch": 0.18308110466091743, "grad_norm": 0.83203125, "learning_rate": 4.984388403121015e-05, "loss": 1.0483, "step": 341 }, { "epoch": 0.18361799939599344, "grad_norm": 0.78125, "learning_rate": 4.984294029781059e-05, "loss": 1.2484, "step": 342 }, { "epoch": 0.18415489413106942, "grad_norm": 1.125, "learning_rate": 4.9841993729516933e-05, "loss": 1.3144, "step": 343 }, { "epoch": 0.18469178886614543, "grad_norm": 0.66796875, "learning_rate": 4.9841044326437194e-05, "loss": 1.0227, "step": 344 }, { "epoch": 0.18522868360122144, "grad_norm": 0.7421875, "learning_rate": 4.984009208867971e-05, "loss": 0.9629, "step": 345 }, { "epoch": 0.18576557833629745, "grad_norm": 0.86328125, "learning_rate": 4.9839137016353147e-05, "loss": 1.0943, "step": 346 }, { "epoch": 0.18630247307137343, "grad_norm": 0.90234375, "learning_rate": 4.983817910956648e-05, "loss": 1.1398, "step": 347 }, { "epoch": 0.18683936780644944, "grad_norm": 0.90234375, "learning_rate": 4.983721836842903e-05, "loss": 0.9662, "step": 348 }, { "epoch": 0.18737626254152545, "grad_norm": 0.7265625, "learning_rate": 4.983625479305043e-05, "loss": 0.9173, "step": 349 }, { "epoch": 0.18791315727660146, "grad_norm": 0.95703125, "learning_rate": 4.9835288383540626e-05, "loss": 1.1645, "step": 350 }, { "epoch": 0.18791315727660146, "eval_loss": 0.9941990971565247, "eval_runtime": 45.8692, "eval_samples_per_second": 20.973, "eval_steps_per_second": 20.973, "step": 350 }, { "epoch": 0.18845005201167747, "grad_norm": 0.85546875, "learning_rate": 4.983431914000991e-05, "loss": 1.0875, "step": 351 }, { "epoch": 0.18898694674675345, "grad_norm": 0.78125, "learning_rate": 4.983334706256888e-05, "loss": 1.0765, "step": 352 }, { "epoch": 0.18952384148182946, "grad_norm": 0.80078125, "learning_rate": 4.983237215132846e-05, "loss": 1.2291, "step": 353 }, { "epoch": 0.19006073621690547, "grad_norm": 0.78515625, "learning_rate": 4.98313944063999e-05, "loss": 0.8445, "step": 354 }, { "epoch": 0.19059763095198148, "grad_norm": 0.9296875, "learning_rate": 4.983041382789477e-05, "loss": 1.2183, "step": 355 }, { "epoch": 0.1911345256870575, "grad_norm": 0.796875, "learning_rate": 4.9829430415924974e-05, "loss": 1.1945, "step": 356 }, { "epoch": 0.19167142042213348, "grad_norm": 0.80859375, "learning_rate": 4.982844417060273e-05, "loss": 0.9253, "step": 357 }, { "epoch": 0.1922083151572095, "grad_norm": 0.7734375, "learning_rate": 4.982745509204058e-05, "loss": 1.0803, "step": 358 }, { "epoch": 0.1927452098922855, "grad_norm": 0.8515625, "learning_rate": 4.982646318035139e-05, "loss": 0.9815, "step": 359 }, { "epoch": 0.1932821046273615, "grad_norm": 0.7890625, "learning_rate": 4.982546843564834e-05, "loss": 0.8256, "step": 360 }, { "epoch": 0.1938189993624375, "grad_norm": 1.2265625, "learning_rate": 4.982447085804496e-05, "loss": 1.0912, "step": 361 }, { "epoch": 0.1943558940975135, "grad_norm": 0.8828125, "learning_rate": 4.982347044765508e-05, "loss": 1.1494, "step": 362 }, { "epoch": 0.1948927888325895, "grad_norm": 0.71875, "learning_rate": 4.982246720459286e-05, "loss": 1.1104, "step": 363 }, { "epoch": 0.19542968356766552, "grad_norm": 0.73828125, "learning_rate": 4.982146112897277e-05, "loss": 1.0217, "step": 364 }, { "epoch": 0.19596657830274153, "grad_norm": 0.734375, "learning_rate": 4.982045222090964e-05, "loss": 0.9799, "step": 365 }, { "epoch": 0.1965034730378175, "grad_norm": 0.984375, "learning_rate": 4.9819440480518574e-05, "loss": 1.0773, "step": 366 }, { "epoch": 0.19704036777289352, "grad_norm": 0.69921875, "learning_rate": 4.9818425907915045e-05, "loss": 0.9679, "step": 367 }, { "epoch": 0.19757726250796953, "grad_norm": 0.76953125, "learning_rate": 4.981740850321481e-05, "loss": 0.9586, "step": 368 }, { "epoch": 0.19811415724304554, "grad_norm": 0.6875, "learning_rate": 4.9816388266533994e-05, "loss": 0.9833, "step": 369 }, { "epoch": 0.19865105197812155, "grad_norm": 0.9921875, "learning_rate": 4.9815365197988986e-05, "loss": 1.0994, "step": 370 }, { "epoch": 0.19918794671319753, "grad_norm": 0.8359375, "learning_rate": 4.9814339297696555e-05, "loss": 1.0632, "step": 371 }, { "epoch": 0.19972484144827354, "grad_norm": 0.8671875, "learning_rate": 4.981331056577376e-05, "loss": 0.9068, "step": 372 }, { "epoch": 0.20026173618334955, "grad_norm": 0.7265625, "learning_rate": 4.9812279002338e-05, "loss": 1.0384, "step": 373 }, { "epoch": 0.20079863091842556, "grad_norm": 1.0, "learning_rate": 4.981124460750698e-05, "loss": 1.4422, "step": 374 }, { "epoch": 0.20133552565350157, "grad_norm": 0.78515625, "learning_rate": 4.981020738139874e-05, "loss": 0.8491, "step": 375 }, { "epoch": 0.20187242038857756, "grad_norm": 0.78125, "learning_rate": 4.9809167324131645e-05, "loss": 0.9107, "step": 376 }, { "epoch": 0.20240931512365357, "grad_norm": 0.9296875, "learning_rate": 4.980812443582438e-05, "loss": 1.1261, "step": 377 }, { "epoch": 0.20294620985872958, "grad_norm": 0.63671875, "learning_rate": 4.980707871659593e-05, "loss": 0.9291, "step": 378 }, { "epoch": 0.20348310459380559, "grad_norm": 0.8125, "learning_rate": 4.980603016656567e-05, "loss": 1.0956, "step": 379 }, { "epoch": 0.20401999932888157, "grad_norm": 0.80859375, "learning_rate": 4.9804978785853196e-05, "loss": 1.1436, "step": 380 }, { "epoch": 0.20455689406395758, "grad_norm": 0.85546875, "learning_rate": 4.980392457457853e-05, "loss": 1.1608, "step": 381 }, { "epoch": 0.2050937887990336, "grad_norm": 0.703125, "learning_rate": 4.980286753286195e-05, "loss": 1.1159, "step": 382 }, { "epoch": 0.2056306835341096, "grad_norm": 0.85546875, "learning_rate": 4.980180766082408e-05, "loss": 1.1763, "step": 383 }, { "epoch": 0.2061675782691856, "grad_norm": 0.73828125, "learning_rate": 4.9800744958585864e-05, "loss": 1.0288, "step": 384 }, { "epoch": 0.2067044730042616, "grad_norm": 0.78515625, "learning_rate": 4.979967942626858e-05, "loss": 1.1797, "step": 385 }, { "epoch": 0.2072413677393376, "grad_norm": 0.7578125, "learning_rate": 4.9798611063993805e-05, "loss": 0.7979, "step": 386 }, { "epoch": 0.2077782624744136, "grad_norm": 0.72265625, "learning_rate": 4.9797539871883456e-05, "loss": 1.0582, "step": 387 }, { "epoch": 0.20831515720948962, "grad_norm": 0.76953125, "learning_rate": 4.979646585005978e-05, "loss": 1.1325, "step": 388 }, { "epoch": 0.20885205194456563, "grad_norm": 0.9140625, "learning_rate": 4.979538899864532e-05, "loss": 1.0, "step": 389 }, { "epoch": 0.2093889466796416, "grad_norm": 0.6875, "learning_rate": 4.979430931776298e-05, "loss": 0.8141, "step": 390 }, { "epoch": 0.20992584141471762, "grad_norm": 0.7109375, "learning_rate": 4.9793226807535944e-05, "loss": 0.8747, "step": 391 }, { "epoch": 0.21046273614979363, "grad_norm": 0.86328125, "learning_rate": 4.9792141468087746e-05, "loss": 0.9623, "step": 392 }, { "epoch": 0.21099963088486964, "grad_norm": 0.78515625, "learning_rate": 4.979105329954224e-05, "loss": 0.8694, "step": 393 }, { "epoch": 0.21153652561994563, "grad_norm": 0.7734375, "learning_rate": 4.97899623020236e-05, "loss": 0.8215, "step": 394 }, { "epoch": 0.21207342035502164, "grad_norm": 0.98046875, "learning_rate": 4.9788868475656325e-05, "loss": 1.1688, "step": 395 }, { "epoch": 0.21261031509009765, "grad_norm": 0.83984375, "learning_rate": 4.978777182056523e-05, "loss": 1.0416, "step": 396 }, { "epoch": 0.21314720982517366, "grad_norm": 0.671875, "learning_rate": 4.978667233687546e-05, "loss": 1.0144, "step": 397 }, { "epoch": 0.21368410456024967, "grad_norm": 0.78515625, "learning_rate": 4.9785570024712475e-05, "loss": 1.0425, "step": 398 }, { "epoch": 0.21422099929532565, "grad_norm": 0.875, "learning_rate": 4.9784464884202067e-05, "loss": 1.1703, "step": 399 }, { "epoch": 0.21475789403040166, "grad_norm": 0.73828125, "learning_rate": 4.9783356915470344e-05, "loss": 1.168, "step": 400 }, { "epoch": 0.21475789403040166, "eval_loss": 0.9871569871902466, "eval_runtime": 45.8833, "eval_samples_per_second": 20.966, "eval_steps_per_second": 20.966, "step": 400 }, { "epoch": 0.21529478876547767, "grad_norm": 0.75, "learning_rate": 4.978224611864375e-05, "loss": 0.9708, "step": 401 }, { "epoch": 0.21583168350055368, "grad_norm": 0.76953125, "learning_rate": 4.9781132493849025e-05, "loss": 1.0356, "step": 402 }, { "epoch": 0.2163685782356297, "grad_norm": 0.74609375, "learning_rate": 4.9780016041213254e-05, "loss": 1.0487, "step": 403 }, { "epoch": 0.21690547297070567, "grad_norm": 0.875, "learning_rate": 4.977889676086383e-05, "loss": 1.0294, "step": 404 }, { "epoch": 0.21744236770578168, "grad_norm": 0.94140625, "learning_rate": 4.977777465292851e-05, "loss": 1.3656, "step": 405 }, { "epoch": 0.2179792624408577, "grad_norm": 0.8046875, "learning_rate": 4.97766497175353e-05, "loss": 1.1777, "step": 406 }, { "epoch": 0.2185161571759337, "grad_norm": 0.86328125, "learning_rate": 4.977552195481259e-05, "loss": 0.9866, "step": 407 }, { "epoch": 0.2190530519110097, "grad_norm": 0.76171875, "learning_rate": 4.977439136488907e-05, "loss": 1.0517, "step": 408 }, { "epoch": 0.2195899466460857, "grad_norm": 1.015625, "learning_rate": 4.977325794789375e-05, "loss": 1.1073, "step": 409 }, { "epoch": 0.2201268413811617, "grad_norm": 0.8671875, "learning_rate": 4.977212170395598e-05, "loss": 0.8281, "step": 410 }, { "epoch": 0.2206637361162377, "grad_norm": 0.88671875, "learning_rate": 4.97709826332054e-05, "loss": 1.1745, "step": 411 }, { "epoch": 0.22120063085131372, "grad_norm": 0.87890625, "learning_rate": 4.9769840735772e-05, "loss": 1.0383, "step": 412 }, { "epoch": 0.2217375255863897, "grad_norm": 0.6953125, "learning_rate": 4.976869601178609e-05, "loss": 1.0757, "step": 413 }, { "epoch": 0.22227442032146572, "grad_norm": 0.81640625, "learning_rate": 4.9767548461378296e-05, "loss": 1.0685, "step": 414 }, { "epoch": 0.22281131505654173, "grad_norm": 0.83203125, "learning_rate": 4.976639808467957e-05, "loss": 1.0155, "step": 415 }, { "epoch": 0.22334820979161774, "grad_norm": 0.79296875, "learning_rate": 4.976524488182118e-05, "loss": 1.0609, "step": 416 }, { "epoch": 0.22388510452669375, "grad_norm": 0.78515625, "learning_rate": 4.976408885293472e-05, "loss": 1.1626, "step": 417 }, { "epoch": 0.22442199926176973, "grad_norm": 0.6640625, "learning_rate": 4.976292999815211e-05, "loss": 0.9083, "step": 418 }, { "epoch": 0.22495889399684574, "grad_norm": 0.80078125, "learning_rate": 4.9761768317605596e-05, "loss": 1.0945, "step": 419 }, { "epoch": 0.22549578873192175, "grad_norm": 0.859375, "learning_rate": 4.976060381142773e-05, "loss": 0.8575, "step": 420 }, { "epoch": 0.22603268346699776, "grad_norm": 0.71484375, "learning_rate": 4.9759436479751406e-05, "loss": 1.0251, "step": 421 }, { "epoch": 0.22656957820207377, "grad_norm": 0.75390625, "learning_rate": 4.975826632270982e-05, "loss": 0.7913, "step": 422 }, { "epoch": 0.22710647293714975, "grad_norm": 0.76953125, "learning_rate": 4.975709334043651e-05, "loss": 0.9925, "step": 423 }, { "epoch": 0.22764336767222576, "grad_norm": 0.90625, "learning_rate": 4.975591753306533e-05, "loss": 1.0412, "step": 424 }, { "epoch": 0.22818026240730177, "grad_norm": 0.76171875, "learning_rate": 4.975473890073045e-05, "loss": 1.0033, "step": 425 }, { "epoch": 0.22871715714237778, "grad_norm": 0.80078125, "learning_rate": 4.975355744356637e-05, "loss": 0.9722, "step": 426 }, { "epoch": 0.2292540518774538, "grad_norm": 0.734375, "learning_rate": 4.97523731617079e-05, "loss": 0.8067, "step": 427 }, { "epoch": 0.22979094661252977, "grad_norm": 1.0, "learning_rate": 4.975118605529019e-05, "loss": 1.1088, "step": 428 }, { "epoch": 0.23032784134760578, "grad_norm": 0.6953125, "learning_rate": 4.974999612444871e-05, "loss": 0.8336, "step": 429 }, { "epoch": 0.2308647360826818, "grad_norm": 0.94921875, "learning_rate": 4.974880336931923e-05, "loss": 1.0894, "step": 430 }, { "epoch": 0.2314016308177578, "grad_norm": 1.0234375, "learning_rate": 4.974760779003787e-05, "loss": 1.2324, "step": 431 }, { "epoch": 0.23193852555283379, "grad_norm": 0.796875, "learning_rate": 4.974640938674107e-05, "loss": 1.2026, "step": 432 }, { "epoch": 0.2324754202879098, "grad_norm": 0.87890625, "learning_rate": 4.974520815956555e-05, "loss": 1.0034, "step": 433 }, { "epoch": 0.2330123150229858, "grad_norm": 0.76171875, "learning_rate": 4.974400410864842e-05, "loss": 0.9869, "step": 434 }, { "epoch": 0.23354920975806182, "grad_norm": 0.671875, "learning_rate": 4.974279723412706e-05, "loss": 0.9489, "step": 435 }, { "epoch": 0.23408610449313783, "grad_norm": 0.765625, "learning_rate": 4.9741587536139204e-05, "loss": 1.1474, "step": 436 }, { "epoch": 0.2346229992282138, "grad_norm": 0.640625, "learning_rate": 4.974037501482287e-05, "loss": 0.7989, "step": 437 }, { "epoch": 0.23515989396328982, "grad_norm": 0.73046875, "learning_rate": 4.973915967031644e-05, "loss": 0.9636, "step": 438 }, { "epoch": 0.23569678869836583, "grad_norm": 1.1953125, "learning_rate": 4.973794150275859e-05, "loss": 1.1533, "step": 439 }, { "epoch": 0.23623368343344184, "grad_norm": 0.7109375, "learning_rate": 4.9736720512288334e-05, "loss": 0.9366, "step": 440 }, { "epoch": 0.23677057816851785, "grad_norm": 1.2421875, "learning_rate": 4.973549669904501e-05, "loss": 1.1356, "step": 441 }, { "epoch": 0.23730747290359383, "grad_norm": 0.890625, "learning_rate": 4.973427006316826e-05, "loss": 1.1003, "step": 442 }, { "epoch": 0.23784436763866984, "grad_norm": 1.0234375, "learning_rate": 4.973304060479807e-05, "loss": 0.9756, "step": 443 }, { "epoch": 0.23838126237374585, "grad_norm": 0.76953125, "learning_rate": 4.9731808324074717e-05, "loss": 0.936, "step": 444 }, { "epoch": 0.23891815710882186, "grad_norm": 0.69140625, "learning_rate": 4.973057322113883e-05, "loss": 1.0434, "step": 445 }, { "epoch": 0.23945505184389784, "grad_norm": 0.8359375, "learning_rate": 4.972933529613135e-05, "loss": 0.9196, "step": 446 }, { "epoch": 0.23999194657897385, "grad_norm": 0.7578125, "learning_rate": 4.972809454919355e-05, "loss": 0.9158, "step": 447 }, { "epoch": 0.24052884131404986, "grad_norm": 0.890625, "learning_rate": 4.9726850980467e-05, "loss": 1.2073, "step": 448 }, { "epoch": 0.24106573604912587, "grad_norm": 0.7265625, "learning_rate": 4.972560459009361e-05, "loss": 1.0106, "step": 449 }, { "epoch": 0.24160263078420188, "grad_norm": 0.98828125, "learning_rate": 4.972435537821562e-05, "loss": 1.0991, "step": 450 }, { "epoch": 0.24160263078420188, "eval_loss": 0.9799502491950989, "eval_runtime": 46.5737, "eval_samples_per_second": 20.655, "eval_steps_per_second": 20.655, "step": 450 }, { "epoch": 0.24213952551927787, "grad_norm": 0.78515625, "learning_rate": 4.972310334497557e-05, "loss": 1.1096, "step": 451 }, { "epoch": 0.24267642025435388, "grad_norm": 0.84375, "learning_rate": 4.972184849051633e-05, "loss": 0.9439, "step": 452 }, { "epoch": 0.24321331498942989, "grad_norm": 0.69921875, "learning_rate": 4.972059081498111e-05, "loss": 1.169, "step": 453 }, { "epoch": 0.2437502097245059, "grad_norm": 0.671875, "learning_rate": 4.971933031851341e-05, "loss": 0.8649, "step": 454 }, { "epoch": 0.2442871044595819, "grad_norm": 0.90234375, "learning_rate": 4.971806700125708e-05, "loss": 0.9772, "step": 455 }, { "epoch": 0.2448239991946579, "grad_norm": 1.0390625, "learning_rate": 4.971680086335627e-05, "loss": 1.1061, "step": 456 }, { "epoch": 0.2453608939297339, "grad_norm": 0.68359375, "learning_rate": 4.971553190495547e-05, "loss": 0.8425, "step": 457 }, { "epoch": 0.2458977886648099, "grad_norm": 0.83203125, "learning_rate": 4.971426012619949e-05, "loss": 1.1449, "step": 458 }, { "epoch": 0.24643468339988592, "grad_norm": 0.8671875, "learning_rate": 4.9712985527233444e-05, "loss": 1.1962, "step": 459 }, { "epoch": 0.24697157813496193, "grad_norm": 0.83984375, "learning_rate": 4.971170810820279e-05, "loss": 1.0111, "step": 460 }, { "epoch": 0.2475084728700379, "grad_norm": 0.76953125, "learning_rate": 4.971042786925328e-05, "loss": 1.0539, "step": 461 }, { "epoch": 0.24804536760511392, "grad_norm": 0.73828125, "learning_rate": 4.9709144810531026e-05, "loss": 0.9385, "step": 462 }, { "epoch": 0.24858226234018993, "grad_norm": 0.83984375, "learning_rate": 4.9707858932182436e-05, "loss": 1.1144, "step": 463 }, { "epoch": 0.24911915707526594, "grad_norm": 0.6796875, "learning_rate": 4.970657023435424e-05, "loss": 0.8469, "step": 464 }, { "epoch": 0.24965605181034192, "grad_norm": 0.71875, "learning_rate": 4.9705278717193496e-05, "loss": 0.9331, "step": 465 }, { "epoch": 0.25019294654541796, "grad_norm": 0.8359375, "learning_rate": 4.970398438084758e-05, "loss": 0.9805, "step": 466 }, { "epoch": 0.25072984128049397, "grad_norm": 0.72265625, "learning_rate": 4.97026872254642e-05, "loss": 1.0235, "step": 467 }, { "epoch": 0.2512667360155699, "grad_norm": 0.78515625, "learning_rate": 4.9701387251191364e-05, "loss": 1.0945, "step": 468 }, { "epoch": 0.25180363075064593, "grad_norm": 0.578125, "learning_rate": 4.970008445817744e-05, "loss": 0.7344, "step": 469 }, { "epoch": 0.25234052548572194, "grad_norm": 0.78125, "learning_rate": 4.969877884657107e-05, "loss": 0.921, "step": 470 }, { "epoch": 0.25287742022079795, "grad_norm": 0.8203125, "learning_rate": 4.969747041652125e-05, "loss": 0.815, "step": 471 }, { "epoch": 0.25341431495587396, "grad_norm": 0.73046875, "learning_rate": 4.969615916817728e-05, "loss": 1.0928, "step": 472 }, { "epoch": 0.25395120969095, "grad_norm": 0.765625, "learning_rate": 4.9694845101688806e-05, "loss": 0.9634, "step": 473 }, { "epoch": 0.254488104426026, "grad_norm": 0.90625, "learning_rate": 4.969352821720577e-05, "loss": 1.0081, "step": 474 }, { "epoch": 0.255024999161102, "grad_norm": 0.765625, "learning_rate": 4.9692208514878444e-05, "loss": 0.9553, "step": 475 }, { "epoch": 0.255561893896178, "grad_norm": 0.765625, "learning_rate": 4.969088599485743e-05, "loss": 1.0275, "step": 476 }, { "epoch": 0.25609878863125396, "grad_norm": 0.671875, "learning_rate": 4.9689560657293633e-05, "loss": 0.9847, "step": 477 }, { "epoch": 0.25663568336632997, "grad_norm": 0.8671875, "learning_rate": 4.96882325023383e-05, "loss": 1.0905, "step": 478 }, { "epoch": 0.257172578101406, "grad_norm": 0.78515625, "learning_rate": 4.968690153014298e-05, "loss": 0.9663, "step": 479 }, { "epoch": 0.257709472836482, "grad_norm": 0.7890625, "learning_rate": 4.968556774085957e-05, "loss": 1.132, "step": 480 }, { "epoch": 0.258246367571558, "grad_norm": 0.6796875, "learning_rate": 4.9684231134640264e-05, "loss": 0.9002, "step": 481 }, { "epoch": 0.258783262306634, "grad_norm": 0.71484375, "learning_rate": 4.968289171163758e-05, "loss": 1.0553, "step": 482 }, { "epoch": 0.25932015704171, "grad_norm": 0.87109375, "learning_rate": 4.9681549472004376e-05, "loss": 1.1524, "step": 483 }, { "epoch": 0.25985705177678603, "grad_norm": 0.828125, "learning_rate": 4.9680204415893804e-05, "loss": 1.0497, "step": 484 }, { "epoch": 0.26039394651186204, "grad_norm": 0.6328125, "learning_rate": 4.967885654345936e-05, "loss": 0.7001, "step": 485 }, { "epoch": 0.260930841246938, "grad_norm": 0.84375, "learning_rate": 4.967750585485484e-05, "loss": 1.1598, "step": 486 }, { "epoch": 0.261467735982014, "grad_norm": 0.86328125, "learning_rate": 4.967615235023441e-05, "loss": 1.0899, "step": 487 }, { "epoch": 0.26200463071709, "grad_norm": 0.80859375, "learning_rate": 4.967479602975248e-05, "loss": 1.1266, "step": 488 }, { "epoch": 0.262541525452166, "grad_norm": 0.7578125, "learning_rate": 4.967343689356385e-05, "loss": 1.0635, "step": 489 }, { "epoch": 0.26307842018724203, "grad_norm": 0.828125, "learning_rate": 4.967207494182361e-05, "loss": 1.083, "step": 490 }, { "epoch": 0.26361531492231804, "grad_norm": 0.72265625, "learning_rate": 4.967071017468716e-05, "loss": 1.0513, "step": 491 }, { "epoch": 0.26415220965739405, "grad_norm": 0.79296875, "learning_rate": 4.966934259231026e-05, "loss": 0.9036, "step": 492 }, { "epoch": 0.26468910439247006, "grad_norm": 0.6640625, "learning_rate": 4.9667972194848955e-05, "loss": 1.0816, "step": 493 }, { "epoch": 0.2652259991275461, "grad_norm": 1.0078125, "learning_rate": 4.9666598982459635e-05, "loss": 1.1239, "step": 494 }, { "epoch": 0.2657628938626221, "grad_norm": 0.85546875, "learning_rate": 4.9665222955298984e-05, "loss": 1.0615, "step": 495 }, { "epoch": 0.26629978859769804, "grad_norm": 0.75390625, "learning_rate": 4.9663844113524035e-05, "loss": 0.833, "step": 496 }, { "epoch": 0.26683668333277405, "grad_norm": 0.76171875, "learning_rate": 4.966246245729213e-05, "loss": 0.8344, "step": 497 }, { "epoch": 0.26737357806785006, "grad_norm": 0.74609375, "learning_rate": 4.966107798676095e-05, "loss": 1.17, "step": 498 }, { "epoch": 0.26791047280292607, "grad_norm": 0.78125, "learning_rate": 4.9659690702088447e-05, "loss": 0.8675, "step": 499 }, { "epoch": 0.2684473675380021, "grad_norm": 0.859375, "learning_rate": 4.965830060343295e-05, "loss": 0.9243, "step": 500 }, { "epoch": 0.2684473675380021, "eval_loss": 0.9750539660453796, "eval_runtime": 46.2834, "eval_samples_per_second": 20.785, "eval_steps_per_second": 20.785, "step": 500 }, { "epoch": 0.2689842622730781, "grad_norm": 0.74609375, "learning_rate": 4.965690769095308e-05, "loss": 0.8827, "step": 501 }, { "epoch": 0.2695211570081541, "grad_norm": 0.80859375, "learning_rate": 4.9655511964807785e-05, "loss": 1.1728, "step": 502 }, { "epoch": 0.2700580517432301, "grad_norm": 0.84375, "learning_rate": 4.965411342515635e-05, "loss": 1.0448, "step": 503 }, { "epoch": 0.2705949464783061, "grad_norm": 0.74609375, "learning_rate": 4.965271207215835e-05, "loss": 0.959, "step": 504 }, { "epoch": 0.2711318412133821, "grad_norm": 0.7265625, "learning_rate": 4.965130790597369e-05, "loss": 1.2904, "step": 505 }, { "epoch": 0.2716687359484581, "grad_norm": 0.8515625, "learning_rate": 4.964990092676263e-05, "loss": 0.9568, "step": 506 }, { "epoch": 0.2722056306835341, "grad_norm": 0.74609375, "learning_rate": 4.964849113468569e-05, "loss": 1.0171, "step": 507 }, { "epoch": 0.2727425254186101, "grad_norm": 1.125, "learning_rate": 4.964707852990378e-05, "loss": 1.2031, "step": 508 }, { "epoch": 0.2732794201536861, "grad_norm": 0.94140625, "learning_rate": 4.9645663112578075e-05, "loss": 0.9805, "step": 509 }, { "epoch": 0.2738163148887621, "grad_norm": 0.82421875, "learning_rate": 4.964424488287009e-05, "loss": 1.0737, "step": 510 }, { "epoch": 0.27435320962383813, "grad_norm": 0.64453125, "learning_rate": 4.9642823840941684e-05, "loss": 1.0363, "step": 511 }, { "epoch": 0.27489010435891414, "grad_norm": 0.92578125, "learning_rate": 4.9641399986955e-05, "loss": 1.2236, "step": 512 }, { "epoch": 0.27542699909399015, "grad_norm": 0.64453125, "learning_rate": 4.9639973321072506e-05, "loss": 0.8331, "step": 513 }, { "epoch": 0.27596389382906616, "grad_norm": 0.72265625, "learning_rate": 4.963854384345702e-05, "loss": 0.9885, "step": 514 }, { "epoch": 0.2765007885641421, "grad_norm": 0.94140625, "learning_rate": 4.9637111554271665e-05, "loss": 0.9867, "step": 515 }, { "epoch": 0.27703768329921813, "grad_norm": 0.71484375, "learning_rate": 4.963567645367988e-05, "loss": 0.974, "step": 516 }, { "epoch": 0.27757457803429414, "grad_norm": 0.7421875, "learning_rate": 4.9634238541845415e-05, "loss": 1.044, "step": 517 }, { "epoch": 0.27811147276937015, "grad_norm": 1.0, "learning_rate": 4.9632797818932374e-05, "loss": 1.099, "step": 518 }, { "epoch": 0.27864836750444616, "grad_norm": 0.859375, "learning_rate": 4.9631354285105145e-05, "loss": 1.0237, "step": 519 }, { "epoch": 0.27918526223952217, "grad_norm": 0.84765625, "learning_rate": 4.962990794052847e-05, "loss": 1.0559, "step": 520 }, { "epoch": 0.2797221569745982, "grad_norm": 0.74609375, "learning_rate": 4.962845878536738e-05, "loss": 0.9844, "step": 521 }, { "epoch": 0.2802590517096742, "grad_norm": 0.71484375, "learning_rate": 4.962700681978725e-05, "loss": 0.931, "step": 522 }, { "epoch": 0.2807959464447502, "grad_norm": 0.81640625, "learning_rate": 4.962555204395377e-05, "loss": 0.9192, "step": 523 }, { "epoch": 0.28133284117982615, "grad_norm": 0.71484375, "learning_rate": 4.9624094458032946e-05, "loss": 1.1457, "step": 524 }, { "epoch": 0.28186973591490216, "grad_norm": 0.83984375, "learning_rate": 4.96226340621911e-05, "loss": 1.0615, "step": 525 }, { "epoch": 0.2824066306499782, "grad_norm": 0.72265625, "learning_rate": 4.962117085659489e-05, "loss": 0.9441, "step": 526 }, { "epoch": 0.2829435253850542, "grad_norm": 0.77734375, "learning_rate": 4.961970484141128e-05, "loss": 1.1009, "step": 527 }, { "epoch": 0.2834804201201302, "grad_norm": 0.671875, "learning_rate": 4.9618236016807564e-05, "loss": 0.9566, "step": 528 }, { "epoch": 0.2840173148552062, "grad_norm": 0.8203125, "learning_rate": 4.961676438295135e-05, "loss": 0.8927, "step": 529 }, { "epoch": 0.2845542095902822, "grad_norm": 0.8203125, "learning_rate": 4.9615289940010584e-05, "loss": 1.1116, "step": 530 }, { "epoch": 0.2850911043253582, "grad_norm": 0.671875, "learning_rate": 4.96138126881535e-05, "loss": 1.004, "step": 531 }, { "epoch": 0.28562799906043423, "grad_norm": 0.7578125, "learning_rate": 4.9612332627548686e-05, "loss": 1.0446, "step": 532 }, { "epoch": 0.28616489379551024, "grad_norm": 0.71484375, "learning_rate": 4.961084975836503e-05, "loss": 0.9992, "step": 533 }, { "epoch": 0.2867017885305862, "grad_norm": 0.64453125, "learning_rate": 4.9609364080771735e-05, "loss": 0.8808, "step": 534 }, { "epoch": 0.2872386832656622, "grad_norm": 0.84375, "learning_rate": 4.960787559493836e-05, "loss": 1.1595, "step": 535 }, { "epoch": 0.2877755780007382, "grad_norm": 0.890625, "learning_rate": 4.960638430103473e-05, "loss": 1.056, "step": 536 }, { "epoch": 0.28831247273581423, "grad_norm": 0.74609375, "learning_rate": 4.960489019923105e-05, "loss": 0.9335, "step": 537 }, { "epoch": 0.28884936747089024, "grad_norm": 0.63671875, "learning_rate": 4.96033932896978e-05, "loss": 0.8555, "step": 538 }, { "epoch": 0.28938626220596625, "grad_norm": 1.1328125, "learning_rate": 4.9601893572605796e-05, "loss": 1.1849, "step": 539 }, { "epoch": 0.28992315694104226, "grad_norm": 0.76171875, "learning_rate": 4.960039104812618e-05, "loss": 1.1725, "step": 540 }, { "epoch": 0.29046005167611827, "grad_norm": 0.77734375, "learning_rate": 4.9598885716430404e-05, "loss": 0.8767, "step": 541 }, { "epoch": 0.2909969464111943, "grad_norm": 0.84765625, "learning_rate": 4.959737757769025e-05, "loss": 1.11, "step": 542 }, { "epoch": 0.29153384114627023, "grad_norm": 0.82421875, "learning_rate": 4.9595866632077816e-05, "loss": 1.0985, "step": 543 }, { "epoch": 0.29207073588134624, "grad_norm": 0.96875, "learning_rate": 4.959435287976551e-05, "loss": 1.2694, "step": 544 }, { "epoch": 0.29260763061642225, "grad_norm": 0.796875, "learning_rate": 4.959283632092608e-05, "loss": 1.0265, "step": 545 }, { "epoch": 0.29314452535149826, "grad_norm": 0.80078125, "learning_rate": 4.9591316955732595e-05, "loss": 1.0899, "step": 546 }, { "epoch": 0.2936814200865743, "grad_norm": 0.82421875, "learning_rate": 4.9589794784358406e-05, "loss": 1.1073, "step": 547 }, { "epoch": 0.2942183148216503, "grad_norm": 0.6875, "learning_rate": 4.9588269806977236e-05, "loss": 0.8764, "step": 548 }, { "epoch": 0.2947552095567263, "grad_norm": 0.6328125, "learning_rate": 4.958674202376309e-05, "loss": 1.0164, "step": 549 }, { "epoch": 0.2952921042918023, "grad_norm": 0.98828125, "learning_rate": 4.958521143489032e-05, "loss": 1.3314, "step": 550 }, { "epoch": 0.2952921042918023, "eval_loss": 0.9689125418663025, "eval_runtime": 46.1941, "eval_samples_per_second": 20.825, "eval_steps_per_second": 20.825, "step": 550 }, { "epoch": 0.2958289990268783, "grad_norm": 0.73046875, "learning_rate": 4.958367804053357e-05, "loss": 0.9652, "step": 551 }, { "epoch": 0.2963658937619543, "grad_norm": 0.76171875, "learning_rate": 4.9582141840867835e-05, "loss": 0.9144, "step": 552 }, { "epoch": 0.2969027884970303, "grad_norm": 0.71875, "learning_rate": 4.9580602836068414e-05, "loss": 0.7835, "step": 553 }, { "epoch": 0.2974396832321063, "grad_norm": 0.67578125, "learning_rate": 4.957906102631091e-05, "loss": 0.8645, "step": 554 }, { "epoch": 0.2979765779671823, "grad_norm": 0.70703125, "learning_rate": 4.957751641177128e-05, "loss": 0.9647, "step": 555 }, { "epoch": 0.2985134727022583, "grad_norm": 0.79296875, "learning_rate": 4.9575968992625775e-05, "loss": 1.0369, "step": 556 }, { "epoch": 0.2990503674373343, "grad_norm": 0.8984375, "learning_rate": 4.957441876905098e-05, "loss": 1.3041, "step": 557 }, { "epoch": 0.29958726217241033, "grad_norm": 0.734375, "learning_rate": 4.957286574122379e-05, "loss": 1.2297, "step": 558 }, { "epoch": 0.30012415690748634, "grad_norm": 0.77734375, "learning_rate": 4.957130990932144e-05, "loss": 1.1134, "step": 559 }, { "epoch": 0.30066105164256235, "grad_norm": 0.703125, "learning_rate": 4.9569751273521454e-05, "loss": 0.9296, "step": 560 }, { "epoch": 0.30119794637763836, "grad_norm": 0.79296875, "learning_rate": 4.956818983400169e-05, "loss": 0.9917, "step": 561 }, { "epoch": 0.3017348411127143, "grad_norm": 0.6640625, "learning_rate": 4.956662559094034e-05, "loss": 0.9292, "step": 562 }, { "epoch": 0.3022717358477903, "grad_norm": 0.69140625, "learning_rate": 4.9565058544515905e-05, "loss": 0.8894, "step": 563 }, { "epoch": 0.30280863058286633, "grad_norm": 0.7421875, "learning_rate": 4.9563488694907186e-05, "loss": 0.9931, "step": 564 }, { "epoch": 0.30334552531794234, "grad_norm": 0.75, "learning_rate": 4.9561916042293345e-05, "loss": 1.0937, "step": 565 }, { "epoch": 0.30388242005301835, "grad_norm": 0.72265625, "learning_rate": 4.9560340586853825e-05, "loss": 0.9625, "step": 566 }, { "epoch": 0.30441931478809436, "grad_norm": 0.8046875, "learning_rate": 4.955876232876842e-05, "loss": 0.9373, "step": 567 }, { "epoch": 0.3049562095231704, "grad_norm": 0.6953125, "learning_rate": 4.9557181268217227e-05, "loss": 0.949, "step": 568 }, { "epoch": 0.3054931042582464, "grad_norm": 0.7109375, "learning_rate": 4.9555597405380647e-05, "loss": 0.9483, "step": 569 }, { "epoch": 0.3060299989933224, "grad_norm": 0.80078125, "learning_rate": 4.9554010740439435e-05, "loss": 0.9059, "step": 570 }, { "epoch": 0.30656689372839835, "grad_norm": 0.6953125, "learning_rate": 4.955242127357465e-05, "loss": 1.0488, "step": 571 }, { "epoch": 0.30710378846347436, "grad_norm": 0.7578125, "learning_rate": 4.955082900496766e-05, "loss": 0.9612, "step": 572 }, { "epoch": 0.30764068319855037, "grad_norm": 0.90234375, "learning_rate": 4.954923393480017e-05, "loss": 1.2172, "step": 573 }, { "epoch": 0.3081775779336264, "grad_norm": 0.7421875, "learning_rate": 4.9547636063254196e-05, "loss": 1.2283, "step": 574 }, { "epoch": 0.3087144726687024, "grad_norm": 0.9296875, "learning_rate": 4.954603539051208e-05, "loss": 0.9913, "step": 575 }, { "epoch": 0.3092513674037784, "grad_norm": 0.82421875, "learning_rate": 4.954443191675648e-05, "loss": 1.0052, "step": 576 }, { "epoch": 0.3097882621388544, "grad_norm": 0.62890625, "learning_rate": 4.954282564217036e-05, "loss": 0.665, "step": 577 }, { "epoch": 0.3103251568739304, "grad_norm": 0.74609375, "learning_rate": 4.954121656693703e-05, "loss": 0.9044, "step": 578 }, { "epoch": 0.31086205160900643, "grad_norm": 0.92578125, "learning_rate": 4.95396046912401e-05, "loss": 1.3292, "step": 579 }, { "epoch": 0.31139894634408244, "grad_norm": 1.0078125, "learning_rate": 4.9537990015263505e-05, "loss": 0.9049, "step": 580 }, { "epoch": 0.3119358410791584, "grad_norm": 0.6875, "learning_rate": 4.95363725391915e-05, "loss": 0.9415, "step": 581 }, { "epoch": 0.3124727358142344, "grad_norm": 0.9609375, "learning_rate": 4.953475226320866e-05, "loss": 0.9518, "step": 582 }, { "epoch": 0.3130096305493104, "grad_norm": 0.66015625, "learning_rate": 4.9533129187499885e-05, "loss": 0.8416, "step": 583 }, { "epoch": 0.3135465252843864, "grad_norm": 0.7265625, "learning_rate": 4.9531503312250375e-05, "loss": 1.0242, "step": 584 }, { "epoch": 0.31408342001946243, "grad_norm": 0.828125, "learning_rate": 4.9529874637645675e-05, "loss": 1.0651, "step": 585 }, { "epoch": 0.31462031475453844, "grad_norm": 1.046875, "learning_rate": 4.952824316387163e-05, "loss": 0.9305, "step": 586 }, { "epoch": 0.31515720948961445, "grad_norm": 0.92578125, "learning_rate": 4.9526608891114415e-05, "loss": 0.919, "step": 587 }, { "epoch": 0.31569410422469046, "grad_norm": 0.71484375, "learning_rate": 4.952497181956053e-05, "loss": 1.0465, "step": 588 }, { "epoch": 0.3162309989597665, "grad_norm": 0.69921875, "learning_rate": 4.952333194939677e-05, "loss": 0.9995, "step": 589 }, { "epoch": 0.31676789369484243, "grad_norm": 0.85546875, "learning_rate": 4.952168928081027e-05, "loss": 1.1415, "step": 590 }, { "epoch": 0.31730478842991844, "grad_norm": 0.73828125, "learning_rate": 4.952004381398848e-05, "loss": 1.0067, "step": 591 }, { "epoch": 0.31784168316499445, "grad_norm": 0.59765625, "learning_rate": 4.951839554911917e-05, "loss": 0.9907, "step": 592 }, { "epoch": 0.31837857790007046, "grad_norm": 0.8359375, "learning_rate": 4.951674448639043e-05, "loss": 1.0772, "step": 593 }, { "epoch": 0.31891547263514647, "grad_norm": 0.72265625, "learning_rate": 4.951509062599066e-05, "loss": 0.9815, "step": 594 }, { "epoch": 0.3194523673702225, "grad_norm": 0.7734375, "learning_rate": 4.95134339681086e-05, "loss": 1.0787, "step": 595 }, { "epoch": 0.3199892621052985, "grad_norm": 0.72265625, "learning_rate": 4.951177451293328e-05, "loss": 0.9766, "step": 596 }, { "epoch": 0.3205261568403745, "grad_norm": 0.56640625, "learning_rate": 4.951011226065407e-05, "loss": 0.6743, "step": 597 }, { "epoch": 0.3210630515754505, "grad_norm": 0.765625, "learning_rate": 4.950844721146066e-05, "loss": 0.8134, "step": 598 }, { "epoch": 0.3215999463105265, "grad_norm": 0.796875, "learning_rate": 4.9506779365543046e-05, "loss": 1.1082, "step": 599 }, { "epoch": 0.3221368410456025, "grad_norm": 0.7421875, "learning_rate": 4.950510872309155e-05, "loss": 1.0203, "step": 600 }, { "epoch": 0.3221368410456025, "eval_loss": 0.9650207757949829, "eval_runtime": 45.8281, "eval_samples_per_second": 20.991, "eval_steps_per_second": 20.991, "step": 600 }, { "epoch": 0.3226737357806785, "grad_norm": 0.84765625, "learning_rate": 4.950343528429683e-05, "loss": 1.0154, "step": 601 }, { "epoch": 0.3232106305157545, "grad_norm": 0.80078125, "learning_rate": 4.950175904934982e-05, "loss": 1.0035, "step": 602 }, { "epoch": 0.3237475252508305, "grad_norm": 0.8125, "learning_rate": 4.950008001844182e-05, "loss": 0.9076, "step": 603 }, { "epoch": 0.3242844199859065, "grad_norm": 1.671875, "learning_rate": 4.949839819176442e-05, "loss": 1.0813, "step": 604 }, { "epoch": 0.3248213147209825, "grad_norm": 0.703125, "learning_rate": 4.949671356950953e-05, "loss": 0.8759, "step": 605 }, { "epoch": 0.32535820945605853, "grad_norm": 0.859375, "learning_rate": 4.949502615186941e-05, "loss": 0.9611, "step": 606 }, { "epoch": 0.32589510419113454, "grad_norm": 0.79296875, "learning_rate": 4.94933359390366e-05, "loss": 0.8841, "step": 607 }, { "epoch": 0.32643199892621055, "grad_norm": 0.8359375, "learning_rate": 4.949164293120397e-05, "loss": 1.0533, "step": 608 }, { "epoch": 0.3269688936612865, "grad_norm": 0.69921875, "learning_rate": 4.948994712856472e-05, "loss": 1.0198, "step": 609 }, { "epoch": 0.3275057883963625, "grad_norm": 0.74609375, "learning_rate": 4.948824853131236e-05, "loss": 0.9614, "step": 610 }, { "epoch": 0.32804268313143853, "grad_norm": 0.75, "learning_rate": 4.948654713964073e-05, "loss": 1.0106, "step": 611 }, { "epoch": 0.32857957786651454, "grad_norm": 0.875, "learning_rate": 4.948484295374397e-05, "loss": 1.1489, "step": 612 }, { "epoch": 0.32911647260159055, "grad_norm": 0.75, "learning_rate": 4.9483135973816564e-05, "loss": 0.9575, "step": 613 }, { "epoch": 0.32965336733666656, "grad_norm": 0.6640625, "learning_rate": 4.948142620005328e-05, "loss": 0.9872, "step": 614 }, { "epoch": 0.33019026207174257, "grad_norm": 0.6796875, "learning_rate": 4.9479713632649236e-05, "loss": 0.8048, "step": 615 }, { "epoch": 0.3307271568068186, "grad_norm": 0.75390625, "learning_rate": 4.947799827179986e-05, "loss": 0.8788, "step": 616 }, { "epoch": 0.3312640515418946, "grad_norm": 0.6875, "learning_rate": 4.9476280117700884e-05, "loss": 0.9128, "step": 617 }, { "epoch": 0.3318009462769706, "grad_norm": 0.69140625, "learning_rate": 4.9474559170548387e-05, "loss": 0.9896, "step": 618 }, { "epoch": 0.33233784101204655, "grad_norm": 0.7421875, "learning_rate": 4.947283543053874e-05, "loss": 1.0966, "step": 619 }, { "epoch": 0.33287473574712256, "grad_norm": 0.71484375, "learning_rate": 4.947110889786864e-05, "loss": 1.0515, "step": 620 }, { "epoch": 0.3334116304821986, "grad_norm": 0.75, "learning_rate": 4.946937957273513e-05, "loss": 0.9433, "step": 621 }, { "epoch": 0.3339485252172746, "grad_norm": 1.3359375, "learning_rate": 4.946764745533552e-05, "loss": 0.9242, "step": 622 }, { "epoch": 0.3344854199523506, "grad_norm": 0.65625, "learning_rate": 4.946591254586748e-05, "loss": 0.9455, "step": 623 }, { "epoch": 0.3350223146874266, "grad_norm": 0.80859375, "learning_rate": 4.9464174844528984e-05, "loss": 0.8518, "step": 624 }, { "epoch": 0.3355592094225026, "grad_norm": 0.859375, "learning_rate": 4.946243435151832e-05, "loss": 0.9709, "step": 625 }, { "epoch": 0.3360961041575786, "grad_norm": 0.7890625, "learning_rate": 4.946069106703411e-05, "loss": 0.9883, "step": 626 }, { "epoch": 0.33663299889265463, "grad_norm": 0.76953125, "learning_rate": 4.945894499127528e-05, "loss": 1.1203, "step": 627 }, { "epoch": 0.3371698936277306, "grad_norm": 0.80859375, "learning_rate": 4.9457196124441073e-05, "loss": 1.0605, "step": 628 }, { "epoch": 0.3377067883628066, "grad_norm": 0.69140625, "learning_rate": 4.9455444466731066e-05, "loss": 1.022, "step": 629 }, { "epoch": 0.3382436830978826, "grad_norm": 0.75390625, "learning_rate": 4.9453690018345144e-05, "loss": 0.8821, "step": 630 }, { "epoch": 0.3387805778329586, "grad_norm": 0.63671875, "learning_rate": 4.9451932779483514e-05, "loss": 0.6977, "step": 631 }, { "epoch": 0.33931747256803463, "grad_norm": 2.40625, "learning_rate": 4.9450172750346684e-05, "loss": 0.965, "step": 632 }, { "epoch": 0.33985436730311064, "grad_norm": 0.90625, "learning_rate": 4.944840993113551e-05, "loss": 1.0392, "step": 633 }, { "epoch": 0.34039126203818665, "grad_norm": 0.7421875, "learning_rate": 4.944664432205115e-05, "loss": 0.8958, "step": 634 }, { "epoch": 0.34092815677326266, "grad_norm": 0.76953125, "learning_rate": 4.944487592329509e-05, "loss": 0.9605, "step": 635 }, { "epoch": 0.34146505150833867, "grad_norm": 0.8046875, "learning_rate": 4.944310473506911e-05, "loss": 0.9886, "step": 636 }, { "epoch": 0.3420019462434146, "grad_norm": 0.734375, "learning_rate": 4.944133075757533e-05, "loss": 1.1848, "step": 637 }, { "epoch": 0.34253884097849063, "grad_norm": 0.82421875, "learning_rate": 4.9439553991016187e-05, "loss": 0.8984, "step": 638 }, { "epoch": 0.34307573571356664, "grad_norm": 0.82421875, "learning_rate": 4.943777443559444e-05, "loss": 0.9367, "step": 639 }, { "epoch": 0.34361263044864265, "grad_norm": 0.92578125, "learning_rate": 4.943599209151314e-05, "loss": 1.0085, "step": 640 }, { "epoch": 0.34414952518371866, "grad_norm": 0.921875, "learning_rate": 4.9434206958975695e-05, "loss": 1.0865, "step": 641 }, { "epoch": 0.3446864199187947, "grad_norm": 0.82421875, "learning_rate": 4.9432419038185794e-05, "loss": 1.0724, "step": 642 }, { "epoch": 0.3452233146538707, "grad_norm": 0.8828125, "learning_rate": 4.9430628329347475e-05, "loss": 1.0542, "step": 643 }, { "epoch": 0.3457602093889467, "grad_norm": 0.66796875, "learning_rate": 4.942883483266507e-05, "loss": 1.0099, "step": 644 }, { "epoch": 0.3462971041240227, "grad_norm": 0.72265625, "learning_rate": 4.942703854834324e-05, "loss": 0.9407, "step": 645 }, { "epoch": 0.3468339988590987, "grad_norm": 0.6875, "learning_rate": 4.942523947658698e-05, "loss": 0.8455, "step": 646 }, { "epoch": 0.34737089359417467, "grad_norm": 0.8515625, "learning_rate": 4.9423437617601564e-05, "loss": 1.0387, "step": 647 }, { "epoch": 0.3479077883292507, "grad_norm": 0.88671875, "learning_rate": 4.942163297159263e-05, "loss": 0.8568, "step": 648 }, { "epoch": 0.3484446830643267, "grad_norm": 0.71875, "learning_rate": 4.941982553876609e-05, "loss": 1.0762, "step": 649 }, { "epoch": 0.3489815777994027, "grad_norm": 0.76171875, "learning_rate": 4.9418015319328204e-05, "loss": 0.9849, "step": 650 }, { "epoch": 0.3489815777994027, "eval_loss": 0.9629884362220764, "eval_runtime": 48.9415, "eval_samples_per_second": 19.656, "eval_steps_per_second": 19.656, "step": 650 }, { "epoch": 0.3495184725344787, "grad_norm": 0.6875, "learning_rate": 4.941620231348555e-05, "loss": 1.0022, "step": 651 }, { "epoch": 0.3500553672695547, "grad_norm": 0.67578125, "learning_rate": 4.9414386521445e-05, "loss": 0.8935, "step": 652 }, { "epoch": 0.35059226200463073, "grad_norm": 0.67578125, "learning_rate": 4.9412567943413755e-05, "loss": 1.0854, "step": 653 }, { "epoch": 0.35112915673970674, "grad_norm": 0.7421875, "learning_rate": 4.941074657959937e-05, "loss": 1.2573, "step": 654 }, { "epoch": 0.35166605147478275, "grad_norm": 0.78515625, "learning_rate": 4.940892243020964e-05, "loss": 0.7695, "step": 655 }, { "epoch": 0.3522029462098587, "grad_norm": 0.6796875, "learning_rate": 4.940709549545276e-05, "loss": 0.9791, "step": 656 }, { "epoch": 0.3527398409449347, "grad_norm": 0.82421875, "learning_rate": 4.940526577553719e-05, "loss": 0.8833, "step": 657 }, { "epoch": 0.3532767356800107, "grad_norm": 0.72265625, "learning_rate": 4.940343327067172e-05, "loss": 0.9296, "step": 658 }, { "epoch": 0.35381363041508673, "grad_norm": 0.62109375, "learning_rate": 4.9401597981065486e-05, "loss": 0.8494, "step": 659 }, { "epoch": 0.35435052515016274, "grad_norm": 0.82421875, "learning_rate": 4.939975990692789e-05, "loss": 0.9775, "step": 660 }, { "epoch": 0.35488741988523875, "grad_norm": 0.67578125, "learning_rate": 4.939791904846869e-05, "loss": 0.8377, "step": 661 }, { "epoch": 0.35542431462031476, "grad_norm": 0.640625, "learning_rate": 4.939607540589795e-05, "loss": 0.8226, "step": 662 }, { "epoch": 0.3559612093553908, "grad_norm": 0.7890625, "learning_rate": 4.939422897942606e-05, "loss": 1.0506, "step": 663 }, { "epoch": 0.3564981040904668, "grad_norm": 0.79296875, "learning_rate": 4.9392379769263716e-05, "loss": 0.927, "step": 664 }, { "epoch": 0.3570349988255428, "grad_norm": 0.734375, "learning_rate": 4.939052777562193e-05, "loss": 0.9213, "step": 665 }, { "epoch": 0.35757189356061875, "grad_norm": 0.6171875, "learning_rate": 4.9388672998712046e-05, "loss": 0.7729, "step": 666 }, { "epoch": 0.35810878829569476, "grad_norm": 0.73046875, "learning_rate": 4.938681543874572e-05, "loss": 1.0298, "step": 667 }, { "epoch": 0.35864568303077077, "grad_norm": 0.60546875, "learning_rate": 4.938495509593492e-05, "loss": 0.8244, "step": 668 }, { "epoch": 0.3591825777658468, "grad_norm": 0.7734375, "learning_rate": 4.938309197049192e-05, "loss": 0.9015, "step": 669 }, { "epoch": 0.3597194725009228, "grad_norm": 0.8046875, "learning_rate": 4.938122606262936e-05, "loss": 0.9868, "step": 670 }, { "epoch": 0.3602563672359988, "grad_norm": 0.6796875, "learning_rate": 4.937935737256012e-05, "loss": 0.9103, "step": 671 }, { "epoch": 0.3607932619710748, "grad_norm": 0.7734375, "learning_rate": 4.9377485900497476e-05, "loss": 0.9998, "step": 672 }, { "epoch": 0.3613301567061508, "grad_norm": 0.72265625, "learning_rate": 4.9375611646654976e-05, "loss": 0.8885, "step": 673 }, { "epoch": 0.36186705144122683, "grad_norm": 0.77734375, "learning_rate": 4.937373461124649e-05, "loss": 0.9337, "step": 674 }, { "epoch": 0.3624039461763028, "grad_norm": 0.75, "learning_rate": 4.937185479448623e-05, "loss": 1.016, "step": 675 }, { "epoch": 0.3629408409113788, "grad_norm": 0.75, "learning_rate": 4.9369972196588676e-05, "loss": 1.1903, "step": 676 }, { "epoch": 0.3634777356464548, "grad_norm": 1.0546875, "learning_rate": 4.9368086817768685e-05, "loss": 1.165, "step": 677 }, { "epoch": 0.3640146303815308, "grad_norm": 0.921875, "learning_rate": 4.936619865824138e-05, "loss": 1.0212, "step": 678 }, { "epoch": 0.3645515251166068, "grad_norm": 0.796875, "learning_rate": 4.9364307718222245e-05, "loss": 0.9909, "step": 679 }, { "epoch": 0.36508841985168283, "grad_norm": 0.64453125, "learning_rate": 4.936241399792705e-05, "loss": 1.0334, "step": 680 }, { "epoch": 0.36562531458675884, "grad_norm": 0.828125, "learning_rate": 4.936051749757189e-05, "loss": 1.1268, "step": 681 }, { "epoch": 0.36616220932183485, "grad_norm": 0.67578125, "learning_rate": 4.935861821737318e-05, "loss": 0.9732, "step": 682 }, { "epoch": 0.36669910405691086, "grad_norm": 0.79296875, "learning_rate": 4.935671615754765e-05, "loss": 1.0614, "step": 683 }, { "epoch": 0.3672359987919869, "grad_norm": 0.83203125, "learning_rate": 4.9354811318312367e-05, "loss": 1.142, "step": 684 }, { "epoch": 0.3677728935270628, "grad_norm": 0.73828125, "learning_rate": 4.935290369988468e-05, "loss": 1.1117, "step": 685 }, { "epoch": 0.36830978826213884, "grad_norm": 0.64453125, "learning_rate": 4.935099330248227e-05, "loss": 0.9285, "step": 686 }, { "epoch": 0.36884668299721485, "grad_norm": 0.92578125, "learning_rate": 4.934908012632315e-05, "loss": 1.1574, "step": 687 }, { "epoch": 0.36938357773229086, "grad_norm": 0.75, "learning_rate": 4.934716417162563e-05, "loss": 0.979, "step": 688 }, { "epoch": 0.36992047246736687, "grad_norm": 0.7421875, "learning_rate": 4.9345245438608344e-05, "loss": 1.1321, "step": 689 }, { "epoch": 0.3704573672024429, "grad_norm": 0.8515625, "learning_rate": 4.934332392749025e-05, "loss": 0.9974, "step": 690 }, { "epoch": 0.3709942619375189, "grad_norm": 0.81640625, "learning_rate": 4.93413996384906e-05, "loss": 1.122, "step": 691 }, { "epoch": 0.3715311566725949, "grad_norm": 0.80078125, "learning_rate": 4.933947257182901e-05, "loss": 1.0571, "step": 692 }, { "epoch": 0.3720680514076709, "grad_norm": 0.71875, "learning_rate": 4.9337542727725356e-05, "loss": 0.9182, "step": 693 }, { "epoch": 0.37260494614274686, "grad_norm": 0.82421875, "learning_rate": 4.9335610106399864e-05, "loss": 1.0838, "step": 694 }, { "epoch": 0.3731418408778229, "grad_norm": 1.2265625, "learning_rate": 4.933367470807308e-05, "loss": 1.2538, "step": 695 }, { "epoch": 0.3736787356128989, "grad_norm": 0.76953125, "learning_rate": 4.933173653296585e-05, "loss": 0.9813, "step": 696 }, { "epoch": 0.3742156303479749, "grad_norm": 0.74609375, "learning_rate": 4.932979558129934e-05, "loss": 0.946, "step": 697 }, { "epoch": 0.3747525250830509, "grad_norm": 0.75, "learning_rate": 4.932785185329505e-05, "loss": 0.8854, "step": 698 }, { "epoch": 0.3752894198181269, "grad_norm": 0.75, "learning_rate": 4.9325905349174765e-05, "loss": 1.1353, "step": 699 }, { "epoch": 0.3758263145532029, "grad_norm": 0.81640625, "learning_rate": 4.932395606916062e-05, "loss": 1.1588, "step": 700 }, { "epoch": 0.3758263145532029, "eval_loss": 0.9581480026245117, "eval_runtime": 46.2527, "eval_samples_per_second": 20.799, "eval_steps_per_second": 20.799, "step": 700 }, { "epoch": 0.37636320928827893, "grad_norm": 0.734375, "learning_rate": 4.932200401347506e-05, "loss": 0.8196, "step": 701 }, { "epoch": 0.37690010402335494, "grad_norm": 0.87109375, "learning_rate": 4.932004918234082e-05, "loss": 1.1573, "step": 702 }, { "epoch": 0.37743699875843095, "grad_norm": 0.77734375, "learning_rate": 4.931809157598099e-05, "loss": 0.8634, "step": 703 }, { "epoch": 0.3779738934935069, "grad_norm": 0.69140625, "learning_rate": 4.931613119461893e-05, "loss": 0.8944, "step": 704 }, { "epoch": 0.3785107882285829, "grad_norm": 0.78515625, "learning_rate": 4.931416803847837e-05, "loss": 1.0816, "step": 705 }, { "epoch": 0.3790476829636589, "grad_norm": 0.99609375, "learning_rate": 4.931220210778332e-05, "loss": 1.1423, "step": 706 }, { "epoch": 0.37958457769873494, "grad_norm": 0.80078125, "learning_rate": 4.9310233402758133e-05, "loss": 1.0044, "step": 707 }, { "epoch": 0.38012147243381095, "grad_norm": 0.66796875, "learning_rate": 4.930826192362744e-05, "loss": 1.0846, "step": 708 }, { "epoch": 0.38065836716888696, "grad_norm": 0.69140625, "learning_rate": 4.930628767061622e-05, "loss": 0.9084, "step": 709 }, { "epoch": 0.38119526190396297, "grad_norm": 0.94921875, "learning_rate": 4.930431064394977e-05, "loss": 1.0305, "step": 710 }, { "epoch": 0.381732156639039, "grad_norm": 0.69921875, "learning_rate": 4.9302330843853686e-05, "loss": 0.8617, "step": 711 }, { "epoch": 0.382269051374115, "grad_norm": 0.82421875, "learning_rate": 4.930034827055388e-05, "loss": 0.992, "step": 712 }, { "epoch": 0.38280594610919094, "grad_norm": 0.734375, "learning_rate": 4.9298362924276604e-05, "loss": 1.0499, "step": 713 }, { "epoch": 0.38334284084426695, "grad_norm": 0.75, "learning_rate": 4.92963748052484e-05, "loss": 1.0629, "step": 714 }, { "epoch": 0.38387973557934296, "grad_norm": 0.7421875, "learning_rate": 4.929438391369615e-05, "loss": 1.0508, "step": 715 }, { "epoch": 0.384416630314419, "grad_norm": 0.6953125, "learning_rate": 4.929239024984702e-05, "loss": 1.0549, "step": 716 }, { "epoch": 0.384953525049495, "grad_norm": 0.796875, "learning_rate": 4.9290393813928534e-05, "loss": 0.7663, "step": 717 }, { "epoch": 0.385490419784571, "grad_norm": 0.7734375, "learning_rate": 4.9288394606168494e-05, "loss": 0.9906, "step": 718 }, { "epoch": 0.386027314519647, "grad_norm": 0.62890625, "learning_rate": 4.928639262679505e-05, "loss": 0.9696, "step": 719 }, { "epoch": 0.386564209254723, "grad_norm": 0.6328125, "learning_rate": 4.928438787603664e-05, "loss": 0.8313, "step": 720 }, { "epoch": 0.387101103989799, "grad_norm": 0.8984375, "learning_rate": 4.928238035412204e-05, "loss": 1.0274, "step": 721 }, { "epoch": 0.387637998724875, "grad_norm": 0.7109375, "learning_rate": 4.928037006128032e-05, "loss": 0.9403, "step": 722 }, { "epoch": 0.388174893459951, "grad_norm": 0.8046875, "learning_rate": 4.9278356997740904e-05, "loss": 0.9096, "step": 723 }, { "epoch": 0.388711788195027, "grad_norm": 0.640625, "learning_rate": 4.927634116373349e-05, "loss": 0.876, "step": 724 }, { "epoch": 0.389248682930103, "grad_norm": 0.953125, "learning_rate": 4.927432255948811e-05, "loss": 0.9584, "step": 725 }, { "epoch": 0.389785577665179, "grad_norm": 0.70703125, "learning_rate": 4.9272301185235116e-05, "loss": 0.8188, "step": 726 }, { "epoch": 0.390322472400255, "grad_norm": 0.83203125, "learning_rate": 4.927027704120518e-05, "loss": 1.1977, "step": 727 }, { "epoch": 0.39085936713533104, "grad_norm": 0.77734375, "learning_rate": 4.9268250127629265e-05, "loss": 0.9294, "step": 728 }, { "epoch": 0.39139626187040705, "grad_norm": 0.73046875, "learning_rate": 4.926622044473869e-05, "loss": 0.8519, "step": 729 }, { "epoch": 0.39193315660548306, "grad_norm": 0.69140625, "learning_rate": 4.926418799276504e-05, "loss": 0.9133, "step": 730 }, { "epoch": 0.39247005134055907, "grad_norm": 0.8203125, "learning_rate": 4.926215277194027e-05, "loss": 0.8231, "step": 731 }, { "epoch": 0.393006946075635, "grad_norm": 0.8203125, "learning_rate": 4.926011478249661e-05, "loss": 1.044, "step": 732 }, { "epoch": 0.39354384081071103, "grad_norm": 0.80859375, "learning_rate": 4.925807402466663e-05, "loss": 0.9314, "step": 733 }, { "epoch": 0.39408073554578704, "grad_norm": 0.78125, "learning_rate": 4.925603049868319e-05, "loss": 1.1221, "step": 734 }, { "epoch": 0.39461763028086305, "grad_norm": 0.76953125, "learning_rate": 4.92539842047795e-05, "loss": 1.0373, "step": 735 }, { "epoch": 0.39515452501593906, "grad_norm": 0.80859375, "learning_rate": 4.925193514318906e-05, "loss": 0.9045, "step": 736 }, { "epoch": 0.39569141975101507, "grad_norm": 0.875, "learning_rate": 4.924988331414569e-05, "loss": 1.0327, "step": 737 }, { "epoch": 0.3962283144860911, "grad_norm": 0.65234375, "learning_rate": 4.924782871788354e-05, "loss": 0.8908, "step": 738 }, { "epoch": 0.3967652092211671, "grad_norm": 0.953125, "learning_rate": 4.924577135463705e-05, "loss": 1.4129, "step": 739 }, { "epoch": 0.3973021039562431, "grad_norm": 0.68359375, "learning_rate": 4.924371122464101e-05, "loss": 0.9474, "step": 740 }, { "epoch": 0.39783899869131906, "grad_norm": 0.7265625, "learning_rate": 4.92416483281305e-05, "loss": 1.0218, "step": 741 }, { "epoch": 0.39837589342639507, "grad_norm": 0.72265625, "learning_rate": 4.923958266534091e-05, "loss": 0.7931, "step": 742 }, { "epoch": 0.3989127881614711, "grad_norm": 0.703125, "learning_rate": 4.923751423650797e-05, "loss": 0.9955, "step": 743 }, { "epoch": 0.3994496828965471, "grad_norm": 0.70703125, "learning_rate": 4.923544304186771e-05, "loss": 0.9051, "step": 744 }, { "epoch": 0.3999865776316231, "grad_norm": 0.73046875, "learning_rate": 4.923336908165649e-05, "loss": 0.8443, "step": 745 }, { "epoch": 0.4005234723666991, "grad_norm": 0.81640625, "learning_rate": 4.923129235611096e-05, "loss": 0.9819, "step": 746 }, { "epoch": 0.4010603671017751, "grad_norm": 0.921875, "learning_rate": 4.922921286546812e-05, "loss": 1.1488, "step": 747 }, { "epoch": 0.4015972618368511, "grad_norm": 0.69140625, "learning_rate": 4.922713060996524e-05, "loss": 0.8344, "step": 748 }, { "epoch": 0.40213415657192714, "grad_norm": 0.703125, "learning_rate": 4.922504558983995e-05, "loss": 0.9631, "step": 749 }, { "epoch": 0.40267105130700315, "grad_norm": 0.6953125, "learning_rate": 4.922295780533017e-05, "loss": 0.8723, "step": 750 }, { "epoch": 0.40267105130700315, "eval_loss": 0.9560365676879883, "eval_runtime": 48.5323, "eval_samples_per_second": 19.822, "eval_steps_per_second": 19.822, "step": 750 }, { "epoch": 0.4032079460420791, "grad_norm": 0.76171875, "learning_rate": 4.922086725667415e-05, "loss": 0.9373, "step": 751 }, { "epoch": 0.4037448407771551, "grad_norm": 0.77734375, "learning_rate": 4.921877394411045e-05, "loss": 1.0295, "step": 752 }, { "epoch": 0.4042817355122311, "grad_norm": 0.8671875, "learning_rate": 4.921667786787792e-05, "loss": 0.8979, "step": 753 }, { "epoch": 0.40481863024730713, "grad_norm": 0.94921875, "learning_rate": 4.9214579028215776e-05, "loss": 1.1372, "step": 754 }, { "epoch": 0.40535552498238314, "grad_norm": 0.81640625, "learning_rate": 4.9212477425363514e-05, "loss": 1.2777, "step": 755 }, { "epoch": 0.40589241971745915, "grad_norm": 0.6875, "learning_rate": 4.921037305956095e-05, "loss": 0.935, "step": 756 }, { "epoch": 0.40642931445253516, "grad_norm": 0.671875, "learning_rate": 4.920826593104822e-05, "loss": 0.8783, "step": 757 }, { "epoch": 0.40696620918761117, "grad_norm": 0.69921875, "learning_rate": 4.920615604006578e-05, "loss": 0.8403, "step": 758 }, { "epoch": 0.4075031039226872, "grad_norm": 0.8046875, "learning_rate": 4.9204043386854377e-05, "loss": 1.0082, "step": 759 }, { "epoch": 0.40803999865776314, "grad_norm": 0.703125, "learning_rate": 4.920192797165511e-05, "loss": 0.8792, "step": 760 }, { "epoch": 0.40857689339283915, "grad_norm": 0.87109375, "learning_rate": 4.919980979470937e-05, "loss": 1.0941, "step": 761 }, { "epoch": 0.40911378812791516, "grad_norm": 0.765625, "learning_rate": 4.919768885625887e-05, "loss": 1.1464, "step": 762 }, { "epoch": 0.40965068286299117, "grad_norm": 0.65625, "learning_rate": 4.919556515654562e-05, "loss": 0.8907, "step": 763 }, { "epoch": 0.4101875775980672, "grad_norm": 0.859375, "learning_rate": 4.9193438695811985e-05, "loss": 0.9733, "step": 764 }, { "epoch": 0.4107244723331432, "grad_norm": 0.8046875, "learning_rate": 4.919130947430061e-05, "loss": 1.0208, "step": 765 }, { "epoch": 0.4112613670682192, "grad_norm": 0.7890625, "learning_rate": 4.9189177492254455e-05, "loss": 1.1111, "step": 766 }, { "epoch": 0.4117982618032952, "grad_norm": 0.73046875, "learning_rate": 4.918704274991683e-05, "loss": 0.7626, "step": 767 }, { "epoch": 0.4123351565383712, "grad_norm": 0.76171875, "learning_rate": 4.9184905247531316e-05, "loss": 1.104, "step": 768 }, { "epoch": 0.4128720512734472, "grad_norm": 0.7578125, "learning_rate": 4.918276498534184e-05, "loss": 0.9929, "step": 769 }, { "epoch": 0.4134089460085232, "grad_norm": 0.9609375, "learning_rate": 4.918062196359263e-05, "loss": 1.2401, "step": 770 }, { "epoch": 0.4139458407435992, "grad_norm": 0.640625, "learning_rate": 4.917847618252822e-05, "loss": 1.0889, "step": 771 }, { "epoch": 0.4144827354786752, "grad_norm": 0.78515625, "learning_rate": 4.917632764239349e-05, "loss": 0.8042, "step": 772 }, { "epoch": 0.4150196302137512, "grad_norm": 0.88671875, "learning_rate": 4.917417634343361e-05, "loss": 1.1121, "step": 773 }, { "epoch": 0.4155565249488272, "grad_norm": 0.765625, "learning_rate": 4.9172022285894074e-05, "loss": 1.0736, "step": 774 }, { "epoch": 0.41609341968390323, "grad_norm": 0.7421875, "learning_rate": 4.916986547002067e-05, "loss": 1.145, "step": 775 }, { "epoch": 0.41663031441897924, "grad_norm": 0.796875, "learning_rate": 4.9167705896059527e-05, "loss": 1.0813, "step": 776 }, { "epoch": 0.41716720915405525, "grad_norm": 0.85546875, "learning_rate": 4.916554356425709e-05, "loss": 0.9823, "step": 777 }, { "epoch": 0.41770410388913126, "grad_norm": 0.765625, "learning_rate": 4.91633784748601e-05, "loss": 1.1357, "step": 778 }, { "epoch": 0.4182409986242072, "grad_norm": 0.671875, "learning_rate": 4.9161210628115615e-05, "loss": 0.7951, "step": 779 }, { "epoch": 0.4187778933592832, "grad_norm": 0.66015625, "learning_rate": 4.915904002427103e-05, "loss": 0.8144, "step": 780 }, { "epoch": 0.41931478809435924, "grad_norm": 0.703125, "learning_rate": 4.915686666357402e-05, "loss": 0.9838, "step": 781 }, { "epoch": 0.41985168282943525, "grad_norm": 0.83984375, "learning_rate": 4.9154690546272606e-05, "loss": 0.9819, "step": 782 }, { "epoch": 0.42038857756451126, "grad_norm": 0.8203125, "learning_rate": 4.915251167261511e-05, "loss": 1.1217, "step": 783 }, { "epoch": 0.42092547229958727, "grad_norm": 0.66015625, "learning_rate": 4.9150330042850155e-05, "loss": 0.8995, "step": 784 }, { "epoch": 0.4214623670346633, "grad_norm": 0.7421875, "learning_rate": 4.914814565722671e-05, "loss": 0.9941, "step": 785 }, { "epoch": 0.4219992617697393, "grad_norm": 0.75, "learning_rate": 4.9145958515994025e-05, "loss": 0.9457, "step": 786 }, { "epoch": 0.4225361565048153, "grad_norm": 0.8671875, "learning_rate": 4.914376861940171e-05, "loss": 1.0138, "step": 787 }, { "epoch": 0.42307305123989125, "grad_norm": 0.578125, "learning_rate": 4.914157596769962e-05, "loss": 0.7708, "step": 788 }, { "epoch": 0.42360994597496726, "grad_norm": 0.859375, "learning_rate": 4.9139380561137994e-05, "loss": 1.121, "step": 789 }, { "epoch": 0.42414684071004327, "grad_norm": 0.6328125, "learning_rate": 4.9137182399967343e-05, "loss": 0.8208, "step": 790 }, { "epoch": 0.4246837354451193, "grad_norm": 0.73828125, "learning_rate": 4.91349814844385e-05, "loss": 1.0937, "step": 791 }, { "epoch": 0.4252206301801953, "grad_norm": 0.77734375, "learning_rate": 4.9132777814802634e-05, "loss": 0.9219, "step": 792 }, { "epoch": 0.4257575249152713, "grad_norm": 0.921875, "learning_rate": 4.91305713913112e-05, "loss": 1.0178, "step": 793 }, { "epoch": 0.4262944196503473, "grad_norm": 0.7265625, "learning_rate": 4.9128362214215986e-05, "loss": 1.0026, "step": 794 }, { "epoch": 0.4268313143854233, "grad_norm": 0.67578125, "learning_rate": 4.912615028376907e-05, "loss": 1.0, "step": 795 }, { "epoch": 0.42736820912049933, "grad_norm": 0.96875, "learning_rate": 4.912393560022288e-05, "loss": 0.9567, "step": 796 }, { "epoch": 0.42790510385557534, "grad_norm": 0.87890625, "learning_rate": 4.912171816383014e-05, "loss": 1.189, "step": 797 }, { "epoch": 0.4284419985906513, "grad_norm": 0.62890625, "learning_rate": 4.911949797484388e-05, "loss": 0.8717, "step": 798 }, { "epoch": 0.4289788933257273, "grad_norm": 0.74609375, "learning_rate": 4.911727503351744e-05, "loss": 1.0839, "step": 799 }, { "epoch": 0.4295157880608033, "grad_norm": 0.64453125, "learning_rate": 4.9115049340104505e-05, "loss": 0.8312, "step": 800 }, { "epoch": 0.4295157880608033, "eval_loss": 0.9527614712715149, "eval_runtime": 46.4794, "eval_samples_per_second": 20.697, "eval_steps_per_second": 20.697, "step": 800 }, { "epoch": 0.4300526827958793, "grad_norm": 0.80078125, "learning_rate": 4.9112820894859046e-05, "loss": 0.815, "step": 801 }, { "epoch": 0.43058957753095534, "grad_norm": 0.78125, "learning_rate": 4.911058969803536e-05, "loss": 1.0427, "step": 802 }, { "epoch": 0.43112647226603135, "grad_norm": 0.7890625, "learning_rate": 4.9108355749888056e-05, "loss": 0.8677, "step": 803 }, { "epoch": 0.43166336700110736, "grad_norm": 0.69921875, "learning_rate": 4.910611905067205e-05, "loss": 1.048, "step": 804 }, { "epoch": 0.43220026173618337, "grad_norm": 0.78515625, "learning_rate": 4.910387960064259e-05, "loss": 0.8238, "step": 805 }, { "epoch": 0.4327371564712594, "grad_norm": 0.82421875, "learning_rate": 4.91016374000552e-05, "loss": 0.8858, "step": 806 }, { "epoch": 0.43327405120633533, "grad_norm": 0.73828125, "learning_rate": 4.9099392449165774e-05, "loss": 1.0776, "step": 807 }, { "epoch": 0.43381094594141134, "grad_norm": 0.65234375, "learning_rate": 4.909714474823047e-05, "loss": 0.7519, "step": 808 }, { "epoch": 0.43434784067648735, "grad_norm": 0.7265625, "learning_rate": 4.909489429750579e-05, "loss": 0.6753, "step": 809 }, { "epoch": 0.43488473541156336, "grad_norm": 1.0, "learning_rate": 4.909264109724853e-05, "loss": 1.1741, "step": 810 }, { "epoch": 0.43542163014663937, "grad_norm": 0.91796875, "learning_rate": 4.909038514771581e-05, "loss": 1.0277, "step": 811 }, { "epoch": 0.4359585248817154, "grad_norm": 0.7265625, "learning_rate": 4.9088126449165065e-05, "loss": 0.9186, "step": 812 }, { "epoch": 0.4364954196167914, "grad_norm": 0.7109375, "learning_rate": 4.9085865001854035e-05, "loss": 1.105, "step": 813 }, { "epoch": 0.4370323143518674, "grad_norm": 0.67578125, "learning_rate": 4.90836008060408e-05, "loss": 0.7978, "step": 814 }, { "epoch": 0.4375692090869434, "grad_norm": 0.8125, "learning_rate": 4.90813338619837e-05, "loss": 1.0604, "step": 815 }, { "epoch": 0.4381061038220194, "grad_norm": 0.7734375, "learning_rate": 4.907906416994146e-05, "loss": 1.0758, "step": 816 }, { "epoch": 0.4386429985570954, "grad_norm": 0.703125, "learning_rate": 4.907679173017305e-05, "loss": 1.1113, "step": 817 }, { "epoch": 0.4391798932921714, "grad_norm": 0.765625, "learning_rate": 4.9074516542937795e-05, "loss": 0.9405, "step": 818 }, { "epoch": 0.4397167880272474, "grad_norm": 0.73046875, "learning_rate": 4.907223860849533e-05, "loss": 0.9341, "step": 819 }, { "epoch": 0.4402536827623234, "grad_norm": 0.68359375, "learning_rate": 4.9069957927105586e-05, "loss": 0.9659, "step": 820 }, { "epoch": 0.4407905774973994, "grad_norm": 0.74609375, "learning_rate": 4.9067674499028824e-05, "loss": 0.9065, "step": 821 }, { "epoch": 0.4413274722324754, "grad_norm": 0.8046875, "learning_rate": 4.906538832452561e-05, "loss": 0.9159, "step": 822 }, { "epoch": 0.44186436696755144, "grad_norm": 0.7265625, "learning_rate": 4.906309940385682e-05, "loss": 0.7843, "step": 823 }, { "epoch": 0.44240126170262745, "grad_norm": 0.89453125, "learning_rate": 4.9060807737283656e-05, "loss": 0.9702, "step": 824 }, { "epoch": 0.44293815643770346, "grad_norm": 0.94921875, "learning_rate": 4.905851332506762e-05, "loss": 1.0678, "step": 825 }, { "epoch": 0.4434750511727794, "grad_norm": 0.65625, "learning_rate": 4.905621616747054e-05, "loss": 1.0175, "step": 826 }, { "epoch": 0.4440119459078554, "grad_norm": 0.91015625, "learning_rate": 4.905391626475455e-05, "loss": 1.1155, "step": 827 }, { "epoch": 0.44454884064293143, "grad_norm": 0.6796875, "learning_rate": 4.905161361718209e-05, "loss": 1.0649, "step": 828 }, { "epoch": 0.44508573537800744, "grad_norm": 0.8671875, "learning_rate": 4.9049308225015936e-05, "loss": 0.9817, "step": 829 }, { "epoch": 0.44562263011308345, "grad_norm": 0.66796875, "learning_rate": 4.9047000088519144e-05, "loss": 0.9786, "step": 830 }, { "epoch": 0.44615952484815946, "grad_norm": 0.6953125, "learning_rate": 4.904468920795512e-05, "loss": 0.8776, "step": 831 }, { "epoch": 0.44669641958323547, "grad_norm": 0.89453125, "learning_rate": 4.9042375583587555e-05, "loss": 0.9766, "step": 832 }, { "epoch": 0.4472333143183115, "grad_norm": 0.59765625, "learning_rate": 4.904005921568045e-05, "loss": 0.8547, "step": 833 }, { "epoch": 0.4477702090533875, "grad_norm": 0.76171875, "learning_rate": 4.9037740104498166e-05, "loss": 0.8818, "step": 834 }, { "epoch": 0.4483071037884635, "grad_norm": 0.8359375, "learning_rate": 4.9035418250305314e-05, "loss": 1.1426, "step": 835 }, { "epoch": 0.44884399852353946, "grad_norm": 0.84375, "learning_rate": 4.903309365336686e-05, "loss": 1.0365, "step": 836 }, { "epoch": 0.44938089325861547, "grad_norm": 0.8046875, "learning_rate": 4.903076631394806e-05, "loss": 0.9775, "step": 837 }, { "epoch": 0.4499177879936915, "grad_norm": 1.40625, "learning_rate": 4.90284362323145e-05, "loss": 1.3363, "step": 838 }, { "epoch": 0.4504546827287675, "grad_norm": 0.74609375, "learning_rate": 4.9026103408732065e-05, "loss": 1.0259, "step": 839 }, { "epoch": 0.4509915774638435, "grad_norm": 0.67578125, "learning_rate": 4.902376784346697e-05, "loss": 0.9859, "step": 840 }, { "epoch": 0.4515284721989195, "grad_norm": 0.6953125, "learning_rate": 4.902142953678573e-05, "loss": 0.9236, "step": 841 }, { "epoch": 0.4520653669339955, "grad_norm": 0.78125, "learning_rate": 4.901908848895517e-05, "loss": 0.9455, "step": 842 }, { "epoch": 0.4526022616690715, "grad_norm": 0.84765625, "learning_rate": 4.901674470024244e-05, "loss": 1.111, "step": 843 }, { "epoch": 0.45313915640414754, "grad_norm": 0.83203125, "learning_rate": 4.901439817091499e-05, "loss": 0.869, "step": 844 }, { "epoch": 0.4536760511392235, "grad_norm": 0.80078125, "learning_rate": 4.9012048901240585e-05, "loss": 0.8917, "step": 845 }, { "epoch": 0.4542129458742995, "grad_norm": 0.75390625, "learning_rate": 4.9009696891487325e-05, "loss": 1.0983, "step": 846 }, { "epoch": 0.4547498406093755, "grad_norm": 0.8203125, "learning_rate": 4.900734214192358e-05, "loss": 1.0704, "step": 847 }, { "epoch": 0.4552867353444515, "grad_norm": 0.8828125, "learning_rate": 4.9004984652818076e-05, "loss": 0.898, "step": 848 }, { "epoch": 0.45582363007952753, "grad_norm": 0.875, "learning_rate": 4.900262442443981e-05, "loss": 1.1245, "step": 849 }, { "epoch": 0.45636052481460354, "grad_norm": 0.58984375, "learning_rate": 4.900026145705815e-05, "loss": 0.7363, "step": 850 }, { "epoch": 0.45636052481460354, "eval_loss": 0.9519462585449219, "eval_runtime": 46.3396, "eval_samples_per_second": 20.76, "eval_steps_per_second": 20.76, "step": 850 }, { "epoch": 0.45689741954967955, "grad_norm": 0.7421875, "learning_rate": 4.8997895750942704e-05, "loss": 1.0622, "step": 851 }, { "epoch": 0.45743431428475556, "grad_norm": 0.8828125, "learning_rate": 4.899552730636345e-05, "loss": 0.9928, "step": 852 }, { "epoch": 0.45797120901983157, "grad_norm": 0.75390625, "learning_rate": 4.899315612359065e-05, "loss": 0.8667, "step": 853 }, { "epoch": 0.4585081037549076, "grad_norm": 0.74609375, "learning_rate": 4.899078220289489e-05, "loss": 1.0888, "step": 854 }, { "epoch": 0.45904499848998354, "grad_norm": 0.84765625, "learning_rate": 4.8988405544547054e-05, "loss": 1.2631, "step": 855 }, { "epoch": 0.45958189322505955, "grad_norm": 0.97265625, "learning_rate": 4.898602614881836e-05, "loss": 1.1243, "step": 856 }, { "epoch": 0.46011878796013556, "grad_norm": 0.7734375, "learning_rate": 4.898364401598034e-05, "loss": 1.0798, "step": 857 }, { "epoch": 0.46065568269521157, "grad_norm": 0.73828125, "learning_rate": 4.898125914630479e-05, "loss": 1.0191, "step": 858 }, { "epoch": 0.4611925774302876, "grad_norm": 0.7578125, "learning_rate": 4.897887154006388e-05, "loss": 0.9533, "step": 859 }, { "epoch": 0.4617294721653636, "grad_norm": 0.640625, "learning_rate": 4.897648119753006e-05, "loss": 0.9253, "step": 860 }, { "epoch": 0.4622663669004396, "grad_norm": 2.328125, "learning_rate": 4.8974088118976094e-05, "loss": 0.9924, "step": 861 }, { "epoch": 0.4628032616355156, "grad_norm": 0.72265625, "learning_rate": 4.897169230467506e-05, "loss": 0.9716, "step": 862 }, { "epoch": 0.4633401563705916, "grad_norm": 0.64453125, "learning_rate": 4.896929375490037e-05, "loss": 0.9661, "step": 863 }, { "epoch": 0.46387705110566757, "grad_norm": 0.78125, "learning_rate": 4.896689246992572e-05, "loss": 0.9774, "step": 864 }, { "epoch": 0.4644139458407436, "grad_norm": 0.8046875, "learning_rate": 4.896448845002511e-05, "loss": 0.8279, "step": 865 }, { "epoch": 0.4649508405758196, "grad_norm": 0.6484375, "learning_rate": 4.8962081695472886e-05, "loss": 0.8951, "step": 866 }, { "epoch": 0.4654877353108956, "grad_norm": 0.65234375, "learning_rate": 4.895967220654368e-05, "loss": 0.8701, "step": 867 }, { "epoch": 0.4660246300459716, "grad_norm": 0.71875, "learning_rate": 4.895725998351246e-05, "loss": 0.887, "step": 868 }, { "epoch": 0.4665615247810476, "grad_norm": 1.5546875, "learning_rate": 4.8954845026654484e-05, "loss": 1.0683, "step": 869 }, { "epoch": 0.46709841951612363, "grad_norm": 0.76953125, "learning_rate": 4.8952427336245324e-05, "loss": 1.0042, "step": 870 }, { "epoch": 0.46763531425119964, "grad_norm": 0.703125, "learning_rate": 4.8950006912560867e-05, "loss": 0.9935, "step": 871 }, { "epoch": 0.46817220898627565, "grad_norm": 0.77734375, "learning_rate": 4.894758375587733e-05, "loss": 0.8753, "step": 872 }, { "epoch": 0.4687091037213516, "grad_norm": 0.79296875, "learning_rate": 4.89451578664712e-05, "loss": 0.9927, "step": 873 }, { "epoch": 0.4692459984564276, "grad_norm": 0.68359375, "learning_rate": 4.894272924461932e-05, "loss": 0.9619, "step": 874 }, { "epoch": 0.4697828931915036, "grad_norm": 0.703125, "learning_rate": 4.894029789059883e-05, "loss": 1.0445, "step": 875 }, { "epoch": 0.47031978792657964, "grad_norm": 0.71484375, "learning_rate": 4.8937863804687165e-05, "loss": 1.0827, "step": 876 }, { "epoch": 0.47085668266165565, "grad_norm": 0.70703125, "learning_rate": 4.893542698716209e-05, "loss": 0.9637, "step": 877 }, { "epoch": 0.47139357739673166, "grad_norm": 0.64453125, "learning_rate": 4.893298743830168e-05, "loss": 0.8322, "step": 878 }, { "epoch": 0.47193047213180767, "grad_norm": 0.78515625, "learning_rate": 4.8930545158384324e-05, "loss": 1.0453, "step": 879 }, { "epoch": 0.4724673668668837, "grad_norm": 0.8203125, "learning_rate": 4.89281001476887e-05, "loss": 0.9922, "step": 880 }, { "epoch": 0.4730042616019597, "grad_norm": 0.609375, "learning_rate": 4.892565240649383e-05, "loss": 0.863, "step": 881 }, { "epoch": 0.4735411563370357, "grad_norm": 0.78515625, "learning_rate": 4.892320193507902e-05, "loss": 0.8975, "step": 882 }, { "epoch": 0.47407805107211165, "grad_norm": 0.7265625, "learning_rate": 4.892074873372392e-05, "loss": 0.859, "step": 883 }, { "epoch": 0.47461494580718766, "grad_norm": 0.765625, "learning_rate": 4.8918292802708445e-05, "loss": 1.1057, "step": 884 }, { "epoch": 0.47515184054226367, "grad_norm": 0.80078125, "learning_rate": 4.891583414231287e-05, "loss": 0.9268, "step": 885 }, { "epoch": 0.4756887352773397, "grad_norm": 0.66796875, "learning_rate": 4.891337275281774e-05, "loss": 1.0443, "step": 886 }, { "epoch": 0.4762256300124157, "grad_norm": 0.69140625, "learning_rate": 4.8910908634503946e-05, "loss": 0.8716, "step": 887 }, { "epoch": 0.4767625247474917, "grad_norm": 0.68359375, "learning_rate": 4.890844178765267e-05, "loss": 0.7626, "step": 888 }, { "epoch": 0.4772994194825677, "grad_norm": 0.74609375, "learning_rate": 4.89059722125454e-05, "loss": 0.9247, "step": 889 }, { "epoch": 0.4778363142176437, "grad_norm": 0.84765625, "learning_rate": 4.8903499909463966e-05, "loss": 0.9973, "step": 890 }, { "epoch": 0.47837320895271973, "grad_norm": 0.82421875, "learning_rate": 4.890102487869048e-05, "loss": 0.8405, "step": 891 }, { "epoch": 0.4789101036877957, "grad_norm": 1.265625, "learning_rate": 4.889854712050737e-05, "loss": 1.079, "step": 892 }, { "epoch": 0.4794469984228717, "grad_norm": 1.0078125, "learning_rate": 4.889606663519739e-05, "loss": 1.0182, "step": 893 }, { "epoch": 0.4799838931579477, "grad_norm": 1.3203125, "learning_rate": 4.8893583423043574e-05, "loss": 0.9582, "step": 894 }, { "epoch": 0.4805207878930237, "grad_norm": 1.0625, "learning_rate": 4.8891097484329315e-05, "loss": 0.942, "step": 895 }, { "epoch": 0.4810576826280997, "grad_norm": 0.796875, "learning_rate": 4.888860881933826e-05, "loss": 0.9177, "step": 896 }, { "epoch": 0.48159457736317574, "grad_norm": 0.68359375, "learning_rate": 4.8886117428354434e-05, "loss": 0.983, "step": 897 }, { "epoch": 0.48213147209825175, "grad_norm": 0.8203125, "learning_rate": 4.888362331166211e-05, "loss": 0.9985, "step": 898 }, { "epoch": 0.48266836683332776, "grad_norm": 0.73828125, "learning_rate": 4.88811264695459e-05, "loss": 0.9199, "step": 899 }, { "epoch": 0.48320526156840377, "grad_norm": 0.828125, "learning_rate": 4.887862690229073e-05, "loss": 0.9292, "step": 900 }, { "epoch": 0.48320526156840377, "eval_loss": 0.949388325214386, "eval_runtime": 45.824, "eval_samples_per_second": 20.993, "eval_steps_per_second": 20.993, "step": 900 }, { "epoch": 0.4837421563034798, "grad_norm": 0.8046875, "learning_rate": 4.887612461018183e-05, "loss": 0.914, "step": 901 }, { "epoch": 0.48427905103855573, "grad_norm": 0.66796875, "learning_rate": 4.887361959350475e-05, "loss": 0.937, "step": 902 }, { "epoch": 0.48481594577363174, "grad_norm": 0.64453125, "learning_rate": 4.887111185254534e-05, "loss": 0.9092, "step": 903 }, { "epoch": 0.48535284050870775, "grad_norm": 0.69921875, "learning_rate": 4.8868601387589765e-05, "loss": 0.805, "step": 904 }, { "epoch": 0.48588973524378376, "grad_norm": 0.65625, "learning_rate": 4.88660881989245e-05, "loss": 0.8396, "step": 905 }, { "epoch": 0.48642662997885977, "grad_norm": 0.6875, "learning_rate": 4.8863572286836324e-05, "loss": 0.9667, "step": 906 }, { "epoch": 0.4869635247139358, "grad_norm": 0.69140625, "learning_rate": 4.8861053651612347e-05, "loss": 0.8923, "step": 907 }, { "epoch": 0.4875004194490118, "grad_norm": 0.71875, "learning_rate": 4.885853229353998e-05, "loss": 0.7539, "step": 908 }, { "epoch": 0.4880373141840878, "grad_norm": 0.6796875, "learning_rate": 4.8856008212906925e-05, "loss": 0.7695, "step": 909 }, { "epoch": 0.4885742089191638, "grad_norm": 0.7109375, "learning_rate": 4.885348141000122e-05, "loss": 1.1176, "step": 910 }, { "epoch": 0.48911110365423976, "grad_norm": 0.82421875, "learning_rate": 4.8850951885111215e-05, "loss": 0.9238, "step": 911 }, { "epoch": 0.4896479983893158, "grad_norm": 0.65234375, "learning_rate": 4.8848419638525545e-05, "loss": 0.8153, "step": 912 }, { "epoch": 0.4901848931243918, "grad_norm": 0.73828125, "learning_rate": 4.8845884670533184e-05, "loss": 1.0782, "step": 913 }, { "epoch": 0.4907217878594678, "grad_norm": 0.69140625, "learning_rate": 4.884334698142339e-05, "loss": 0.8182, "step": 914 }, { "epoch": 0.4912586825945438, "grad_norm": 0.65625, "learning_rate": 4.884080657148576e-05, "loss": 0.9822, "step": 915 }, { "epoch": 0.4917955773296198, "grad_norm": 0.72265625, "learning_rate": 4.8838263441010186e-05, "loss": 0.8813, "step": 916 }, { "epoch": 0.4923324720646958, "grad_norm": 0.6796875, "learning_rate": 4.8835717590286857e-05, "loss": 0.8641, "step": 917 }, { "epoch": 0.49286936679977184, "grad_norm": 0.74609375, "learning_rate": 4.88331690196063e-05, "loss": 1.0673, "step": 918 }, { "epoch": 0.49340626153484785, "grad_norm": 0.74609375, "learning_rate": 4.883061772925933e-05, "loss": 0.812, "step": 919 }, { "epoch": 0.49394315626992386, "grad_norm": 1.15625, "learning_rate": 4.88280637195371e-05, "loss": 1.0478, "step": 920 }, { "epoch": 0.4944800510049998, "grad_norm": 0.81640625, "learning_rate": 4.882550699073103e-05, "loss": 0.9996, "step": 921 }, { "epoch": 0.4950169457400758, "grad_norm": 0.94921875, "learning_rate": 4.882294754313289e-05, "loss": 1.2174, "step": 922 }, { "epoch": 0.49555384047515183, "grad_norm": 0.796875, "learning_rate": 4.882038537703474e-05, "loss": 1.1008, "step": 923 }, { "epoch": 0.49609073521022784, "grad_norm": 0.84375, "learning_rate": 4.881782049272896e-05, "loss": 1.1056, "step": 924 }, { "epoch": 0.49662762994530385, "grad_norm": 0.78125, "learning_rate": 4.881525289050824e-05, "loss": 1.0885, "step": 925 }, { "epoch": 0.49716452468037986, "grad_norm": 0.81640625, "learning_rate": 4.8812682570665556e-05, "loss": 0.8931, "step": 926 }, { "epoch": 0.49770141941545587, "grad_norm": 0.828125, "learning_rate": 4.881010953349423e-05, "loss": 0.9753, "step": 927 }, { "epoch": 0.4982383141505319, "grad_norm": 0.66015625, "learning_rate": 4.880753377928788e-05, "loss": 0.9947, "step": 928 }, { "epoch": 0.4987752088856079, "grad_norm": 0.80859375, "learning_rate": 4.8804955308340425e-05, "loss": 0.9936, "step": 929 }, { "epoch": 0.49931210362068384, "grad_norm": 0.796875, "learning_rate": 4.880237412094611e-05, "loss": 1.0456, "step": 930 }, { "epoch": 0.49984899835575985, "grad_norm": 0.76953125, "learning_rate": 4.879979021739947e-05, "loss": 0.8974, "step": 931 }, { "epoch": 0.5003858930908359, "grad_norm": 0.78515625, "learning_rate": 4.879720359799537e-05, "loss": 0.9897, "step": 932 }, { "epoch": 0.5009227878259119, "grad_norm": 0.8828125, "learning_rate": 4.879461426302897e-05, "loss": 1.0921, "step": 933 }, { "epoch": 0.5014596825609879, "grad_norm": 0.6875, "learning_rate": 4.879202221279575e-05, "loss": 0.9196, "step": 934 }, { "epoch": 0.5019965772960638, "grad_norm": 0.765625, "learning_rate": 4.8789427447591486e-05, "loss": 1.1765, "step": 935 }, { "epoch": 0.5025334720311398, "grad_norm": 0.76171875, "learning_rate": 4.878682996771229e-05, "loss": 0.9488, "step": 936 }, { "epoch": 0.5030703667662159, "grad_norm": 0.734375, "learning_rate": 4.8784229773454555e-05, "loss": 0.8722, "step": 937 }, { "epoch": 0.5036072615012919, "grad_norm": 1.1015625, "learning_rate": 4.8781626865115005e-05, "loss": 1.0332, "step": 938 }, { "epoch": 0.5041441562363679, "grad_norm": 0.69140625, "learning_rate": 4.877902124299065e-05, "loss": 0.8815, "step": 939 }, { "epoch": 0.5046810509714439, "grad_norm": 0.625, "learning_rate": 4.877641290737884e-05, "loss": 0.9193, "step": 940 }, { "epoch": 0.5052179457065199, "grad_norm": 1.4609375, "learning_rate": 4.877380185857722e-05, "loss": 1.0847, "step": 941 }, { "epoch": 0.5057548404415959, "grad_norm": 0.73828125, "learning_rate": 4.877118809688372e-05, "loss": 0.8385, "step": 942 }, { "epoch": 0.5062917351766719, "grad_norm": 0.7734375, "learning_rate": 4.876857162259664e-05, "loss": 1.125, "step": 943 }, { "epoch": 0.5068286299117479, "grad_norm": 0.734375, "learning_rate": 4.8765952436014515e-05, "loss": 0.9923, "step": 944 }, { "epoch": 0.5073655246468239, "grad_norm": 0.72265625, "learning_rate": 4.8763330537436255e-05, "loss": 0.8513, "step": 945 }, { "epoch": 0.5079024193819, "grad_norm": 1.03125, "learning_rate": 4.876070592716105e-05, "loss": 1.0621, "step": 946 }, { "epoch": 0.508439314116976, "grad_norm": 0.8125, "learning_rate": 4.875807860548838e-05, "loss": 1.0328, "step": 947 }, { "epoch": 0.508976208852052, "grad_norm": 0.64453125, "learning_rate": 4.875544857271808e-05, "loss": 0.9213, "step": 948 }, { "epoch": 0.509513103587128, "grad_norm": 0.67578125, "learning_rate": 4.875281582915026e-05, "loss": 0.883, "step": 949 }, { "epoch": 0.510049998322204, "grad_norm": 0.65234375, "learning_rate": 4.8750180375085344e-05, "loss": 0.9326, "step": 950 }, { "epoch": 0.510049998322204, "eval_loss": 0.9448692202568054, "eval_runtime": 46.4895, "eval_samples_per_second": 20.693, "eval_steps_per_second": 20.693, "step": 950 }, { "epoch": 0.51058689305728, "grad_norm": 0.80078125, "learning_rate": 4.874754221082407e-05, "loss": 1.0214, "step": 951 }, { "epoch": 0.511123787792356, "grad_norm": 0.85546875, "learning_rate": 4.874490133666749e-05, "loss": 0.8779, "step": 952 }, { "epoch": 0.511660682527432, "grad_norm": 0.8203125, "learning_rate": 4.874225775291697e-05, "loss": 1.1105, "step": 953 }, { "epoch": 0.5121975772625079, "grad_norm": 0.9296875, "learning_rate": 4.873961145987417e-05, "loss": 1.1597, "step": 954 }, { "epoch": 0.5127344719975839, "grad_norm": 0.84375, "learning_rate": 4.873696245784106e-05, "loss": 0.9641, "step": 955 }, { "epoch": 0.5132713667326599, "grad_norm": 0.75, "learning_rate": 4.8734310747119935e-05, "loss": 1.0023, "step": 956 }, { "epoch": 0.513808261467736, "grad_norm": 0.68359375, "learning_rate": 4.873165632801338e-05, "loss": 0.9229, "step": 957 }, { "epoch": 0.514345156202812, "grad_norm": 0.6796875, "learning_rate": 4.87289992008243e-05, "loss": 0.8132, "step": 958 }, { "epoch": 0.514882050937888, "grad_norm": 0.7421875, "learning_rate": 4.872633936585591e-05, "loss": 1.0589, "step": 959 }, { "epoch": 0.515418945672964, "grad_norm": 0.66796875, "learning_rate": 4.872367682341173e-05, "loss": 0.9771, "step": 960 }, { "epoch": 0.51595584040804, "grad_norm": 0.7265625, "learning_rate": 4.872101157379558e-05, "loss": 1.1707, "step": 961 }, { "epoch": 0.516492735143116, "grad_norm": 0.7109375, "learning_rate": 4.871834361731162e-05, "loss": 0.9431, "step": 962 }, { "epoch": 0.517029629878192, "grad_norm": 0.71875, "learning_rate": 4.871567295426427e-05, "loss": 0.894, "step": 963 }, { "epoch": 0.517566524613268, "grad_norm": 0.8359375, "learning_rate": 4.8712999584958314e-05, "loss": 1.1675, "step": 964 }, { "epoch": 0.518103419348344, "grad_norm": 0.6484375, "learning_rate": 4.87103235096988e-05, "loss": 0.8444, "step": 965 }, { "epoch": 0.51864031408342, "grad_norm": 0.6640625, "learning_rate": 4.87076447287911e-05, "loss": 0.7976, "step": 966 }, { "epoch": 0.519177208818496, "grad_norm": 0.68359375, "learning_rate": 4.870496324254091e-05, "loss": 0.9452, "step": 967 }, { "epoch": 0.5197141035535721, "grad_norm": 0.7578125, "learning_rate": 4.870227905125422e-05, "loss": 0.9365, "step": 968 }, { "epoch": 0.5202509982886481, "grad_norm": 0.71875, "learning_rate": 4.8699592155237316e-05, "loss": 0.9319, "step": 969 }, { "epoch": 0.5207878930237241, "grad_norm": 0.796875, "learning_rate": 4.869690255479682e-05, "loss": 0.9302, "step": 970 }, { "epoch": 0.5213247877588001, "grad_norm": 0.63671875, "learning_rate": 4.869421025023965e-05, "loss": 0.8786, "step": 971 }, { "epoch": 0.521861682493876, "grad_norm": 1.203125, "learning_rate": 4.8691515241873023e-05, "loss": 0.8714, "step": 972 }, { "epoch": 0.522398577228952, "grad_norm": 0.6328125, "learning_rate": 4.8688817530004473e-05, "loss": 0.9017, "step": 973 }, { "epoch": 0.522935471964028, "grad_norm": 0.94140625, "learning_rate": 4.868611711494186e-05, "loss": 1.1617, "step": 974 }, { "epoch": 0.523472366699104, "grad_norm": 0.71484375, "learning_rate": 4.868341399699331e-05, "loss": 0.9164, "step": 975 }, { "epoch": 0.52400926143418, "grad_norm": 0.71875, "learning_rate": 4.8680708176467305e-05, "loss": 0.9218, "step": 976 }, { "epoch": 0.524546156169256, "grad_norm": 0.890625, "learning_rate": 4.86779996536726e-05, "loss": 1.0642, "step": 977 }, { "epoch": 0.525083050904332, "grad_norm": 0.6328125, "learning_rate": 4.867528842891828e-05, "loss": 0.9238, "step": 978 }, { "epoch": 0.5256199456394081, "grad_norm": 0.65625, "learning_rate": 4.867257450251373e-05, "loss": 0.9341, "step": 979 }, { "epoch": 0.5261568403744841, "grad_norm": 1.0078125, "learning_rate": 4.866985787476863e-05, "loss": 1.1306, "step": 980 }, { "epoch": 0.5266937351095601, "grad_norm": 0.77734375, "learning_rate": 4.866713854599301e-05, "loss": 0.9636, "step": 981 }, { "epoch": 0.5272306298446361, "grad_norm": 0.7890625, "learning_rate": 4.866441651649715e-05, "loss": 1.0581, "step": 982 }, { "epoch": 0.5277675245797121, "grad_norm": 0.96875, "learning_rate": 4.866169178659168e-05, "loss": 0.9646, "step": 983 }, { "epoch": 0.5283044193147881, "grad_norm": 0.7421875, "learning_rate": 4.865896435658752e-05, "loss": 0.9056, "step": 984 }, { "epoch": 0.5288413140498641, "grad_norm": 0.80078125, "learning_rate": 4.865623422679593e-05, "loss": 1.0613, "step": 985 }, { "epoch": 0.5293782087849401, "grad_norm": 0.78125, "learning_rate": 4.865350139752841e-05, "loss": 0.8856, "step": 986 }, { "epoch": 0.5299151035200161, "grad_norm": 0.84765625, "learning_rate": 4.865076586909685e-05, "loss": 1.0529, "step": 987 }, { "epoch": 0.5304519982550921, "grad_norm": 0.6875, "learning_rate": 4.8648027641813384e-05, "loss": 1.0098, "step": 988 }, { "epoch": 0.5309888929901682, "grad_norm": 0.77734375, "learning_rate": 4.864528671599049e-05, "loss": 1.0291, "step": 989 }, { "epoch": 0.5315257877252442, "grad_norm": 0.65625, "learning_rate": 4.864254309194093e-05, "loss": 0.7555, "step": 990 }, { "epoch": 0.5320626824603201, "grad_norm": 0.71875, "learning_rate": 4.86397967699778e-05, "loss": 1.0176, "step": 991 }, { "epoch": 0.5325995771953961, "grad_norm": 0.66015625, "learning_rate": 4.863704775041449e-05, "loss": 0.9002, "step": 992 }, { "epoch": 0.5331364719304721, "grad_norm": 0.6640625, "learning_rate": 4.863429603356469e-05, "loss": 0.9122, "step": 993 }, { "epoch": 0.5336733666655481, "grad_norm": 0.765625, "learning_rate": 4.8631541619742405e-05, "loss": 0.9334, "step": 994 }, { "epoch": 0.5342102614006241, "grad_norm": 0.71875, "learning_rate": 4.862878450926195e-05, "loss": 0.7841, "step": 995 }, { "epoch": 0.5347471561357001, "grad_norm": 0.70703125, "learning_rate": 4.8626024702437956e-05, "loss": 0.958, "step": 996 }, { "epoch": 0.5352840508707761, "grad_norm": 0.76171875, "learning_rate": 4.862326219958533e-05, "loss": 0.9678, "step": 997 }, { "epoch": 0.5358209456058521, "grad_norm": 0.71484375, "learning_rate": 4.862049700101934e-05, "loss": 0.894, "step": 998 }, { "epoch": 0.5363578403409281, "grad_norm": 0.5625, "learning_rate": 4.86177291070555e-05, "loss": 0.771, "step": 999 }, { "epoch": 0.5368947350760042, "grad_norm": 0.71484375, "learning_rate": 4.861495851800968e-05, "loss": 0.8994, "step": 1000 }, { "epoch": 0.5368947350760042, "eval_loss": 0.942378580570221, "eval_runtime": 46.24, "eval_samples_per_second": 20.804, "eval_steps_per_second": 20.804, "step": 1000 }, { "epoch": 0.5374316298110802, "grad_norm": 0.7109375, "learning_rate": 4.861218523419804e-05, "loss": 0.9404, "step": 1001 }, { "epoch": 0.5379685245461562, "grad_norm": 0.66015625, "learning_rate": 4.860940925593703e-05, "loss": 0.7577, "step": 1002 }, { "epoch": 0.5385054192812322, "grad_norm": 0.7265625, "learning_rate": 4.860663058354344e-05, "loss": 0.9183, "step": 1003 }, { "epoch": 0.5390423140163082, "grad_norm": 0.96875, "learning_rate": 4.860384921733434e-05, "loss": 1.0715, "step": 1004 }, { "epoch": 0.5395792087513842, "grad_norm": 0.6484375, "learning_rate": 4.8601065157627135e-05, "loss": 0.904, "step": 1005 }, { "epoch": 0.5401161034864602, "grad_norm": 0.8984375, "learning_rate": 4.8598278404739507e-05, "loss": 1.0282, "step": 1006 }, { "epoch": 0.5406529982215362, "grad_norm": 0.671875, "learning_rate": 4.8595488958989466e-05, "loss": 0.8043, "step": 1007 }, { "epoch": 0.5411898929566122, "grad_norm": 0.9609375, "learning_rate": 4.859269682069533e-05, "loss": 0.8917, "step": 1008 }, { "epoch": 0.5417267876916882, "grad_norm": 0.828125, "learning_rate": 4.85899019901757e-05, "loss": 1.1733, "step": 1009 }, { "epoch": 0.5422636824267641, "grad_norm": 1.015625, "learning_rate": 4.858710446774951e-05, "loss": 0.9036, "step": 1010 }, { "epoch": 0.5428005771618402, "grad_norm": 0.71484375, "learning_rate": 4.858430425373599e-05, "loss": 0.9303, "step": 1011 }, { "epoch": 0.5433374718969162, "grad_norm": 0.7265625, "learning_rate": 4.8581501348454696e-05, "loss": 0.924, "step": 1012 }, { "epoch": 0.5438743666319922, "grad_norm": 0.73828125, "learning_rate": 4.857869575222545e-05, "loss": 0.8699, "step": 1013 }, { "epoch": 0.5444112613670682, "grad_norm": 0.6171875, "learning_rate": 4.857588746536843e-05, "loss": 0.8638, "step": 1014 }, { "epoch": 0.5449481561021442, "grad_norm": 0.75, "learning_rate": 4.857307648820408e-05, "loss": 0.9615, "step": 1015 }, { "epoch": 0.5454850508372202, "grad_norm": 0.80078125, "learning_rate": 4.857026282105318e-05, "loss": 0.9276, "step": 1016 }, { "epoch": 0.5460219455722962, "grad_norm": 0.9375, "learning_rate": 4.856744646423679e-05, "loss": 1.3419, "step": 1017 }, { "epoch": 0.5465588403073722, "grad_norm": 0.75390625, "learning_rate": 4.8564627418076306e-05, "loss": 0.997, "step": 1018 }, { "epoch": 0.5470957350424482, "grad_norm": 0.78515625, "learning_rate": 4.856180568289341e-05, "loss": 1.0873, "step": 1019 }, { "epoch": 0.5476326297775242, "grad_norm": 0.67578125, "learning_rate": 4.85589812590101e-05, "loss": 0.9318, "step": 1020 }, { "epoch": 0.5481695245126003, "grad_norm": 0.8671875, "learning_rate": 4.855615414674868e-05, "loss": 0.9326, "step": 1021 }, { "epoch": 0.5487064192476763, "grad_norm": 0.96875, "learning_rate": 4.855332434643175e-05, "loss": 1.1083, "step": 1022 }, { "epoch": 0.5492433139827523, "grad_norm": 0.7890625, "learning_rate": 4.8550491858382244e-05, "loss": 1.0586, "step": 1023 }, { "epoch": 0.5497802087178283, "grad_norm": 0.640625, "learning_rate": 4.854765668292337e-05, "loss": 0.8821, "step": 1024 }, { "epoch": 0.5503171034529043, "grad_norm": 0.671875, "learning_rate": 4.8544818820378656e-05, "loss": 1.0563, "step": 1025 }, { "epoch": 0.5508539981879803, "grad_norm": 0.82421875, "learning_rate": 4.8541978271071946e-05, "loss": 0.8077, "step": 1026 }, { "epoch": 0.5513908929230563, "grad_norm": 0.77734375, "learning_rate": 4.8539135035327386e-05, "loss": 1.1503, "step": 1027 }, { "epoch": 0.5519277876581323, "grad_norm": 0.91796875, "learning_rate": 4.8536289113469416e-05, "loss": 0.9103, "step": 1028 }, { "epoch": 0.5524646823932082, "grad_norm": 0.65625, "learning_rate": 4.853344050582279e-05, "loss": 0.9062, "step": 1029 }, { "epoch": 0.5530015771282842, "grad_norm": 0.8828125, "learning_rate": 4.853058921271259e-05, "loss": 1.0392, "step": 1030 }, { "epoch": 0.5535384718633602, "grad_norm": 0.77734375, "learning_rate": 4.8527735234464156e-05, "loss": 1.0447, "step": 1031 }, { "epoch": 0.5540753665984363, "grad_norm": 0.78125, "learning_rate": 4.8524878571403175e-05, "loss": 0.7923, "step": 1032 }, { "epoch": 0.5546122613335123, "grad_norm": 0.796875, "learning_rate": 4.852201922385564e-05, "loss": 0.9418, "step": 1033 }, { "epoch": 0.5551491560685883, "grad_norm": 0.78515625, "learning_rate": 4.851915719214782e-05, "loss": 0.8726, "step": 1034 }, { "epoch": 0.5556860508036643, "grad_norm": 1.109375, "learning_rate": 4.851629247660633e-05, "loss": 1.1544, "step": 1035 }, { "epoch": 0.5562229455387403, "grad_norm": 0.8984375, "learning_rate": 4.851342507755805e-05, "loss": 1.1065, "step": 1036 }, { "epoch": 0.5567598402738163, "grad_norm": 0.7265625, "learning_rate": 4.8510554995330205e-05, "loss": 1.0341, "step": 1037 }, { "epoch": 0.5572967350088923, "grad_norm": 0.69921875, "learning_rate": 4.850768223025028e-05, "loss": 0.9686, "step": 1038 }, { "epoch": 0.5578336297439683, "grad_norm": 0.87109375, "learning_rate": 4.850480678264613e-05, "loss": 1.014, "step": 1039 }, { "epoch": 0.5583705244790443, "grad_norm": 0.7109375, "learning_rate": 4.8501928652845854e-05, "loss": 0.9053, "step": 1040 }, { "epoch": 0.5589074192141203, "grad_norm": 0.875, "learning_rate": 4.849904784117789e-05, "loss": 1.1054, "step": 1041 }, { "epoch": 0.5594443139491964, "grad_norm": 0.6796875, "learning_rate": 4.849616434797098e-05, "loss": 0.861, "step": 1042 }, { "epoch": 0.5599812086842724, "grad_norm": 0.83984375, "learning_rate": 4.8493278173554156e-05, "loss": 0.9483, "step": 1043 }, { "epoch": 0.5605181034193484, "grad_norm": 0.7421875, "learning_rate": 4.8490389318256776e-05, "loss": 0.7822, "step": 1044 }, { "epoch": 0.5610549981544244, "grad_norm": 0.79296875, "learning_rate": 4.8487497782408506e-05, "loss": 1.0347, "step": 1045 }, { "epoch": 0.5615918928895004, "grad_norm": 0.58984375, "learning_rate": 4.848460356633928e-05, "loss": 0.9687, "step": 1046 }, { "epoch": 0.5621287876245764, "grad_norm": 0.73828125, "learning_rate": 4.848170667037938e-05, "loss": 1.0925, "step": 1047 }, { "epoch": 0.5626656823596523, "grad_norm": 0.8515625, "learning_rate": 4.847880709485938e-05, "loss": 0.9552, "step": 1048 }, { "epoch": 0.5632025770947283, "grad_norm": 0.69921875, "learning_rate": 4.847590484011015e-05, "loss": 1.0899, "step": 1049 }, { "epoch": 0.5637394718298043, "grad_norm": 0.89453125, "learning_rate": 4.847299990646289e-05, "loss": 1.0149, "step": 1050 }, { "epoch": 0.5637394718298043, "eval_loss": 0.9401158690452576, "eval_runtime": 46.0879, "eval_samples_per_second": 20.873, "eval_steps_per_second": 20.873, "step": 1050 }, { "epoch": 0.5642763665648803, "grad_norm": 0.82421875, "learning_rate": 4.8470092294249075e-05, "loss": 0.9582, "step": 1051 }, { "epoch": 0.5648132612999563, "grad_norm": 0.83984375, "learning_rate": 4.84671820038005e-05, "loss": 0.7745, "step": 1052 }, { "epoch": 0.5653501560350324, "grad_norm": 0.73828125, "learning_rate": 4.8464269035449275e-05, "loss": 1.0579, "step": 1053 }, { "epoch": 0.5658870507701084, "grad_norm": 0.59375, "learning_rate": 4.84613533895278e-05, "loss": 0.893, "step": 1054 }, { "epoch": 0.5664239455051844, "grad_norm": 1.0625, "learning_rate": 4.845843506636879e-05, "loss": 1.0196, "step": 1055 }, { "epoch": 0.5669608402402604, "grad_norm": 0.73828125, "learning_rate": 4.845551406630526e-05, "loss": 0.968, "step": 1056 }, { "epoch": 0.5674977349753364, "grad_norm": 0.6328125, "learning_rate": 4.845259038967054e-05, "loss": 0.9198, "step": 1057 }, { "epoch": 0.5680346297104124, "grad_norm": 0.70703125, "learning_rate": 4.844966403679825e-05, "loss": 0.9292, "step": 1058 }, { "epoch": 0.5685715244454884, "grad_norm": 0.703125, "learning_rate": 4.8446735008022324e-05, "loss": 0.9598, "step": 1059 }, { "epoch": 0.5691084191805644, "grad_norm": 0.875, "learning_rate": 4.844380330367701e-05, "loss": 1.2437, "step": 1060 }, { "epoch": 0.5696453139156404, "grad_norm": 0.7265625, "learning_rate": 4.844086892409685e-05, "loss": 0.8978, "step": 1061 }, { "epoch": 0.5701822086507164, "grad_norm": 1.5546875, "learning_rate": 4.8437931869616686e-05, "loss": 0.9245, "step": 1062 }, { "epoch": 0.5707191033857925, "grad_norm": 0.64453125, "learning_rate": 4.8434992140571675e-05, "loss": 0.8795, "step": 1063 }, { "epoch": 0.5712559981208685, "grad_norm": 0.86328125, "learning_rate": 4.843204973729729e-05, "loss": 1.1411, "step": 1064 }, { "epoch": 0.5717928928559445, "grad_norm": 0.76171875, "learning_rate": 4.8429104660129284e-05, "loss": 1.0165, "step": 1065 }, { "epoch": 0.5723297875910205, "grad_norm": 0.77734375, "learning_rate": 4.842615690940373e-05, "loss": 0.9937, "step": 1066 }, { "epoch": 0.5728666823260964, "grad_norm": 0.67578125, "learning_rate": 4.8423206485456995e-05, "loss": 0.9006, "step": 1067 }, { "epoch": 0.5734035770611724, "grad_norm": 0.953125, "learning_rate": 4.842025338862578e-05, "loss": 1.0157, "step": 1068 }, { "epoch": 0.5739404717962484, "grad_norm": 0.828125, "learning_rate": 4.841729761924706e-05, "loss": 0.8855, "step": 1069 }, { "epoch": 0.5744773665313244, "grad_norm": 0.9296875, "learning_rate": 4.841433917765812e-05, "loss": 0.9902, "step": 1070 }, { "epoch": 0.5750142612664004, "grad_norm": 0.78125, "learning_rate": 4.841137806419657e-05, "loss": 0.9989, "step": 1071 }, { "epoch": 0.5755511560014764, "grad_norm": 0.85546875, "learning_rate": 4.840841427920029e-05, "loss": 1.1161, "step": 1072 }, { "epoch": 0.5760880507365524, "grad_norm": 0.703125, "learning_rate": 4.84054478230075e-05, "loss": 0.852, "step": 1073 }, { "epoch": 0.5766249454716285, "grad_norm": 0.7578125, "learning_rate": 4.840247869595671e-05, "loss": 1.0496, "step": 1074 }, { "epoch": 0.5771618402067045, "grad_norm": 0.8125, "learning_rate": 4.839950689838674e-05, "loss": 1.0162, "step": 1075 }, { "epoch": 0.5776987349417805, "grad_norm": 0.8046875, "learning_rate": 4.8396532430636684e-05, "loss": 0.757, "step": 1076 }, { "epoch": 0.5782356296768565, "grad_norm": 0.82421875, "learning_rate": 4.8393555293046e-05, "loss": 0.9283, "step": 1077 }, { "epoch": 0.5787725244119325, "grad_norm": 0.7421875, "learning_rate": 4.83905754859544e-05, "loss": 1.0616, "step": 1078 }, { "epoch": 0.5793094191470085, "grad_norm": 0.82421875, "learning_rate": 4.8387593009701914e-05, "loss": 1.1749, "step": 1079 }, { "epoch": 0.5798463138820845, "grad_norm": 0.84765625, "learning_rate": 4.838460786462888e-05, "loss": 1.1078, "step": 1080 }, { "epoch": 0.5803832086171605, "grad_norm": 0.76171875, "learning_rate": 4.838162005107596e-05, "loss": 0.7911, "step": 1081 }, { "epoch": 0.5809201033522365, "grad_norm": 0.609375, "learning_rate": 4.837862956938408e-05, "loss": 0.8446, "step": 1082 }, { "epoch": 0.5814569980873125, "grad_norm": 0.6875, "learning_rate": 4.837563641989451e-05, "loss": 1.1396, "step": 1083 }, { "epoch": 0.5819938928223886, "grad_norm": 0.73046875, "learning_rate": 4.837264060294878e-05, "loss": 1.0937, "step": 1084 }, { "epoch": 0.5825307875574646, "grad_norm": 1.0546875, "learning_rate": 4.836964211888878e-05, "loss": 1.1845, "step": 1085 }, { "epoch": 0.5830676822925405, "grad_norm": 0.6015625, "learning_rate": 4.836664096805665e-05, "loss": 0.9594, "step": 1086 }, { "epoch": 0.5836045770276165, "grad_norm": 0.7265625, "learning_rate": 4.8363637150794884e-05, "loss": 1.0269, "step": 1087 }, { "epoch": 0.5841414717626925, "grad_norm": 0.71484375, "learning_rate": 4.8360630667446236e-05, "loss": 0.9155, "step": 1088 }, { "epoch": 0.5846783664977685, "grad_norm": 0.70703125, "learning_rate": 4.835762151835379e-05, "loss": 0.8554, "step": 1089 }, { "epoch": 0.5852152612328445, "grad_norm": 0.74609375, "learning_rate": 4.835460970386093e-05, "loss": 0.856, "step": 1090 }, { "epoch": 0.5857521559679205, "grad_norm": 0.7265625, "learning_rate": 4.835159522431134e-05, "loss": 0.8173, "step": 1091 }, { "epoch": 0.5862890507029965, "grad_norm": 0.66796875, "learning_rate": 4.8348578080049004e-05, "loss": 1.0669, "step": 1092 }, { "epoch": 0.5868259454380725, "grad_norm": 0.75390625, "learning_rate": 4.8345558271418235e-05, "loss": 0.987, "step": 1093 }, { "epoch": 0.5873628401731485, "grad_norm": 0.68359375, "learning_rate": 4.834253579876361e-05, "loss": 0.8267, "step": 1094 }, { "epoch": 0.5878997349082246, "grad_norm": 1.0390625, "learning_rate": 4.8339510662430046e-05, "loss": 0.9502, "step": 1095 }, { "epoch": 0.5884366296433006, "grad_norm": 0.671875, "learning_rate": 4.833648286276275e-05, "loss": 0.9575, "step": 1096 }, { "epoch": 0.5889735243783766, "grad_norm": 0.81640625, "learning_rate": 4.8333452400107217e-05, "loss": 1.1301, "step": 1097 }, { "epoch": 0.5895104191134526, "grad_norm": 0.83203125, "learning_rate": 4.833041927480927e-05, "loss": 1.1176, "step": 1098 }, { "epoch": 0.5900473138485286, "grad_norm": 0.73046875, "learning_rate": 4.832738348721503e-05, "loss": 0.8473, "step": 1099 }, { "epoch": 0.5905842085836046, "grad_norm": 0.76953125, "learning_rate": 4.832434503767092e-05, "loss": 0.9467, "step": 1100 }, { "epoch": 0.5905842085836046, "eval_loss": 0.9381348490715027, "eval_runtime": 46.4802, "eval_samples_per_second": 20.697, "eval_steps_per_second": 20.697, "step": 1100 }, { "epoch": 0.5911211033186806, "grad_norm": 0.6640625, "learning_rate": 4.8321303926523665e-05, "loss": 1.0117, "step": 1101 }, { "epoch": 0.5916579980537566, "grad_norm": 0.75, "learning_rate": 4.8318260154120286e-05, "loss": 1.2182, "step": 1102 }, { "epoch": 0.5921948927888326, "grad_norm": 0.765625, "learning_rate": 4.831521372080812e-05, "loss": 1.0126, "step": 1103 }, { "epoch": 0.5927317875239086, "grad_norm": 0.70703125, "learning_rate": 4.831216462693481e-05, "loss": 0.8073, "step": 1104 }, { "epoch": 0.5932686822589845, "grad_norm": 0.91796875, "learning_rate": 4.8309112872848296e-05, "loss": 0.9932, "step": 1105 }, { "epoch": 0.5938055769940606, "grad_norm": 0.8515625, "learning_rate": 4.830605845889681e-05, "loss": 1.0643, "step": 1106 }, { "epoch": 0.5943424717291366, "grad_norm": 0.5859375, "learning_rate": 4.83030013854289e-05, "loss": 0.8294, "step": 1107 }, { "epoch": 0.5948793664642126, "grad_norm": 0.703125, "learning_rate": 4.829994165279343e-05, "loss": 0.8283, "step": 1108 }, { "epoch": 0.5954162611992886, "grad_norm": 0.63671875, "learning_rate": 4.829687926133955e-05, "loss": 0.9631, "step": 1109 }, { "epoch": 0.5959531559343646, "grad_norm": 0.6875, "learning_rate": 4.829381421141671e-05, "loss": 0.8741, "step": 1110 }, { "epoch": 0.5964900506694406, "grad_norm": 0.83203125, "learning_rate": 4.8290746503374687e-05, "loss": 1.0303, "step": 1111 }, { "epoch": 0.5970269454045166, "grad_norm": 0.76171875, "learning_rate": 4.828767613756352e-05, "loss": 0.9702, "step": 1112 }, { "epoch": 0.5975638401395926, "grad_norm": 0.84375, "learning_rate": 4.82846031143336e-05, "loss": 0.9661, "step": 1113 }, { "epoch": 0.5981007348746686, "grad_norm": 0.859375, "learning_rate": 4.828152743403559e-05, "loss": 1.0958, "step": 1114 }, { "epoch": 0.5986376296097446, "grad_norm": 0.82421875, "learning_rate": 4.8278449097020464e-05, "loss": 1.0832, "step": 1115 }, { "epoch": 0.5991745243448207, "grad_norm": 0.7421875, "learning_rate": 4.827536810363949e-05, "loss": 1.1014, "step": 1116 }, { "epoch": 0.5997114190798967, "grad_norm": 0.65234375, "learning_rate": 4.827228445424426e-05, "loss": 0.9986, "step": 1117 }, { "epoch": 0.6002483138149727, "grad_norm": 0.765625, "learning_rate": 4.8269198149186656e-05, "loss": 1.0819, "step": 1118 }, { "epoch": 0.6007852085500487, "grad_norm": 0.6875, "learning_rate": 4.826610918881886e-05, "loss": 0.9433, "step": 1119 }, { "epoch": 0.6013221032851247, "grad_norm": 0.6484375, "learning_rate": 4.826301757349337e-05, "loss": 1.0211, "step": 1120 }, { "epoch": 0.6018589980202007, "grad_norm": 0.6875, "learning_rate": 4.825992330356297e-05, "loss": 0.9769, "step": 1121 }, { "epoch": 0.6023958927552767, "grad_norm": 0.6484375, "learning_rate": 4.825682637938075e-05, "loss": 0.8121, "step": 1122 }, { "epoch": 0.6029327874903526, "grad_norm": 0.984375, "learning_rate": 4.825372680130013e-05, "loss": 1.0143, "step": 1123 }, { "epoch": 0.6034696822254286, "grad_norm": 0.765625, "learning_rate": 4.825062456967479e-05, "loss": 1.0155, "step": 1124 }, { "epoch": 0.6040065769605046, "grad_norm": 0.7734375, "learning_rate": 4.824751968485874e-05, "loss": 0.937, "step": 1125 }, { "epoch": 0.6045434716955806, "grad_norm": 0.6953125, "learning_rate": 4.8244412147206284e-05, "loss": 0.928, "step": 1126 }, { "epoch": 0.6050803664306567, "grad_norm": 0.65625, "learning_rate": 4.8241301957072046e-05, "loss": 0.8057, "step": 1127 }, { "epoch": 0.6056172611657327, "grad_norm": 0.83984375, "learning_rate": 4.8238189114810916e-05, "loss": 0.8886, "step": 1128 }, { "epoch": 0.6061541559008087, "grad_norm": 0.66015625, "learning_rate": 4.823507362077813e-05, "loss": 0.7433, "step": 1129 }, { "epoch": 0.6066910506358847, "grad_norm": 0.76953125, "learning_rate": 4.823195547532919e-05, "loss": 0.9903, "step": 1130 }, { "epoch": 0.6072279453709607, "grad_norm": 0.5859375, "learning_rate": 4.822883467881992e-05, "loss": 0.9196, "step": 1131 }, { "epoch": 0.6077648401060367, "grad_norm": 0.82421875, "learning_rate": 4.822571123160645e-05, "loss": 1.0092, "step": 1132 }, { "epoch": 0.6083017348411127, "grad_norm": 0.71484375, "learning_rate": 4.8222585134045194e-05, "loss": 0.9917, "step": 1133 }, { "epoch": 0.6088386295761887, "grad_norm": 0.921875, "learning_rate": 4.821945638649289e-05, "loss": 0.942, "step": 1134 }, { "epoch": 0.6093755243112647, "grad_norm": 0.8359375, "learning_rate": 4.821632498930656e-05, "loss": 0.9521, "step": 1135 }, { "epoch": 0.6099124190463407, "grad_norm": 0.95703125, "learning_rate": 4.8213190942843535e-05, "loss": 1.0247, "step": 1136 }, { "epoch": 0.6104493137814168, "grad_norm": 0.671875, "learning_rate": 4.821005424746145e-05, "loss": 0.8676, "step": 1137 }, { "epoch": 0.6109862085164928, "grad_norm": 0.8203125, "learning_rate": 4.820691490351824e-05, "loss": 1.0077, "step": 1138 }, { "epoch": 0.6115231032515688, "grad_norm": 0.78515625, "learning_rate": 4.820377291137216e-05, "loss": 0.8987, "step": 1139 }, { "epoch": 0.6120599979866448, "grad_norm": 0.93359375, "learning_rate": 4.820062827138173e-05, "loss": 1.058, "step": 1140 }, { "epoch": 0.6125968927217208, "grad_norm": 0.71875, "learning_rate": 4.819748098390581e-05, "loss": 1.0804, "step": 1141 }, { "epoch": 0.6131337874567967, "grad_norm": 0.5859375, "learning_rate": 4.819433104930352e-05, "loss": 0.7626, "step": 1142 }, { "epoch": 0.6136706821918727, "grad_norm": 0.7421875, "learning_rate": 4.8191178467934346e-05, "loss": 1.0532, "step": 1143 }, { "epoch": 0.6142075769269487, "grad_norm": 0.82421875, "learning_rate": 4.818802324015801e-05, "loss": 0.9695, "step": 1144 }, { "epoch": 0.6147444716620247, "grad_norm": 0.69921875, "learning_rate": 4.8184865366334564e-05, "loss": 0.8658, "step": 1145 }, { "epoch": 0.6152813663971007, "grad_norm": 0.765625, "learning_rate": 4.818170484682437e-05, "loss": 0.826, "step": 1146 }, { "epoch": 0.6158182611321767, "grad_norm": 0.609375, "learning_rate": 4.8178541681988074e-05, "loss": 0.7409, "step": 1147 }, { "epoch": 0.6163551558672528, "grad_norm": 0.66796875, "learning_rate": 4.817537587218665e-05, "loss": 0.9064, "step": 1148 }, { "epoch": 0.6168920506023288, "grad_norm": 0.7734375, "learning_rate": 4.817220741778135e-05, "loss": 0.8841, "step": 1149 }, { "epoch": 0.6174289453374048, "grad_norm": 0.71875, "learning_rate": 4.816903631913372e-05, "loss": 1.0278, "step": 1150 }, { "epoch": 0.6174289453374048, "eval_loss": 0.9374992251396179, "eval_runtime": 46.3897, "eval_samples_per_second": 20.737, "eval_steps_per_second": 20.737, "step": 1150 }, { "epoch": 0.6179658400724808, "grad_norm": 0.9140625, "learning_rate": 4.816586257660564e-05, "loss": 0.963, "step": 1151 }, { "epoch": 0.6185027348075568, "grad_norm": 0.90234375, "learning_rate": 4.816268619055926e-05, "loss": 1.2144, "step": 1152 }, { "epoch": 0.6190396295426328, "grad_norm": 0.734375, "learning_rate": 4.815950716135706e-05, "loss": 1.0242, "step": 1153 }, { "epoch": 0.6195765242777088, "grad_norm": 0.75390625, "learning_rate": 4.81563254893618e-05, "loss": 0.9588, "step": 1154 }, { "epoch": 0.6201134190127848, "grad_norm": 0.8046875, "learning_rate": 4.815314117493655e-05, "loss": 0.6064, "step": 1155 }, { "epoch": 0.6206503137478608, "grad_norm": 0.8828125, "learning_rate": 4.814995421844468e-05, "loss": 1.1407, "step": 1156 }, { "epoch": 0.6211872084829368, "grad_norm": 0.8046875, "learning_rate": 4.814676462024988e-05, "loss": 0.9002, "step": 1157 }, { "epoch": 0.6217241032180129, "grad_norm": 0.73046875, "learning_rate": 4.814357238071608e-05, "loss": 0.9564, "step": 1158 }, { "epoch": 0.6222609979530889, "grad_norm": 0.64453125, "learning_rate": 4.8140377500207604e-05, "loss": 0.693, "step": 1159 }, { "epoch": 0.6227978926881649, "grad_norm": 0.9140625, "learning_rate": 4.8137179979088995e-05, "loss": 0.941, "step": 1160 }, { "epoch": 0.6233347874232408, "grad_norm": 0.73828125, "learning_rate": 4.813397981772514e-05, "loss": 0.9724, "step": 1161 }, { "epoch": 0.6238716821583168, "grad_norm": 0.734375, "learning_rate": 4.813077701648122e-05, "loss": 0.961, "step": 1162 }, { "epoch": 0.6244085768933928, "grad_norm": 0.62890625, "learning_rate": 4.812757157572272e-05, "loss": 0.815, "step": 1163 }, { "epoch": 0.6249454716284688, "grad_norm": 0.72265625, "learning_rate": 4.812436349581542e-05, "loss": 0.9533, "step": 1164 }, { "epoch": 0.6254823663635448, "grad_norm": 0.74609375, "learning_rate": 4.812115277712539e-05, "loss": 1.0351, "step": 1165 }, { "epoch": 0.6260192610986208, "grad_norm": 0.76953125, "learning_rate": 4.811793942001902e-05, "loss": 0.9545, "step": 1166 }, { "epoch": 0.6265561558336968, "grad_norm": 0.60546875, "learning_rate": 4.8114723424863003e-05, "loss": 0.8702, "step": 1167 }, { "epoch": 0.6270930505687728, "grad_norm": 0.80078125, "learning_rate": 4.811150479202432e-05, "loss": 1.1377, "step": 1168 }, { "epoch": 0.6276299453038489, "grad_norm": 0.76953125, "learning_rate": 4.8108283521870255e-05, "loss": 0.9175, "step": 1169 }, { "epoch": 0.6281668400389249, "grad_norm": 0.7578125, "learning_rate": 4.81050596147684e-05, "loss": 0.8968, "step": 1170 }, { "epoch": 0.6287037347740009, "grad_norm": 0.74609375, "learning_rate": 4.8101833071086635e-05, "loss": 1.1216, "step": 1171 }, { "epoch": 0.6292406295090769, "grad_norm": 0.859375, "learning_rate": 4.809860389119316e-05, "loss": 0.8454, "step": 1172 }, { "epoch": 0.6297775242441529, "grad_norm": 0.7734375, "learning_rate": 4.809537207545646e-05, "loss": 1.0855, "step": 1173 }, { "epoch": 0.6303144189792289, "grad_norm": 0.71875, "learning_rate": 4.8092137624245326e-05, "loss": 0.8737, "step": 1174 }, { "epoch": 0.6308513137143049, "grad_norm": 0.625, "learning_rate": 4.808890053792885e-05, "loss": 0.9678, "step": 1175 }, { "epoch": 0.6313882084493809, "grad_norm": 0.8125, "learning_rate": 4.808566081687643e-05, "loss": 0.9752, "step": 1176 }, { "epoch": 0.6319251031844569, "grad_norm": 0.80859375, "learning_rate": 4.808241846145775e-05, "loss": 1.0641, "step": 1177 }, { "epoch": 0.632461997919533, "grad_norm": 0.6640625, "learning_rate": 4.807917347204281e-05, "loss": 0.9785, "step": 1178 }, { "epoch": 0.632998892654609, "grad_norm": 0.7265625, "learning_rate": 4.80759258490019e-05, "loss": 0.9183, "step": 1179 }, { "epoch": 0.6335357873896849, "grad_norm": 0.703125, "learning_rate": 4.807267559270563e-05, "loss": 0.9138, "step": 1180 }, { "epoch": 0.6340726821247609, "grad_norm": 0.7265625, "learning_rate": 4.806942270352487e-05, "loss": 0.8897, "step": 1181 }, { "epoch": 0.6346095768598369, "grad_norm": 0.703125, "learning_rate": 4.8066167181830835e-05, "loss": 1.0239, "step": 1182 }, { "epoch": 0.6351464715949129, "grad_norm": 0.75, "learning_rate": 4.806290902799501e-05, "loss": 0.9348, "step": 1183 }, { "epoch": 0.6356833663299889, "grad_norm": 0.73046875, "learning_rate": 4.80596482423892e-05, "loss": 0.786, "step": 1184 }, { "epoch": 0.6362202610650649, "grad_norm": 0.609375, "learning_rate": 4.8056384825385495e-05, "loss": 0.7737, "step": 1185 }, { "epoch": 0.6367571558001409, "grad_norm": 0.66015625, "learning_rate": 4.80531187773563e-05, "loss": 0.8627, "step": 1186 }, { "epoch": 0.6372940505352169, "grad_norm": 0.66796875, "learning_rate": 4.804985009867431e-05, "loss": 0.9286, "step": 1187 }, { "epoch": 0.6378309452702929, "grad_norm": 0.6875, "learning_rate": 4.8046578789712515e-05, "loss": 0.9034, "step": 1188 }, { "epoch": 0.638367840005369, "grad_norm": 0.64453125, "learning_rate": 4.804330485084422e-05, "loss": 1.03, "step": 1189 }, { "epoch": 0.638904734740445, "grad_norm": 0.6875, "learning_rate": 4.8040028282443025e-05, "loss": 0.9326, "step": 1190 }, { "epoch": 0.639441629475521, "grad_norm": 0.82421875, "learning_rate": 4.8036749084882826e-05, "loss": 0.9014, "step": 1191 }, { "epoch": 0.639978524210597, "grad_norm": 0.6328125, "learning_rate": 4.8033467258537817e-05, "loss": 0.8208, "step": 1192 }, { "epoch": 0.640515418945673, "grad_norm": 0.734375, "learning_rate": 4.80301828037825e-05, "loss": 0.9107, "step": 1193 }, { "epoch": 0.641052313680749, "grad_norm": 0.7265625, "learning_rate": 4.802689572099167e-05, "loss": 1.1873, "step": 1194 }, { "epoch": 0.641589208415825, "grad_norm": 0.65234375, "learning_rate": 4.8023606010540426e-05, "loss": 1.0099, "step": 1195 }, { "epoch": 0.642126103150901, "grad_norm": 0.88671875, "learning_rate": 4.802031367280416e-05, "loss": 0.9254, "step": 1196 }, { "epoch": 0.642662997885977, "grad_norm": 0.7890625, "learning_rate": 4.801701870815859e-05, "loss": 1.2033, "step": 1197 }, { "epoch": 0.643199892621053, "grad_norm": 0.7421875, "learning_rate": 4.801372111697968e-05, "loss": 0.9532, "step": 1198 }, { "epoch": 0.6437367873561289, "grad_norm": 0.703125, "learning_rate": 4.801042089964376e-05, "loss": 0.8776, "step": 1199 }, { "epoch": 0.644273682091205, "grad_norm": 0.828125, "learning_rate": 4.800711805652741e-05, "loss": 0.9975, "step": 1200 }, { "epoch": 0.644273682091205, "eval_loss": 0.9368125200271606, "eval_runtime": 46.1689, "eval_samples_per_second": 20.837, "eval_steps_per_second": 20.837, "step": 1200 }, { "epoch": 0.644810576826281, "grad_norm": 0.859375, "learning_rate": 4.800381258800752e-05, "loss": 0.9638, "step": 1201 }, { "epoch": 0.645347471561357, "grad_norm": 0.7578125, "learning_rate": 4.8000504494461304e-05, "loss": 0.9991, "step": 1202 }, { "epoch": 0.645884366296433, "grad_norm": 0.64453125, "learning_rate": 4.799719377626624e-05, "loss": 0.8162, "step": 1203 }, { "epoch": 0.646421261031509, "grad_norm": 0.703125, "learning_rate": 4.799388043380013e-05, "loss": 0.9042, "step": 1204 }, { "epoch": 0.646958155766585, "grad_norm": 0.81640625, "learning_rate": 4.799056446744107e-05, "loss": 1.1001, "step": 1205 }, { "epoch": 0.647495050501661, "grad_norm": 0.765625, "learning_rate": 4.798724587756745e-05, "loss": 0.8964, "step": 1206 }, { "epoch": 0.648031945236737, "grad_norm": 0.73046875, "learning_rate": 4.798392466455798e-05, "loss": 0.8578, "step": 1207 }, { "epoch": 0.648568839971813, "grad_norm": 0.7265625, "learning_rate": 4.7980600828791625e-05, "loss": 0.9347, "step": 1208 }, { "epoch": 0.649105734706889, "grad_norm": 0.66796875, "learning_rate": 4.797727437064769e-05, "loss": 0.9061, "step": 1209 }, { "epoch": 0.649642629441965, "grad_norm": 0.78125, "learning_rate": 4.7973945290505766e-05, "loss": 1.0133, "step": 1210 }, { "epoch": 0.6501795241770411, "grad_norm": 0.6953125, "learning_rate": 4.797061358874575e-05, "loss": 1.0449, "step": 1211 }, { "epoch": 0.6507164189121171, "grad_norm": 0.67578125, "learning_rate": 4.796727926574781e-05, "loss": 0.803, "step": 1212 }, { "epoch": 0.6512533136471931, "grad_norm": 0.8046875, "learning_rate": 4.796394232189246e-05, "loss": 0.8309, "step": 1213 }, { "epoch": 0.6517902083822691, "grad_norm": 0.8046875, "learning_rate": 4.796060275756047e-05, "loss": 0.9637, "step": 1214 }, { "epoch": 0.6523271031173451, "grad_norm": 0.69921875, "learning_rate": 4.7957260573132944e-05, "loss": 0.8338, "step": 1215 }, { "epoch": 0.6528639978524211, "grad_norm": 0.73828125, "learning_rate": 4.795391576899125e-05, "loss": 1.0785, "step": 1216 }, { "epoch": 0.6534008925874971, "grad_norm": 0.91015625, "learning_rate": 4.795056834551708e-05, "loss": 1.2755, "step": 1217 }, { "epoch": 0.653937787322573, "grad_norm": 0.8828125, "learning_rate": 4.7947218303092414e-05, "loss": 1.132, "step": 1218 }, { "epoch": 0.654474682057649, "grad_norm": 0.7265625, "learning_rate": 4.794386564209953e-05, "loss": 0.9399, "step": 1219 }, { "epoch": 0.655011576792725, "grad_norm": 0.67578125, "learning_rate": 4.794051036292102e-05, "loss": 0.9379, "step": 1220 }, { "epoch": 0.655548471527801, "grad_norm": 0.88671875, "learning_rate": 4.793715246593976e-05, "loss": 1.0229, "step": 1221 }, { "epoch": 0.6560853662628771, "grad_norm": 0.83203125, "learning_rate": 4.793379195153894e-05, "loss": 0.8744, "step": 1222 }, { "epoch": 0.6566222609979531, "grad_norm": 0.7578125, "learning_rate": 4.793042882010202e-05, "loss": 1.0583, "step": 1223 }, { "epoch": 0.6571591557330291, "grad_norm": 0.87109375, "learning_rate": 4.7927063072012765e-05, "loss": 1.09, "step": 1224 }, { "epoch": 0.6576960504681051, "grad_norm": 0.7734375, "learning_rate": 4.7923694707655275e-05, "loss": 0.8834, "step": 1225 }, { "epoch": 0.6582329452031811, "grad_norm": 0.76171875, "learning_rate": 4.7920323727413917e-05, "loss": 0.9068, "step": 1226 }, { "epoch": 0.6587698399382571, "grad_norm": 0.734375, "learning_rate": 4.791695013167336e-05, "loss": 1.0008, "step": 1227 }, { "epoch": 0.6593067346733331, "grad_norm": 0.62890625, "learning_rate": 4.791357392081857e-05, "loss": 0.8071, "step": 1228 }, { "epoch": 0.6598436294084091, "grad_norm": 0.69140625, "learning_rate": 4.791019509523482e-05, "loss": 0.8902, "step": 1229 }, { "epoch": 0.6603805241434851, "grad_norm": 0.734375, "learning_rate": 4.790681365530768e-05, "loss": 0.9749, "step": 1230 }, { "epoch": 0.6609174188785611, "grad_norm": 0.6796875, "learning_rate": 4.7903429601423004e-05, "loss": 0.8686, "step": 1231 }, { "epoch": 0.6614543136136372, "grad_norm": 0.6953125, "learning_rate": 4.790004293396697e-05, "loss": 1.219, "step": 1232 }, { "epoch": 0.6619912083487132, "grad_norm": 0.86328125, "learning_rate": 4.789665365332602e-05, "loss": 1.2762, "step": 1233 }, { "epoch": 0.6625281030837892, "grad_norm": 0.79296875, "learning_rate": 4.789326175988693e-05, "loss": 1.0241, "step": 1234 }, { "epoch": 0.6630649978188652, "grad_norm": 0.6484375, "learning_rate": 4.7889867254036755e-05, "loss": 0.8839, "step": 1235 }, { "epoch": 0.6636018925539412, "grad_norm": 0.74609375, "learning_rate": 4.788647013616285e-05, "loss": 0.9357, "step": 1236 }, { "epoch": 0.6641387872890171, "grad_norm": 0.6875, "learning_rate": 4.7883070406652875e-05, "loss": 0.7911, "step": 1237 }, { "epoch": 0.6646756820240931, "grad_norm": 0.75390625, "learning_rate": 4.7879668065894766e-05, "loss": 0.8811, "step": 1238 }, { "epoch": 0.6652125767591691, "grad_norm": 0.80078125, "learning_rate": 4.7876263114276786e-05, "loss": 1.2227, "step": 1239 }, { "epoch": 0.6657494714942451, "grad_norm": 0.7109375, "learning_rate": 4.787285555218748e-05, "loss": 0.8563, "step": 1240 }, { "epoch": 0.6662863662293211, "grad_norm": 0.73828125, "learning_rate": 4.7869445380015697e-05, "loss": 1.0592, "step": 1241 }, { "epoch": 0.6668232609643971, "grad_norm": 0.77734375, "learning_rate": 4.786603259815058e-05, "loss": 0.9877, "step": 1242 }, { "epoch": 0.6673601556994732, "grad_norm": 0.7734375, "learning_rate": 4.7862617206981565e-05, "loss": 1.1321, "step": 1243 }, { "epoch": 0.6678970504345492, "grad_norm": 0.80078125, "learning_rate": 4.785919920689838e-05, "loss": 0.8493, "step": 1244 }, { "epoch": 0.6684339451696252, "grad_norm": 0.9921875, "learning_rate": 4.78557785982911e-05, "loss": 1.1701, "step": 1245 }, { "epoch": 0.6689708399047012, "grad_norm": 0.74609375, "learning_rate": 4.785235538155003e-05, "loss": 1.1437, "step": 1246 }, { "epoch": 0.6695077346397772, "grad_norm": 1.0390625, "learning_rate": 4.784892955706581e-05, "loss": 1.0322, "step": 1247 }, { "epoch": 0.6700446293748532, "grad_norm": 0.76171875, "learning_rate": 4.7845501125229367e-05, "loss": 0.9838, "step": 1248 }, { "epoch": 0.6705815241099292, "grad_norm": 0.6640625, "learning_rate": 4.784207008643193e-05, "loss": 0.9461, "step": 1249 }, { "epoch": 0.6711184188450052, "grad_norm": 0.77734375, "learning_rate": 4.783863644106502e-05, "loss": 1.0143, "step": 1250 }, { "epoch": 0.6711184188450052, "eval_loss": 0.9342340230941772, "eval_runtime": 46.1501, "eval_samples_per_second": 20.845, "eval_steps_per_second": 20.845, "step": 1250 }, { "epoch": 0.6716553135800812, "grad_norm": 0.9375, "learning_rate": 4.783520018952048e-05, "loss": 1.2811, "step": 1251 }, { "epoch": 0.6721922083151572, "grad_norm": 0.68359375, "learning_rate": 4.78317613321904e-05, "loss": 0.9107, "step": 1252 }, { "epoch": 0.6727291030502333, "grad_norm": 0.734375, "learning_rate": 4.7828319869467214e-05, "loss": 1.0983, "step": 1253 }, { "epoch": 0.6732659977853093, "grad_norm": 0.69140625, "learning_rate": 4.7824875801743636e-05, "loss": 0.8884, "step": 1254 }, { "epoch": 0.6738028925203853, "grad_norm": 0.734375, "learning_rate": 4.782142912941268e-05, "loss": 0.9643, "step": 1255 }, { "epoch": 0.6743397872554612, "grad_norm": 0.71875, "learning_rate": 4.781797985286763e-05, "loss": 0.9157, "step": 1256 }, { "epoch": 0.6748766819905372, "grad_norm": 0.640625, "learning_rate": 4.781452797250213e-05, "loss": 0.884, "step": 1257 }, { "epoch": 0.6754135767256132, "grad_norm": 0.83203125, "learning_rate": 4.7811073488710065e-05, "loss": 0.8141, "step": 1258 }, { "epoch": 0.6759504714606892, "grad_norm": 0.7109375, "learning_rate": 4.7807616401885634e-05, "loss": 0.7466, "step": 1259 }, { "epoch": 0.6764873661957652, "grad_norm": 0.625, "learning_rate": 4.780415671242334e-05, "loss": 0.8875, "step": 1260 }, { "epoch": 0.6770242609308412, "grad_norm": 0.60546875, "learning_rate": 4.7800694420717966e-05, "loss": 0.9136, "step": 1261 }, { "epoch": 0.6775611556659172, "grad_norm": 0.66015625, "learning_rate": 4.779722952716462e-05, "loss": 0.7917, "step": 1262 }, { "epoch": 0.6780980504009932, "grad_norm": 0.75390625, "learning_rate": 4.7793762032158676e-05, "loss": 1.0076, "step": 1263 }, { "epoch": 0.6786349451360693, "grad_norm": 0.57421875, "learning_rate": 4.779029193609582e-05, "loss": 0.8647, "step": 1264 }, { "epoch": 0.6791718398711453, "grad_norm": 0.6875, "learning_rate": 4.7786819239372046e-05, "loss": 0.8765, "step": 1265 }, { "epoch": 0.6797087346062213, "grad_norm": 0.6875, "learning_rate": 4.778334394238362e-05, "loss": 1.0059, "step": 1266 }, { "epoch": 0.6802456293412973, "grad_norm": 0.578125, "learning_rate": 4.777986604552712e-05, "loss": 0.9935, "step": 1267 }, { "epoch": 0.6807825240763733, "grad_norm": 0.671875, "learning_rate": 4.777638554919943e-05, "loss": 0.8443, "step": 1268 }, { "epoch": 0.6813194188114493, "grad_norm": 0.7421875, "learning_rate": 4.777290245379771e-05, "loss": 1.0868, "step": 1269 }, { "epoch": 0.6818563135465253, "grad_norm": 0.64453125, "learning_rate": 4.776941675971941e-05, "loss": 0.7599, "step": 1270 }, { "epoch": 0.6823932082816013, "grad_norm": 0.84375, "learning_rate": 4.776592846736232e-05, "loss": 1.1578, "step": 1271 }, { "epoch": 0.6829301030166773, "grad_norm": 0.76171875, "learning_rate": 4.776243757712448e-05, "loss": 0.928, "step": 1272 }, { "epoch": 0.6834669977517533, "grad_norm": 0.75390625, "learning_rate": 4.775894408940425e-05, "loss": 0.9463, "step": 1273 }, { "epoch": 0.6840038924868292, "grad_norm": 0.640625, "learning_rate": 4.775544800460028e-05, "loss": 0.8065, "step": 1274 }, { "epoch": 0.6845407872219053, "grad_norm": 0.73046875, "learning_rate": 4.775194932311152e-05, "loss": 0.9241, "step": 1275 }, { "epoch": 0.6850776819569813, "grad_norm": 0.8125, "learning_rate": 4.774844804533721e-05, "loss": 1.1012, "step": 1276 }, { "epoch": 0.6856145766920573, "grad_norm": 0.77734375, "learning_rate": 4.774494417167689e-05, "loss": 1.1509, "step": 1277 }, { "epoch": 0.6861514714271333, "grad_norm": 0.6484375, "learning_rate": 4.77414377025304e-05, "loss": 0.7877, "step": 1278 }, { "epoch": 0.6866883661622093, "grad_norm": 0.84375, "learning_rate": 4.773792863829788e-05, "loss": 0.9988, "step": 1279 }, { "epoch": 0.6872252608972853, "grad_norm": 0.828125, "learning_rate": 4.7734416979379736e-05, "loss": 0.9574, "step": 1280 }, { "epoch": 0.6877621556323613, "grad_norm": 0.64453125, "learning_rate": 4.773090272617672e-05, "loss": 0.9265, "step": 1281 }, { "epoch": 0.6882990503674373, "grad_norm": 0.71484375, "learning_rate": 4.772738587908982e-05, "loss": 0.821, "step": 1282 }, { "epoch": 0.6888359451025133, "grad_norm": 0.7578125, "learning_rate": 4.7723866438520394e-05, "loss": 0.8526, "step": 1283 }, { "epoch": 0.6893728398375893, "grad_norm": 0.62109375, "learning_rate": 4.7720344404870024e-05, "loss": 0.8551, "step": 1284 }, { "epoch": 0.6899097345726654, "grad_norm": 0.6796875, "learning_rate": 4.7716819778540625e-05, "loss": 1.032, "step": 1285 }, { "epoch": 0.6904466293077414, "grad_norm": 0.78515625, "learning_rate": 4.771329255993441e-05, "loss": 0.8922, "step": 1286 }, { "epoch": 0.6909835240428174, "grad_norm": 0.703125, "learning_rate": 4.770976274945387e-05, "loss": 1.0449, "step": 1287 }, { "epoch": 0.6915204187778934, "grad_norm": 0.85546875, "learning_rate": 4.77062303475018e-05, "loss": 0.8638, "step": 1288 }, { "epoch": 0.6920573135129694, "grad_norm": 0.6875, "learning_rate": 4.7702695354481296e-05, "loss": 0.8569, "step": 1289 }, { "epoch": 0.6925942082480454, "grad_norm": 0.7734375, "learning_rate": 4.7699157770795755e-05, "loss": 1.0324, "step": 1290 }, { "epoch": 0.6931311029831214, "grad_norm": 0.65625, "learning_rate": 4.769561759684884e-05, "loss": 0.8717, "step": 1291 }, { "epoch": 0.6936679977181974, "grad_norm": 0.80859375, "learning_rate": 4.7692074833044556e-05, "loss": 0.9943, "step": 1292 }, { "epoch": 0.6942048924532733, "grad_norm": 0.9375, "learning_rate": 4.7688529479787144e-05, "loss": 0.8082, "step": 1293 }, { "epoch": 0.6947417871883493, "grad_norm": 0.7578125, "learning_rate": 4.76849815374812e-05, "loss": 0.8062, "step": 1294 }, { "epoch": 0.6952786819234253, "grad_norm": 0.76171875, "learning_rate": 4.768143100653158e-05, "loss": 1.2008, "step": 1295 }, { "epoch": 0.6958155766585014, "grad_norm": 0.88671875, "learning_rate": 4.7677877887343445e-05, "loss": 0.9233, "step": 1296 }, { "epoch": 0.6963524713935774, "grad_norm": 0.68359375, "learning_rate": 4.767432218032225e-05, "loss": 0.9462, "step": 1297 }, { "epoch": 0.6968893661286534, "grad_norm": 0.7578125, "learning_rate": 4.767076388587375e-05, "loss": 0.9398, "step": 1298 }, { "epoch": 0.6974262608637294, "grad_norm": 0.73828125, "learning_rate": 4.766720300440398e-05, "loss": 0.9359, "step": 1299 }, { "epoch": 0.6979631555988054, "grad_norm": 0.69921875, "learning_rate": 4.7663639536319296e-05, "loss": 1.1696, "step": 1300 }, { "epoch": 0.6979631555988054, "eval_loss": 0.9328204393386841, "eval_runtime": 46.1153, "eval_samples_per_second": 20.861, "eval_steps_per_second": 20.861, "step": 1300 }, { "epoch": 0.6985000503338814, "grad_norm": 0.80859375, "learning_rate": 4.766007348202633e-05, "loss": 1.034, "step": 1301 }, { "epoch": 0.6990369450689574, "grad_norm": 0.71875, "learning_rate": 4.765650484193202e-05, "loss": 0.8461, "step": 1302 }, { "epoch": 0.6995738398040334, "grad_norm": 0.74609375, "learning_rate": 4.765293361644357e-05, "loss": 1.0729, "step": 1303 }, { "epoch": 0.7001107345391094, "grad_norm": 0.6328125, "learning_rate": 4.764935980596853e-05, "loss": 0.7903, "step": 1304 }, { "epoch": 0.7006476292741854, "grad_norm": 1.40625, "learning_rate": 4.76457834109147e-05, "loss": 0.8027, "step": 1305 }, { "epoch": 0.7011845240092615, "grad_norm": 0.75, "learning_rate": 4.7642204431690206e-05, "loss": 0.8505, "step": 1306 }, { "epoch": 0.7017214187443375, "grad_norm": 0.73828125, "learning_rate": 4.763862286870343e-05, "loss": 0.943, "step": 1307 }, { "epoch": 0.7022583134794135, "grad_norm": 0.7109375, "learning_rate": 4.7635038722363105e-05, "loss": 1.0102, "step": 1308 }, { "epoch": 0.7027952082144895, "grad_norm": 0.734375, "learning_rate": 4.763145199307821e-05, "loss": 0.9867, "step": 1309 }, { "epoch": 0.7033321029495655, "grad_norm": 0.6953125, "learning_rate": 4.7627862681258037e-05, "loss": 1.0336, "step": 1310 }, { "epoch": 0.7038689976846415, "grad_norm": 0.69921875, "learning_rate": 4.7624270787312166e-05, "loss": 0.8188, "step": 1311 }, { "epoch": 0.7044058924197174, "grad_norm": 0.90625, "learning_rate": 4.762067631165049e-05, "loss": 0.8758, "step": 1312 }, { "epoch": 0.7049427871547934, "grad_norm": 0.8359375, "learning_rate": 4.761707925468318e-05, "loss": 1.4241, "step": 1313 }, { "epoch": 0.7054796818898694, "grad_norm": 0.8671875, "learning_rate": 4.7613479616820705e-05, "loss": 1.0613, "step": 1314 }, { "epoch": 0.7060165766249454, "grad_norm": 0.7578125, "learning_rate": 4.7609877398473826e-05, "loss": 0.8751, "step": 1315 }, { "epoch": 0.7065534713600214, "grad_norm": 0.6796875, "learning_rate": 4.7606272600053605e-05, "loss": 1.0087, "step": 1316 }, { "epoch": 0.7070903660950975, "grad_norm": 0.8046875, "learning_rate": 4.7602665221971396e-05, "loss": 1.0294, "step": 1317 }, { "epoch": 0.7076272608301735, "grad_norm": 0.65625, "learning_rate": 4.7599055264638856e-05, "loss": 0.8163, "step": 1318 }, { "epoch": 0.7081641555652495, "grad_norm": 0.73046875, "learning_rate": 4.75954427284679e-05, "loss": 0.7875, "step": 1319 }, { "epoch": 0.7087010503003255, "grad_norm": 0.63671875, "learning_rate": 4.7591827613870785e-05, "loss": 0.7542, "step": 1320 }, { "epoch": 0.7092379450354015, "grad_norm": 0.71484375, "learning_rate": 4.758820992126005e-05, "loss": 0.9595, "step": 1321 }, { "epoch": 0.7097748397704775, "grad_norm": 0.80078125, "learning_rate": 4.758458965104849e-05, "loss": 1.2754, "step": 1322 }, { "epoch": 0.7103117345055535, "grad_norm": 0.6640625, "learning_rate": 4.758096680364925e-05, "loss": 0.9228, "step": 1323 }, { "epoch": 0.7108486292406295, "grad_norm": 0.77734375, "learning_rate": 4.757734137947573e-05, "loss": 1.001, "step": 1324 }, { "epoch": 0.7113855239757055, "grad_norm": 0.78515625, "learning_rate": 4.757371337894164e-05, "loss": 0.9495, "step": 1325 }, { "epoch": 0.7119224187107815, "grad_norm": 0.7265625, "learning_rate": 4.757008280246098e-05, "loss": 1.0422, "step": 1326 }, { "epoch": 0.7124593134458576, "grad_norm": 0.7421875, "learning_rate": 4.756644965044804e-05, "loss": 1.0319, "step": 1327 }, { "epoch": 0.7129962081809336, "grad_norm": 0.7421875, "learning_rate": 4.756281392331742e-05, "loss": 1.2311, "step": 1328 }, { "epoch": 0.7135331029160096, "grad_norm": 0.76171875, "learning_rate": 4.7559175621484e-05, "loss": 0.9433, "step": 1329 }, { "epoch": 0.7140699976510856, "grad_norm": 0.7109375, "learning_rate": 4.755553474536294e-05, "loss": 0.938, "step": 1330 }, { "epoch": 0.7146068923861615, "grad_norm": 0.73828125, "learning_rate": 4.755189129536973e-05, "loss": 0.8623, "step": 1331 }, { "epoch": 0.7151437871212375, "grad_norm": 0.66015625, "learning_rate": 4.754824527192013e-05, "loss": 0.9582, "step": 1332 }, { "epoch": 0.7156806818563135, "grad_norm": 0.6796875, "learning_rate": 4.754459667543019e-05, "loss": 1.0101, "step": 1333 }, { "epoch": 0.7162175765913895, "grad_norm": 0.7890625, "learning_rate": 4.754094550631627e-05, "loss": 0.8064, "step": 1334 }, { "epoch": 0.7167544713264655, "grad_norm": 0.67578125, "learning_rate": 4.7537291764995006e-05, "loss": 0.8832, "step": 1335 }, { "epoch": 0.7172913660615415, "grad_norm": 0.640625, "learning_rate": 4.753363545188334e-05, "loss": 0.9058, "step": 1336 }, { "epoch": 0.7178282607966175, "grad_norm": 0.72265625, "learning_rate": 4.7529976567398506e-05, "loss": 1.0384, "step": 1337 }, { "epoch": 0.7183651555316936, "grad_norm": 0.78515625, "learning_rate": 4.752631511195802e-05, "loss": 1.0729, "step": 1338 }, { "epoch": 0.7189020502667696, "grad_norm": 0.6484375, "learning_rate": 4.752265108597971e-05, "loss": 0.7538, "step": 1339 }, { "epoch": 0.7194389450018456, "grad_norm": 0.74609375, "learning_rate": 4.751898448988169e-05, "loss": 0.9167, "step": 1340 }, { "epoch": 0.7199758397369216, "grad_norm": 0.81640625, "learning_rate": 4.751531532408236e-05, "loss": 0.9067, "step": 1341 }, { "epoch": 0.7205127344719976, "grad_norm": 0.828125, "learning_rate": 4.751164358900041e-05, "loss": 0.9941, "step": 1342 }, { "epoch": 0.7210496292070736, "grad_norm": 0.66796875, "learning_rate": 4.7507969285054845e-05, "loss": 0.8699, "step": 1343 }, { "epoch": 0.7215865239421496, "grad_norm": 0.6640625, "learning_rate": 4.750429241266494e-05, "loss": 0.9196, "step": 1344 }, { "epoch": 0.7221234186772256, "grad_norm": 0.67578125, "learning_rate": 4.750061297225028e-05, "loss": 0.896, "step": 1345 }, { "epoch": 0.7226603134123016, "grad_norm": 0.8515625, "learning_rate": 4.749693096423073e-05, "loss": 0.7731, "step": 1346 }, { "epoch": 0.7231972081473776, "grad_norm": 0.82421875, "learning_rate": 4.749324638902646e-05, "loss": 0.8657, "step": 1347 }, { "epoch": 0.7237341028824537, "grad_norm": 0.77734375, "learning_rate": 4.748955924705792e-05, "loss": 1.0607, "step": 1348 }, { "epoch": 0.7242709976175297, "grad_norm": 0.7734375, "learning_rate": 4.748586953874587e-05, "loss": 0.9679, "step": 1349 }, { "epoch": 0.7248078923526056, "grad_norm": 0.7265625, "learning_rate": 4.7482177264511335e-05, "loss": 0.8223, "step": 1350 }, { "epoch": 0.7248078923526056, "eval_loss": 0.9316281676292419, "eval_runtime": 45.924, "eval_samples_per_second": 20.948, "eval_steps_per_second": 20.948, "step": 1350 }, { "epoch": 0.7253447870876816, "grad_norm": 1.3515625, "learning_rate": 4.747848242477566e-05, "loss": 1.1274, "step": 1351 }, { "epoch": 0.7258816818227576, "grad_norm": 0.69140625, "learning_rate": 4.747478501996048e-05, "loss": 0.7566, "step": 1352 }, { "epoch": 0.7264185765578336, "grad_norm": 0.59765625, "learning_rate": 4.747108505048771e-05, "loss": 0.7889, "step": 1353 }, { "epoch": 0.7269554712929096, "grad_norm": 0.83984375, "learning_rate": 4.7467382516779554e-05, "loss": 0.8987, "step": 1354 }, { "epoch": 0.7274923660279856, "grad_norm": 0.71484375, "learning_rate": 4.746367741925854e-05, "loss": 0.9739, "step": 1355 }, { "epoch": 0.7280292607630616, "grad_norm": 0.78125, "learning_rate": 4.7459969758347444e-05, "loss": 0.8851, "step": 1356 }, { "epoch": 0.7285661554981376, "grad_norm": 0.66015625, "learning_rate": 4.745625953446936e-05, "loss": 0.9751, "step": 1357 }, { "epoch": 0.7291030502332136, "grad_norm": 0.90234375, "learning_rate": 4.745254674804769e-05, "loss": 0.8199, "step": 1358 }, { "epoch": 0.7296399449682897, "grad_norm": 0.80078125, "learning_rate": 4.74488313995061e-05, "loss": 1.0326, "step": 1359 }, { "epoch": 0.7301768397033657, "grad_norm": 0.76171875, "learning_rate": 4.7445113489268544e-05, "loss": 0.9174, "step": 1360 }, { "epoch": 0.7307137344384417, "grad_norm": 0.70703125, "learning_rate": 4.74413930177593e-05, "loss": 0.8355, "step": 1361 }, { "epoch": 0.7312506291735177, "grad_norm": 0.94921875, "learning_rate": 4.743766998540291e-05, "loss": 1.1939, "step": 1362 }, { "epoch": 0.7317875239085937, "grad_norm": 0.75, "learning_rate": 4.743394439262423e-05, "loss": 0.725, "step": 1363 }, { "epoch": 0.7323244186436697, "grad_norm": 0.70703125, "learning_rate": 4.743021623984839e-05, "loss": 1.0349, "step": 1364 }, { "epoch": 0.7328613133787457, "grad_norm": 0.8671875, "learning_rate": 4.742648552750082e-05, "loss": 0.8164, "step": 1365 }, { "epoch": 0.7333982081138217, "grad_norm": 0.65625, "learning_rate": 4.742275225600725e-05, "loss": 0.8941, "step": 1366 }, { "epoch": 0.7339351028488977, "grad_norm": 0.71875, "learning_rate": 4.741901642579368e-05, "loss": 0.9404, "step": 1367 }, { "epoch": 0.7344719975839737, "grad_norm": 0.73046875, "learning_rate": 4.7415278037286424e-05, "loss": 1.0523, "step": 1368 }, { "epoch": 0.7350088923190496, "grad_norm": 0.703125, "learning_rate": 4.741153709091208e-05, "loss": 1.0108, "step": 1369 }, { "epoch": 0.7355457870541257, "grad_norm": 0.94921875, "learning_rate": 4.740779358709753e-05, "loss": 1.1026, "step": 1370 }, { "epoch": 0.7360826817892017, "grad_norm": 0.66796875, "learning_rate": 4.740404752626996e-05, "loss": 0.7894, "step": 1371 }, { "epoch": 0.7366195765242777, "grad_norm": 0.7734375, "learning_rate": 4.740029890885684e-05, "loss": 1.0289, "step": 1372 }, { "epoch": 0.7371564712593537, "grad_norm": 0.80859375, "learning_rate": 4.7396547735285945e-05, "loss": 1.0261, "step": 1373 }, { "epoch": 0.7376933659944297, "grad_norm": 0.65625, "learning_rate": 4.7392794005985326e-05, "loss": 0.8073, "step": 1374 }, { "epoch": 0.7382302607295057, "grad_norm": 0.84375, "learning_rate": 4.7389037721383325e-05, "loss": 0.8464, "step": 1375 }, { "epoch": 0.7387671554645817, "grad_norm": 0.671875, "learning_rate": 4.738527888190858e-05, "loss": 0.9254, "step": 1376 }, { "epoch": 0.7393040501996577, "grad_norm": 0.66015625, "learning_rate": 4.738151748799003e-05, "loss": 0.8076, "step": 1377 }, { "epoch": 0.7398409449347337, "grad_norm": 0.69921875, "learning_rate": 4.73777535400569e-05, "loss": 0.9767, "step": 1378 }, { "epoch": 0.7403778396698097, "grad_norm": 0.59765625, "learning_rate": 4.73739870385387e-05, "loss": 0.9295, "step": 1379 }, { "epoch": 0.7409147344048858, "grad_norm": 0.8046875, "learning_rate": 4.737021798386524e-05, "loss": 0.938, "step": 1380 }, { "epoch": 0.7414516291399618, "grad_norm": 0.66015625, "learning_rate": 4.73664463764666e-05, "loss": 0.9509, "step": 1381 }, { "epoch": 0.7419885238750378, "grad_norm": 0.59375, "learning_rate": 4.736267221677318e-05, "loss": 0.8211, "step": 1382 }, { "epoch": 0.7425254186101138, "grad_norm": 0.6328125, "learning_rate": 4.735889550521567e-05, "loss": 0.9719, "step": 1383 }, { "epoch": 0.7430623133451898, "grad_norm": 0.828125, "learning_rate": 4.735511624222503e-05, "loss": 0.9421, "step": 1384 }, { "epoch": 0.7435992080802658, "grad_norm": 0.6640625, "learning_rate": 4.735133442823252e-05, "loss": 0.801, "step": 1385 }, { "epoch": 0.7441361028153418, "grad_norm": 0.8359375, "learning_rate": 4.7347550063669695e-05, "loss": 1.1049, "step": 1386 }, { "epoch": 0.7446729975504178, "grad_norm": 0.671875, "learning_rate": 4.7343763148968394e-05, "loss": 0.876, "step": 1387 }, { "epoch": 0.7452098922854937, "grad_norm": 0.76171875, "learning_rate": 4.733997368456076e-05, "loss": 0.9858, "step": 1388 }, { "epoch": 0.7457467870205697, "grad_norm": 0.62109375, "learning_rate": 4.733618167087922e-05, "loss": 0.8927, "step": 1389 }, { "epoch": 0.7462836817556457, "grad_norm": 0.67578125, "learning_rate": 4.733238710835648e-05, "loss": 1.0067, "step": 1390 }, { "epoch": 0.7468205764907218, "grad_norm": 0.7421875, "learning_rate": 4.7328589997425554e-05, "loss": 0.9566, "step": 1391 }, { "epoch": 0.7473574712257978, "grad_norm": 0.68359375, "learning_rate": 4.7324790338519744e-05, "loss": 0.8136, "step": 1392 }, { "epoch": 0.7478943659608738, "grad_norm": 0.75, "learning_rate": 4.7320988132072633e-05, "loss": 0.887, "step": 1393 }, { "epoch": 0.7484312606959498, "grad_norm": 0.671875, "learning_rate": 4.731718337851811e-05, "loss": 0.8988, "step": 1394 }, { "epoch": 0.7489681554310258, "grad_norm": 0.76171875, "learning_rate": 4.7313376078290335e-05, "loss": 1.0616, "step": 1395 }, { "epoch": 0.7495050501661018, "grad_norm": 0.7421875, "learning_rate": 4.730956623182377e-05, "loss": 1.0494, "step": 1396 }, { "epoch": 0.7500419449011778, "grad_norm": 0.59765625, "learning_rate": 4.7305753839553174e-05, "loss": 0.802, "step": 1397 }, { "epoch": 0.7505788396362538, "grad_norm": 0.75, "learning_rate": 4.730193890191359e-05, "loss": 0.9595, "step": 1398 }, { "epoch": 0.7511157343713298, "grad_norm": 0.67578125, "learning_rate": 4.7298121419340335e-05, "loss": 0.9304, "step": 1399 }, { "epoch": 0.7516526291064058, "grad_norm": 0.70703125, "learning_rate": 4.729430139226905e-05, "loss": 0.9384, "step": 1400 }, { "epoch": 0.7516526291064058, "eval_loss": 0.9312136173248291, "eval_runtime": 46.4614, "eval_samples_per_second": 20.705, "eval_steps_per_second": 20.705, "step": 1400 }, { "epoch": 0.7521895238414819, "grad_norm": 0.66015625, "learning_rate": 4.7290478821135644e-05, "loss": 0.8603, "step": 1401 }, { "epoch": 0.7527264185765579, "grad_norm": 0.6875, "learning_rate": 4.728665370637631e-05, "loss": 0.871, "step": 1402 }, { "epoch": 0.7532633133116339, "grad_norm": 0.7734375, "learning_rate": 4.7282826048427555e-05, "loss": 1.0149, "step": 1403 }, { "epoch": 0.7538002080467099, "grad_norm": 0.734375, "learning_rate": 4.727899584772616e-05, "loss": 0.722, "step": 1404 }, { "epoch": 0.7543371027817859, "grad_norm": 0.828125, "learning_rate": 4.72751631047092e-05, "loss": 0.9323, "step": 1405 }, { "epoch": 0.7548739975168619, "grad_norm": 0.66796875, "learning_rate": 4.7271327819814035e-05, "loss": 0.9867, "step": 1406 }, { "epoch": 0.7554108922519378, "grad_norm": 0.70703125, "learning_rate": 4.726748999347832e-05, "loss": 0.7637, "step": 1407 }, { "epoch": 0.7559477869870138, "grad_norm": 0.6484375, "learning_rate": 4.726364962614e-05, "loss": 1.0353, "step": 1408 }, { "epoch": 0.7564846817220898, "grad_norm": 0.7578125, "learning_rate": 4.725980671823732e-05, "loss": 0.9195, "step": 1409 }, { "epoch": 0.7570215764571658, "grad_norm": 0.94921875, "learning_rate": 4.725596127020879e-05, "loss": 1.3082, "step": 1410 }, { "epoch": 0.7575584711922418, "grad_norm": 0.7890625, "learning_rate": 4.725211328249323e-05, "loss": 1.0194, "step": 1411 }, { "epoch": 0.7580953659273179, "grad_norm": 0.78125, "learning_rate": 4.724826275552975e-05, "loss": 0.9548, "step": 1412 }, { "epoch": 0.7586322606623939, "grad_norm": 0.76171875, "learning_rate": 4.724440968975773e-05, "loss": 1.0079, "step": 1413 }, { "epoch": 0.7591691553974699, "grad_norm": 0.7734375, "learning_rate": 4.724055408561687e-05, "loss": 1.1499, "step": 1414 }, { "epoch": 0.7597060501325459, "grad_norm": 0.69140625, "learning_rate": 4.723669594354713e-05, "loss": 0.826, "step": 1415 }, { "epoch": 0.7602429448676219, "grad_norm": 0.73828125, "learning_rate": 4.723283526398877e-05, "loss": 1.047, "step": 1416 }, { "epoch": 0.7607798396026979, "grad_norm": 0.65625, "learning_rate": 4.722897204738237e-05, "loss": 1.0241, "step": 1417 }, { "epoch": 0.7613167343377739, "grad_norm": 0.7109375, "learning_rate": 4.7225106294168735e-05, "loss": 0.8048, "step": 1418 }, { "epoch": 0.7618536290728499, "grad_norm": 0.84375, "learning_rate": 4.722123800478902e-05, "loss": 0.8397, "step": 1419 }, { "epoch": 0.7623905238079259, "grad_norm": 0.6796875, "learning_rate": 4.721736717968464e-05, "loss": 0.854, "step": 1420 }, { "epoch": 0.762927418543002, "grad_norm": 0.69921875, "learning_rate": 4.7213493819297306e-05, "loss": 0.9567, "step": 1421 }, { "epoch": 0.763464313278078, "grad_norm": 0.765625, "learning_rate": 4.720961792406901e-05, "loss": 1.0173, "step": 1422 }, { "epoch": 0.764001208013154, "grad_norm": 1.0625, "learning_rate": 4.720573949444205e-05, "loss": 0.9527, "step": 1423 }, { "epoch": 0.76453810274823, "grad_norm": 0.6953125, "learning_rate": 4.7201858530859e-05, "loss": 0.9913, "step": 1424 }, { "epoch": 0.7650749974833059, "grad_norm": 1.09375, "learning_rate": 4.719797503376273e-05, "loss": 0.9947, "step": 1425 }, { "epoch": 0.7656118922183819, "grad_norm": 0.80859375, "learning_rate": 4.71940890035964e-05, "loss": 0.9944, "step": 1426 }, { "epoch": 0.7661487869534579, "grad_norm": 0.76953125, "learning_rate": 4.7190200440803434e-05, "loss": 0.9377, "step": 1427 }, { "epoch": 0.7666856816885339, "grad_norm": 0.7734375, "learning_rate": 4.71863093458276e-05, "loss": 1.0204, "step": 1428 }, { "epoch": 0.7672225764236099, "grad_norm": 0.6171875, "learning_rate": 4.7182415719112885e-05, "loss": 0.9268, "step": 1429 }, { "epoch": 0.7677594711586859, "grad_norm": 0.6640625, "learning_rate": 4.717851956110363e-05, "loss": 0.7247, "step": 1430 }, { "epoch": 0.7682963658937619, "grad_norm": 0.87890625, "learning_rate": 4.717462087224442e-05, "loss": 0.9339, "step": 1431 }, { "epoch": 0.768833260628838, "grad_norm": 0.67578125, "learning_rate": 4.717071965298015e-05, "loss": 0.8147, "step": 1432 }, { "epoch": 0.769370155363914, "grad_norm": 0.7578125, "learning_rate": 4.7166815903756004e-05, "loss": 0.842, "step": 1433 }, { "epoch": 0.76990705009899, "grad_norm": 0.71484375, "learning_rate": 4.716290962501744e-05, "loss": 0.8458, "step": 1434 }, { "epoch": 0.770443944834066, "grad_norm": 0.91015625, "learning_rate": 4.7159000817210205e-05, "loss": 0.9644, "step": 1435 }, { "epoch": 0.770980839569142, "grad_norm": 0.69921875, "learning_rate": 4.715508948078037e-05, "loss": 0.9705, "step": 1436 }, { "epoch": 0.771517734304218, "grad_norm": 0.671875, "learning_rate": 4.715117561617425e-05, "loss": 0.8341, "step": 1437 }, { "epoch": 0.772054629039294, "grad_norm": 0.80859375, "learning_rate": 4.714725922383847e-05, "loss": 0.8103, "step": 1438 }, { "epoch": 0.77259152377437, "grad_norm": 0.8125, "learning_rate": 4.714334030421994e-05, "loss": 0.9949, "step": 1439 }, { "epoch": 0.773128418509446, "grad_norm": 0.828125, "learning_rate": 4.713941885776586e-05, "loss": 0.8243, "step": 1440 }, { "epoch": 0.773665313244522, "grad_norm": 0.97265625, "learning_rate": 4.713549488492371e-05, "loss": 1.1497, "step": 1441 }, { "epoch": 0.774202207979598, "grad_norm": 0.97265625, "learning_rate": 4.713156838614128e-05, "loss": 0.8016, "step": 1442 }, { "epoch": 0.774739102714674, "grad_norm": 0.71484375, "learning_rate": 4.7127639361866616e-05, "loss": 0.9364, "step": 1443 }, { "epoch": 0.77527599744975, "grad_norm": 0.8125, "learning_rate": 4.712370781254808e-05, "loss": 1.0553, "step": 1444 }, { "epoch": 0.775812892184826, "grad_norm": 0.62109375, "learning_rate": 4.711977373863431e-05, "loss": 1.0205, "step": 1445 }, { "epoch": 0.776349786919902, "grad_norm": 0.734375, "learning_rate": 4.7115837140574227e-05, "loss": 0.9484, "step": 1446 }, { "epoch": 0.776886681654978, "grad_norm": 1.0234375, "learning_rate": 4.711189801881706e-05, "loss": 1.2984, "step": 1447 }, { "epoch": 0.777423576390054, "grad_norm": 0.74609375, "learning_rate": 4.710795637381229e-05, "loss": 0.8438, "step": 1448 }, { "epoch": 0.77796047112513, "grad_norm": 0.73828125, "learning_rate": 4.710401220600974e-05, "loss": 0.9522, "step": 1449 }, { "epoch": 0.778497365860206, "grad_norm": 0.73046875, "learning_rate": 4.710006551585946e-05, "loss": 0.9466, "step": 1450 }, { "epoch": 0.778497365860206, "eval_loss": 0.9296481609344482, "eval_runtime": 46.0144, "eval_samples_per_second": 20.906, "eval_steps_per_second": 20.906, "step": 1450 }, { "epoch": 0.779034260595282, "grad_norm": 0.76953125, "learning_rate": 4.709611630381184e-05, "loss": 1.1011, "step": 1451 }, { "epoch": 0.779571155330358, "grad_norm": 0.7421875, "learning_rate": 4.709216457031752e-05, "loss": 1.0408, "step": 1452 }, { "epoch": 0.780108050065434, "grad_norm": 0.83984375, "learning_rate": 4.7088210315827444e-05, "loss": 0.8894, "step": 1453 }, { "epoch": 0.78064494480051, "grad_norm": 0.75, "learning_rate": 4.708425354079286e-05, "loss": 1.0962, "step": 1454 }, { "epoch": 0.7811818395355861, "grad_norm": 0.9296875, "learning_rate": 4.708029424566526e-05, "loss": 1.088, "step": 1455 }, { "epoch": 0.7817187342706621, "grad_norm": 0.77734375, "learning_rate": 4.7076332430896464e-05, "loss": 0.989, "step": 1456 }, { "epoch": 0.7822556290057381, "grad_norm": 1.03125, "learning_rate": 4.707236809693856e-05, "loss": 1.1858, "step": 1457 }, { "epoch": 0.7827925237408141, "grad_norm": 0.5625, "learning_rate": 4.706840124424394e-05, "loss": 0.8159, "step": 1458 }, { "epoch": 0.7833294184758901, "grad_norm": 0.7265625, "learning_rate": 4.706443187326526e-05, "loss": 0.8357, "step": 1459 }, { "epoch": 0.7838663132109661, "grad_norm": 0.9609375, "learning_rate": 4.706045998445548e-05, "loss": 1.0639, "step": 1460 }, { "epoch": 0.7844032079460421, "grad_norm": 0.8046875, "learning_rate": 4.7056485578267846e-05, "loss": 1.1264, "step": 1461 }, { "epoch": 0.7849401026811181, "grad_norm": 0.7265625, "learning_rate": 4.705250865515588e-05, "loss": 1.141, "step": 1462 }, { "epoch": 0.785476997416194, "grad_norm": 0.81640625, "learning_rate": 4.70485292155734e-05, "loss": 1.1089, "step": 1463 }, { "epoch": 0.78601389215127, "grad_norm": 0.78515625, "learning_rate": 4.7044547259974525e-05, "loss": 1.002, "step": 1464 }, { "epoch": 0.786550786886346, "grad_norm": 0.6875, "learning_rate": 4.7040562788813636e-05, "loss": 1.015, "step": 1465 }, { "epoch": 0.7870876816214221, "grad_norm": 0.85546875, "learning_rate": 4.703657580254541e-05, "loss": 0.9949, "step": 1466 }, { "epoch": 0.7876245763564981, "grad_norm": 0.765625, "learning_rate": 4.70325863016248e-05, "loss": 1.0667, "step": 1467 }, { "epoch": 0.7881614710915741, "grad_norm": 0.81640625, "learning_rate": 4.7028594286507095e-05, "loss": 0.9384, "step": 1468 }, { "epoch": 0.7886983658266501, "grad_norm": 0.75390625, "learning_rate": 4.7024599757647804e-05, "loss": 1.0406, "step": 1469 }, { "epoch": 0.7892352605617261, "grad_norm": 0.6953125, "learning_rate": 4.702060271550276e-05, "loss": 0.8912, "step": 1470 }, { "epoch": 0.7897721552968021, "grad_norm": 1.09375, "learning_rate": 4.701660316052807e-05, "loss": 1.0978, "step": 1471 }, { "epoch": 0.7903090500318781, "grad_norm": 0.86328125, "learning_rate": 4.701260109318015e-05, "loss": 0.9843, "step": 1472 }, { "epoch": 0.7908459447669541, "grad_norm": 0.87109375, "learning_rate": 4.7008596513915684e-05, "loss": 0.9987, "step": 1473 }, { "epoch": 0.7913828395020301, "grad_norm": 0.64453125, "learning_rate": 4.7004589423191623e-05, "loss": 0.8369, "step": 1474 }, { "epoch": 0.7919197342371062, "grad_norm": 0.75, "learning_rate": 4.700057982146526e-05, "loss": 1.1304, "step": 1475 }, { "epoch": 0.7924566289721822, "grad_norm": 0.62109375, "learning_rate": 4.699656770919412e-05, "loss": 0.8327, "step": 1476 }, { "epoch": 0.7929935237072582, "grad_norm": 0.70703125, "learning_rate": 4.6992553086836035e-05, "loss": 0.9318, "step": 1477 }, { "epoch": 0.7935304184423342, "grad_norm": 0.7265625, "learning_rate": 4.698853595484913e-05, "loss": 0.977, "step": 1478 }, { "epoch": 0.7940673131774102, "grad_norm": 0.703125, "learning_rate": 4.6984516313691816e-05, "loss": 0.8479, "step": 1479 }, { "epoch": 0.7946042079124862, "grad_norm": 0.69921875, "learning_rate": 4.698049416382278e-05, "loss": 1.0746, "step": 1480 }, { "epoch": 0.7951411026475622, "grad_norm": 0.671875, "learning_rate": 4.697646950570099e-05, "loss": 0.8582, "step": 1481 }, { "epoch": 0.7956779973826381, "grad_norm": 0.95703125, "learning_rate": 4.697244233978573e-05, "loss": 0.9033, "step": 1482 }, { "epoch": 0.7962148921177141, "grad_norm": 0.703125, "learning_rate": 4.696841266653653e-05, "loss": 0.9216, "step": 1483 }, { "epoch": 0.7967517868527901, "grad_norm": 0.73046875, "learning_rate": 4.696438048641324e-05, "loss": 1.0413, "step": 1484 }, { "epoch": 0.7972886815878661, "grad_norm": 0.640625, "learning_rate": 4.6960345799875995e-05, "loss": 1.1256, "step": 1485 }, { "epoch": 0.7978255763229422, "grad_norm": 0.69921875, "learning_rate": 4.6956308607385166e-05, "loss": 0.9003, "step": 1486 }, { "epoch": 0.7983624710580182, "grad_norm": 0.6953125, "learning_rate": 4.695226890940149e-05, "loss": 0.9033, "step": 1487 }, { "epoch": 0.7988993657930942, "grad_norm": 0.6953125, "learning_rate": 4.6948226706385915e-05, "loss": 1.1069, "step": 1488 }, { "epoch": 0.7994362605281702, "grad_norm": 0.65625, "learning_rate": 4.694418199879973e-05, "loss": 0.895, "step": 1489 }, { "epoch": 0.7999731552632462, "grad_norm": 0.66015625, "learning_rate": 4.694013478710447e-05, "loss": 0.8756, "step": 1490 }, { "epoch": 0.8005100499983222, "grad_norm": 0.56640625, "learning_rate": 4.693608507176199e-05, "loss": 0.8967, "step": 1491 }, { "epoch": 0.8010469447333982, "grad_norm": 0.73828125, "learning_rate": 4.6932032853234406e-05, "loss": 0.973, "step": 1492 }, { "epoch": 0.8015838394684742, "grad_norm": 0.76171875, "learning_rate": 4.692797813198412e-05, "loss": 0.8524, "step": 1493 }, { "epoch": 0.8021207342035502, "grad_norm": 0.88671875, "learning_rate": 4.692392090847383e-05, "loss": 1.0615, "step": 1494 }, { "epoch": 0.8026576289386262, "grad_norm": 0.64453125, "learning_rate": 4.6919861183166536e-05, "loss": 0.7777, "step": 1495 }, { "epoch": 0.8031945236737023, "grad_norm": 0.94921875, "learning_rate": 4.691579895652548e-05, "loss": 1.1297, "step": 1496 }, { "epoch": 0.8037314184087783, "grad_norm": 0.703125, "learning_rate": 4.6911734229014215e-05, "loss": 0.8149, "step": 1497 }, { "epoch": 0.8042683131438543, "grad_norm": 0.70703125, "learning_rate": 4.690766700109659e-05, "loss": 0.966, "step": 1498 }, { "epoch": 0.8048052078789303, "grad_norm": 0.94921875, "learning_rate": 4.690359727323672e-05, "loss": 0.9832, "step": 1499 }, { "epoch": 0.8053421026140063, "grad_norm": 0.65625, "learning_rate": 4.6899525045899024e-05, "loss": 0.8791, "step": 1500 }, { "epoch": 0.8053421026140063, "eval_loss": 0.928687572479248, "eval_runtime": 45.9461, "eval_samples_per_second": 20.938, "eval_steps_per_second": 20.938, "step": 1500 }, { "epoch": 0.8058789973490822, "grad_norm": 0.74609375, "learning_rate": 4.6895450319548175e-05, "loss": 0.9375, "step": 1501 }, { "epoch": 0.8064158920841582, "grad_norm": 0.73828125, "learning_rate": 4.6891373094649165e-05, "loss": 0.9404, "step": 1502 }, { "epoch": 0.8069527868192342, "grad_norm": 0.71875, "learning_rate": 4.688729337166725e-05, "loss": 0.8475, "step": 1503 }, { "epoch": 0.8074896815543102, "grad_norm": 0.6015625, "learning_rate": 4.688321115106798e-05, "loss": 0.8304, "step": 1504 }, { "epoch": 0.8080265762893862, "grad_norm": 0.7421875, "learning_rate": 4.68791264333172e-05, "loss": 1.1578, "step": 1505 }, { "epoch": 0.8085634710244622, "grad_norm": 0.59765625, "learning_rate": 4.687503921888101e-05, "loss": 0.7527, "step": 1506 }, { "epoch": 0.8091003657595383, "grad_norm": 0.734375, "learning_rate": 4.6870949508225826e-05, "loss": 1.0027, "step": 1507 }, { "epoch": 0.8096372604946143, "grad_norm": 0.71875, "learning_rate": 4.686685730181832e-05, "loss": 0.8635, "step": 1508 }, { "epoch": 0.8101741552296903, "grad_norm": 0.7734375, "learning_rate": 4.6862762600125476e-05, "loss": 0.9531, "step": 1509 }, { "epoch": 0.8107110499647663, "grad_norm": 0.6875, "learning_rate": 4.685866540361456e-05, "loss": 0.8614, "step": 1510 }, { "epoch": 0.8112479446998423, "grad_norm": 0.6328125, "learning_rate": 4.685456571275309e-05, "loss": 0.8557, "step": 1511 }, { "epoch": 0.8117848394349183, "grad_norm": 0.79296875, "learning_rate": 4.685046352800891e-05, "loss": 0.941, "step": 1512 }, { "epoch": 0.8123217341699943, "grad_norm": 0.6796875, "learning_rate": 4.6846358849850136e-05, "loss": 0.8133, "step": 1513 }, { "epoch": 0.8128586289050703, "grad_norm": 0.671875, "learning_rate": 4.6842251678745156e-05, "loss": 0.8278, "step": 1514 }, { "epoch": 0.8133955236401463, "grad_norm": 0.71875, "learning_rate": 4.683814201516264e-05, "loss": 1.0006, "step": 1515 }, { "epoch": 0.8139324183752223, "grad_norm": 0.65625, "learning_rate": 4.683402985957157e-05, "loss": 0.7521, "step": 1516 }, { "epoch": 0.8144693131102984, "grad_norm": 0.7578125, "learning_rate": 4.682991521244118e-05, "loss": 0.9986, "step": 1517 }, { "epoch": 0.8150062078453744, "grad_norm": 0.76171875, "learning_rate": 4.682579807424101e-05, "loss": 0.9481, "step": 1518 }, { "epoch": 0.8155431025804504, "grad_norm": 0.91015625, "learning_rate": 4.6821678445440884e-05, "loss": 0.9817, "step": 1519 }, { "epoch": 0.8160799973155263, "grad_norm": 0.67578125, "learning_rate": 4.68175563265109e-05, "loss": 0.8421, "step": 1520 }, { "epoch": 0.8166168920506023, "grad_norm": 0.734375, "learning_rate": 4.681343171792144e-05, "loss": 0.7721, "step": 1521 }, { "epoch": 0.8171537867856783, "grad_norm": 0.7578125, "learning_rate": 4.6809304620143175e-05, "loss": 1.0361, "step": 1522 }, { "epoch": 0.8176906815207543, "grad_norm": 0.6796875, "learning_rate": 4.680517503364706e-05, "loss": 0.947, "step": 1523 }, { "epoch": 0.8182275762558303, "grad_norm": 1.0859375, "learning_rate": 4.680104295890433e-05, "loss": 0.9522, "step": 1524 }, { "epoch": 0.8187644709909063, "grad_norm": 0.68359375, "learning_rate": 4.679690839638652e-05, "loss": 1.1382, "step": 1525 }, { "epoch": 0.8193013657259823, "grad_norm": 0.6484375, "learning_rate": 4.6792771346565425e-05, "loss": 0.7148, "step": 1526 }, { "epoch": 0.8198382604610583, "grad_norm": 0.92578125, "learning_rate": 4.678863180991313e-05, "loss": 1.0069, "step": 1527 }, { "epoch": 0.8203751551961344, "grad_norm": 0.81640625, "learning_rate": 4.678448978690201e-05, "loss": 0.8415, "step": 1528 }, { "epoch": 0.8209120499312104, "grad_norm": 0.73828125, "learning_rate": 4.678034527800474e-05, "loss": 1.0156, "step": 1529 }, { "epoch": 0.8214489446662864, "grad_norm": 0.90234375, "learning_rate": 4.677619828369424e-05, "loss": 1.2303, "step": 1530 }, { "epoch": 0.8219858394013624, "grad_norm": 0.8359375, "learning_rate": 4.677204880444375e-05, "loss": 1.1202, "step": 1531 }, { "epoch": 0.8225227341364384, "grad_norm": 0.63671875, "learning_rate": 4.676789684072676e-05, "loss": 0.9635, "step": 1532 }, { "epoch": 0.8230596288715144, "grad_norm": 0.8359375, "learning_rate": 4.6763742393017076e-05, "loss": 1.0879, "step": 1533 }, { "epoch": 0.8235965236065904, "grad_norm": 0.73828125, "learning_rate": 4.6759585461788774e-05, "loss": 0.8324, "step": 1534 }, { "epoch": 0.8241334183416664, "grad_norm": 0.6875, "learning_rate": 4.6755426047516205e-05, "loss": 1.0856, "step": 1535 }, { "epoch": 0.8246703130767424, "grad_norm": 0.68359375, "learning_rate": 4.675126415067401e-05, "loss": 0.967, "step": 1536 }, { "epoch": 0.8252072078118184, "grad_norm": 0.63671875, "learning_rate": 4.674709977173712e-05, "loss": 0.7723, "step": 1537 }, { "epoch": 0.8257441025468945, "grad_norm": 0.73046875, "learning_rate": 4.674293291118075e-05, "loss": 0.8389, "step": 1538 }, { "epoch": 0.8262809972819704, "grad_norm": 0.859375, "learning_rate": 4.6738763569480375e-05, "loss": 1.1869, "step": 1539 }, { "epoch": 0.8268178920170464, "grad_norm": 0.6796875, "learning_rate": 4.673459174711178e-05, "loss": 0.9137, "step": 1540 }, { "epoch": 0.8273547867521224, "grad_norm": 0.82421875, "learning_rate": 4.673041744455102e-05, "loss": 1.1577, "step": 1541 }, { "epoch": 0.8278916814871984, "grad_norm": 0.703125, "learning_rate": 4.672624066227444e-05, "loss": 0.8871, "step": 1542 }, { "epoch": 0.8284285762222744, "grad_norm": 0.6328125, "learning_rate": 4.6722061400758654e-05, "loss": 0.7516, "step": 1543 }, { "epoch": 0.8289654709573504, "grad_norm": 0.6171875, "learning_rate": 4.671787966048057e-05, "loss": 0.763, "step": 1544 }, { "epoch": 0.8295023656924264, "grad_norm": 0.78125, "learning_rate": 4.6713695441917396e-05, "loss": 0.9525, "step": 1545 }, { "epoch": 0.8300392604275024, "grad_norm": 0.70703125, "learning_rate": 4.6709508745546575e-05, "loss": 0.9975, "step": 1546 }, { "epoch": 0.8305761551625784, "grad_norm": 0.765625, "learning_rate": 4.670531957184588e-05, "loss": 0.7822, "step": 1547 }, { "epoch": 0.8311130498976544, "grad_norm": 0.8671875, "learning_rate": 4.6701127921293355e-05, "loss": 1.0512, "step": 1548 }, { "epoch": 0.8316499446327305, "grad_norm": 0.7578125, "learning_rate": 4.669693379436731e-05, "loss": 0.8699, "step": 1549 }, { "epoch": 0.8321868393678065, "grad_norm": 0.76171875, "learning_rate": 4.6692737191546346e-05, "loss": 1.1393, "step": 1550 }, { "epoch": 0.8321868393678065, "eval_loss": 0.9287117719650269, "eval_runtime": 46.0676, "eval_samples_per_second": 20.882, "eval_steps_per_second": 20.882, "step": 1550 }, { "epoch": 0.8327237341028825, "grad_norm": 0.7109375, "learning_rate": 4.6688538113309355e-05, "loss": 0.7913, "step": 1551 }, { "epoch": 0.8332606288379585, "grad_norm": 0.6796875, "learning_rate": 4.6684336560135497e-05, "loss": 0.8743, "step": 1552 }, { "epoch": 0.8337975235730345, "grad_norm": 0.66796875, "learning_rate": 4.668013253250423e-05, "loss": 0.913, "step": 1553 }, { "epoch": 0.8343344183081105, "grad_norm": 0.64453125, "learning_rate": 4.667592603089529e-05, "loss": 0.7931, "step": 1554 }, { "epoch": 0.8348713130431865, "grad_norm": 0.71875, "learning_rate": 4.6671717055788675e-05, "loss": 0.9818, "step": 1555 }, { "epoch": 0.8354082077782625, "grad_norm": 0.9921875, "learning_rate": 4.666750560766471e-05, "loss": 0.995, "step": 1556 }, { "epoch": 0.8359451025133385, "grad_norm": 0.74609375, "learning_rate": 4.666329168700394e-05, "loss": 0.8833, "step": 1557 }, { "epoch": 0.8364819972484144, "grad_norm": 0.765625, "learning_rate": 4.665907529428726e-05, "loss": 0.7022, "step": 1558 }, { "epoch": 0.8370188919834904, "grad_norm": 0.7109375, "learning_rate": 4.6654856429995794e-05, "loss": 0.8494, "step": 1559 }, { "epoch": 0.8375557867185665, "grad_norm": 0.64453125, "learning_rate": 4.665063509461097e-05, "loss": 0.8747, "step": 1560 }, { "epoch": 0.8380926814536425, "grad_norm": 0.8515625, "learning_rate": 4.66464112886145e-05, "loss": 1.0861, "step": 1561 }, { "epoch": 0.8386295761887185, "grad_norm": 0.60546875, "learning_rate": 4.6642185012488374e-05, "loss": 0.8035, "step": 1562 }, { "epoch": 0.8391664709237945, "grad_norm": 0.75, "learning_rate": 4.6637956266714855e-05, "loss": 0.9639, "step": 1563 }, { "epoch": 0.8397033656588705, "grad_norm": 0.69140625, "learning_rate": 4.663372505177651e-05, "loss": 0.9753, "step": 1564 }, { "epoch": 0.8402402603939465, "grad_norm": 0.7421875, "learning_rate": 4.662949136815617e-05, "loss": 0.9643, "step": 1565 }, { "epoch": 0.8407771551290225, "grad_norm": 0.77734375, "learning_rate": 4.662525521633695e-05, "loss": 1.007, "step": 1566 }, { "epoch": 0.8413140498640985, "grad_norm": 0.76171875, "learning_rate": 4.662101659680224e-05, "loss": 1.0901, "step": 1567 }, { "epoch": 0.8418509445991745, "grad_norm": 0.734375, "learning_rate": 4.6616775510035735e-05, "loss": 1.0253, "step": 1568 }, { "epoch": 0.8423878393342505, "grad_norm": 0.73046875, "learning_rate": 4.661253195652139e-05, "loss": 0.8747, "step": 1569 }, { "epoch": 0.8429247340693266, "grad_norm": 0.6953125, "learning_rate": 4.6608285936743445e-05, "loss": 0.8479, "step": 1570 }, { "epoch": 0.8434616288044026, "grad_norm": 0.70703125, "learning_rate": 4.660403745118642e-05, "loss": 0.9357, "step": 1571 }, { "epoch": 0.8439985235394786, "grad_norm": 0.828125, "learning_rate": 4.659978650033514e-05, "loss": 0.7952, "step": 1572 }, { "epoch": 0.8445354182745546, "grad_norm": 0.76171875, "learning_rate": 4.659553308467468e-05, "loss": 1.1169, "step": 1573 }, { "epoch": 0.8450723130096306, "grad_norm": 0.828125, "learning_rate": 4.659127720469041e-05, "loss": 0.9751, "step": 1574 }, { "epoch": 0.8456092077447066, "grad_norm": 1.1640625, "learning_rate": 4.658701886086798e-05, "loss": 0.8086, "step": 1575 }, { "epoch": 0.8461461024797825, "grad_norm": 0.59765625, "learning_rate": 4.658275805369331e-05, "loss": 0.8188, "step": 1576 }, { "epoch": 0.8466829972148585, "grad_norm": 0.73828125, "learning_rate": 4.6578494783652635e-05, "loss": 0.8419, "step": 1577 }, { "epoch": 0.8472198919499345, "grad_norm": 0.6015625, "learning_rate": 4.6574229051232433e-05, "loss": 0.8852, "step": 1578 }, { "epoch": 0.8477567866850105, "grad_norm": 0.75, "learning_rate": 4.656996085691948e-05, "loss": 0.9037, "step": 1579 }, { "epoch": 0.8482936814200865, "grad_norm": 0.72265625, "learning_rate": 4.656569020120083e-05, "loss": 0.9597, "step": 1580 }, { "epoch": 0.8488305761551626, "grad_norm": 0.7890625, "learning_rate": 4.656141708456382e-05, "loss": 1.1091, "step": 1581 }, { "epoch": 0.8493674708902386, "grad_norm": 0.7421875, "learning_rate": 4.655714150749607e-05, "loss": 0.9231, "step": 1582 }, { "epoch": 0.8499043656253146, "grad_norm": 0.71484375, "learning_rate": 4.655286347048547e-05, "loss": 1.011, "step": 1583 }, { "epoch": 0.8504412603603906, "grad_norm": 0.640625, "learning_rate": 4.6548582974020216e-05, "loss": 0.9487, "step": 1584 }, { "epoch": 0.8509781550954666, "grad_norm": 0.64453125, "learning_rate": 4.654430001858874e-05, "loss": 0.7564, "step": 1585 }, { "epoch": 0.8515150498305426, "grad_norm": 0.62109375, "learning_rate": 4.65400146046798e-05, "loss": 0.7795, "step": 1586 }, { "epoch": 0.8520519445656186, "grad_norm": 0.87109375, "learning_rate": 4.653572673278241e-05, "loss": 0.8144, "step": 1587 }, { "epoch": 0.8525888393006946, "grad_norm": 0.7109375, "learning_rate": 4.653143640338588e-05, "loss": 0.8644, "step": 1588 }, { "epoch": 0.8531257340357706, "grad_norm": 0.90234375, "learning_rate": 4.6527143616979776e-05, "loss": 1.0435, "step": 1589 }, { "epoch": 0.8536626287708466, "grad_norm": 0.70703125, "learning_rate": 4.652284837405397e-05, "loss": 0.8945, "step": 1590 }, { "epoch": 0.8541995235059227, "grad_norm": 0.83203125, "learning_rate": 4.65185506750986e-05, "loss": 0.7452, "step": 1591 }, { "epoch": 0.8547364182409987, "grad_norm": 0.7109375, "learning_rate": 4.6514250520604074e-05, "loss": 1.0434, "step": 1592 }, { "epoch": 0.8552733129760747, "grad_norm": 0.76171875, "learning_rate": 4.6509947911061125e-05, "loss": 1.001, "step": 1593 }, { "epoch": 0.8558102077111507, "grad_norm": 0.75, "learning_rate": 4.6505642846960704e-05, "loss": 1.0016, "step": 1594 }, { "epoch": 0.8563471024462266, "grad_norm": 0.98046875, "learning_rate": 4.650133532879409e-05, "loss": 1.067, "step": 1595 }, { "epoch": 0.8568839971813026, "grad_norm": 0.61328125, "learning_rate": 4.6497025357052835e-05, "loss": 0.8001, "step": 1596 }, { "epoch": 0.8574208919163786, "grad_norm": 0.86328125, "learning_rate": 4.6492712932228736e-05, "loss": 0.9735, "step": 1597 }, { "epoch": 0.8579577866514546, "grad_norm": 0.71875, "learning_rate": 4.6488398054813916e-05, "loss": 0.8591, "step": 1598 }, { "epoch": 0.8584946813865306, "grad_norm": 0.640625, "learning_rate": 4.648408072530074e-05, "loss": 1.0515, "step": 1599 }, { "epoch": 0.8590315761216066, "grad_norm": 0.703125, "learning_rate": 4.647976094418189e-05, "loss": 0.9288, "step": 1600 }, { "epoch": 0.8590315761216066, "eval_loss": 0.927656888961792, "eval_runtime": 46.3378, "eval_samples_per_second": 20.761, "eval_steps_per_second": 20.761, "step": 1600 }, { "epoch": 0.8595684708566826, "grad_norm": 0.64453125, "learning_rate": 4.647543871195029e-05, "loss": 0.9534, "step": 1601 }, { "epoch": 0.8601053655917587, "grad_norm": 0.69921875, "learning_rate": 4.6471114029099174e-05, "loss": 0.9108, "step": 1602 }, { "epoch": 0.8606422603268347, "grad_norm": 0.765625, "learning_rate": 4.646678689612203e-05, "loss": 0.8877, "step": 1603 }, { "epoch": 0.8611791550619107, "grad_norm": 0.8125, "learning_rate": 4.646245731351265e-05, "loss": 1.1448, "step": 1604 }, { "epoch": 0.8617160497969867, "grad_norm": 0.78125, "learning_rate": 4.64581252817651e-05, "loss": 0.913, "step": 1605 }, { "epoch": 0.8622529445320627, "grad_norm": 0.7265625, "learning_rate": 4.6453790801373695e-05, "loss": 0.9422, "step": 1606 }, { "epoch": 0.8627898392671387, "grad_norm": 0.7578125, "learning_rate": 4.6449453872833084e-05, "loss": 0.982, "step": 1607 }, { "epoch": 0.8633267340022147, "grad_norm": 0.76953125, "learning_rate": 4.6445114496638136e-05, "loss": 0.8221, "step": 1608 }, { "epoch": 0.8638636287372907, "grad_norm": 0.7109375, "learning_rate": 4.644077267328405e-05, "loss": 1.0404, "step": 1609 }, { "epoch": 0.8644005234723667, "grad_norm": 0.67578125, "learning_rate": 4.643642840326627e-05, "loss": 0.7026, "step": 1610 }, { "epoch": 0.8649374182074427, "grad_norm": 0.8203125, "learning_rate": 4.6432081687080554e-05, "loss": 1.0056, "step": 1611 }, { "epoch": 0.8654743129425188, "grad_norm": 0.8203125, "learning_rate": 4.642773252522289e-05, "loss": 0.7586, "step": 1612 }, { "epoch": 0.8660112076775948, "grad_norm": 0.84765625, "learning_rate": 4.642338091818959e-05, "loss": 0.8835, "step": 1613 }, { "epoch": 0.8665481024126707, "grad_norm": 0.6484375, "learning_rate": 4.641902686647722e-05, "loss": 0.9294, "step": 1614 }, { "epoch": 0.8670849971477467, "grad_norm": 0.66015625, "learning_rate": 4.641467037058263e-05, "loss": 0.9119, "step": 1615 }, { "epoch": 0.8676218918828227, "grad_norm": 1.25, "learning_rate": 4.6410311431002966e-05, "loss": 1.1248, "step": 1616 }, { "epoch": 0.8681587866178987, "grad_norm": 0.6640625, "learning_rate": 4.6405950048235625e-05, "loss": 1.0666, "step": 1617 }, { "epoch": 0.8686956813529747, "grad_norm": 0.81640625, "learning_rate": 4.640158622277829e-05, "loss": 1.2896, "step": 1618 }, { "epoch": 0.8692325760880507, "grad_norm": 0.6640625, "learning_rate": 4.639721995512895e-05, "loss": 1.175, "step": 1619 }, { "epoch": 0.8697694708231267, "grad_norm": 0.7890625, "learning_rate": 4.6392851245785834e-05, "loss": 1.0522, "step": 1620 }, { "epoch": 0.8703063655582027, "grad_norm": 0.69140625, "learning_rate": 4.638848009524747e-05, "loss": 0.8324, "step": 1621 }, { "epoch": 0.8708432602932787, "grad_norm": 0.70703125, "learning_rate": 4.638410650401267e-05, "loss": 0.9249, "step": 1622 }, { "epoch": 0.8713801550283548, "grad_norm": 0.72265625, "learning_rate": 4.6379730472580506e-05, "loss": 0.831, "step": 1623 }, { "epoch": 0.8719170497634308, "grad_norm": 0.625, "learning_rate": 4.637535200145034e-05, "loss": 0.8674, "step": 1624 }, { "epoch": 0.8724539444985068, "grad_norm": 0.83984375, "learning_rate": 4.637097109112182e-05, "loss": 1.0633, "step": 1625 }, { "epoch": 0.8729908392335828, "grad_norm": 0.6171875, "learning_rate": 4.636658774209485e-05, "loss": 1.041, "step": 1626 }, { "epoch": 0.8735277339686588, "grad_norm": 0.62109375, "learning_rate": 4.636220195486964e-05, "loss": 0.9218, "step": 1627 }, { "epoch": 0.8740646287037348, "grad_norm": 0.64453125, "learning_rate": 4.6357813729946655e-05, "loss": 0.9361, "step": 1628 }, { "epoch": 0.8746015234388108, "grad_norm": 0.8359375, "learning_rate": 4.635342306782664e-05, "loss": 1.0006, "step": 1629 }, { "epoch": 0.8751384181738868, "grad_norm": 0.78125, "learning_rate": 4.6349029969010644e-05, "loss": 0.7847, "step": 1630 }, { "epoch": 0.8756753129089628, "grad_norm": 0.76953125, "learning_rate": 4.634463443399995e-05, "loss": 1.0114, "step": 1631 }, { "epoch": 0.8762122076440388, "grad_norm": 0.703125, "learning_rate": 4.634023646329618e-05, "loss": 0.9952, "step": 1632 }, { "epoch": 0.8767491023791147, "grad_norm": 0.72265625, "learning_rate": 4.633583605740117e-05, "loss": 0.8418, "step": 1633 }, { "epoch": 0.8772859971141908, "grad_norm": 0.70703125, "learning_rate": 4.633143321681707e-05, "loss": 0.9023, "step": 1634 }, { "epoch": 0.8778228918492668, "grad_norm": 0.7734375, "learning_rate": 4.6327027942046286e-05, "loss": 0.9766, "step": 1635 }, { "epoch": 0.8783597865843428, "grad_norm": 0.7734375, "learning_rate": 4.632262023359154e-05, "loss": 1.0, "step": 1636 }, { "epoch": 0.8788966813194188, "grad_norm": 0.73828125, "learning_rate": 4.63182100919558e-05, "loss": 1.0702, "step": 1637 }, { "epoch": 0.8794335760544948, "grad_norm": 0.66015625, "learning_rate": 4.631379751764231e-05, "loss": 1.0212, "step": 1638 }, { "epoch": 0.8799704707895708, "grad_norm": 0.6796875, "learning_rate": 4.630938251115461e-05, "loss": 0.8073, "step": 1639 }, { "epoch": 0.8805073655246468, "grad_norm": 1.515625, "learning_rate": 4.6304965072996495e-05, "loss": 0.9944, "step": 1640 }, { "epoch": 0.8810442602597228, "grad_norm": 0.9375, "learning_rate": 4.6300545203672065e-05, "loss": 0.9397, "step": 1641 }, { "epoch": 0.8815811549947988, "grad_norm": 0.68359375, "learning_rate": 4.629612290368568e-05, "loss": 0.9176, "step": 1642 }, { "epoch": 0.8821180497298748, "grad_norm": 0.71484375, "learning_rate": 4.629169817354198e-05, "loss": 0.8976, "step": 1643 }, { "epoch": 0.8826549444649509, "grad_norm": 0.6640625, "learning_rate": 4.628727101374587e-05, "loss": 0.9841, "step": 1644 }, { "epoch": 0.8831918392000269, "grad_norm": 0.73046875, "learning_rate": 4.6282841424802557e-05, "loss": 0.9706, "step": 1645 }, { "epoch": 0.8837287339351029, "grad_norm": 1.3515625, "learning_rate": 4.6278409407217525e-05, "loss": 0.8614, "step": 1646 }, { "epoch": 0.8842656286701789, "grad_norm": 0.89453125, "learning_rate": 4.62739749614965e-05, "loss": 0.9592, "step": 1647 }, { "epoch": 0.8848025234052549, "grad_norm": 0.734375, "learning_rate": 4.626953808814553e-05, "loss": 0.9304, "step": 1648 }, { "epoch": 0.8853394181403309, "grad_norm": 0.796875, "learning_rate": 4.626509878767089e-05, "loss": 1.0085, "step": 1649 }, { "epoch": 0.8858763128754069, "grad_norm": 0.75, "learning_rate": 4.6260657060579185e-05, "loss": 0.9249, "step": 1650 }, { "epoch": 0.8858763128754069, "eval_loss": 0.9260422587394714, "eval_runtime": 45.8059, "eval_samples_per_second": 21.002, "eval_steps_per_second": 21.002, "step": 1650 }, { "epoch": 0.8864132076104829, "grad_norm": 0.71484375, "learning_rate": 4.6256212907377274e-05, "loss": 0.961, "step": 1651 }, { "epoch": 0.8869501023455588, "grad_norm": 0.6796875, "learning_rate": 4.625176632857227e-05, "loss": 0.8949, "step": 1652 }, { "epoch": 0.8874869970806348, "grad_norm": 0.58203125, "learning_rate": 4.6247317324671605e-05, "loss": 0.8514, "step": 1653 }, { "epoch": 0.8880238918157108, "grad_norm": 0.765625, "learning_rate": 4.624286589618295e-05, "loss": 0.8828, "step": 1654 }, { "epoch": 0.8885607865507869, "grad_norm": 0.765625, "learning_rate": 4.6238412043614276e-05, "loss": 0.9396, "step": 1655 }, { "epoch": 0.8890976812858629, "grad_norm": 0.66015625, "learning_rate": 4.6233955767473834e-05, "loss": 0.9512, "step": 1656 }, { "epoch": 0.8896345760209389, "grad_norm": 0.7265625, "learning_rate": 4.622949706827013e-05, "loss": 1.0584, "step": 1657 }, { "epoch": 0.8901714707560149, "grad_norm": 0.70703125, "learning_rate": 4.622503594651196e-05, "loss": 0.8989, "step": 1658 }, { "epoch": 0.8907083654910909, "grad_norm": 0.76953125, "learning_rate": 4.622057240270839e-05, "loss": 1.1702, "step": 1659 }, { "epoch": 0.8912452602261669, "grad_norm": 0.73046875, "learning_rate": 4.621610643736878e-05, "loss": 0.9822, "step": 1660 }, { "epoch": 0.8917821549612429, "grad_norm": 0.68359375, "learning_rate": 4.6211638051002734e-05, "loss": 1.1196, "step": 1661 }, { "epoch": 0.8923190496963189, "grad_norm": 0.7890625, "learning_rate": 4.620716724412018e-05, "loss": 0.8581, "step": 1662 }, { "epoch": 0.8928559444313949, "grad_norm": 0.62890625, "learning_rate": 4.620269401723126e-05, "loss": 0.8999, "step": 1663 }, { "epoch": 0.8933928391664709, "grad_norm": 0.7109375, "learning_rate": 4.6198218370846444e-05, "loss": 0.9555, "step": 1664 }, { "epoch": 0.893929733901547, "grad_norm": 0.7109375, "learning_rate": 4.619374030547647e-05, "loss": 0.9766, "step": 1665 }, { "epoch": 0.894466628636623, "grad_norm": 0.8359375, "learning_rate": 4.618925982163232e-05, "loss": 1.0356, "step": 1666 }, { "epoch": 0.895003523371699, "grad_norm": 0.9296875, "learning_rate": 4.618477691982529e-05, "loss": 0.9794, "step": 1667 }, { "epoch": 0.895540418106775, "grad_norm": 0.7734375, "learning_rate": 4.618029160056693e-05, "loss": 1.2178, "step": 1668 }, { "epoch": 0.896077312841851, "grad_norm": 0.79296875, "learning_rate": 4.617580386436906e-05, "loss": 0.958, "step": 1669 }, { "epoch": 0.896614207576927, "grad_norm": 0.73046875, "learning_rate": 4.617131371174381e-05, "loss": 0.8052, "step": 1670 }, { "epoch": 0.8971511023120029, "grad_norm": 0.7734375, "learning_rate": 4.6166821143203555e-05, "loss": 1.1036, "step": 1671 }, { "epoch": 0.8976879970470789, "grad_norm": 0.76953125, "learning_rate": 4.6162326159260945e-05, "loss": 1.0919, "step": 1672 }, { "epoch": 0.8982248917821549, "grad_norm": 0.9765625, "learning_rate": 4.615782876042892e-05, "loss": 1.0033, "step": 1673 }, { "epoch": 0.8987617865172309, "grad_norm": 0.64453125, "learning_rate": 4.6153328947220696e-05, "loss": 0.8411, "step": 1674 }, { "epoch": 0.8992986812523069, "grad_norm": 0.84375, "learning_rate": 4.614882672014975e-05, "loss": 0.8503, "step": 1675 }, { "epoch": 0.899835575987383, "grad_norm": 0.84765625, "learning_rate": 4.6144322079729854e-05, "loss": 0.9053, "step": 1676 }, { "epoch": 0.900372470722459, "grad_norm": 0.7109375, "learning_rate": 4.6139815026475034e-05, "loss": 1.1372, "step": 1677 }, { "epoch": 0.900909365457535, "grad_norm": 0.6015625, "learning_rate": 4.61353055608996e-05, "loss": 0.7013, "step": 1678 }, { "epoch": 0.901446260192611, "grad_norm": 0.82421875, "learning_rate": 4.6130793683518146e-05, "loss": 0.8741, "step": 1679 }, { "epoch": 0.901983154927687, "grad_norm": 0.859375, "learning_rate": 4.612627939484554e-05, "loss": 1.1555, "step": 1680 }, { "epoch": 0.902520049662763, "grad_norm": 0.6875, "learning_rate": 4.612176269539691e-05, "loss": 0.9153, "step": 1681 }, { "epoch": 0.903056944397839, "grad_norm": 0.66015625, "learning_rate": 4.6117243585687664e-05, "loss": 1.0942, "step": 1682 }, { "epoch": 0.903593839132915, "grad_norm": 0.8515625, "learning_rate": 4.61127220662335e-05, "loss": 0.9599, "step": 1683 }, { "epoch": 0.904130733867991, "grad_norm": 0.80078125, "learning_rate": 4.610819813755038e-05, "loss": 0.9821, "step": 1684 }, { "epoch": 0.904667628603067, "grad_norm": 0.77734375, "learning_rate": 4.610367180015454e-05, "loss": 0.9118, "step": 1685 }, { "epoch": 0.905204523338143, "grad_norm": 0.75390625, "learning_rate": 4.6099143054562486e-05, "loss": 1.0199, "step": 1686 }, { "epoch": 0.9057414180732191, "grad_norm": 0.921875, "learning_rate": 4.609461190129101e-05, "loss": 1.2331, "step": 1687 }, { "epoch": 0.9062783128082951, "grad_norm": 0.62890625, "learning_rate": 4.609007834085718e-05, "loss": 0.7347, "step": 1688 }, { "epoch": 0.9068152075433711, "grad_norm": 0.796875, "learning_rate": 4.6085542373778326e-05, "loss": 0.8719, "step": 1689 }, { "epoch": 0.907352102278447, "grad_norm": 0.859375, "learning_rate": 4.608100400057206e-05, "loss": 0.942, "step": 1690 }, { "epoch": 0.907888997013523, "grad_norm": 0.76953125, "learning_rate": 4.607646322175626e-05, "loss": 0.8428, "step": 1691 }, { "epoch": 0.908425891748599, "grad_norm": 0.80859375, "learning_rate": 4.60719200378491e-05, "loss": 0.7774, "step": 1692 }, { "epoch": 0.908962786483675, "grad_norm": 0.62109375, "learning_rate": 4.606737444936902e-05, "loss": 0.7665, "step": 1693 }, { "epoch": 0.909499681218751, "grad_norm": 0.8359375, "learning_rate": 4.6062826456834704e-05, "loss": 0.855, "step": 1694 }, { "epoch": 0.910036575953827, "grad_norm": 0.6796875, "learning_rate": 4.605827606076516e-05, "loss": 0.9617, "step": 1695 }, { "epoch": 0.910573470688903, "grad_norm": 0.77734375, "learning_rate": 4.605372326167963e-05, "loss": 1.0468, "step": 1696 }, { "epoch": 0.911110365423979, "grad_norm": 0.67578125, "learning_rate": 4.604916806009766e-05, "loss": 0.8403, "step": 1697 }, { "epoch": 0.9116472601590551, "grad_norm": 0.66796875, "learning_rate": 4.604461045653904e-05, "loss": 0.8779, "step": 1698 }, { "epoch": 0.9121841548941311, "grad_norm": 0.9296875, "learning_rate": 4.604005045152385e-05, "loss": 0.8258, "step": 1699 }, { "epoch": 0.9127210496292071, "grad_norm": 0.7265625, "learning_rate": 4.603548804557247e-05, "loss": 1.0053, "step": 1700 }, { "epoch": 0.9127210496292071, "eval_loss": 0.9246048331260681, "eval_runtime": 46.856, "eval_samples_per_second": 20.531, "eval_steps_per_second": 20.531, "step": 1700 }, { "epoch": 0.9132579443642831, "grad_norm": 0.75, "learning_rate": 4.60309232392055e-05, "loss": 1.033, "step": 1701 }, { "epoch": 0.9137948390993591, "grad_norm": 0.72265625, "learning_rate": 4.6026356032943866e-05, "loss": 0.8617, "step": 1702 }, { "epoch": 0.9143317338344351, "grad_norm": 0.7109375, "learning_rate": 4.6021786427308724e-05, "loss": 1.1039, "step": 1703 }, { "epoch": 0.9148686285695111, "grad_norm": 0.67578125, "learning_rate": 4.601721442282152e-05, "loss": 0.9135, "step": 1704 }, { "epoch": 0.9154055233045871, "grad_norm": 0.62890625, "learning_rate": 4.601264002000401e-05, "loss": 1.0555, "step": 1705 }, { "epoch": 0.9159424180396631, "grad_norm": 0.7265625, "learning_rate": 4.600806321937815e-05, "loss": 0.9806, "step": 1706 }, { "epoch": 0.9164793127747392, "grad_norm": 0.80078125, "learning_rate": 4.6003484021466245e-05, "loss": 0.9285, "step": 1707 }, { "epoch": 0.9170162075098152, "grad_norm": 0.6640625, "learning_rate": 4.599890242679082e-05, "loss": 0.9098, "step": 1708 }, { "epoch": 0.9175531022448911, "grad_norm": 0.67578125, "learning_rate": 4.59943184358747e-05, "loss": 0.9752, "step": 1709 }, { "epoch": 0.9180899969799671, "grad_norm": 0.78125, "learning_rate": 4.598973204924097e-05, "loss": 0.9775, "step": 1710 }, { "epoch": 0.9186268917150431, "grad_norm": 0.7265625, "learning_rate": 4.598514326741301e-05, "loss": 0.8949, "step": 1711 }, { "epoch": 0.9191637864501191, "grad_norm": 0.86328125, "learning_rate": 4.598055209091444e-05, "loss": 0.9869, "step": 1712 }, { "epoch": 0.9197006811851951, "grad_norm": 0.6484375, "learning_rate": 4.597595852026918e-05, "loss": 0.8093, "step": 1713 }, { "epoch": 0.9202375759202711, "grad_norm": 0.77734375, "learning_rate": 4.597136255600142e-05, "loss": 0.9374, "step": 1714 }, { "epoch": 0.9207744706553471, "grad_norm": 0.7578125, "learning_rate": 4.5966764198635606e-05, "loss": 1.1599, "step": 1715 }, { "epoch": 0.9213113653904231, "grad_norm": 0.7265625, "learning_rate": 4.596216344869647e-05, "loss": 0.9477, "step": 1716 }, { "epoch": 0.9218482601254991, "grad_norm": 0.984375, "learning_rate": 4.595756030670902e-05, "loss": 1.0141, "step": 1717 }, { "epoch": 0.9223851548605752, "grad_norm": 0.76953125, "learning_rate": 4.595295477319854e-05, "loss": 1.0733, "step": 1718 }, { "epoch": 0.9229220495956512, "grad_norm": 0.82421875, "learning_rate": 4.594834684869058e-05, "loss": 1.1318, "step": 1719 }, { "epoch": 0.9234589443307272, "grad_norm": 0.69140625, "learning_rate": 4.5943736533710944e-05, "loss": 1.0442, "step": 1720 }, { "epoch": 0.9239958390658032, "grad_norm": 0.671875, "learning_rate": 4.5939123828785735e-05, "loss": 1.0249, "step": 1721 }, { "epoch": 0.9245327338008792, "grad_norm": 0.8203125, "learning_rate": 4.593450873444133e-05, "loss": 1.0056, "step": 1722 }, { "epoch": 0.9250696285359552, "grad_norm": 0.7109375, "learning_rate": 4.592989125120437e-05, "loss": 0.8301, "step": 1723 }, { "epoch": 0.9256065232710312, "grad_norm": 0.76953125, "learning_rate": 4.592527137960175e-05, "loss": 1.1348, "step": 1724 }, { "epoch": 0.9261434180061072, "grad_norm": 0.8203125, "learning_rate": 4.592064912016068e-05, "loss": 0.8146, "step": 1725 }, { "epoch": 0.9266803127411832, "grad_norm": 0.66015625, "learning_rate": 4.59160244734086e-05, "loss": 0.845, "step": 1726 }, { "epoch": 0.9272172074762591, "grad_norm": 0.87109375, "learning_rate": 4.591139743987325e-05, "loss": 1.0645, "step": 1727 }, { "epoch": 0.9277541022113351, "grad_norm": 0.80078125, "learning_rate": 4.5906768020082634e-05, "loss": 0.9206, "step": 1728 }, { "epoch": 0.9282909969464112, "grad_norm": 0.7421875, "learning_rate": 4.590213621456502e-05, "loss": 1.0432, "step": 1729 }, { "epoch": 0.9288278916814872, "grad_norm": 0.86328125, "learning_rate": 4.589750202384896e-05, "loss": 0.8983, "step": 1730 }, { "epoch": 0.9293647864165632, "grad_norm": 0.7578125, "learning_rate": 4.5892865448463274e-05, "loss": 0.8719, "step": 1731 }, { "epoch": 0.9299016811516392, "grad_norm": 0.69140625, "learning_rate": 4.588822648893706e-05, "loss": 1.1086, "step": 1732 }, { "epoch": 0.9304385758867152, "grad_norm": 0.59375, "learning_rate": 4.588358514579967e-05, "loss": 0.7372, "step": 1733 }, { "epoch": 0.9309754706217912, "grad_norm": 0.6875, "learning_rate": 4.587894141958074e-05, "loss": 0.9003, "step": 1734 }, { "epoch": 0.9315123653568672, "grad_norm": 0.7578125, "learning_rate": 4.587429531081019e-05, "loss": 0.9172, "step": 1735 }, { "epoch": 0.9320492600919432, "grad_norm": 0.73828125, "learning_rate": 4.5869646820018185e-05, "loss": 0.9648, "step": 1736 }, { "epoch": 0.9325861548270192, "grad_norm": 0.79296875, "learning_rate": 4.58649959477352e-05, "loss": 0.8828, "step": 1737 }, { "epoch": 0.9331230495620952, "grad_norm": 0.80859375, "learning_rate": 4.586034269449193e-05, "loss": 0.9438, "step": 1738 }, { "epoch": 0.9336599442971713, "grad_norm": 0.64453125, "learning_rate": 4.585568706081939e-05, "loss": 0.9909, "step": 1739 }, { "epoch": 0.9341968390322473, "grad_norm": 0.734375, "learning_rate": 4.585102904724884e-05, "loss": 0.8296, "step": 1740 }, { "epoch": 0.9347337337673233, "grad_norm": 0.78125, "learning_rate": 4.5846368654311825e-05, "loss": 0.8979, "step": 1741 }, { "epoch": 0.9352706285023993, "grad_norm": 0.64453125, "learning_rate": 4.5841705882540134e-05, "loss": 0.9257, "step": 1742 }, { "epoch": 0.9358075232374753, "grad_norm": 0.69140625, "learning_rate": 4.583704073246588e-05, "loss": 0.8802, "step": 1743 }, { "epoch": 0.9363444179725513, "grad_norm": 0.921875, "learning_rate": 4.583237320462138e-05, "loss": 1.0676, "step": 1744 }, { "epoch": 0.9368813127076273, "grad_norm": 0.78515625, "learning_rate": 4.5827703299539293e-05, "loss": 0.9623, "step": 1745 }, { "epoch": 0.9374182074427032, "grad_norm": 0.67578125, "learning_rate": 4.5823031017752485e-05, "loss": 0.9597, "step": 1746 }, { "epoch": 0.9379551021777792, "grad_norm": 0.7734375, "learning_rate": 4.5818356359794146e-05, "loss": 1.0, "step": 1747 }, { "epoch": 0.9384919969128552, "grad_norm": 0.5546875, "learning_rate": 4.58136793261977e-05, "loss": 0.8805, "step": 1748 }, { "epoch": 0.9390288916479312, "grad_norm": 0.71875, "learning_rate": 4.580899991749686e-05, "loss": 0.9251, "step": 1749 }, { "epoch": 0.9395657863830073, "grad_norm": 0.60546875, "learning_rate": 4.58043181342256e-05, "loss": 0.7675, "step": 1750 }, { "epoch": 0.9395657863830073, "eval_loss": 0.9235332012176514, "eval_runtime": 45.9473, "eval_samples_per_second": 20.937, "eval_steps_per_second": 20.937, "step": 1750 }, { "epoch": 0.9401026811180833, "grad_norm": 0.9140625, "learning_rate": 4.579963397691818e-05, "loss": 1.0145, "step": 1751 }, { "epoch": 0.9406395758531593, "grad_norm": 0.6796875, "learning_rate": 4.579494744610912e-05, "loss": 0.9521, "step": 1752 }, { "epoch": 0.9411764705882353, "grad_norm": 0.66796875, "learning_rate": 4.57902585423332e-05, "loss": 0.7413, "step": 1753 }, { "epoch": 0.9417133653233113, "grad_norm": 0.66015625, "learning_rate": 4.5785567266125494e-05, "loss": 0.8167, "step": 1754 }, { "epoch": 0.9422502600583873, "grad_norm": 0.6640625, "learning_rate": 4.578087361802135e-05, "loss": 0.8401, "step": 1755 }, { "epoch": 0.9427871547934633, "grad_norm": 1.0625, "learning_rate": 4.577617759855634e-05, "loss": 1.123, "step": 1756 }, { "epoch": 0.9433240495285393, "grad_norm": 0.73046875, "learning_rate": 4.5771479208266363e-05, "loss": 1.0633, "step": 1757 }, { "epoch": 0.9438609442636153, "grad_norm": 1.0234375, "learning_rate": 4.5766778447687565e-05, "loss": 0.9038, "step": 1758 }, { "epoch": 0.9443978389986913, "grad_norm": 0.68359375, "learning_rate": 4.576207531735635e-05, "loss": 0.9805, "step": 1759 }, { "epoch": 0.9449347337337674, "grad_norm": 0.73046875, "learning_rate": 4.5757369817809415e-05, "loss": 0.9875, "step": 1760 }, { "epoch": 0.9454716284688434, "grad_norm": 0.75390625, "learning_rate": 4.575266194958371e-05, "loss": 0.95, "step": 1761 }, { "epoch": 0.9460085232039194, "grad_norm": 0.65625, "learning_rate": 4.574795171321646e-05, "loss": 0.7474, "step": 1762 }, { "epoch": 0.9465454179389954, "grad_norm": 0.6953125, "learning_rate": 4.574323910924517e-05, "loss": 0.8633, "step": 1763 }, { "epoch": 0.9470823126740714, "grad_norm": 0.7578125, "learning_rate": 4.5738524138207616e-05, "loss": 0.936, "step": 1764 }, { "epoch": 0.9476192074091473, "grad_norm": 0.6640625, "learning_rate": 4.5733806800641816e-05, "loss": 0.9208, "step": 1765 }, { "epoch": 0.9481561021442233, "grad_norm": 0.7265625, "learning_rate": 4.572908709708609e-05, "loss": 0.8539, "step": 1766 }, { "epoch": 0.9486929968792993, "grad_norm": 0.6875, "learning_rate": 4.5724365028079017e-05, "loss": 0.8122, "step": 1767 }, { "epoch": 0.9492298916143753, "grad_norm": 0.93359375, "learning_rate": 4.571964059415944e-05, "loss": 0.9129, "step": 1768 }, { "epoch": 0.9497667863494513, "grad_norm": 0.82421875, "learning_rate": 4.571491379586647e-05, "loss": 0.7864, "step": 1769 }, { "epoch": 0.9503036810845273, "grad_norm": 0.72265625, "learning_rate": 4.571018463373952e-05, "loss": 0.9741, "step": 1770 }, { "epoch": 0.9508405758196034, "grad_norm": 0.67578125, "learning_rate": 4.570545310831821e-05, "loss": 0.8631, "step": 1771 }, { "epoch": 0.9513774705546794, "grad_norm": 0.66796875, "learning_rate": 4.5700719220142494e-05, "loss": 0.8401, "step": 1772 }, { "epoch": 0.9519143652897554, "grad_norm": 0.7734375, "learning_rate": 4.5695982969752566e-05, "loss": 0.7391, "step": 1773 }, { "epoch": 0.9524512600248314, "grad_norm": 0.6484375, "learning_rate": 4.569124435768889e-05, "loss": 0.9252, "step": 1774 }, { "epoch": 0.9529881547599074, "grad_norm": 0.59375, "learning_rate": 4.5686503384492194e-05, "loss": 0.8185, "step": 1775 }, { "epoch": 0.9535250494949834, "grad_norm": 0.67578125, "learning_rate": 4.5681760050703485e-05, "loss": 0.7863, "step": 1776 }, { "epoch": 0.9540619442300594, "grad_norm": 0.87109375, "learning_rate": 4.567701435686404e-05, "loss": 0.8573, "step": 1777 }, { "epoch": 0.9545988389651354, "grad_norm": 0.765625, "learning_rate": 4.567226630351542e-05, "loss": 1.1293, "step": 1778 }, { "epoch": 0.9551357337002114, "grad_norm": 0.7109375, "learning_rate": 4.566751589119941e-05, "loss": 0.9446, "step": 1779 }, { "epoch": 0.9556726284352874, "grad_norm": 1.3828125, "learning_rate": 4.5662763120458105e-05, "loss": 0.9117, "step": 1780 }, { "epoch": 0.9562095231703635, "grad_norm": 0.71484375, "learning_rate": 4.565800799183385e-05, "loss": 1.1525, "step": 1781 }, { "epoch": 0.9567464179054395, "grad_norm": 0.67578125, "learning_rate": 4.5653250505869274e-05, "loss": 1.1099, "step": 1782 }, { "epoch": 0.9572833126405155, "grad_norm": 0.73046875, "learning_rate": 4.5648490663107266e-05, "loss": 1.0494, "step": 1783 }, { "epoch": 0.9578202073755914, "grad_norm": 0.703125, "learning_rate": 4.5643728464090986e-05, "loss": 0.8424, "step": 1784 }, { "epoch": 0.9583571021106674, "grad_norm": 0.63671875, "learning_rate": 4.5638963909363845e-05, "loss": 0.8713, "step": 1785 }, { "epoch": 0.9588939968457434, "grad_norm": 0.78125, "learning_rate": 4.563419699946956e-05, "loss": 0.7595, "step": 1786 }, { "epoch": 0.9594308915808194, "grad_norm": 0.65625, "learning_rate": 4.562942773495208e-05, "loss": 0.8329, "step": 1787 }, { "epoch": 0.9599677863158954, "grad_norm": 0.8515625, "learning_rate": 4.5624656116355645e-05, "loss": 1.1087, "step": 1788 }, { "epoch": 0.9605046810509714, "grad_norm": 0.89453125, "learning_rate": 4.5619882144224765e-05, "loss": 1.0542, "step": 1789 }, { "epoch": 0.9610415757860474, "grad_norm": 0.69140625, "learning_rate": 4.5615105819104194e-05, "loss": 0.8634, "step": 1790 }, { "epoch": 0.9615784705211234, "grad_norm": 0.7890625, "learning_rate": 4.561032714153898e-05, "loss": 1.0115, "step": 1791 }, { "epoch": 0.9621153652561995, "grad_norm": 0.765625, "learning_rate": 4.560554611207444e-05, "loss": 0.9457, "step": 1792 }, { "epoch": 0.9626522599912755, "grad_norm": 0.64453125, "learning_rate": 4.560076273125613e-05, "loss": 0.8585, "step": 1793 }, { "epoch": 0.9631891547263515, "grad_norm": 0.8046875, "learning_rate": 4.5595976999629906e-05, "loss": 1.007, "step": 1794 }, { "epoch": 0.9637260494614275, "grad_norm": 0.75390625, "learning_rate": 4.559118891774188e-05, "loss": 1.0299, "step": 1795 }, { "epoch": 0.9642629441965035, "grad_norm": 0.76171875, "learning_rate": 4.5586398486138435e-05, "loss": 1.0001, "step": 1796 }, { "epoch": 0.9647998389315795, "grad_norm": 0.8984375, "learning_rate": 4.5581605705366224e-05, "loss": 1.1626, "step": 1797 }, { "epoch": 0.9653367336666555, "grad_norm": 0.72265625, "learning_rate": 4.5576810575972146e-05, "loss": 0.8722, "step": 1798 }, { "epoch": 0.9658736284017315, "grad_norm": 0.85546875, "learning_rate": 4.55720130985034e-05, "loss": 0.9075, "step": 1799 }, { "epoch": 0.9664105231368075, "grad_norm": 0.77734375, "learning_rate": 4.5567213273507435e-05, "loss": 1.0448, "step": 1800 }, { "epoch": 0.9664105231368075, "eval_loss": 0.9234041571617126, "eval_runtime": 46.0021, "eval_samples_per_second": 20.912, "eval_steps_per_second": 20.912, "step": 1800 }, { "epoch": 0.9669474178718835, "grad_norm": 0.76953125, "learning_rate": 4.5562411101531975e-05, "loss": 0.8338, "step": 1801 }, { "epoch": 0.9674843126069596, "grad_norm": 0.703125, "learning_rate": 4.555760658312501e-05, "loss": 0.8609, "step": 1802 }, { "epoch": 0.9680212073420355, "grad_norm": 0.80859375, "learning_rate": 4.555279971883478e-05, "loss": 0.8369, "step": 1803 }, { "epoch": 0.9685581020771115, "grad_norm": 0.76171875, "learning_rate": 4.554799050920984e-05, "loss": 0.871, "step": 1804 }, { "epoch": 0.9690949968121875, "grad_norm": 0.71875, "learning_rate": 4.554317895479895e-05, "loss": 1.0161, "step": 1805 }, { "epoch": 0.9696318915472635, "grad_norm": 0.625, "learning_rate": 4.553836505615119e-05, "loss": 0.8452, "step": 1806 }, { "epoch": 0.9701687862823395, "grad_norm": 0.93359375, "learning_rate": 4.553354881381587e-05, "loss": 1.1926, "step": 1807 }, { "epoch": 0.9707056810174155, "grad_norm": 0.7421875, "learning_rate": 4.5528730228342605e-05, "loss": 0.8598, "step": 1808 }, { "epoch": 0.9712425757524915, "grad_norm": 1.0234375, "learning_rate": 4.552390930028124e-05, "loss": 1.0521, "step": 1809 }, { "epoch": 0.9717794704875675, "grad_norm": 0.734375, "learning_rate": 4.551908603018191e-05, "loss": 1.0746, "step": 1810 }, { "epoch": 0.9723163652226435, "grad_norm": 1.0546875, "learning_rate": 4.551426041859501e-05, "loss": 1.1368, "step": 1811 }, { "epoch": 0.9728532599577195, "grad_norm": 0.65234375, "learning_rate": 4.550943246607121e-05, "loss": 0.942, "step": 1812 }, { "epoch": 0.9733901546927956, "grad_norm": 0.65234375, "learning_rate": 4.550460217316143e-05, "loss": 0.8976, "step": 1813 }, { "epoch": 0.9739270494278716, "grad_norm": 0.59375, "learning_rate": 4.549976954041687e-05, "loss": 0.7012, "step": 1814 }, { "epoch": 0.9744639441629476, "grad_norm": 0.74609375, "learning_rate": 4.5494934568389e-05, "loss": 0.7313, "step": 1815 }, { "epoch": 0.9750008388980236, "grad_norm": 0.70703125, "learning_rate": 4.549009725762955e-05, "loss": 0.8401, "step": 1816 }, { "epoch": 0.9755377336330996, "grad_norm": 0.67578125, "learning_rate": 4.548525760869051e-05, "loss": 0.9733, "step": 1817 }, { "epoch": 0.9760746283681756, "grad_norm": 0.6484375, "learning_rate": 4.5480415622124165e-05, "loss": 0.898, "step": 1818 }, { "epoch": 0.9766115231032516, "grad_norm": 0.75, "learning_rate": 4.547557129848302e-05, "loss": 0.7641, "step": 1819 }, { "epoch": 0.9771484178383276, "grad_norm": 0.6875, "learning_rate": 4.547072463831989e-05, "loss": 0.8549, "step": 1820 }, { "epoch": 0.9776853125734036, "grad_norm": 0.765625, "learning_rate": 4.546587564218785e-05, "loss": 0.9291, "step": 1821 }, { "epoch": 0.9782222073084795, "grad_norm": 0.58203125, "learning_rate": 4.546102431064021e-05, "loss": 0.9247, "step": 1822 }, { "epoch": 0.9787591020435555, "grad_norm": 0.78515625, "learning_rate": 4.545617064423058e-05, "loss": 0.8991, "step": 1823 }, { "epoch": 0.9792959967786316, "grad_norm": 0.7265625, "learning_rate": 4.545131464351282e-05, "loss": 1.0273, "step": 1824 }, { "epoch": 0.9798328915137076, "grad_norm": 0.91015625, "learning_rate": 4.544645630904106e-05, "loss": 0.7825, "step": 1825 }, { "epoch": 0.9803697862487836, "grad_norm": 0.671875, "learning_rate": 4.544159564136972e-05, "loss": 1.179, "step": 1826 }, { "epoch": 0.9809066809838596, "grad_norm": 0.80078125, "learning_rate": 4.543673264105343e-05, "loss": 0.8172, "step": 1827 }, { "epoch": 0.9814435757189356, "grad_norm": 0.6484375, "learning_rate": 4.5431867308647135e-05, "loss": 0.8477, "step": 1828 }, { "epoch": 0.9819804704540116, "grad_norm": 0.671875, "learning_rate": 4.5426999644706044e-05, "loss": 0.9885, "step": 1829 }, { "epoch": 0.9825173651890876, "grad_norm": 0.6328125, "learning_rate": 4.54221296497856e-05, "loss": 0.9193, "step": 1830 }, { "epoch": 0.9830542599241636, "grad_norm": 0.6796875, "learning_rate": 4.541725732444153e-05, "loss": 0.9058, "step": 1831 }, { "epoch": 0.9835911546592396, "grad_norm": 0.61328125, "learning_rate": 4.541238266922984e-05, "loss": 0.8006, "step": 1832 }, { "epoch": 0.9841280493943156, "grad_norm": 0.66796875, "learning_rate": 4.540750568470679e-05, "loss": 0.9181, "step": 1833 }, { "epoch": 0.9846649441293917, "grad_norm": 0.75, "learning_rate": 4.54026263714289e-05, "loss": 0.9358, "step": 1834 }, { "epoch": 0.9852018388644677, "grad_norm": 0.67578125, "learning_rate": 4.5397744729952964e-05, "loss": 0.8277, "step": 1835 }, { "epoch": 0.9857387335995437, "grad_norm": 0.765625, "learning_rate": 4.539286076083603e-05, "loss": 0.8649, "step": 1836 }, { "epoch": 0.9862756283346197, "grad_norm": 0.76171875, "learning_rate": 4.538797446463543e-05, "loss": 1.0171, "step": 1837 }, { "epoch": 0.9868125230696957, "grad_norm": 0.640625, "learning_rate": 4.538308584190876e-05, "loss": 0.9048, "step": 1838 }, { "epoch": 0.9873494178047717, "grad_norm": 0.796875, "learning_rate": 4.537819489321386e-05, "loss": 0.9797, "step": 1839 }, { "epoch": 0.9878863125398477, "grad_norm": 0.66015625, "learning_rate": 4.5373301619108854e-05, "loss": 0.7941, "step": 1840 }, { "epoch": 0.9884232072749236, "grad_norm": 0.76953125, "learning_rate": 4.536840602015212e-05, "loss": 0.8707, "step": 1841 }, { "epoch": 0.9889601020099996, "grad_norm": 0.7578125, "learning_rate": 4.536350809690232e-05, "loss": 1.0523, "step": 1842 }, { "epoch": 0.9894969967450756, "grad_norm": 0.62109375, "learning_rate": 4.5358607849918364e-05, "loss": 0.7851, "step": 1843 }, { "epoch": 0.9900338914801516, "grad_norm": 0.6875, "learning_rate": 4.535370527975943e-05, "loss": 0.9351, "step": 1844 }, { "epoch": 0.9905707862152276, "grad_norm": 0.66015625, "learning_rate": 4.5348800386984954e-05, "loss": 0.7522, "step": 1845 }, { "epoch": 0.9911076809503037, "grad_norm": 0.8828125, "learning_rate": 4.5343893172154675e-05, "loss": 1.1531, "step": 1846 }, { "epoch": 0.9916445756853797, "grad_norm": 0.6953125, "learning_rate": 4.533898363582854e-05, "loss": 0.9912, "step": 1847 }, { "epoch": 0.9921814704204557, "grad_norm": 0.80078125, "learning_rate": 4.5334071778566804e-05, "loss": 0.9189, "step": 1848 }, { "epoch": 0.9927183651555317, "grad_norm": 0.93359375, "learning_rate": 4.5329157600929966e-05, "loss": 1.0364, "step": 1849 }, { "epoch": 0.9932552598906077, "grad_norm": 0.63671875, "learning_rate": 4.5324241103478804e-05, "loss": 0.8382, "step": 1850 }, { "epoch": 0.9932552598906077, "eval_loss": 0.9214339256286621, "eval_runtime": 45.6257, "eval_samples_per_second": 21.085, "eval_steps_per_second": 21.085, "step": 1850 }, { "epoch": 0.9937921546256837, "grad_norm": 0.69140625, "learning_rate": 4.531932228677433e-05, "loss": 1.0326, "step": 1851 }, { "epoch": 0.9943290493607597, "grad_norm": 0.73828125, "learning_rate": 4.5314401151377876e-05, "loss": 0.9046, "step": 1852 }, { "epoch": 0.9948659440958357, "grad_norm": 0.6328125, "learning_rate": 4.530947769785099e-05, "loss": 0.9507, "step": 1853 }, { "epoch": 0.9954028388309117, "grad_norm": 0.7890625, "learning_rate": 4.530455192675549e-05, "loss": 1.0793, "step": 1854 }, { "epoch": 0.9959397335659878, "grad_norm": 0.8125, "learning_rate": 4.529962383865348e-05, "loss": 1.0272, "step": 1855 }, { "epoch": 0.9964766283010638, "grad_norm": 0.78125, "learning_rate": 4.529469343410732e-05, "loss": 1.0442, "step": 1856 }, { "epoch": 0.9970135230361398, "grad_norm": 0.734375, "learning_rate": 4.528976071367963e-05, "loss": 0.8845, "step": 1857 }, { "epoch": 0.9975504177712158, "grad_norm": 0.6875, "learning_rate": 4.528482567793329e-05, "loss": 0.9361, "step": 1858 }, { "epoch": 0.9980873125062918, "grad_norm": 0.71484375, "learning_rate": 4.5279888327431455e-05, "loss": 0.9168, "step": 1859 }, { "epoch": 0.9986242072413677, "grad_norm": 0.69921875, "learning_rate": 4.527494866273753e-05, "loss": 0.8991, "step": 1860 }, { "epoch": 0.9991611019764437, "grad_norm": 0.73828125, "learning_rate": 4.527000668441521e-05, "loss": 0.8749, "step": 1861 }, { "epoch": 0.9996979967115197, "grad_norm": 0.69921875, "learning_rate": 4.526506239302842e-05, "loss": 0.8481, "step": 1862 }, { "epoch": 1.000536894735076, "grad_norm": 0.80859375, "learning_rate": 4.526011578914139e-05, "loss": 1.45, "step": 1863 }, { "epoch": 1.001073789470152, "grad_norm": 0.68359375, "learning_rate": 4.525516687331857e-05, "loss": 0.8831, "step": 1864 }, { "epoch": 1.001610684205228, "grad_norm": 0.65234375, "learning_rate": 4.5250215646124684e-05, "loss": 0.9929, "step": 1865 }, { "epoch": 1.002147578940304, "grad_norm": 0.78515625, "learning_rate": 4.524526210812475e-05, "loss": 0.9428, "step": 1866 }, { "epoch": 1.00268447367538, "grad_norm": 0.83203125, "learning_rate": 4.524030625988404e-05, "loss": 0.9506, "step": 1867 }, { "epoch": 1.003221368410456, "grad_norm": 0.69921875, "learning_rate": 4.5235348101968044e-05, "loss": 1.1667, "step": 1868 }, { "epoch": 1.003758263145532, "grad_norm": 0.68359375, "learning_rate": 4.523038763494258e-05, "loss": 0.9324, "step": 1869 }, { "epoch": 1.004295157880608, "grad_norm": 0.734375, "learning_rate": 4.522542485937369e-05, "loss": 0.8907, "step": 1870 }, { "epoch": 1.004832052615684, "grad_norm": 0.6953125, "learning_rate": 4.5220459775827685e-05, "loss": 0.9064, "step": 1871 }, { "epoch": 1.00536894735076, "grad_norm": 0.69921875, "learning_rate": 4.521549238487115e-05, "loss": 0.8501, "step": 1872 }, { "epoch": 1.005905842085836, "grad_norm": 0.69140625, "learning_rate": 4.521052268707092e-05, "loss": 0.8544, "step": 1873 }, { "epoch": 1.0064427368209121, "grad_norm": 0.6484375, "learning_rate": 4.520555068299411e-05, "loss": 0.8082, "step": 1874 }, { "epoch": 1.006979631555988, "grad_norm": 0.56640625, "learning_rate": 4.5200576373208093e-05, "loss": 0.6351, "step": 1875 }, { "epoch": 1.0075165262910641, "grad_norm": 0.73828125, "learning_rate": 4.5195599758280485e-05, "loss": 1.1155, "step": 1876 }, { "epoch": 1.00805342102614, "grad_norm": 0.703125, "learning_rate": 4.519062083877918e-05, "loss": 0.9415, "step": 1877 }, { "epoch": 1.0085903157612162, "grad_norm": 0.64453125, "learning_rate": 4.5185639615272356e-05, "loss": 0.7706, "step": 1878 }, { "epoch": 1.009127210496292, "grad_norm": 0.75390625, "learning_rate": 4.518065608832842e-05, "loss": 0.8954, "step": 1879 }, { "epoch": 1.0096641052313682, "grad_norm": 0.80859375, "learning_rate": 4.517567025851605e-05, "loss": 0.985, "step": 1880 }, { "epoch": 1.010200999966444, "grad_norm": 0.58203125, "learning_rate": 4.5170682126404206e-05, "loss": 0.8717, "step": 1881 }, { "epoch": 1.01073789470152, "grad_norm": 0.73046875, "learning_rate": 4.5165691692562085e-05, "loss": 0.8408, "step": 1882 }, { "epoch": 1.011274789436596, "grad_norm": 0.78125, "learning_rate": 4.5160698957559164e-05, "loss": 0.9268, "step": 1883 }, { "epoch": 1.011811684171672, "grad_norm": 0.63671875, "learning_rate": 4.515570392196518e-05, "loss": 0.9071, "step": 1884 }, { "epoch": 1.0123485789067481, "grad_norm": 0.87109375, "learning_rate": 4.515070658635013e-05, "loss": 1.1588, "step": 1885 }, { "epoch": 1.012885473641824, "grad_norm": 0.6953125, "learning_rate": 4.5145706951284265e-05, "loss": 0.9855, "step": 1886 }, { "epoch": 1.0134223683769001, "grad_norm": 0.6484375, "learning_rate": 4.5140705017338114e-05, "loss": 0.8237, "step": 1887 }, { "epoch": 1.013959263111976, "grad_norm": 0.8203125, "learning_rate": 4.513570078508246e-05, "loss": 0.9666, "step": 1888 }, { "epoch": 1.0144961578470522, "grad_norm": 0.76953125, "learning_rate": 4.5130694255088344e-05, "loss": 0.911, "step": 1889 }, { "epoch": 1.015033052582128, "grad_norm": 0.58984375, "learning_rate": 4.512568542792709e-05, "loss": 0.7198, "step": 1890 }, { "epoch": 1.0155699473172042, "grad_norm": 0.89453125, "learning_rate": 4.5120674304170255e-05, "loss": 0.9902, "step": 1891 }, { "epoch": 1.01610684205228, "grad_norm": 0.73828125, "learning_rate": 4.5115660884389665e-05, "loss": 1.0889, "step": 1892 }, { "epoch": 1.0166437367873562, "grad_norm": 0.71484375, "learning_rate": 4.511064516915743e-05, "loss": 1.0408, "step": 1893 }, { "epoch": 1.017180631522432, "grad_norm": 0.71875, "learning_rate": 4.510562715904589e-05, "loss": 0.8262, "step": 1894 }, { "epoch": 1.0177175262575082, "grad_norm": 0.63671875, "learning_rate": 4.510060685462769e-05, "loss": 0.8045, "step": 1895 }, { "epoch": 1.0182544209925841, "grad_norm": 0.734375, "learning_rate": 4.5095584256475686e-05, "loss": 0.9082, "step": 1896 }, { "epoch": 1.0187913157276602, "grad_norm": 0.890625, "learning_rate": 4.509055936516303e-05, "loss": 0.9124, "step": 1897 }, { "epoch": 1.0193282104627361, "grad_norm": 0.78515625, "learning_rate": 4.508553218126312e-05, "loss": 0.9423, "step": 1898 }, { "epoch": 1.0198651051978123, "grad_norm": 0.76953125, "learning_rate": 4.508050270534963e-05, "loss": 1.0588, "step": 1899 }, { "epoch": 1.0204019999328882, "grad_norm": 0.7421875, "learning_rate": 4.507547093799648e-05, "loss": 0.7418, "step": 1900 }, { "epoch": 1.0204019999328882, "eval_loss": 0.9220516085624695, "eval_runtime": 46.2004, "eval_samples_per_second": 20.822, "eval_steps_per_second": 20.822, "step": 1900 }, { "epoch": 1.020938894667964, "grad_norm": 0.73046875, "learning_rate": 4.5070436879777865e-05, "loss": 0.6648, "step": 1901 }, { "epoch": 1.0214757894030402, "grad_norm": 0.71484375, "learning_rate": 4.506540053126823e-05, "loss": 0.8462, "step": 1902 }, { "epoch": 1.022012684138116, "grad_norm": 0.6640625, "learning_rate": 4.506036189304228e-05, "loss": 0.7025, "step": 1903 }, { "epoch": 1.0225495788731922, "grad_norm": 0.75390625, "learning_rate": 4.5055320965675e-05, "loss": 0.8031, "step": 1904 }, { "epoch": 1.023086473608268, "grad_norm": 0.640625, "learning_rate": 4.505027774974162e-05, "loss": 0.7238, "step": 1905 }, { "epoch": 1.0236233683433442, "grad_norm": 0.76171875, "learning_rate": 4.504523224581762e-05, "loss": 0.8339, "step": 1906 }, { "epoch": 1.0241602630784201, "grad_norm": 0.6953125, "learning_rate": 4.5040184454478786e-05, "loss": 0.812, "step": 1907 }, { "epoch": 1.0246971578134962, "grad_norm": 1.0234375, "learning_rate": 4.50351343763011e-05, "loss": 0.9015, "step": 1908 }, { "epoch": 1.0252340525485721, "grad_norm": 0.7578125, "learning_rate": 4.503008201186087e-05, "loss": 0.8438, "step": 1909 }, { "epoch": 1.0257709472836483, "grad_norm": 0.734375, "learning_rate": 4.502502736173462e-05, "loss": 1.1629, "step": 1910 }, { "epoch": 1.0263078420187242, "grad_norm": 0.62109375, "learning_rate": 4.5019970426499145e-05, "loss": 0.8084, "step": 1911 }, { "epoch": 1.0268447367538003, "grad_norm": 0.671875, "learning_rate": 4.5014911206731515e-05, "loss": 0.8435, "step": 1912 }, { "epoch": 1.0273816314888762, "grad_norm": 0.67578125, "learning_rate": 4.500984970300905e-05, "loss": 0.9267, "step": 1913 }, { "epoch": 1.0279185262239523, "grad_norm": 0.859375, "learning_rate": 4.500478591590933e-05, "loss": 0.8635, "step": 1914 }, { "epoch": 1.0284554209590282, "grad_norm": 0.7578125, "learning_rate": 4.49997198460102e-05, "loss": 0.9659, "step": 1915 }, { "epoch": 1.0289923156941043, "grad_norm": 0.7109375, "learning_rate": 4.499465149388975e-05, "loss": 0.9466, "step": 1916 }, { "epoch": 1.0295292104291802, "grad_norm": 0.62109375, "learning_rate": 4.4989580860126354e-05, "loss": 0.9998, "step": 1917 }, { "epoch": 1.0300661051642563, "grad_norm": 0.6640625, "learning_rate": 4.4984507945298644e-05, "loss": 0.8697, "step": 1918 }, { "epoch": 1.0306029998993322, "grad_norm": 0.671875, "learning_rate": 4.497943274998549e-05, "loss": 0.8562, "step": 1919 }, { "epoch": 1.0311398946344081, "grad_norm": 0.7421875, "learning_rate": 4.4974355274766034e-05, "loss": 0.8562, "step": 1920 }, { "epoch": 1.0316767893694843, "grad_norm": 0.6875, "learning_rate": 4.496927552021969e-05, "loss": 0.8777, "step": 1921 }, { "epoch": 1.0322136841045602, "grad_norm": 0.765625, "learning_rate": 4.496419348692612e-05, "loss": 0.8184, "step": 1922 }, { "epoch": 1.0327505788396363, "grad_norm": 0.76953125, "learning_rate": 4.495910917546525e-05, "loss": 0.9376, "step": 1923 }, { "epoch": 1.0332874735747122, "grad_norm": 0.88671875, "learning_rate": 4.495402258641726e-05, "loss": 0.8625, "step": 1924 }, { "epoch": 1.0338243683097883, "grad_norm": 0.703125, "learning_rate": 4.494893372036259e-05, "loss": 0.8018, "step": 1925 }, { "epoch": 1.0343612630448642, "grad_norm": 0.76171875, "learning_rate": 4.4943842577881957e-05, "loss": 0.8103, "step": 1926 }, { "epoch": 1.0348981577799403, "grad_norm": 1.0, "learning_rate": 4.493874915955631e-05, "loss": 0.7037, "step": 1927 }, { "epoch": 1.0354350525150162, "grad_norm": 0.66796875, "learning_rate": 4.493365346596688e-05, "loss": 0.7914, "step": 1928 }, { "epoch": 1.0359719472500923, "grad_norm": 0.71875, "learning_rate": 4.492855549769516e-05, "loss": 0.8135, "step": 1929 }, { "epoch": 1.0365088419851682, "grad_norm": 0.6796875, "learning_rate": 4.492345525532288e-05, "loss": 0.7385, "step": 1930 }, { "epoch": 1.0370457367202444, "grad_norm": 0.65234375, "learning_rate": 4.4918352739432043e-05, "loss": 0.8067, "step": 1931 }, { "epoch": 1.0375826314553203, "grad_norm": 0.62890625, "learning_rate": 4.491324795060491e-05, "loss": 0.8124, "step": 1932 }, { "epoch": 1.0381195261903964, "grad_norm": 0.63671875, "learning_rate": 4.490814088942401e-05, "loss": 0.7838, "step": 1933 }, { "epoch": 1.0386564209254723, "grad_norm": 0.703125, "learning_rate": 4.4903031556472117e-05, "loss": 0.7945, "step": 1934 }, { "epoch": 1.0391933156605484, "grad_norm": 0.80078125, "learning_rate": 4.489791995233227e-05, "loss": 1.0002, "step": 1935 }, { "epoch": 1.0397302103956243, "grad_norm": 0.67578125, "learning_rate": 4.4892806077587765e-05, "loss": 0.8675, "step": 1936 }, { "epoch": 1.0402671051307002, "grad_norm": 0.703125, "learning_rate": 4.4887689932822176e-05, "loss": 0.9476, "step": 1937 }, { "epoch": 1.0408039998657763, "grad_norm": 0.6171875, "learning_rate": 4.48825715186193e-05, "loss": 0.81, "step": 1938 }, { "epoch": 1.0413408946008522, "grad_norm": 0.6953125, "learning_rate": 4.4877450835563215e-05, "loss": 0.8288, "step": 1939 }, { "epoch": 1.0418777893359283, "grad_norm": 0.84765625, "learning_rate": 4.4872327884238274e-05, "loss": 1.0518, "step": 1940 }, { "epoch": 1.0424146840710042, "grad_norm": 0.69140625, "learning_rate": 4.486720266522906e-05, "loss": 0.7991, "step": 1941 }, { "epoch": 1.0429515788060804, "grad_norm": 0.73828125, "learning_rate": 4.486207517912042e-05, "loss": 0.9036, "step": 1942 }, { "epoch": 1.0434884735411563, "grad_norm": 0.76953125, "learning_rate": 4.4856945426497464e-05, "loss": 0.9259, "step": 1943 }, { "epoch": 1.0440253682762324, "grad_norm": 0.68359375, "learning_rate": 4.4851813407945574e-05, "loss": 0.8469, "step": 1944 }, { "epoch": 1.0445622630113083, "grad_norm": 0.67578125, "learning_rate": 4.4846679124050375e-05, "loss": 0.9435, "step": 1945 }, { "epoch": 1.0450991577463844, "grad_norm": 0.69921875, "learning_rate": 4.484154257539774e-05, "loss": 0.8936, "step": 1946 }, { "epoch": 1.0456360524814603, "grad_norm": 0.578125, "learning_rate": 4.4836403762573826e-05, "loss": 0.779, "step": 1947 }, { "epoch": 1.0461729472165364, "grad_norm": 0.7109375, "learning_rate": 4.483126268616504e-05, "loss": 1.0763, "step": 1948 }, { "epoch": 1.0467098419516123, "grad_norm": 0.6640625, "learning_rate": 4.4826119346758036e-05, "loss": 1.0075, "step": 1949 }, { "epoch": 1.0472467366866884, "grad_norm": 0.73046875, "learning_rate": 4.482097374493974e-05, "loss": 0.8152, "step": 1950 }, { "epoch": 1.0472467366866884, "eval_loss": 0.9233726263046265, "eval_runtime": 45.8573, "eval_samples_per_second": 20.978, "eval_steps_per_second": 20.978, "step": 1950 }, { "epoch": 1.0477836314217643, "grad_norm": 0.87890625, "learning_rate": 4.481582588129732e-05, "loss": 0.9823, "step": 1951 }, { "epoch": 1.0483205261568405, "grad_norm": 0.6953125, "learning_rate": 4.481067575641822e-05, "loss": 0.9266, "step": 1952 }, { "epoch": 1.0488574208919164, "grad_norm": 0.71484375, "learning_rate": 4.480552337089015e-05, "loss": 0.7821, "step": 1953 }, { "epoch": 1.0493943156269925, "grad_norm": 0.64453125, "learning_rate": 4.480036872530103e-05, "loss": 0.9949, "step": 1954 }, { "epoch": 1.0499312103620684, "grad_norm": 0.76171875, "learning_rate": 4.479521182023909e-05, "loss": 1.0427, "step": 1955 }, { "epoch": 1.0504681050971443, "grad_norm": 0.86328125, "learning_rate": 4.47900526562928e-05, "loss": 0.8266, "step": 1956 }, { "epoch": 1.0510049998322204, "grad_norm": 0.703125, "learning_rate": 4.478489123405088e-05, "loss": 0.9953, "step": 1957 }, { "epoch": 1.0515418945672963, "grad_norm": 0.93359375, "learning_rate": 4.477972755410231e-05, "loss": 0.8112, "step": 1958 }, { "epoch": 1.0520787893023724, "grad_norm": 0.64453125, "learning_rate": 4.477456161703634e-05, "loss": 0.7595, "step": 1959 }, { "epoch": 1.0526156840374483, "grad_norm": 0.65625, "learning_rate": 4.476939342344246e-05, "loss": 0.9713, "step": 1960 }, { "epoch": 1.0531525787725244, "grad_norm": 0.6875, "learning_rate": 4.476422297391044e-05, "loss": 0.864, "step": 1961 }, { "epoch": 1.0536894735076003, "grad_norm": 0.578125, "learning_rate": 4.475905026903028e-05, "loss": 0.7046, "step": 1962 }, { "epoch": 1.0542263682426765, "grad_norm": 0.68359375, "learning_rate": 4.4753875309392266e-05, "loss": 1.0367, "step": 1963 }, { "epoch": 1.0547632629777524, "grad_norm": 0.66015625, "learning_rate": 4.4748698095586904e-05, "loss": 0.8485, "step": 1964 }, { "epoch": 1.0553001577128285, "grad_norm": 0.625, "learning_rate": 4.474351862820499e-05, "loss": 0.7381, "step": 1965 }, { "epoch": 1.0558370524479044, "grad_norm": 0.80859375, "learning_rate": 4.4738336907837584e-05, "loss": 1.0235, "step": 1966 }, { "epoch": 1.0563739471829805, "grad_norm": 0.6875, "learning_rate": 4.473315293507596e-05, "loss": 0.886, "step": 1967 }, { "epoch": 1.0569108419180564, "grad_norm": 0.7109375, "learning_rate": 4.472796671051169e-05, "loss": 0.9203, "step": 1968 }, { "epoch": 1.0574477366531325, "grad_norm": 0.671875, "learning_rate": 4.472277823473658e-05, "loss": 0.8866, "step": 1969 }, { "epoch": 1.0579846313882084, "grad_norm": 0.68359375, "learning_rate": 4.471758750834271e-05, "loss": 0.8197, "step": 1970 }, { "epoch": 1.0585215261232845, "grad_norm": 0.71484375, "learning_rate": 4.47123945319224e-05, "loss": 0.805, "step": 1971 }, { "epoch": 1.0590584208583604, "grad_norm": 0.5859375, "learning_rate": 4.470719930606823e-05, "loss": 0.6962, "step": 1972 }, { "epoch": 1.0595953155934366, "grad_norm": 0.69140625, "learning_rate": 4.470200183137305e-05, "loss": 0.9446, "step": 1973 }, { "epoch": 1.0601322103285125, "grad_norm": 0.796875, "learning_rate": 4.469680210842996e-05, "loss": 0.7789, "step": 1974 }, { "epoch": 1.0606691050635884, "grad_norm": 0.65625, "learning_rate": 4.4691600137832316e-05, "loss": 0.8901, "step": 1975 }, { "epoch": 1.0612059997986645, "grad_norm": 0.62109375, "learning_rate": 4.4686395920173706e-05, "loss": 0.8904, "step": 1976 }, { "epoch": 1.0617428945337404, "grad_norm": 0.703125, "learning_rate": 4.4681189456048026e-05, "loss": 0.9188, "step": 1977 }, { "epoch": 1.0622797892688165, "grad_norm": 0.69921875, "learning_rate": 4.4675980746049384e-05, "loss": 1.0461, "step": 1978 }, { "epoch": 1.0628166840038924, "grad_norm": 0.76953125, "learning_rate": 4.467076979077216e-05, "loss": 1.0348, "step": 1979 }, { "epoch": 1.0633535787389685, "grad_norm": 0.61328125, "learning_rate": 4.4665556590811e-05, "loss": 0.9061, "step": 1980 }, { "epoch": 1.0638904734740444, "grad_norm": 0.71875, "learning_rate": 4.466034114676077e-05, "loss": 0.7636, "step": 1981 }, { "epoch": 1.0644273682091205, "grad_norm": 0.6640625, "learning_rate": 4.4655123459216664e-05, "loss": 0.9381, "step": 1982 }, { "epoch": 1.0649642629441964, "grad_norm": 0.60546875, "learning_rate": 4.464990352877404e-05, "loss": 0.7239, "step": 1983 }, { "epoch": 1.0655011576792726, "grad_norm": 0.7421875, "learning_rate": 4.4644681356028586e-05, "loss": 0.9538, "step": 1984 }, { "epoch": 1.0660380524143485, "grad_norm": 0.71484375, "learning_rate": 4.463945694157621e-05, "loss": 0.9682, "step": 1985 }, { "epoch": 1.0665749471494246, "grad_norm": 0.765625, "learning_rate": 4.463423028601308e-05, "loss": 0.8489, "step": 1986 }, { "epoch": 1.0671118418845005, "grad_norm": 0.69921875, "learning_rate": 4.462900138993562e-05, "loss": 0.9274, "step": 1987 }, { "epoch": 1.0676487366195766, "grad_norm": 0.765625, "learning_rate": 4.462377025394053e-05, "loss": 0.848, "step": 1988 }, { "epoch": 1.0681856313546525, "grad_norm": 0.77734375, "learning_rate": 4.461853687862474e-05, "loss": 1.1715, "step": 1989 }, { "epoch": 1.0687225260897286, "grad_norm": 0.69140625, "learning_rate": 4.461330126458544e-05, "loss": 0.898, "step": 1990 }, { "epoch": 1.0692594208248045, "grad_norm": 1.0859375, "learning_rate": 4.4608063412420074e-05, "loss": 0.8517, "step": 1991 }, { "epoch": 1.0697963155598806, "grad_norm": 0.671875, "learning_rate": 4.460282332272637e-05, "loss": 0.7949, "step": 1992 }, { "epoch": 1.0703332102949565, "grad_norm": 0.87890625, "learning_rate": 4.4597580996102276e-05, "loss": 1.0467, "step": 1993 }, { "epoch": 1.0708701050300324, "grad_norm": 0.7578125, "learning_rate": 4.4592336433146e-05, "loss": 1.0597, "step": 1994 }, { "epoch": 1.0714069997651086, "grad_norm": 0.66015625, "learning_rate": 4.458708963445602e-05, "loss": 0.8162, "step": 1995 }, { "epoch": 1.0719438945001845, "grad_norm": 0.7421875, "learning_rate": 4.4581840600631066e-05, "loss": 0.7769, "step": 1996 }, { "epoch": 1.0724807892352606, "grad_norm": 0.5703125, "learning_rate": 4.457658933227011e-05, "loss": 0.8472, "step": 1997 }, { "epoch": 1.0730176839703365, "grad_norm": 0.859375, "learning_rate": 4.4571335829972396e-05, "loss": 0.8712, "step": 1998 }, { "epoch": 1.0735545787054126, "grad_norm": 0.69921875, "learning_rate": 4.456608009433741e-05, "loss": 0.9897, "step": 1999 }, { "epoch": 1.0740914734404885, "grad_norm": 0.71484375, "learning_rate": 4.4560822125964915e-05, "loss": 0.888, "step": 2000 }, { "epoch": 1.0740914734404885, "eval_loss": 0.9224899411201477, "eval_runtime": 46.5225, "eval_samples_per_second": 20.678, "eval_steps_per_second": 20.678, "step": 2000 }, { "epoch": 1.0746283681755646, "grad_norm": 0.66015625, "learning_rate": 4.455556192545489e-05, "loss": 0.7956, "step": 2001 }, { "epoch": 1.0751652629106405, "grad_norm": 0.7890625, "learning_rate": 4.455029949340759e-05, "loss": 1.0419, "step": 2002 }, { "epoch": 1.0757021576457166, "grad_norm": 0.7734375, "learning_rate": 4.454503483042354e-05, "loss": 1.0803, "step": 2003 }, { "epoch": 1.0762390523807925, "grad_norm": 0.6953125, "learning_rate": 4.4539767937103496e-05, "loss": 0.7567, "step": 2004 }, { "epoch": 1.0767759471158687, "grad_norm": 0.625, "learning_rate": 4.453449881404849e-05, "loss": 0.829, "step": 2005 }, { "epoch": 1.0773128418509446, "grad_norm": 0.59765625, "learning_rate": 4.4529227461859766e-05, "loss": 0.8505, "step": 2006 }, { "epoch": 1.0778497365860207, "grad_norm": 0.85546875, "learning_rate": 4.4523953881138885e-05, "loss": 0.8171, "step": 2007 }, { "epoch": 1.0783866313210966, "grad_norm": 0.640625, "learning_rate": 4.451867807248761e-05, "loss": 0.8978, "step": 2008 }, { "epoch": 1.0789235260561727, "grad_norm": 0.609375, "learning_rate": 4.451340003650799e-05, "loss": 0.8383, "step": 2009 }, { "epoch": 1.0794604207912486, "grad_norm": 0.7109375, "learning_rate": 4.45081197738023e-05, "loss": 0.8685, "step": 2010 }, { "epoch": 1.0799973155263247, "grad_norm": 0.69921875, "learning_rate": 4.45028372849731e-05, "loss": 0.8838, "step": 2011 }, { "epoch": 1.0805342102614006, "grad_norm": 0.76953125, "learning_rate": 4.449755257062317e-05, "loss": 0.7649, "step": 2012 }, { "epoch": 1.0810711049964765, "grad_norm": 0.671875, "learning_rate": 4.449226563135559e-05, "loss": 0.9033, "step": 2013 }, { "epoch": 1.0816079997315526, "grad_norm": 0.765625, "learning_rate": 4.448697646777364e-05, "loss": 1.1965, "step": 2014 }, { "epoch": 1.0821448944666285, "grad_norm": 0.71875, "learning_rate": 4.44816850804809e-05, "loss": 0.7399, "step": 2015 }, { "epoch": 1.0826817892017047, "grad_norm": 0.76171875, "learning_rate": 4.447639147008117e-05, "loss": 0.8932, "step": 2016 }, { "epoch": 1.0832186839367806, "grad_norm": 0.67578125, "learning_rate": 4.447109563717853e-05, "loss": 0.8015, "step": 2017 }, { "epoch": 1.0837555786718567, "grad_norm": 0.76953125, "learning_rate": 4.44657975823773e-05, "loss": 0.801, "step": 2018 }, { "epoch": 1.0842924734069326, "grad_norm": 0.85546875, "learning_rate": 4.446049730628204e-05, "loss": 0.9231, "step": 2019 }, { "epoch": 1.0848293681420087, "grad_norm": 0.69140625, "learning_rate": 4.44551948094976e-05, "loss": 0.816, "step": 2020 }, { "epoch": 1.0853662628770846, "grad_norm": 0.70703125, "learning_rate": 4.4449890092629045e-05, "loss": 0.8183, "step": 2021 }, { "epoch": 1.0859031576121607, "grad_norm": 0.73828125, "learning_rate": 4.4444583156281724e-05, "loss": 0.9396, "step": 2022 }, { "epoch": 1.0864400523472366, "grad_norm": 0.76171875, "learning_rate": 4.443927400106122e-05, "loss": 0.9589, "step": 2023 }, { "epoch": 1.0869769470823127, "grad_norm": 0.65234375, "learning_rate": 4.443396262757337e-05, "loss": 1.0206, "step": 2024 }, { "epoch": 1.0875138418173886, "grad_norm": 0.73828125, "learning_rate": 4.442864903642428e-05, "loss": 0.9336, "step": 2025 }, { "epoch": 1.0880507365524648, "grad_norm": 0.67578125, "learning_rate": 4.442333322822028e-05, "loss": 0.956, "step": 2026 }, { "epoch": 1.0885876312875407, "grad_norm": 0.70703125, "learning_rate": 4.4418015203567986e-05, "loss": 0.9395, "step": 2027 }, { "epoch": 1.0891245260226168, "grad_norm": 0.7265625, "learning_rate": 4.441269496307425e-05, "loss": 0.9294, "step": 2028 }, { "epoch": 1.0896614207576927, "grad_norm": 0.7734375, "learning_rate": 4.440737250734618e-05, "loss": 0.957, "step": 2029 }, { "epoch": 1.0901983154927688, "grad_norm": 0.71875, "learning_rate": 4.440204783699112e-05, "loss": 0.9635, "step": 2030 }, { "epoch": 1.0907352102278447, "grad_norm": 1.3046875, "learning_rate": 4.4396720952616705e-05, "loss": 0.9689, "step": 2031 }, { "epoch": 1.0912721049629206, "grad_norm": 0.7890625, "learning_rate": 4.4391391854830785e-05, "loss": 0.834, "step": 2032 }, { "epoch": 1.0918089996979967, "grad_norm": 0.73828125, "learning_rate": 4.4386060544241493e-05, "loss": 1.1758, "step": 2033 }, { "epoch": 1.0923458944330726, "grad_norm": 0.7421875, "learning_rate": 4.438072702145718e-05, "loss": 0.9533, "step": 2034 }, { "epoch": 1.0928827891681487, "grad_norm": 0.7890625, "learning_rate": 4.437539128708647e-05, "loss": 0.7748, "step": 2035 }, { "epoch": 1.0934196839032246, "grad_norm": 0.72265625, "learning_rate": 4.437005334173826e-05, "loss": 0.9824, "step": 2036 }, { "epoch": 1.0939565786383008, "grad_norm": 0.73046875, "learning_rate": 4.436471318602164e-05, "loss": 0.9566, "step": 2037 }, { "epoch": 1.0944934733733767, "grad_norm": 0.75390625, "learning_rate": 4.435937082054603e-05, "loss": 0.9278, "step": 2038 }, { "epoch": 1.0950303681084528, "grad_norm": 0.74609375, "learning_rate": 4.4354026245921034e-05, "loss": 0.8977, "step": 2039 }, { "epoch": 1.0955672628435287, "grad_norm": 0.97265625, "learning_rate": 4.4348679462756556e-05, "loss": 0.9137, "step": 2040 }, { "epoch": 1.0961041575786048, "grad_norm": 0.8515625, "learning_rate": 4.434333047166271e-05, "loss": 0.9121, "step": 2041 }, { "epoch": 1.0966410523136807, "grad_norm": 0.65234375, "learning_rate": 4.43379792732499e-05, "loss": 0.931, "step": 2042 }, { "epoch": 1.0971779470487568, "grad_norm": 0.68359375, "learning_rate": 4.4332625868128755e-05, "loss": 0.8208, "step": 2043 }, { "epoch": 1.0977148417838327, "grad_norm": 0.671875, "learning_rate": 4.4327270256910164e-05, "loss": 0.9443, "step": 2044 }, { "epoch": 1.0982517365189088, "grad_norm": 0.7421875, "learning_rate": 4.4321912440205285e-05, "loss": 0.9097, "step": 2045 }, { "epoch": 1.0987886312539847, "grad_norm": 0.72265625, "learning_rate": 4.43165524186255e-05, "loss": 0.9636, "step": 2046 }, { "epoch": 1.0993255259890609, "grad_norm": 0.7734375, "learning_rate": 4.431119019278246e-05, "loss": 1.0085, "step": 2047 }, { "epoch": 1.0998624207241368, "grad_norm": 0.640625, "learning_rate": 4.430582576328807e-05, "loss": 0.7751, "step": 2048 }, { "epoch": 1.1003993154592129, "grad_norm": 0.6875, "learning_rate": 4.4300459130754465e-05, "loss": 0.7163, "step": 2049 }, { "epoch": 1.1009362101942888, "grad_norm": 0.6328125, "learning_rate": 4.429509029579405e-05, "loss": 0.762, "step": 2050 }, { "epoch": 1.1009362101942888, "eval_loss": 0.9212960004806519, "eval_runtime": 46.7543, "eval_samples_per_second": 20.576, "eval_steps_per_second": 20.576, "step": 2050 }, { "epoch": 1.1014731049293647, "grad_norm": 0.67578125, "learning_rate": 4.428971925901949e-05, "loss": 0.9842, "step": 2051 }, { "epoch": 1.1020099996644408, "grad_norm": 0.6953125, "learning_rate": 4.4284346021043664e-05, "loss": 0.8665, "step": 2052 }, { "epoch": 1.1025468943995167, "grad_norm": 0.60546875, "learning_rate": 4.427897058247975e-05, "loss": 0.6928, "step": 2053 }, { "epoch": 1.1030837891345928, "grad_norm": 0.9609375, "learning_rate": 4.427359294394115e-05, "loss": 0.8716, "step": 2054 }, { "epoch": 1.1036206838696687, "grad_norm": 0.703125, "learning_rate": 4.42682131060415e-05, "loss": 1.015, "step": 2055 }, { "epoch": 1.1041575786047448, "grad_norm": 0.65234375, "learning_rate": 4.426283106939474e-05, "loss": 0.8309, "step": 2056 }, { "epoch": 1.1046944733398207, "grad_norm": 0.69921875, "learning_rate": 4.4257446834615e-05, "loss": 0.8908, "step": 2057 }, { "epoch": 1.1052313680748969, "grad_norm": 0.80078125, "learning_rate": 4.4252060402316696e-05, "loss": 1.2144, "step": 2058 }, { "epoch": 1.1057682628099728, "grad_norm": 0.6640625, "learning_rate": 4.4246671773114495e-05, "loss": 0.7307, "step": 2059 }, { "epoch": 1.1063051575450489, "grad_norm": 0.79296875, "learning_rate": 4.424128094762331e-05, "loss": 1.0203, "step": 2060 }, { "epoch": 1.1068420522801248, "grad_norm": 0.76953125, "learning_rate": 4.42358879264583e-05, "loss": 0.9436, "step": 2061 }, { "epoch": 1.107378947015201, "grad_norm": 0.73046875, "learning_rate": 4.4230492710234864e-05, "loss": 0.8664, "step": 2062 }, { "epoch": 1.1079158417502768, "grad_norm": 0.734375, "learning_rate": 4.422509529956868e-05, "loss": 0.8325, "step": 2063 }, { "epoch": 1.108452736485353, "grad_norm": 0.765625, "learning_rate": 4.4219695695075656e-05, "loss": 0.9997, "step": 2064 }, { "epoch": 1.1089896312204288, "grad_norm": 0.65625, "learning_rate": 4.421429389737196e-05, "loss": 0.8273, "step": 2065 }, { "epoch": 1.109526525955505, "grad_norm": 0.6796875, "learning_rate": 4.4208889907074e-05, "loss": 0.7933, "step": 2066 }, { "epoch": 1.1100634206905808, "grad_norm": 0.671875, "learning_rate": 4.420348372479844e-05, "loss": 0.8302, "step": 2067 }, { "epoch": 1.110600315425657, "grad_norm": 0.60546875, "learning_rate": 4.419807535116219e-05, "loss": 0.8421, "step": 2068 }, { "epoch": 1.1111372101607329, "grad_norm": 0.640625, "learning_rate": 4.4192664786782426e-05, "loss": 0.8186, "step": 2069 }, { "epoch": 1.1116741048958088, "grad_norm": 0.75, "learning_rate": 4.418725203227655e-05, "loss": 1.0375, "step": 2070 }, { "epoch": 1.1122109996308849, "grad_norm": 0.703125, "learning_rate": 4.418183708826223e-05, "loss": 0.7499, "step": 2071 }, { "epoch": 1.1127478943659608, "grad_norm": 0.74609375, "learning_rate": 4.417641995535739e-05, "loss": 1.141, "step": 2072 }, { "epoch": 1.113284789101037, "grad_norm": 0.7109375, "learning_rate": 4.417100063418016e-05, "loss": 0.779, "step": 2073 }, { "epoch": 1.1138216838361128, "grad_norm": 0.64453125, "learning_rate": 4.4165579125348996e-05, "loss": 0.862, "step": 2074 }, { "epoch": 1.114358578571189, "grad_norm": 0.9375, "learning_rate": 4.416015542948253e-05, "loss": 0.9814, "step": 2075 }, { "epoch": 1.1148954733062648, "grad_norm": 0.73046875, "learning_rate": 4.4154729547199695e-05, "loss": 0.996, "step": 2076 }, { "epoch": 1.115432368041341, "grad_norm": 0.7421875, "learning_rate": 4.4149301479119643e-05, "loss": 1.0847, "step": 2077 }, { "epoch": 1.1159692627764168, "grad_norm": 0.73046875, "learning_rate": 4.414387122586178e-05, "loss": 0.891, "step": 2078 }, { "epoch": 1.116506157511493, "grad_norm": 0.65234375, "learning_rate": 4.413843878804578e-05, "loss": 0.8898, "step": 2079 }, { "epoch": 1.1170430522465689, "grad_norm": 0.74609375, "learning_rate": 4.413300416629154e-05, "loss": 1.0265, "step": 2080 }, { "epoch": 1.117579946981645, "grad_norm": 0.765625, "learning_rate": 4.4127567361219224e-05, "loss": 0.9772, "step": 2081 }, { "epoch": 1.1181168417167209, "grad_norm": 0.7890625, "learning_rate": 4.412212837344924e-05, "loss": 0.9532, "step": 2082 }, { "epoch": 1.118653736451797, "grad_norm": 0.74609375, "learning_rate": 4.411668720360225e-05, "loss": 0.9018, "step": 2083 }, { "epoch": 1.119190631186873, "grad_norm": 0.78125, "learning_rate": 4.411124385229916e-05, "loss": 0.8392, "step": 2084 }, { "epoch": 1.119727525921949, "grad_norm": 0.83984375, "learning_rate": 4.4105798320161115e-05, "loss": 0.9466, "step": 2085 }, { "epoch": 1.120264420657025, "grad_norm": 0.64453125, "learning_rate": 4.410035060780953e-05, "loss": 0.8188, "step": 2086 }, { "epoch": 1.120801315392101, "grad_norm": 0.74609375, "learning_rate": 4.4094900715866064e-05, "loss": 0.8948, "step": 2087 }, { "epoch": 1.121338210127177, "grad_norm": 0.65625, "learning_rate": 4.4089448644952593e-05, "loss": 0.9638, "step": 2088 }, { "epoch": 1.1218751048622528, "grad_norm": 0.6484375, "learning_rate": 4.408399439569129e-05, "loss": 0.9848, "step": 2089 }, { "epoch": 1.122411999597329, "grad_norm": 0.6640625, "learning_rate": 4.407853796870455e-05, "loss": 0.9337, "step": 2090 }, { "epoch": 1.1229488943324049, "grad_norm": 0.6953125, "learning_rate": 4.407307936461501e-05, "loss": 0.8408, "step": 2091 }, { "epoch": 1.123485789067481, "grad_norm": 0.73046875, "learning_rate": 4.4067618584045576e-05, "loss": 0.8009, "step": 2092 }, { "epoch": 1.1240226838025569, "grad_norm": 0.62109375, "learning_rate": 4.406215562761939e-05, "loss": 0.8012, "step": 2093 }, { "epoch": 1.124559578537633, "grad_norm": 0.60546875, "learning_rate": 4.4056690495959855e-05, "loss": 0.6881, "step": 2094 }, { "epoch": 1.125096473272709, "grad_norm": 0.703125, "learning_rate": 4.405122318969058e-05, "loss": 0.6926, "step": 2095 }, { "epoch": 1.125633368007785, "grad_norm": 0.71875, "learning_rate": 4.404575370943549e-05, "loss": 0.9392, "step": 2096 }, { "epoch": 1.126170262742861, "grad_norm": 0.62890625, "learning_rate": 4.40402820558187e-05, "loss": 0.7936, "step": 2097 }, { "epoch": 1.126707157477937, "grad_norm": 0.59765625, "learning_rate": 4.403480822946461e-05, "loss": 0.7749, "step": 2098 }, { "epoch": 1.127244052213013, "grad_norm": 0.72265625, "learning_rate": 4.402933223099784e-05, "loss": 1.0051, "step": 2099 }, { "epoch": 1.127780946948089, "grad_norm": 0.6328125, "learning_rate": 4.402385406104327e-05, "loss": 0.8621, "step": 2100 }, { "epoch": 1.127780946948089, "eval_loss": 0.9210036396980286, "eval_runtime": 47.3848, "eval_samples_per_second": 20.302, "eval_steps_per_second": 20.302, "step": 2100 }, { "epoch": 1.128317841683165, "grad_norm": 0.72265625, "learning_rate": 4.401837372022604e-05, "loss": 0.8931, "step": 2101 }, { "epoch": 1.128854736418241, "grad_norm": 0.62109375, "learning_rate": 4.401289120917152e-05, "loss": 0.7536, "step": 2102 }, { "epoch": 1.129391631153317, "grad_norm": 0.765625, "learning_rate": 4.400740652850534e-05, "loss": 1.0879, "step": 2103 }, { "epoch": 1.129928525888393, "grad_norm": 0.70703125, "learning_rate": 4.400191967885335e-05, "loss": 1.0175, "step": 2104 }, { "epoch": 1.130465420623469, "grad_norm": 0.765625, "learning_rate": 4.3996430660841695e-05, "loss": 0.8159, "step": 2105 }, { "epoch": 1.1310023153585451, "grad_norm": 0.921875, "learning_rate": 4.399093947509674e-05, "loss": 0.8421, "step": 2106 }, { "epoch": 1.131539210093621, "grad_norm": 0.75390625, "learning_rate": 4.3985446122245076e-05, "loss": 0.9083, "step": 2107 }, { "epoch": 1.132076104828697, "grad_norm": 0.64453125, "learning_rate": 4.3979950602913585e-05, "loss": 0.8521, "step": 2108 }, { "epoch": 1.132612999563773, "grad_norm": 0.73046875, "learning_rate": 4.3974452917729366e-05, "loss": 0.86, "step": 2109 }, { "epoch": 1.133149894298849, "grad_norm": 0.66015625, "learning_rate": 4.3968953067319777e-05, "loss": 1.0138, "step": 2110 }, { "epoch": 1.133686789033925, "grad_norm": 0.74609375, "learning_rate": 4.396345105231242e-05, "loss": 0.9405, "step": 2111 }, { "epoch": 1.134223683769001, "grad_norm": 0.64453125, "learning_rate": 4.3957946873335144e-05, "loss": 0.7999, "step": 2112 }, { "epoch": 1.134760578504077, "grad_norm": 0.76953125, "learning_rate": 4.3952440531016046e-05, "loss": 0.7411, "step": 2113 }, { "epoch": 1.135297473239153, "grad_norm": 0.71484375, "learning_rate": 4.394693202598348e-05, "loss": 1.024, "step": 2114 }, { "epoch": 1.135834367974229, "grad_norm": 0.609375, "learning_rate": 4.3941421358866016e-05, "loss": 0.8786, "step": 2115 }, { "epoch": 1.136371262709305, "grad_norm": 0.76171875, "learning_rate": 4.393590853029251e-05, "loss": 0.9797, "step": 2116 }, { "epoch": 1.1369081574443811, "grad_norm": 0.71875, "learning_rate": 4.393039354089203e-05, "loss": 0.7747, "step": 2117 }, { "epoch": 1.137445052179457, "grad_norm": 0.5546875, "learning_rate": 4.3924876391293915e-05, "loss": 0.7416, "step": 2118 }, { "epoch": 1.1379819469145331, "grad_norm": 0.65625, "learning_rate": 4.391935708212774e-05, "loss": 0.9829, "step": 2119 }, { "epoch": 1.138518841649609, "grad_norm": 0.8515625, "learning_rate": 4.391383561402333e-05, "loss": 0.8999, "step": 2120 }, { "epoch": 1.1390557363846852, "grad_norm": 0.71875, "learning_rate": 4.390831198761074e-05, "loss": 0.936, "step": 2121 }, { "epoch": 1.139592631119761, "grad_norm": 0.6953125, "learning_rate": 4.390278620352031e-05, "loss": 0.8469, "step": 2122 }, { "epoch": 1.1401295258548372, "grad_norm": 0.74609375, "learning_rate": 4.389725826238259e-05, "loss": 1.0217, "step": 2123 }, { "epoch": 1.140666420589913, "grad_norm": 0.6796875, "learning_rate": 4.3891728164828385e-05, "loss": 0.8167, "step": 2124 }, { "epoch": 1.1412033153249892, "grad_norm": 0.75, "learning_rate": 4.3886195911488746e-05, "loss": 0.9418, "step": 2125 }, { "epoch": 1.141740210060065, "grad_norm": 0.69921875, "learning_rate": 4.3880661502994986e-05, "loss": 0.8184, "step": 2126 }, { "epoch": 1.142277104795141, "grad_norm": 0.75390625, "learning_rate": 4.387512493997864e-05, "loss": 0.9536, "step": 2127 }, { "epoch": 1.1428139995302171, "grad_norm": 0.68359375, "learning_rate": 4.386958622307151e-05, "loss": 0.8705, "step": 2128 }, { "epoch": 1.143350894265293, "grad_norm": 0.734375, "learning_rate": 4.386404535290563e-05, "loss": 1.0281, "step": 2129 }, { "epoch": 1.1438877890003691, "grad_norm": 0.703125, "learning_rate": 4.385850233011327e-05, "loss": 0.942, "step": 2130 }, { "epoch": 1.144424683735445, "grad_norm": 0.70703125, "learning_rate": 4.385295715532698e-05, "loss": 0.922, "step": 2131 }, { "epoch": 1.1449615784705212, "grad_norm": 0.80859375, "learning_rate": 4.384740982917952e-05, "loss": 0.9384, "step": 2132 }, { "epoch": 1.145498473205597, "grad_norm": 0.66015625, "learning_rate": 4.384186035230392e-05, "loss": 1.0251, "step": 2133 }, { "epoch": 1.1460353679406732, "grad_norm": 0.79296875, "learning_rate": 4.383630872533344e-05, "loss": 1.1454, "step": 2134 }, { "epoch": 1.146572262675749, "grad_norm": 0.98828125, "learning_rate": 4.383075494890159e-05, "loss": 1.096, "step": 2135 }, { "epoch": 1.1471091574108252, "grad_norm": 0.828125, "learning_rate": 4.3825199023642125e-05, "loss": 0.8909, "step": 2136 }, { "epoch": 1.147646052145901, "grad_norm": 0.65625, "learning_rate": 4.381964095018906e-05, "loss": 0.8381, "step": 2137 }, { "epoch": 1.1481829468809772, "grad_norm": 0.71484375, "learning_rate": 4.3814080729176624e-05, "loss": 0.8422, "step": 2138 }, { "epoch": 1.1487198416160531, "grad_norm": 0.6640625, "learning_rate": 4.380851836123931e-05, "loss": 0.8447, "step": 2139 }, { "epoch": 1.1492567363511292, "grad_norm": 0.83984375, "learning_rate": 4.3802953847011865e-05, "loss": 0.9898, "step": 2140 }, { "epoch": 1.1497936310862051, "grad_norm": 0.671875, "learning_rate": 4.379738718712927e-05, "loss": 0.9763, "step": 2141 }, { "epoch": 1.1503305258212813, "grad_norm": 0.65234375, "learning_rate": 4.379181838222675e-05, "loss": 0.919, "step": 2142 }, { "epoch": 1.1508674205563572, "grad_norm": 0.6640625, "learning_rate": 4.378624743293976e-05, "loss": 0.7736, "step": 2143 }, { "epoch": 1.1514043152914333, "grad_norm": 0.640625, "learning_rate": 4.378067433990404e-05, "loss": 0.9296, "step": 2144 }, { "epoch": 1.1519412100265092, "grad_norm": 0.8125, "learning_rate": 4.377509910375554e-05, "loss": 0.8183, "step": 2145 }, { "epoch": 1.152478104761585, "grad_norm": 0.70703125, "learning_rate": 4.376952172513046e-05, "loss": 0.9463, "step": 2146 }, { "epoch": 1.1530149994966612, "grad_norm": 0.63671875, "learning_rate": 4.3763942204665255e-05, "loss": 0.8834, "step": 2147 }, { "epoch": 1.153551894231737, "grad_norm": 0.7890625, "learning_rate": 4.3758360542996625e-05, "loss": 0.9232, "step": 2148 }, { "epoch": 1.1540887889668132, "grad_norm": 1.0390625, "learning_rate": 4.375277674076149e-05, "loss": 1.0551, "step": 2149 }, { "epoch": 1.1546256837018891, "grad_norm": 0.89453125, "learning_rate": 4.374719079859705e-05, "loss": 0.9048, "step": 2150 }, { "epoch": 1.1546256837018891, "eval_loss": 0.9199674129486084, "eval_runtime": 46.1504, "eval_samples_per_second": 20.845, "eval_steps_per_second": 20.845, "step": 2150 }, { "epoch": 1.1551625784369652, "grad_norm": 0.7890625, "learning_rate": 4.374160271714073e-05, "loss": 1.0633, "step": 2151 }, { "epoch": 1.1556994731720411, "grad_norm": 0.66015625, "learning_rate": 4.3736012497030194e-05, "loss": 0.8048, "step": 2152 }, { "epoch": 1.1562363679071173, "grad_norm": 1.078125, "learning_rate": 4.373042013890335e-05, "loss": 1.0484, "step": 2153 }, { "epoch": 1.1567732626421932, "grad_norm": 0.74609375, "learning_rate": 4.372482564339838e-05, "loss": 0.8295, "step": 2154 }, { "epoch": 1.1573101573772693, "grad_norm": 0.7109375, "learning_rate": 4.3719229011153675e-05, "loss": 0.8643, "step": 2155 }, { "epoch": 1.1578470521123452, "grad_norm": 0.82421875, "learning_rate": 4.3713630242807866e-05, "loss": 1.0369, "step": 2156 }, { "epoch": 1.1583839468474213, "grad_norm": 0.69140625, "learning_rate": 4.3708029338999867e-05, "loss": 0.8695, "step": 2157 }, { "epoch": 1.1589208415824972, "grad_norm": 0.88671875, "learning_rate": 4.37024263003688e-05, "loss": 0.8015, "step": 2158 }, { "epoch": 1.1594577363175733, "grad_norm": 0.5546875, "learning_rate": 4.369682112755405e-05, "loss": 0.8047, "step": 2159 }, { "epoch": 1.1599946310526492, "grad_norm": 0.76171875, "learning_rate": 4.369121382119523e-05, "loss": 0.7613, "step": 2160 }, { "epoch": 1.1605315257877253, "grad_norm": 0.59765625, "learning_rate": 4.36856043819322e-05, "loss": 0.72, "step": 2161 }, { "epoch": 1.1610684205228012, "grad_norm": 0.7109375, "learning_rate": 4.367999281040509e-05, "loss": 1.0082, "step": 2162 }, { "epoch": 1.1616053152578774, "grad_norm": 0.671875, "learning_rate": 4.3674379107254225e-05, "loss": 0.9587, "step": 2163 }, { "epoch": 1.1621422099929533, "grad_norm": 0.64453125, "learning_rate": 4.3668763273120214e-05, "loss": 1.1291, "step": 2164 }, { "epoch": 1.1626791047280292, "grad_norm": 0.68359375, "learning_rate": 4.366314530864391e-05, "loss": 0.9716, "step": 2165 }, { "epoch": 1.1632159994631053, "grad_norm": 0.83984375, "learning_rate": 4.365752521446635e-05, "loss": 1.017, "step": 2166 }, { "epoch": 1.1637528941981812, "grad_norm": 0.8515625, "learning_rate": 4.36519029912289e-05, "loss": 0.7967, "step": 2167 }, { "epoch": 1.1642897889332573, "grad_norm": 0.78515625, "learning_rate": 4.364627863957311e-05, "loss": 0.8033, "step": 2168 }, { "epoch": 1.1648266836683332, "grad_norm": 0.7578125, "learning_rate": 4.364065216014079e-05, "loss": 0.9961, "step": 2169 }, { "epoch": 1.1653635784034093, "grad_norm": 0.69921875, "learning_rate": 4.363502355357399e-05, "loss": 0.9891, "step": 2170 }, { "epoch": 1.1659004731384852, "grad_norm": 0.9140625, "learning_rate": 4.362939282051501e-05, "loss": 0.9474, "step": 2171 }, { "epoch": 1.1664373678735613, "grad_norm": 0.765625, "learning_rate": 4.362375996160639e-05, "loss": 0.9568, "step": 2172 }, { "epoch": 1.1669742626086372, "grad_norm": 0.71484375, "learning_rate": 4.3618124977490916e-05, "loss": 0.8744, "step": 2173 }, { "epoch": 1.1675111573437134, "grad_norm": 0.6796875, "learning_rate": 4.361248786881159e-05, "loss": 0.9549, "step": 2174 }, { "epoch": 1.1680480520787893, "grad_norm": 0.70703125, "learning_rate": 4.3606848636211693e-05, "loss": 0.79, "step": 2175 }, { "epoch": 1.1685849468138654, "grad_norm": 0.65625, "learning_rate": 4.3601207280334733e-05, "loss": 0.9945, "step": 2176 }, { "epoch": 1.1691218415489413, "grad_norm": 0.6875, "learning_rate": 4.359556380182446e-05, "loss": 0.8591, "step": 2177 }, { "epoch": 1.1696587362840174, "grad_norm": 0.65625, "learning_rate": 4.3589918201324854e-05, "loss": 0.807, "step": 2178 }, { "epoch": 1.1701956310190933, "grad_norm": 0.5859375, "learning_rate": 4.358427047948016e-05, "loss": 0.807, "step": 2179 }, { "epoch": 1.1707325257541694, "grad_norm": 0.6875, "learning_rate": 4.357862063693486e-05, "loss": 0.8825, "step": 2180 }, { "epoch": 1.1712694204892453, "grad_norm": 0.69140625, "learning_rate": 4.357296867433366e-05, "loss": 0.9605, "step": 2181 }, { "epoch": 1.1718063152243214, "grad_norm": 0.671875, "learning_rate": 4.356731459232153e-05, "loss": 0.7999, "step": 2182 }, { "epoch": 1.1723432099593973, "grad_norm": 0.8828125, "learning_rate": 4.356165839154367e-05, "loss": 0.9473, "step": 2183 }, { "epoch": 1.1728801046944732, "grad_norm": 0.70703125, "learning_rate": 4.355600007264552e-05, "loss": 0.9375, "step": 2184 }, { "epoch": 1.1734169994295494, "grad_norm": 0.62109375, "learning_rate": 4.3550339636272775e-05, "loss": 0.8811, "step": 2185 }, { "epoch": 1.1739538941646253, "grad_norm": 0.6171875, "learning_rate": 4.354467708307135e-05, "loss": 0.8012, "step": 2186 }, { "epoch": 1.1744907888997014, "grad_norm": 0.8359375, "learning_rate": 4.353901241368742e-05, "loss": 1.0285, "step": 2187 }, { "epoch": 1.1750276836347773, "grad_norm": 0.93359375, "learning_rate": 4.35333456287674e-05, "loss": 1.191, "step": 2188 }, { "epoch": 1.1755645783698534, "grad_norm": 0.83984375, "learning_rate": 4.352767672895793e-05, "loss": 0.9894, "step": 2189 }, { "epoch": 1.1761014731049293, "grad_norm": 0.703125, "learning_rate": 4.352200571490592e-05, "loss": 0.8701, "step": 2190 }, { "epoch": 1.1766383678400054, "grad_norm": 0.69921875, "learning_rate": 4.35163325872585e-05, "loss": 0.9078, "step": 2191 }, { "epoch": 1.1771752625750813, "grad_norm": 0.66796875, "learning_rate": 4.351065734666303e-05, "loss": 0.8073, "step": 2192 }, { "epoch": 1.1777121573101574, "grad_norm": 0.70703125, "learning_rate": 4.350497999376715e-05, "loss": 0.8385, "step": 2193 }, { "epoch": 1.1782490520452333, "grad_norm": 0.7578125, "learning_rate": 4.34993005292187e-05, "loss": 0.9369, "step": 2194 }, { "epoch": 1.1787859467803095, "grad_norm": 0.6875, "learning_rate": 4.3493618953665785e-05, "loss": 0.8755, "step": 2195 }, { "epoch": 1.1793228415153854, "grad_norm": 0.8515625, "learning_rate": 4.348793526775674e-05, "loss": 0.7655, "step": 2196 }, { "epoch": 1.1798597362504615, "grad_norm": 0.7265625, "learning_rate": 4.348224947214016e-05, "loss": 1.0039, "step": 2197 }, { "epoch": 1.1803966309855374, "grad_norm": 0.6875, "learning_rate": 4.347656156746485e-05, "loss": 0.7767, "step": 2198 }, { "epoch": 1.1809335257206135, "grad_norm": 0.80859375, "learning_rate": 4.347087155437987e-05, "loss": 1.0435, "step": 2199 }, { "epoch": 1.1814704204556894, "grad_norm": 0.65625, "learning_rate": 4.3465179433534543e-05, "loss": 0.7952, "step": 2200 }, { "epoch": 1.1814704204556894, "eval_loss": 0.9191593527793884, "eval_runtime": 46.0425, "eval_samples_per_second": 20.894, "eval_steps_per_second": 20.894, "step": 2200 }, { "epoch": 1.1820073151907655, "grad_norm": 0.70703125, "learning_rate": 4.34594852055784e-05, "loss": 0.8467, "step": 2201 }, { "epoch": 1.1825442099258414, "grad_norm": 0.7109375, "learning_rate": 4.345378887116122e-05, "loss": 0.8265, "step": 2202 }, { "epoch": 1.1830811046609173, "grad_norm": 0.69140625, "learning_rate": 4.344809043093303e-05, "loss": 1.019, "step": 2203 }, { "epoch": 1.1836179993959934, "grad_norm": 0.734375, "learning_rate": 4.34423898855441e-05, "loss": 0.7523, "step": 2204 }, { "epoch": 1.1841548941310693, "grad_norm": 0.8125, "learning_rate": 4.3436687235644927e-05, "loss": 0.9662, "step": 2205 }, { "epoch": 1.1846917888661455, "grad_norm": 0.78125, "learning_rate": 4.343098248188626e-05, "loss": 1.0414, "step": 2206 }, { "epoch": 1.1852286836012214, "grad_norm": 0.625, "learning_rate": 4.342527562491908e-05, "loss": 0.9376, "step": 2207 }, { "epoch": 1.1857655783362975, "grad_norm": 0.69921875, "learning_rate": 4.341956666539462e-05, "loss": 0.9927, "step": 2208 }, { "epoch": 1.1863024730713734, "grad_norm": 0.70703125, "learning_rate": 4.341385560396434e-05, "loss": 0.993, "step": 2209 }, { "epoch": 1.1868393678064495, "grad_norm": 0.77734375, "learning_rate": 4.340814244127993e-05, "loss": 1.0194, "step": 2210 }, { "epoch": 1.1873762625415254, "grad_norm": 0.51953125, "learning_rate": 4.3402427177993366e-05, "loss": 0.6954, "step": 2211 }, { "epoch": 1.1879131572766015, "grad_norm": 0.68359375, "learning_rate": 4.33967098147568e-05, "loss": 0.8069, "step": 2212 }, { "epoch": 1.1884500520116774, "grad_norm": 0.80078125, "learning_rate": 4.339099035222267e-05, "loss": 0.9309, "step": 2213 }, { "epoch": 1.1889869467467535, "grad_norm": 0.59375, "learning_rate": 4.3385268791043634e-05, "loss": 0.7615, "step": 2214 }, { "epoch": 1.1895238414818294, "grad_norm": 0.72265625, "learning_rate": 4.3379545131872605e-05, "loss": 0.8976, "step": 2215 }, { "epoch": 1.1900607362169056, "grad_norm": 0.671875, "learning_rate": 4.337381937536272e-05, "loss": 0.8906, "step": 2216 }, { "epoch": 1.1905976309519815, "grad_norm": 0.80859375, "learning_rate": 4.3368091522167356e-05, "loss": 1.0466, "step": 2217 }, { "epoch": 1.1911345256870576, "grad_norm": 0.7734375, "learning_rate": 4.3362361572940136e-05, "loss": 0.948, "step": 2218 }, { "epoch": 1.1916714204221335, "grad_norm": 0.77734375, "learning_rate": 4.335662952833492e-05, "loss": 1.0474, "step": 2219 }, { "epoch": 1.1922083151572096, "grad_norm": 0.625, "learning_rate": 4.33508953890058e-05, "loss": 0.9169, "step": 2220 }, { "epoch": 1.1927452098922855, "grad_norm": 0.7109375, "learning_rate": 4.334515915560713e-05, "loss": 0.96, "step": 2221 }, { "epoch": 1.1932821046273614, "grad_norm": 0.65234375, "learning_rate": 4.333942082879348e-05, "loss": 0.8072, "step": 2222 }, { "epoch": 1.1938189993624375, "grad_norm": 0.6640625, "learning_rate": 4.3333680409219655e-05, "loss": 0.8069, "step": 2223 }, { "epoch": 1.1943558940975134, "grad_norm": 0.6171875, "learning_rate": 4.332793789754072e-05, "loss": 0.8196, "step": 2224 }, { "epoch": 1.1948927888325895, "grad_norm": 0.6796875, "learning_rate": 4.3322193294411974e-05, "loss": 0.8526, "step": 2225 }, { "epoch": 1.1954296835676654, "grad_norm": 0.71875, "learning_rate": 4.331644660048893e-05, "loss": 0.8704, "step": 2226 }, { "epoch": 1.1959665783027416, "grad_norm": 0.60546875, "learning_rate": 4.3310697816427374e-05, "loss": 0.9687, "step": 2227 }, { "epoch": 1.1965034730378175, "grad_norm": 0.62890625, "learning_rate": 4.330494694288331e-05, "loss": 0.863, "step": 2228 }, { "epoch": 1.1970403677728936, "grad_norm": 0.66015625, "learning_rate": 4.3299193980512995e-05, "loss": 0.8124, "step": 2229 }, { "epoch": 1.1975772625079695, "grad_norm": 0.65625, "learning_rate": 4.3293438929972894e-05, "loss": 0.8963, "step": 2230 }, { "epoch": 1.1981141572430456, "grad_norm": 0.640625, "learning_rate": 4.328768179191975e-05, "loss": 1.0648, "step": 2231 }, { "epoch": 1.1986510519781215, "grad_norm": 0.59765625, "learning_rate": 4.328192256701052e-05, "loss": 0.7347, "step": 2232 }, { "epoch": 1.1991879467131976, "grad_norm": 0.66015625, "learning_rate": 4.3276161255902407e-05, "loss": 0.8145, "step": 2233 }, { "epoch": 1.1997248414482735, "grad_norm": 0.8125, "learning_rate": 4.327039785925283e-05, "loss": 0.811, "step": 2234 }, { "epoch": 1.2002617361833496, "grad_norm": 0.7109375, "learning_rate": 4.3264632377719496e-05, "loss": 1.0834, "step": 2235 }, { "epoch": 1.2007986309184255, "grad_norm": 0.66015625, "learning_rate": 4.3258864811960296e-05, "loss": 0.9029, "step": 2236 }, { "epoch": 1.2013355256535017, "grad_norm": 0.6796875, "learning_rate": 4.3253095162633396e-05, "loss": 1.0308, "step": 2237 }, { "epoch": 1.2018724203885776, "grad_norm": 0.74609375, "learning_rate": 4.324732343039718e-05, "loss": 0.9287, "step": 2238 }, { "epoch": 1.2024093151236537, "grad_norm": 0.73046875, "learning_rate": 4.324154961591028e-05, "loss": 0.9113, "step": 2239 }, { "epoch": 1.2029462098587296, "grad_norm": 0.64453125, "learning_rate": 4.323577371983155e-05, "loss": 0.7997, "step": 2240 }, { "epoch": 1.2034831045938055, "grad_norm": 0.77734375, "learning_rate": 4.322999574282011e-05, "loss": 0.9625, "step": 2241 }, { "epoch": 1.2040199993288816, "grad_norm": 0.8125, "learning_rate": 4.3224215685535294e-05, "loss": 0.8329, "step": 2242 }, { "epoch": 1.2045568940639575, "grad_norm": 0.65625, "learning_rate": 4.321843354863667e-05, "loss": 0.841, "step": 2243 }, { "epoch": 1.2050937887990336, "grad_norm": 0.71875, "learning_rate": 4.321264933278406e-05, "loss": 0.8994, "step": 2244 }, { "epoch": 1.2056306835341095, "grad_norm": 0.72265625, "learning_rate": 4.3206863038637516e-05, "loss": 1.1515, "step": 2245 }, { "epoch": 1.2061675782691856, "grad_norm": 0.70703125, "learning_rate": 4.3201074666857334e-05, "loss": 0.8285, "step": 2246 }, { "epoch": 1.2067044730042615, "grad_norm": 0.609375, "learning_rate": 4.3195284218104024e-05, "loss": 0.6893, "step": 2247 }, { "epoch": 1.2072413677393377, "grad_norm": 0.8046875, "learning_rate": 4.318949169303837e-05, "loss": 0.8844, "step": 2248 }, { "epoch": 1.2077782624744136, "grad_norm": 0.83984375, "learning_rate": 4.318369709232136e-05, "loss": 1.1114, "step": 2249 }, { "epoch": 1.2083151572094897, "grad_norm": 0.86328125, "learning_rate": 4.317790041661424e-05, "loss": 0.9558, "step": 2250 }, { "epoch": 1.2083151572094897, "eval_loss": 0.9185804724693298, "eval_runtime": 45.9295, "eval_samples_per_second": 20.945, "eval_steps_per_second": 20.945, "step": 2250 }, { "epoch": 1.2088520519445656, "grad_norm": 0.74609375, "learning_rate": 4.317210166657847e-05, "loss": 0.7758, "step": 2251 }, { "epoch": 1.2093889466796417, "grad_norm": 0.8515625, "learning_rate": 4.316630084287577e-05, "loss": 0.8318, "step": 2252 }, { "epoch": 1.2099258414147176, "grad_norm": 0.7109375, "learning_rate": 4.316049794616809e-05, "loss": 0.866, "step": 2253 }, { "epoch": 1.2104627361497937, "grad_norm": 0.80859375, "learning_rate": 4.3154692977117604e-05, "loss": 0.9432, "step": 2254 }, { "epoch": 1.2109996308848696, "grad_norm": 0.8359375, "learning_rate": 4.314888593638674e-05, "loss": 0.9956, "step": 2255 }, { "epoch": 1.2115365256199455, "grad_norm": 0.7421875, "learning_rate": 4.314307682463816e-05, "loss": 1.0258, "step": 2256 }, { "epoch": 1.2120734203550216, "grad_norm": 0.60546875, "learning_rate": 4.313726564253474e-05, "loss": 0.7928, "step": 2257 }, { "epoch": 1.2126103150900978, "grad_norm": 0.62890625, "learning_rate": 4.3131452390739626e-05, "loss": 0.772, "step": 2258 }, { "epoch": 1.2131472098251737, "grad_norm": 0.8125, "learning_rate": 4.312563706991618e-05, "loss": 0.8915, "step": 2259 }, { "epoch": 1.2136841045602496, "grad_norm": 0.71484375, "learning_rate": 4.3119819680728e-05, "loss": 0.8551, "step": 2260 }, { "epoch": 1.2142209992953257, "grad_norm": 0.87890625, "learning_rate": 4.3114000223838927e-05, "loss": 1.0303, "step": 2261 }, { "epoch": 1.2147578940304016, "grad_norm": 0.7734375, "learning_rate": 4.310817869991303e-05, "loss": 1.0987, "step": 2262 }, { "epoch": 1.2152947887654777, "grad_norm": 0.78515625, "learning_rate": 4.310235510961462e-05, "loss": 0.7569, "step": 2263 }, { "epoch": 1.2158316835005536, "grad_norm": 0.76171875, "learning_rate": 4.309652945360825e-05, "loss": 0.8945, "step": 2264 }, { "epoch": 1.2163685782356297, "grad_norm": 0.6875, "learning_rate": 4.3090701732558694e-05, "loss": 0.8193, "step": 2265 }, { "epoch": 1.2169054729707056, "grad_norm": 0.79296875, "learning_rate": 4.308487194713097e-05, "loss": 0.9874, "step": 2266 }, { "epoch": 1.2174423677057817, "grad_norm": 0.7734375, "learning_rate": 4.307904009799033e-05, "loss": 0.9096, "step": 2267 }, { "epoch": 1.2179792624408576, "grad_norm": 0.796875, "learning_rate": 4.307320618580226e-05, "loss": 1.0407, "step": 2268 }, { "epoch": 1.2185161571759338, "grad_norm": 0.76953125, "learning_rate": 4.306737021123249e-05, "loss": 0.9619, "step": 2269 }, { "epoch": 1.2190530519110097, "grad_norm": 0.66015625, "learning_rate": 4.306153217494697e-05, "loss": 0.8103, "step": 2270 }, { "epoch": 1.2195899466460858, "grad_norm": 0.78515625, "learning_rate": 4.30556920776119e-05, "loss": 0.8183, "step": 2271 }, { "epoch": 1.2201268413811617, "grad_norm": 0.98828125, "learning_rate": 4.304984991989371e-05, "loss": 1.0441, "step": 2272 }, { "epoch": 1.2206637361162378, "grad_norm": 1.0, "learning_rate": 4.304400570245906e-05, "loss": 0.7898, "step": 2273 }, { "epoch": 1.2212006308513137, "grad_norm": 0.6875, "learning_rate": 4.3038159425974846e-05, "loss": 0.9046, "step": 2274 }, { "epoch": 1.2217375255863896, "grad_norm": 0.8203125, "learning_rate": 4.3032311091108214e-05, "loss": 0.8699, "step": 2275 }, { "epoch": 1.2222744203214657, "grad_norm": 0.796875, "learning_rate": 4.302646069852652e-05, "loss": 0.7927, "step": 2276 }, { "epoch": 1.2228113150565418, "grad_norm": 0.7734375, "learning_rate": 4.3020608248897373e-05, "loss": 0.879, "step": 2277 }, { "epoch": 1.2233482097916177, "grad_norm": 0.74609375, "learning_rate": 4.301475374288861e-05, "loss": 0.9867, "step": 2278 }, { "epoch": 1.2238851045266936, "grad_norm": 0.67578125, "learning_rate": 4.300889718116831e-05, "loss": 0.8386, "step": 2279 }, { "epoch": 1.2244219992617698, "grad_norm": 0.78125, "learning_rate": 4.300303856440478e-05, "loss": 0.9709, "step": 2280 }, { "epoch": 1.2249588939968457, "grad_norm": 0.58984375, "learning_rate": 4.299717789326656e-05, "loss": 0.7167, "step": 2281 }, { "epoch": 1.2254957887319218, "grad_norm": 0.734375, "learning_rate": 4.299131516842242e-05, "loss": 0.9082, "step": 2282 }, { "epoch": 1.2260326834669977, "grad_norm": 0.63671875, "learning_rate": 4.298545039054138e-05, "loss": 0.7985, "step": 2283 }, { "epoch": 1.2265695782020738, "grad_norm": 0.6171875, "learning_rate": 4.297958356029269e-05, "loss": 0.6688, "step": 2284 }, { "epoch": 1.2271064729371497, "grad_norm": 0.67578125, "learning_rate": 4.2973714678345824e-05, "loss": 0.7596, "step": 2285 }, { "epoch": 1.2276433676722258, "grad_norm": 0.6171875, "learning_rate": 4.2967843745370484e-05, "loss": 0.8478, "step": 2286 }, { "epoch": 1.2281802624073017, "grad_norm": 0.734375, "learning_rate": 4.296197076203663e-05, "loss": 0.8947, "step": 2287 }, { "epoch": 1.2287171571423778, "grad_norm": 0.73046875, "learning_rate": 4.2956095729014444e-05, "loss": 1.0706, "step": 2288 }, { "epoch": 1.2292540518774537, "grad_norm": 0.69921875, "learning_rate": 4.2950218646974346e-05, "loss": 0.7206, "step": 2289 }, { "epoch": 1.2297909466125299, "grad_norm": 0.62890625, "learning_rate": 4.294433951658697e-05, "loss": 0.6709, "step": 2290 }, { "epoch": 1.2303278413476058, "grad_norm": 0.69140625, "learning_rate": 4.293845833852321e-05, "loss": 1.0079, "step": 2291 }, { "epoch": 1.2308647360826819, "grad_norm": 0.703125, "learning_rate": 4.293257511345419e-05, "loss": 0.8778, "step": 2292 }, { "epoch": 1.2314016308177578, "grad_norm": 0.6796875, "learning_rate": 4.292668984205124e-05, "loss": 1.0551, "step": 2293 }, { "epoch": 1.2319385255528337, "grad_norm": 0.76953125, "learning_rate": 4.292080252498596e-05, "loss": 0.9318, "step": 2294 }, { "epoch": 1.2324754202879098, "grad_norm": 0.80859375, "learning_rate": 4.291491316293017e-05, "loss": 0.8448, "step": 2295 }, { "epoch": 1.233012315022986, "grad_norm": 0.703125, "learning_rate": 4.2909021756555906e-05, "loss": 0.9453, "step": 2296 }, { "epoch": 1.2335492097580618, "grad_norm": 0.8515625, "learning_rate": 4.2903128306535466e-05, "loss": 0.9568, "step": 2297 }, { "epoch": 1.2340861044931377, "grad_norm": 0.69921875, "learning_rate": 4.289723281354135e-05, "loss": 0.9516, "step": 2298 }, { "epoch": 1.2346229992282138, "grad_norm": 0.69921875, "learning_rate": 4.2891335278246335e-05, "loss": 0.8181, "step": 2299 }, { "epoch": 1.2351598939632897, "grad_norm": 0.69140625, "learning_rate": 4.288543570132338e-05, "loss": 1.0422, "step": 2300 }, { "epoch": 1.2351598939632897, "eval_loss": 0.919032633304596, "eval_runtime": 46.5908, "eval_samples_per_second": 20.648, "eval_steps_per_second": 20.648, "step": 2300 }, { "epoch": 1.2356967886983659, "grad_norm": 0.6484375, "learning_rate": 4.2879534083445716e-05, "loss": 0.8635, "step": 2301 }, { "epoch": 1.2362336834334418, "grad_norm": 0.87890625, "learning_rate": 4.2873630425286784e-05, "loss": 0.872, "step": 2302 }, { "epoch": 1.2367705781685179, "grad_norm": 0.71875, "learning_rate": 4.286772472752027e-05, "loss": 0.8259, "step": 2303 }, { "epoch": 1.2373074729035938, "grad_norm": 0.7109375, "learning_rate": 4.2861816990820084e-05, "loss": 0.9112, "step": 2304 }, { "epoch": 1.23784436763867, "grad_norm": 0.76953125, "learning_rate": 4.285590721586039e-05, "loss": 0.8183, "step": 2305 }, { "epoch": 1.2383812623737458, "grad_norm": 0.8125, "learning_rate": 4.284999540331554e-05, "loss": 1.0442, "step": 2306 }, { "epoch": 1.238918157108822, "grad_norm": 0.734375, "learning_rate": 4.2844081553860174e-05, "loss": 1.0191, "step": 2307 }, { "epoch": 1.2394550518438978, "grad_norm": 0.78515625, "learning_rate": 4.2838165668169117e-05, "loss": 0.9365, "step": 2308 }, { "epoch": 1.239991946578974, "grad_norm": 0.60546875, "learning_rate": 4.283224774691746e-05, "loss": 0.7694, "step": 2309 }, { "epoch": 1.2405288413140498, "grad_norm": 0.68359375, "learning_rate": 4.282632779078051e-05, "loss": 0.8036, "step": 2310 }, { "epoch": 1.241065736049126, "grad_norm": 0.796875, "learning_rate": 4.2820405800433804e-05, "loss": 1.0061, "step": 2311 }, { "epoch": 1.2416026307842019, "grad_norm": 0.7578125, "learning_rate": 4.281448177655312e-05, "loss": 0.8638, "step": 2312 }, { "epoch": 1.2421395255192778, "grad_norm": 0.73046875, "learning_rate": 4.2808555719814466e-05, "loss": 0.7865, "step": 2313 }, { "epoch": 1.2426764202543539, "grad_norm": 0.6640625, "learning_rate": 4.280262763089407e-05, "loss": 0.9393, "step": 2314 }, { "epoch": 1.24321331498943, "grad_norm": 0.71484375, "learning_rate": 4.279669751046841e-05, "loss": 0.9858, "step": 2315 }, { "epoch": 1.243750209724506, "grad_norm": 0.74609375, "learning_rate": 4.27907653592142e-05, "loss": 0.993, "step": 2316 }, { "epoch": 1.2442871044595818, "grad_norm": 0.5703125, "learning_rate": 4.278483117780835e-05, "loss": 0.7039, "step": 2317 }, { "epoch": 1.244823999194658, "grad_norm": 0.66015625, "learning_rate": 4.277889496692804e-05, "loss": 0.8444, "step": 2318 }, { "epoch": 1.2453608939297338, "grad_norm": 0.58203125, "learning_rate": 4.2772956727250655e-05, "loss": 0.7538, "step": 2319 }, { "epoch": 1.24589778866481, "grad_norm": 0.7578125, "learning_rate": 4.2767016459453835e-05, "loss": 0.9166, "step": 2320 }, { "epoch": 1.2464346833998858, "grad_norm": 0.59765625, "learning_rate": 4.276107416421544e-05, "loss": 0.7658, "step": 2321 }, { "epoch": 1.246971578134962, "grad_norm": 0.58203125, "learning_rate": 4.2755129842213546e-05, "loss": 0.8692, "step": 2322 }, { "epoch": 1.2475084728700379, "grad_norm": 0.7421875, "learning_rate": 4.2749183494126496e-05, "loss": 0.8463, "step": 2323 }, { "epoch": 1.248045367605114, "grad_norm": 0.78515625, "learning_rate": 4.274323512063284e-05, "loss": 0.9421, "step": 2324 }, { "epoch": 1.2485822623401899, "grad_norm": 0.59765625, "learning_rate": 4.273728472241134e-05, "loss": 0.7189, "step": 2325 }, { "epoch": 1.249119157075266, "grad_norm": 0.69140625, "learning_rate": 4.2731332300141046e-05, "loss": 0.7346, "step": 2326 }, { "epoch": 1.249656051810342, "grad_norm": 0.625, "learning_rate": 4.272537785450118e-05, "loss": 0.9027, "step": 2327 }, { "epoch": 1.250192946545418, "grad_norm": 0.7578125, "learning_rate": 4.271942138617122e-05, "loss": 1.0147, "step": 2328 }, { "epoch": 1.250729841280494, "grad_norm": 0.76171875, "learning_rate": 4.2713462895830894e-05, "loss": 0.8928, "step": 2329 }, { "epoch": 1.25126673601557, "grad_norm": 0.8359375, "learning_rate": 4.270750238416013e-05, "loss": 0.8658, "step": 2330 }, { "epoch": 1.251803630750646, "grad_norm": 0.8984375, "learning_rate": 4.270153985183908e-05, "loss": 0.7505, "step": 2331 }, { "epoch": 1.2523405254857218, "grad_norm": 0.671875, "learning_rate": 4.269557529954818e-05, "loss": 0.8029, "step": 2332 }, { "epoch": 1.252877420220798, "grad_norm": 0.890625, "learning_rate": 4.268960872796802e-05, "loss": 0.9767, "step": 2333 }, { "epoch": 1.253414314955874, "grad_norm": 0.921875, "learning_rate": 4.26836401377795e-05, "loss": 0.9238, "step": 2334 }, { "epoch": 1.25395120969095, "grad_norm": 0.66796875, "learning_rate": 4.267766952966369e-05, "loss": 0.9478, "step": 2335 }, { "epoch": 1.2544881044260259, "grad_norm": 0.8828125, "learning_rate": 4.267169690430192e-05, "loss": 0.939, "step": 2336 }, { "epoch": 1.255024999161102, "grad_norm": 0.78515625, "learning_rate": 4.2665722262375734e-05, "loss": 0.8077, "step": 2337 }, { "epoch": 1.2555618938961781, "grad_norm": 0.73046875, "learning_rate": 4.2659745604566924e-05, "loss": 0.9556, "step": 2338 }, { "epoch": 1.256098788631254, "grad_norm": 0.62109375, "learning_rate": 4.26537669315575e-05, "loss": 0.7619, "step": 2339 }, { "epoch": 1.25663568336633, "grad_norm": 0.796875, "learning_rate": 4.2647786244029694e-05, "loss": 0.9437, "step": 2340 }, { "epoch": 1.257172578101406, "grad_norm": 0.71875, "learning_rate": 4.264180354266599e-05, "loss": 0.9256, "step": 2341 }, { "epoch": 1.257709472836482, "grad_norm": 1.0625, "learning_rate": 4.263581882814909e-05, "loss": 0.9118, "step": 2342 }, { "epoch": 1.258246367571558, "grad_norm": 0.8203125, "learning_rate": 4.262983210116193e-05, "loss": 1.0294, "step": 2343 }, { "epoch": 1.258783262306634, "grad_norm": 0.72265625, "learning_rate": 4.2623843362387645e-05, "loss": 1.1032, "step": 2344 }, { "epoch": 1.25932015704171, "grad_norm": 0.64453125, "learning_rate": 4.2617852612509656e-05, "loss": 0.8864, "step": 2345 }, { "epoch": 1.259857051776786, "grad_norm": 0.67578125, "learning_rate": 4.261185985221156e-05, "loss": 0.8555, "step": 2346 }, { "epoch": 1.260393946511862, "grad_norm": 0.6875, "learning_rate": 4.2605865082177235e-05, "loss": 0.7918, "step": 2347 }, { "epoch": 1.260930841246938, "grad_norm": 0.7109375, "learning_rate": 4.259986830309074e-05, "loss": 0.7688, "step": 2348 }, { "epoch": 1.2614677359820141, "grad_norm": 0.78125, "learning_rate": 4.2593869515636376e-05, "loss": 0.8333, "step": 2349 }, { "epoch": 1.26200463071709, "grad_norm": 0.80078125, "learning_rate": 4.2587868720498705e-05, "loss": 0.7353, "step": 2350 }, { "epoch": 1.26200463071709, "eval_loss": 0.9166867136955261, "eval_runtime": 45.6547, "eval_samples_per_second": 21.071, "eval_steps_per_second": 21.071, "step": 2350 }, { "epoch": 1.262541525452166, "grad_norm": 0.640625, "learning_rate": 4.258186591836248e-05, "loss": 0.773, "step": 2351 }, { "epoch": 1.263078420187242, "grad_norm": 0.64453125, "learning_rate": 4.257586110991268e-05, "loss": 0.7677, "step": 2352 }, { "epoch": 1.2636153149223182, "grad_norm": 0.65625, "learning_rate": 4.2569854295834566e-05, "loss": 0.9747, "step": 2353 }, { "epoch": 1.264152209657394, "grad_norm": 0.83203125, "learning_rate": 4.2563845476813566e-05, "loss": 0.8526, "step": 2354 }, { "epoch": 1.26468910439247, "grad_norm": 0.65625, "learning_rate": 4.2557834653535366e-05, "loss": 0.7428, "step": 2355 }, { "epoch": 1.265225999127546, "grad_norm": 0.68359375, "learning_rate": 4.2551821826685887e-05, "loss": 0.8741, "step": 2356 }, { "epoch": 1.2657628938626222, "grad_norm": 0.70703125, "learning_rate": 4.254580699695125e-05, "loss": 0.7939, "step": 2357 }, { "epoch": 1.266299788597698, "grad_norm": 0.75, "learning_rate": 4.253979016501784e-05, "loss": 0.9722, "step": 2358 }, { "epoch": 1.266836683332774, "grad_norm": 0.71484375, "learning_rate": 4.2533771331572263e-05, "loss": 1.0446, "step": 2359 }, { "epoch": 1.2673735780678501, "grad_norm": 0.6953125, "learning_rate": 4.2527750497301323e-05, "loss": 1.0864, "step": 2360 }, { "epoch": 1.267910472802926, "grad_norm": 0.69921875, "learning_rate": 4.2521727662892074e-05, "loss": 0.8485, "step": 2361 }, { "epoch": 1.2684473675380021, "grad_norm": 0.8046875, "learning_rate": 4.25157028290318e-05, "loss": 1.0127, "step": 2362 }, { "epoch": 1.268984262273078, "grad_norm": 0.8203125, "learning_rate": 4.250967599640803e-05, "loss": 0.8196, "step": 2363 }, { "epoch": 1.2695211570081542, "grad_norm": 0.7265625, "learning_rate": 4.250364716570847e-05, "loss": 0.8567, "step": 2364 }, { "epoch": 1.27005805174323, "grad_norm": 0.7265625, "learning_rate": 4.249761633762111e-05, "loss": 0.8167, "step": 2365 }, { "epoch": 1.2705949464783062, "grad_norm": 0.8125, "learning_rate": 4.249158351283414e-05, "loss": 0.9388, "step": 2366 }, { "epoch": 1.271131841213382, "grad_norm": 0.66015625, "learning_rate": 4.248554869203597e-05, "loss": 1.0163, "step": 2367 }, { "epoch": 1.2716687359484582, "grad_norm": 0.671875, "learning_rate": 4.247951187591527e-05, "loss": 0.8415, "step": 2368 }, { "epoch": 1.272205630683534, "grad_norm": 0.875, "learning_rate": 4.247347306516089e-05, "loss": 0.8779, "step": 2369 }, { "epoch": 1.27274252541861, "grad_norm": 0.7734375, "learning_rate": 4.246743226046196e-05, "loss": 0.8726, "step": 2370 }, { "epoch": 1.2732794201536861, "grad_norm": 0.72265625, "learning_rate": 4.24613894625078e-05, "loss": 0.8935, "step": 2371 }, { "epoch": 1.2738163148887622, "grad_norm": 0.6953125, "learning_rate": 4.245534467198797e-05, "loss": 0.8479, "step": 2372 }, { "epoch": 1.2743532096238381, "grad_norm": 1.0546875, "learning_rate": 4.244929788959225e-05, "loss": 0.9463, "step": 2373 }, { "epoch": 1.274890104358914, "grad_norm": 0.625, "learning_rate": 4.244324911601068e-05, "loss": 0.8583, "step": 2374 }, { "epoch": 1.2754269990939902, "grad_norm": 0.70703125, "learning_rate": 4.2437198351933484e-05, "loss": 0.9005, "step": 2375 }, { "epoch": 1.2759638938290663, "grad_norm": 0.69140625, "learning_rate": 4.243114559805112e-05, "loss": 0.7934, "step": 2376 }, { "epoch": 1.2765007885641422, "grad_norm": 0.65234375, "learning_rate": 4.24250908550543e-05, "loss": 0.7214, "step": 2377 }, { "epoch": 1.277037683299218, "grad_norm": 0.65234375, "learning_rate": 4.241903412363395e-05, "loss": 0.7903, "step": 2378 }, { "epoch": 1.2775745780342942, "grad_norm": 0.85546875, "learning_rate": 4.24129754044812e-05, "loss": 1.0774, "step": 2379 }, { "epoch": 1.27811147276937, "grad_norm": 0.640625, "learning_rate": 4.240691469828745e-05, "loss": 0.8575, "step": 2380 }, { "epoch": 1.2786483675044462, "grad_norm": 0.69140625, "learning_rate": 4.2400852005744284e-05, "loss": 1.0707, "step": 2381 }, { "epoch": 1.2791852622395221, "grad_norm": 0.6015625, "learning_rate": 4.239478732754354e-05, "loss": 0.7319, "step": 2382 }, { "epoch": 1.2797221569745982, "grad_norm": 0.90625, "learning_rate": 4.2388720664377276e-05, "loss": 1.0774, "step": 2383 }, { "epoch": 1.2802590517096741, "grad_norm": 0.703125, "learning_rate": 4.238265201693778e-05, "loss": 0.9633, "step": 2384 }, { "epoch": 1.2807959464447503, "grad_norm": 0.9375, "learning_rate": 4.2376581385917547e-05, "loss": 1.0594, "step": 2385 }, { "epoch": 1.2813328411798262, "grad_norm": 0.6640625, "learning_rate": 4.2370508772009334e-05, "loss": 0.9085, "step": 2386 }, { "epoch": 1.2818697359149023, "grad_norm": 0.61328125, "learning_rate": 4.236443417590607e-05, "loss": 0.7452, "step": 2387 }, { "epoch": 1.2824066306499782, "grad_norm": 0.6640625, "learning_rate": 4.235835759830098e-05, "loss": 0.7606, "step": 2388 }, { "epoch": 1.282943525385054, "grad_norm": 0.72265625, "learning_rate": 4.235227903988746e-05, "loss": 0.9656, "step": 2389 }, { "epoch": 1.2834804201201302, "grad_norm": 0.58203125, "learning_rate": 4.2346198501359146e-05, "loss": 0.8026, "step": 2390 }, { "epoch": 1.2840173148552063, "grad_norm": 0.75390625, "learning_rate": 4.234011598340992e-05, "loss": 0.9217, "step": 2391 }, { "epoch": 1.2845542095902822, "grad_norm": 0.76171875, "learning_rate": 4.233403148673386e-05, "loss": 0.8261, "step": 2392 }, { "epoch": 1.2850911043253581, "grad_norm": 0.69140625, "learning_rate": 4.2327945012025294e-05, "loss": 0.897, "step": 2393 }, { "epoch": 1.2856279990604342, "grad_norm": 0.828125, "learning_rate": 4.232185655997876e-05, "loss": 0.9855, "step": 2394 }, { "epoch": 1.2861648937955104, "grad_norm": 0.72265625, "learning_rate": 4.231576613128902e-05, "loss": 0.8463, "step": 2395 }, { "epoch": 1.2867017885305863, "grad_norm": 0.80078125, "learning_rate": 4.230967372665109e-05, "loss": 0.9179, "step": 2396 }, { "epoch": 1.2872386832656622, "grad_norm": 0.77734375, "learning_rate": 4.230357934676017e-05, "loss": 0.8223, "step": 2397 }, { "epoch": 1.2877755780007383, "grad_norm": 0.640625, "learning_rate": 4.229748299231172e-05, "loss": 0.6675, "step": 2398 }, { "epoch": 1.2883124727358142, "grad_norm": 0.7421875, "learning_rate": 4.22913846640014e-05, "loss": 0.8385, "step": 2399 }, { "epoch": 1.2888493674708903, "grad_norm": 0.69921875, "learning_rate": 4.228528436252512e-05, "loss": 0.924, "step": 2400 }, { "epoch": 1.2888493674708903, "eval_loss": 0.9168675541877747, "eval_runtime": 46.1564, "eval_samples_per_second": 20.842, "eval_steps_per_second": 20.842, "step": 2400 }, { "epoch": 1.2893862622059662, "grad_norm": 0.87109375, "learning_rate": 4.2279182088578996e-05, "loss": 0.922, "step": 2401 }, { "epoch": 1.2899231569410423, "grad_norm": 0.65234375, "learning_rate": 4.227307784285936e-05, "loss": 1.067, "step": 2402 }, { "epoch": 1.2904600516761182, "grad_norm": 0.73828125, "learning_rate": 4.22669716260628e-05, "loss": 0.8357, "step": 2403 }, { "epoch": 1.2909969464111943, "grad_norm": 0.68359375, "learning_rate": 4.226086343888611e-05, "loss": 0.7451, "step": 2404 }, { "epoch": 1.2915338411462702, "grad_norm": 0.69921875, "learning_rate": 4.2254753282026314e-05, "loss": 0.8641, "step": 2405 }, { "epoch": 1.2920707358813464, "grad_norm": 0.640625, "learning_rate": 4.224864115618066e-05, "loss": 0.8504, "step": 2406 }, { "epoch": 1.2926076306164223, "grad_norm": 0.69140625, "learning_rate": 4.22425270620466e-05, "loss": 0.8427, "step": 2407 }, { "epoch": 1.2931445253514982, "grad_norm": 0.71484375, "learning_rate": 4.223641100032185e-05, "loss": 0.93, "step": 2408 }, { "epoch": 1.2936814200865743, "grad_norm": 0.72265625, "learning_rate": 4.2230292971704324e-05, "loss": 0.9213, "step": 2409 }, { "epoch": 1.2942183148216504, "grad_norm": 1.1484375, "learning_rate": 4.222417297689217e-05, "loss": 0.915, "step": 2410 }, { "epoch": 1.2947552095567263, "grad_norm": 0.78515625, "learning_rate": 4.221805101658375e-05, "loss": 1.1016, "step": 2411 }, { "epoch": 1.2952921042918022, "grad_norm": 0.765625, "learning_rate": 4.2211927091477656e-05, "loss": 0.8728, "step": 2412 }, { "epoch": 1.2958289990268783, "grad_norm": 0.671875, "learning_rate": 4.220580120227271e-05, "loss": 1.0869, "step": 2413 }, { "epoch": 1.2963658937619544, "grad_norm": 0.9140625, "learning_rate": 4.2199673349667955e-05, "loss": 0.9189, "step": 2414 }, { "epoch": 1.2969027884970303, "grad_norm": 0.83984375, "learning_rate": 4.2193543534362654e-05, "loss": 1.0031, "step": 2415 }, { "epoch": 1.2974396832321062, "grad_norm": 0.7109375, "learning_rate": 4.2187411757056295e-05, "loss": 0.9205, "step": 2416 }, { "epoch": 1.2979765779671824, "grad_norm": 0.69921875, "learning_rate": 4.21812780184486e-05, "loss": 0.8935, "step": 2417 }, { "epoch": 1.2985134727022583, "grad_norm": 0.84375, "learning_rate": 4.217514231923949e-05, "loss": 1.1609, "step": 2418 }, { "epoch": 1.2990503674373344, "grad_norm": 0.74609375, "learning_rate": 4.2169004660129144e-05, "loss": 0.9533, "step": 2419 }, { "epoch": 1.2995872621724103, "grad_norm": 0.734375, "learning_rate": 4.2162865041817935e-05, "loss": 1.014, "step": 2420 }, { "epoch": 1.3001241569074864, "grad_norm": 0.77734375, "learning_rate": 4.215672346500648e-05, "loss": 0.9725, "step": 2421 }, { "epoch": 1.3006610516425623, "grad_norm": 0.69140625, "learning_rate": 4.21505799303956e-05, "loss": 0.8765, "step": 2422 }, { "epoch": 1.3011979463776384, "grad_norm": 0.6171875, "learning_rate": 4.2144434438686355e-05, "loss": 0.884, "step": 2423 }, { "epoch": 1.3017348411127143, "grad_norm": 0.74609375, "learning_rate": 4.2138286990580025e-05, "loss": 0.8088, "step": 2424 }, { "epoch": 1.3022717358477904, "grad_norm": 0.765625, "learning_rate": 4.21321375867781e-05, "loss": 0.9909, "step": 2425 }, { "epoch": 1.3028086305828663, "grad_norm": 0.68359375, "learning_rate": 4.212598622798233e-05, "loss": 0.9059, "step": 2426 }, { "epoch": 1.3033455253179422, "grad_norm": 0.6328125, "learning_rate": 4.211983291489465e-05, "loss": 0.7257, "step": 2427 }, { "epoch": 1.3038824200530184, "grad_norm": 0.6484375, "learning_rate": 4.211367764821722e-05, "loss": 0.7937, "step": 2428 }, { "epoch": 1.3044193147880945, "grad_norm": 0.640625, "learning_rate": 4.2107520428652445e-05, "loss": 0.7595, "step": 2429 }, { "epoch": 1.3049562095231704, "grad_norm": 0.66796875, "learning_rate": 4.210136125690294e-05, "loss": 0.8931, "step": 2430 }, { "epoch": 1.3054931042582463, "grad_norm": 0.86328125, "learning_rate": 4.209520013367155e-05, "loss": 0.8578, "step": 2431 }, { "epoch": 1.3060299989933224, "grad_norm": 0.6953125, "learning_rate": 4.2089037059661326e-05, "loss": 0.8389, "step": 2432 }, { "epoch": 1.3065668937283983, "grad_norm": 0.88671875, "learning_rate": 4.208287203557556e-05, "loss": 0.838, "step": 2433 }, { "epoch": 1.3071037884634744, "grad_norm": 0.8203125, "learning_rate": 4.207670506211776e-05, "loss": 1.0361, "step": 2434 }, { "epoch": 1.3076406831985503, "grad_norm": 0.68359375, "learning_rate": 4.2070536139991644e-05, "loss": 0.8776, "step": 2435 }, { "epoch": 1.3081775779336264, "grad_norm": 0.7734375, "learning_rate": 4.206436526990119e-05, "loss": 1.0119, "step": 2436 }, { "epoch": 1.3087144726687023, "grad_norm": 0.64453125, "learning_rate": 4.2058192452550535e-05, "loss": 0.837, "step": 2437 }, { "epoch": 1.3092513674037785, "grad_norm": 0.69921875, "learning_rate": 4.205201768864412e-05, "loss": 1.0506, "step": 2438 }, { "epoch": 1.3097882621388544, "grad_norm": 0.859375, "learning_rate": 4.204584097888652e-05, "loss": 0.9861, "step": 2439 }, { "epoch": 1.3103251568739305, "grad_norm": 0.71484375, "learning_rate": 4.203966232398261e-05, "loss": 0.8611, "step": 2440 }, { "epoch": 1.3108620516090064, "grad_norm": 0.61328125, "learning_rate": 4.203348172463743e-05, "loss": 0.8617, "step": 2441 }, { "epoch": 1.3113989463440825, "grad_norm": 0.73828125, "learning_rate": 4.202729918155628e-05, "loss": 0.8722, "step": 2442 }, { "epoch": 1.3119358410791584, "grad_norm": 0.671875, "learning_rate": 4.202111469544465e-05, "loss": 0.949, "step": 2443 }, { "epoch": 1.3124727358142345, "grad_norm": 0.6640625, "learning_rate": 4.201492826700829e-05, "loss": 0.9245, "step": 2444 }, { "epoch": 1.3130096305493104, "grad_norm": 0.71484375, "learning_rate": 4.200873989695313e-05, "loss": 0.7698, "step": 2445 }, { "epoch": 1.3135465252843863, "grad_norm": 0.72265625, "learning_rate": 4.200254958598535e-05, "loss": 0.8766, "step": 2446 }, { "epoch": 1.3140834200194624, "grad_norm": 0.85546875, "learning_rate": 4.1996357334811354e-05, "loss": 0.9836, "step": 2447 }, { "epoch": 1.3146203147545386, "grad_norm": 0.81640625, "learning_rate": 4.1990163144137735e-05, "loss": 0.9956, "step": 2448 }, { "epoch": 1.3151572094896145, "grad_norm": 0.5703125, "learning_rate": 4.1983967014671335e-05, "loss": 0.6642, "step": 2449 }, { "epoch": 1.3156941042246904, "grad_norm": 0.6875, "learning_rate": 4.197776894711922e-05, "loss": 0.8895, "step": 2450 }, { "epoch": 1.3156941042246904, "eval_loss": 0.915617823600769, "eval_runtime": 46.2183, "eval_samples_per_second": 20.814, "eval_steps_per_second": 20.814, "step": 2450 }, { "epoch": 1.3162309989597665, "grad_norm": 0.8984375, "learning_rate": 4.1971568942188665e-05, "loss": 0.7344, "step": 2451 }, { "epoch": 1.3167678936948424, "grad_norm": 0.78125, "learning_rate": 4.196536700058716e-05, "loss": 0.9511, "step": 2452 }, { "epoch": 1.3173047884299185, "grad_norm": 0.62890625, "learning_rate": 4.195916312302243e-05, "loss": 0.7909, "step": 2453 }, { "epoch": 1.3178416831649944, "grad_norm": 0.765625, "learning_rate": 4.1952957310202424e-05, "loss": 1.0219, "step": 2454 }, { "epoch": 1.3183785779000705, "grad_norm": 0.66015625, "learning_rate": 4.194674956283529e-05, "loss": 0.747, "step": 2455 }, { "epoch": 1.3189154726351464, "grad_norm": 0.69140625, "learning_rate": 4.194053988162943e-05, "loss": 0.9002, "step": 2456 }, { "epoch": 1.3194523673702225, "grad_norm": 0.6328125, "learning_rate": 4.193432826729342e-05, "loss": 0.7652, "step": 2457 }, { "epoch": 1.3199892621052984, "grad_norm": 0.66015625, "learning_rate": 4.1928114720536106e-05, "loss": 0.9451, "step": 2458 }, { "epoch": 1.3205261568403746, "grad_norm": 0.73828125, "learning_rate": 4.192189924206652e-05, "loss": 0.8087, "step": 2459 }, { "epoch": 1.3210630515754505, "grad_norm": 0.6796875, "learning_rate": 4.191568183259394e-05, "loss": 0.8087, "step": 2460 }, { "epoch": 1.3215999463105266, "grad_norm": 0.671875, "learning_rate": 4.1909462492827837e-05, "loss": 0.7977, "step": 2461 }, { "epoch": 1.3221368410456025, "grad_norm": 0.7265625, "learning_rate": 4.190324122347792e-05, "loss": 0.8794, "step": 2462 }, { "epoch": 1.3226737357806786, "grad_norm": 0.828125, "learning_rate": 4.189701802525412e-05, "loss": 0.9191, "step": 2463 }, { "epoch": 1.3232106305157545, "grad_norm": 0.5703125, "learning_rate": 4.189079289886659e-05, "loss": 0.7624, "step": 2464 }, { "epoch": 1.3237475252508304, "grad_norm": 0.71875, "learning_rate": 4.188456584502567e-05, "loss": 0.8994, "step": 2465 }, { "epoch": 1.3242844199859065, "grad_norm": 0.73828125, "learning_rate": 4.187833686444197e-05, "loss": 1.0305, "step": 2466 }, { "epoch": 1.3248213147209826, "grad_norm": 0.6171875, "learning_rate": 4.187210595782628e-05, "loss": 0.8002, "step": 2467 }, { "epoch": 1.3253582094560585, "grad_norm": 0.65234375, "learning_rate": 4.186587312588964e-05, "loss": 0.9054, "step": 2468 }, { "epoch": 1.3258951041911344, "grad_norm": 0.62890625, "learning_rate": 4.185963836934329e-05, "loss": 0.9195, "step": 2469 }, { "epoch": 1.3264319989262106, "grad_norm": 0.71484375, "learning_rate": 4.185340168889868e-05, "loss": 0.9654, "step": 2470 }, { "epoch": 1.3269688936612865, "grad_norm": 0.65234375, "learning_rate": 4.1847163085267514e-05, "loss": 0.9004, "step": 2471 }, { "epoch": 1.3275057883963626, "grad_norm": 0.87109375, "learning_rate": 4.18409225591617e-05, "loss": 0.7906, "step": 2472 }, { "epoch": 1.3280426831314385, "grad_norm": 0.6640625, "learning_rate": 4.183468011129333e-05, "loss": 0.8954, "step": 2473 }, { "epoch": 1.3285795778665146, "grad_norm": 0.6875, "learning_rate": 4.182843574237477e-05, "loss": 0.9238, "step": 2474 }, { "epoch": 1.3291164726015905, "grad_norm": 0.70703125, "learning_rate": 4.182218945311858e-05, "loss": 0.8575, "step": 2475 }, { "epoch": 1.3296533673366666, "grad_norm": 0.6328125, "learning_rate": 4.181594124423754e-05, "loss": 0.8888, "step": 2476 }, { "epoch": 1.3301902620717425, "grad_norm": 0.828125, "learning_rate": 4.180969111644464e-05, "loss": 1.0395, "step": 2477 }, { "epoch": 1.3307271568068186, "grad_norm": 0.6640625, "learning_rate": 4.180343907045311e-05, "loss": 0.8024, "step": 2478 }, { "epoch": 1.3312640515418945, "grad_norm": 0.6953125, "learning_rate": 4.179718510697638e-05, "loss": 0.8117, "step": 2479 }, { "epoch": 1.3318009462769707, "grad_norm": 0.953125, "learning_rate": 4.1790929226728114e-05, "loss": 0.9042, "step": 2480 }, { "epoch": 1.3323378410120466, "grad_norm": 0.7578125, "learning_rate": 4.1784671430422185e-05, "loss": 0.975, "step": 2481 }, { "epoch": 1.3328747357471227, "grad_norm": 0.60546875, "learning_rate": 4.177841171877268e-05, "loss": 1.008, "step": 2482 }, { "epoch": 1.3334116304821986, "grad_norm": 0.6328125, "learning_rate": 4.177215009249392e-05, "loss": 0.9612, "step": 2483 }, { "epoch": 1.3339485252172745, "grad_norm": 0.875, "learning_rate": 4.1765886552300435e-05, "loss": 1.0852, "step": 2484 }, { "epoch": 1.3344854199523506, "grad_norm": 0.75, "learning_rate": 4.175962109890696e-05, "loss": 1.0471, "step": 2485 }, { "epoch": 1.3350223146874267, "grad_norm": 0.78125, "learning_rate": 4.175335373302849e-05, "loss": 0.964, "step": 2486 }, { "epoch": 1.3355592094225026, "grad_norm": 0.71484375, "learning_rate": 4.174708445538018e-05, "loss": 0.8976, "step": 2487 }, { "epoch": 1.3360961041575785, "grad_norm": 0.62890625, "learning_rate": 4.174081326667746e-05, "loss": 0.9157, "step": 2488 }, { "epoch": 1.3366329988926546, "grad_norm": 0.79296875, "learning_rate": 4.173454016763594e-05, "loss": 0.736, "step": 2489 }, { "epoch": 1.3371698936277305, "grad_norm": 2.0625, "learning_rate": 4.172826515897146e-05, "loss": 0.8345, "step": 2490 }, { "epoch": 1.3377067883628067, "grad_norm": 0.796875, "learning_rate": 4.172198824140008e-05, "loss": 0.8717, "step": 2491 }, { "epoch": 1.3382436830978826, "grad_norm": 0.83203125, "learning_rate": 4.1715709415638074e-05, "loss": 0.819, "step": 2492 }, { "epoch": 1.3387805778329587, "grad_norm": 0.69140625, "learning_rate": 4.170942868240193e-05, "loss": 1.0067, "step": 2493 }, { "epoch": 1.3393174725680346, "grad_norm": 0.6796875, "learning_rate": 4.170314604240837e-05, "loss": 0.8044, "step": 2494 }, { "epoch": 1.3398543673031107, "grad_norm": 0.6953125, "learning_rate": 4.1696861496374315e-05, "loss": 0.8712, "step": 2495 }, { "epoch": 1.3403912620381866, "grad_norm": 0.64453125, "learning_rate": 4.169057504501692e-05, "loss": 0.9447, "step": 2496 }, { "epoch": 1.3409281567732627, "grad_norm": 0.78515625, "learning_rate": 4.168428668905354e-05, "loss": 0.8681, "step": 2497 }, { "epoch": 1.3414650515083386, "grad_norm": 0.7734375, "learning_rate": 4.167799642920176e-05, "loss": 1.0327, "step": 2498 }, { "epoch": 1.3420019462434145, "grad_norm": 0.8203125, "learning_rate": 4.167170426617938e-05, "loss": 0.7492, "step": 2499 }, { "epoch": 1.3425388409784906, "grad_norm": 0.62109375, "learning_rate": 4.166541020070441e-05, "loss": 0.9581, "step": 2500 }, { "epoch": 1.3425388409784906, "eval_loss": 0.9149090051651001, "eval_runtime": 45.9495, "eval_samples_per_second": 20.936, "eval_steps_per_second": 20.936, "step": 2500 }, { "epoch": 1.3430757357135668, "grad_norm": 0.7578125, "learning_rate": 4.165911423349509e-05, "loss": 1.0437, "step": 2501 }, { "epoch": 1.3436126304486427, "grad_norm": 0.671875, "learning_rate": 4.1652816365269866e-05, "loss": 0.9115, "step": 2502 }, { "epoch": 1.3441495251837186, "grad_norm": 0.765625, "learning_rate": 4.1646516596747395e-05, "loss": 0.8741, "step": 2503 }, { "epoch": 1.3446864199187947, "grad_norm": 0.66796875, "learning_rate": 4.1640214928646584e-05, "loss": 0.857, "step": 2504 }, { "epoch": 1.3452233146538708, "grad_norm": 0.67578125, "learning_rate": 4.163391136168651e-05, "loss": 0.8804, "step": 2505 }, { "epoch": 1.3457602093889467, "grad_norm": 0.81640625, "learning_rate": 4.162760589658649e-05, "loss": 0.7245, "step": 2506 }, { "epoch": 1.3462971041240226, "grad_norm": 0.6171875, "learning_rate": 4.162129853406607e-05, "loss": 0.8196, "step": 2507 }, { "epoch": 1.3468339988590987, "grad_norm": 0.7421875, "learning_rate": 4.1614989274845004e-05, "loss": 1.0052, "step": 2508 }, { "epoch": 1.3473708935941746, "grad_norm": 1.1171875, "learning_rate": 4.160867811964324e-05, "loss": 0.9977, "step": 2509 }, { "epoch": 1.3479077883292507, "grad_norm": 0.6796875, "learning_rate": 4.160236506918098e-05, "loss": 0.9316, "step": 2510 }, { "epoch": 1.3484446830643266, "grad_norm": 0.6875, "learning_rate": 4.159605012417861e-05, "loss": 0.9938, "step": 2511 }, { "epoch": 1.3489815777994028, "grad_norm": 0.609375, "learning_rate": 4.158973328535675e-05, "loss": 0.8452, "step": 2512 }, { "epoch": 1.3495184725344787, "grad_norm": 0.671875, "learning_rate": 4.158341455343622e-05, "loss": 0.7538, "step": 2513 }, { "epoch": 1.3500553672695548, "grad_norm": 0.62109375, "learning_rate": 4.1577093929138087e-05, "loss": 0.8579, "step": 2514 }, { "epoch": 1.3505922620046307, "grad_norm": 0.6796875, "learning_rate": 4.1570771413183606e-05, "loss": 0.7398, "step": 2515 }, { "epoch": 1.3511291567397068, "grad_norm": 1.046875, "learning_rate": 4.156444700629425e-05, "loss": 1.0352, "step": 2516 }, { "epoch": 1.3516660514747827, "grad_norm": 0.70703125, "learning_rate": 4.155812070919172e-05, "loss": 0.9783, "step": 2517 }, { "epoch": 1.3522029462098586, "grad_norm": 0.7890625, "learning_rate": 4.155179252259793e-05, "loss": 0.8666, "step": 2518 }, { "epoch": 1.3527398409449347, "grad_norm": 0.76953125, "learning_rate": 4.1545462447234995e-05, "loss": 1.0115, "step": 2519 }, { "epoch": 1.3532767356800108, "grad_norm": 0.5859375, "learning_rate": 4.1539130483825266e-05, "loss": 0.8279, "step": 2520 }, { "epoch": 1.3538136304150867, "grad_norm": 0.69140625, "learning_rate": 4.1532796633091296e-05, "loss": 0.853, "step": 2521 }, { "epoch": 1.3543505251501626, "grad_norm": 0.6328125, "learning_rate": 4.152646089575587e-05, "loss": 0.7318, "step": 2522 }, { "epoch": 1.3548874198852388, "grad_norm": 0.77734375, "learning_rate": 4.152012327254196e-05, "loss": 0.8343, "step": 2523 }, { "epoch": 1.3554243146203149, "grad_norm": 0.67578125, "learning_rate": 4.1513783764172765e-05, "loss": 0.7412, "step": 2524 }, { "epoch": 1.3559612093553908, "grad_norm": 0.92578125, "learning_rate": 4.150744237137171e-05, "loss": 0.9574, "step": 2525 }, { "epoch": 1.3564981040904667, "grad_norm": 0.62890625, "learning_rate": 4.1501099094862444e-05, "loss": 0.8173, "step": 2526 }, { "epoch": 1.3570349988255428, "grad_norm": 0.63671875, "learning_rate": 4.14947539353688e-05, "loss": 0.8508, "step": 2527 }, { "epoch": 1.3575718935606187, "grad_norm": 2.046875, "learning_rate": 4.148840689361484e-05, "loss": 0.8877, "step": 2528 }, { "epoch": 1.3581087882956948, "grad_norm": 0.76171875, "learning_rate": 4.1482057970324837e-05, "loss": 0.935, "step": 2529 }, { "epoch": 1.3586456830307707, "grad_norm": 0.78125, "learning_rate": 4.1475707166223296e-05, "loss": 0.8328, "step": 2530 }, { "epoch": 1.3591825777658468, "grad_norm": 0.70703125, "learning_rate": 4.1469354482034924e-05, "loss": 0.8265, "step": 2531 }, { "epoch": 1.3597194725009227, "grad_norm": 0.68359375, "learning_rate": 4.146299991848464e-05, "loss": 0.8488, "step": 2532 }, { "epoch": 1.3602563672359989, "grad_norm": 0.609375, "learning_rate": 4.1456643476297566e-05, "loss": 0.6766, "step": 2533 }, { "epoch": 1.3607932619710748, "grad_norm": 0.62890625, "learning_rate": 4.145028515619907e-05, "loss": 0.9599, "step": 2534 }, { "epoch": 1.3613301567061509, "grad_norm": 0.66796875, "learning_rate": 4.144392495891473e-05, "loss": 0.7632, "step": 2535 }, { "epoch": 1.3618670514412268, "grad_norm": 0.7265625, "learning_rate": 4.143756288517028e-05, "loss": 0.8555, "step": 2536 }, { "epoch": 1.3624039461763027, "grad_norm": 0.6328125, "learning_rate": 4.143119893569175e-05, "loss": 0.8442, "step": 2537 }, { "epoch": 1.3629408409113788, "grad_norm": 0.9140625, "learning_rate": 4.142483311120534e-05, "loss": 1.082, "step": 2538 }, { "epoch": 1.363477735646455, "grad_norm": 0.73046875, "learning_rate": 4.1418465412437467e-05, "loss": 0.847, "step": 2539 }, { "epoch": 1.3640146303815308, "grad_norm": 0.9140625, "learning_rate": 4.1412095840114764e-05, "loss": 0.9993, "step": 2540 }, { "epoch": 1.3645515251166067, "grad_norm": 0.6796875, "learning_rate": 4.140572439496409e-05, "loss": 0.9044, "step": 2541 }, { "epoch": 1.3650884198516828, "grad_norm": 0.86328125, "learning_rate": 4.1399351077712496e-05, "loss": 0.9764, "step": 2542 }, { "epoch": 1.365625314586759, "grad_norm": 0.63671875, "learning_rate": 4.1392975889087266e-05, "loss": 0.7391, "step": 2543 }, { "epoch": 1.3661622093218349, "grad_norm": 0.70703125, "learning_rate": 4.138659882981588e-05, "loss": 0.7217, "step": 2544 }, { "epoch": 1.3666991040569108, "grad_norm": 0.64453125, "learning_rate": 4.138021990062606e-05, "loss": 0.909, "step": 2545 }, { "epoch": 1.3672359987919869, "grad_norm": 0.6796875, "learning_rate": 4.13738391022457e-05, "loss": 0.9635, "step": 2546 }, { "epoch": 1.3677728935270628, "grad_norm": 0.70703125, "learning_rate": 4.136745643540295e-05, "loss": 0.7904, "step": 2547 }, { "epoch": 1.368309788262139, "grad_norm": 1.0234375, "learning_rate": 4.136107190082613e-05, "loss": 0.8989, "step": 2548 }, { "epoch": 1.3688466829972148, "grad_norm": 0.71484375, "learning_rate": 4.135468549924382e-05, "loss": 0.744, "step": 2549 }, { "epoch": 1.369383577732291, "grad_norm": 0.6875, "learning_rate": 4.134829723138477e-05, "loss": 0.7616, "step": 2550 }, { "epoch": 1.369383577732291, "eval_loss": 0.9148359894752502, "eval_runtime": 46.512, "eval_samples_per_second": 20.683, "eval_steps_per_second": 20.683, "step": 2550 }, { "epoch": 1.3699204724673668, "grad_norm": 0.72265625, "learning_rate": 4.1341907097977985e-05, "loss": 0.7684, "step": 2551 }, { "epoch": 1.370457367202443, "grad_norm": 0.6015625, "learning_rate": 4.133551509975264e-05, "loss": 0.8445, "step": 2552 }, { "epoch": 1.3709942619375188, "grad_norm": 0.65625, "learning_rate": 4.132912123743815e-05, "loss": 1.0183, "step": 2553 }, { "epoch": 1.371531156672595, "grad_norm": 0.640625, "learning_rate": 4.132272551176413e-05, "loss": 0.7382, "step": 2554 }, { "epoch": 1.3720680514076709, "grad_norm": 0.76953125, "learning_rate": 4.131632792346042e-05, "loss": 0.8819, "step": 2555 }, { "epoch": 1.3726049461427468, "grad_norm": 0.875, "learning_rate": 4.130992847325707e-05, "loss": 0.826, "step": 2556 }, { "epoch": 1.3731418408778229, "grad_norm": 0.76171875, "learning_rate": 4.1303527161884336e-05, "loss": 0.8119, "step": 2557 }, { "epoch": 1.373678735612899, "grad_norm": 0.81640625, "learning_rate": 4.129712399007267e-05, "loss": 0.8714, "step": 2558 }, { "epoch": 1.374215630347975, "grad_norm": 0.8125, "learning_rate": 4.129071895855279e-05, "loss": 1.1081, "step": 2559 }, { "epoch": 1.3747525250830508, "grad_norm": 0.64453125, "learning_rate": 4.128431206805557e-05, "loss": 0.7996, "step": 2560 }, { "epoch": 1.375289419818127, "grad_norm": 0.75390625, "learning_rate": 4.12779033193121e-05, "loss": 1.1367, "step": 2561 }, { "epoch": 1.375826314553203, "grad_norm": 0.7890625, "learning_rate": 4.1271492713053725e-05, "loss": 0.8228, "step": 2562 }, { "epoch": 1.376363209288279, "grad_norm": 0.76171875, "learning_rate": 4.126508025001198e-05, "loss": 0.9288, "step": 2563 }, { "epoch": 1.3769001040233548, "grad_norm": 0.91015625, "learning_rate": 4.125866593091859e-05, "loss": 1.1671, "step": 2564 }, { "epoch": 1.377436998758431, "grad_norm": 0.8984375, "learning_rate": 4.1252249756505514e-05, "loss": 0.9771, "step": 2565 }, { "epoch": 1.3779738934935069, "grad_norm": 0.82421875, "learning_rate": 4.1245831727504925e-05, "loss": 0.9341, "step": 2566 }, { "epoch": 1.378510788228583, "grad_norm": 0.75, "learning_rate": 4.123941184464921e-05, "loss": 1.0186, "step": 2567 }, { "epoch": 1.3790476829636589, "grad_norm": 0.58984375, "learning_rate": 4.1232990108670935e-05, "loss": 0.7287, "step": 2568 }, { "epoch": 1.379584577698735, "grad_norm": 0.7734375, "learning_rate": 4.122656652030292e-05, "loss": 0.8416, "step": 2569 }, { "epoch": 1.380121472433811, "grad_norm": 0.71875, "learning_rate": 4.122014108027817e-05, "loss": 1.0048, "step": 2570 }, { "epoch": 1.380658367168887, "grad_norm": 0.703125, "learning_rate": 4.121371378932991e-05, "loss": 0.9034, "step": 2571 }, { "epoch": 1.381195261903963, "grad_norm": 0.61328125, "learning_rate": 4.120728464819158e-05, "loss": 0.7797, "step": 2572 }, { "epoch": 1.381732156639039, "grad_norm": 0.59375, "learning_rate": 4.1200853657596826e-05, "loss": 0.7304, "step": 2573 }, { "epoch": 1.382269051374115, "grad_norm": 0.59375, "learning_rate": 4.11944208182795e-05, "loss": 0.7157, "step": 2574 }, { "epoch": 1.3828059461091908, "grad_norm": 0.67578125, "learning_rate": 4.118798613097366e-05, "loss": 0.9234, "step": 2575 }, { "epoch": 1.383342840844267, "grad_norm": 0.8203125, "learning_rate": 4.118154959641362e-05, "loss": 0.9838, "step": 2576 }, { "epoch": 1.383879735579343, "grad_norm": 0.65234375, "learning_rate": 4.117511121533384e-05, "loss": 0.7853, "step": 2577 }, { "epoch": 1.384416630314419, "grad_norm": 0.62890625, "learning_rate": 4.116867098846903e-05, "loss": 0.8374, "step": 2578 }, { "epoch": 1.3849535250494949, "grad_norm": 1.046875, "learning_rate": 4.116222891655409e-05, "loss": 1.1056, "step": 2579 }, { "epoch": 1.385490419784571, "grad_norm": 0.65234375, "learning_rate": 4.1155785000324165e-05, "loss": 0.8676, "step": 2580 }, { "epoch": 1.3860273145196471, "grad_norm": 0.734375, "learning_rate": 4.1149339240514574e-05, "loss": 0.8654, "step": 2581 }, { "epoch": 1.386564209254723, "grad_norm": 0.84765625, "learning_rate": 4.1142891637860855e-05, "loss": 0.9874, "step": 2582 }, { "epoch": 1.387101103989799, "grad_norm": 0.58203125, "learning_rate": 4.113644219309877e-05, "loss": 0.8685, "step": 2583 }, { "epoch": 1.387637998724875, "grad_norm": 0.7578125, "learning_rate": 4.112999090696427e-05, "loss": 1.203, "step": 2584 }, { "epoch": 1.388174893459951, "grad_norm": 0.6953125, "learning_rate": 4.112353778019355e-05, "loss": 0.9733, "step": 2585 }, { "epoch": 1.388711788195027, "grad_norm": 0.62109375, "learning_rate": 4.111708281352298e-05, "loss": 0.8137, "step": 2586 }, { "epoch": 1.389248682930103, "grad_norm": 0.64453125, "learning_rate": 4.1110626007689145e-05, "loss": 0.9915, "step": 2587 }, { "epoch": 1.389785577665179, "grad_norm": 0.69140625, "learning_rate": 4.1104167363428866e-05, "loss": 0.8851, "step": 2588 }, { "epoch": 1.390322472400255, "grad_norm": 0.67578125, "learning_rate": 4.109770688147915e-05, "loss": 0.8586, "step": 2589 }, { "epoch": 1.390859367135331, "grad_norm": 0.71484375, "learning_rate": 4.109124456257721e-05, "loss": 0.6883, "step": 2590 }, { "epoch": 1.391396261870407, "grad_norm": 0.6640625, "learning_rate": 4.10847804074605e-05, "loss": 0.9585, "step": 2591 }, { "epoch": 1.3919331566054831, "grad_norm": 0.609375, "learning_rate": 4.1078314416866636e-05, "loss": 0.7392, "step": 2592 }, { "epoch": 1.392470051340559, "grad_norm": 0.640625, "learning_rate": 4.107184659153348e-05, "loss": 1.0139, "step": 2593 }, { "epoch": 1.393006946075635, "grad_norm": 0.6796875, "learning_rate": 4.1065376932199104e-05, "loss": 0.8706, "step": 2594 }, { "epoch": 1.393543840810711, "grad_norm": 0.875, "learning_rate": 4.1058905439601766e-05, "loss": 0.9932, "step": 2595 }, { "epoch": 1.3940807355457872, "grad_norm": 0.72265625, "learning_rate": 4.1052432114479946e-05, "loss": 0.8429, "step": 2596 }, { "epoch": 1.394617630280863, "grad_norm": 0.75, "learning_rate": 4.104595695757233e-05, "loss": 0.7455, "step": 2597 }, { "epoch": 1.395154525015939, "grad_norm": 0.68359375, "learning_rate": 4.103947996961783e-05, "loss": 0.6963, "step": 2598 }, { "epoch": 1.395691419751015, "grad_norm": 0.8984375, "learning_rate": 4.103300115135553e-05, "loss": 0.9878, "step": 2599 }, { "epoch": 1.3962283144860912, "grad_norm": 0.64453125, "learning_rate": 4.1026520503524765e-05, "loss": 0.77, "step": 2600 }, { "epoch": 1.3962283144860912, "eval_loss": 0.914279043674469, "eval_runtime": 46.8263, "eval_samples_per_second": 20.544, "eval_steps_per_second": 20.544, "step": 2600 }, { "epoch": 1.396765209221167, "grad_norm": 0.703125, "learning_rate": 4.102003802686505e-05, "loss": 0.8192, "step": 2601 }, { "epoch": 1.397302103956243, "grad_norm": 0.671875, "learning_rate": 4.1013553722116114e-05, "loss": 0.8407, "step": 2602 }, { "epoch": 1.3978389986913191, "grad_norm": 0.84375, "learning_rate": 4.100706759001791e-05, "loss": 0.9569, "step": 2603 }, { "epoch": 1.398375893426395, "grad_norm": 0.70703125, "learning_rate": 4.1000579631310574e-05, "loss": 1.0608, "step": 2604 }, { "epoch": 1.3989127881614711, "grad_norm": 0.75, "learning_rate": 4.099408984673447e-05, "loss": 0.8907, "step": 2605 }, { "epoch": 1.399449682896547, "grad_norm": 0.76953125, "learning_rate": 4.098759823703018e-05, "loss": 0.9854, "step": 2606 }, { "epoch": 1.3999865776316232, "grad_norm": 0.6171875, "learning_rate": 4.098110480293845e-05, "loss": 0.8458, "step": 2607 }, { "epoch": 1.400523472366699, "grad_norm": 0.6953125, "learning_rate": 4.097460954520028e-05, "loss": 0.7117, "step": 2608 }, { "epoch": 1.4010603671017752, "grad_norm": 0.72265625, "learning_rate": 4.0968112464556864e-05, "loss": 0.8811, "step": 2609 }, { "epoch": 1.401597261836851, "grad_norm": 0.6875, "learning_rate": 4.096161356174959e-05, "loss": 1.094, "step": 2610 }, { "epoch": 1.4021341565719272, "grad_norm": 0.73828125, "learning_rate": 4.095511283752007e-05, "loss": 0.8327, "step": 2611 }, { "epoch": 1.402671051307003, "grad_norm": 0.67578125, "learning_rate": 4.094861029261013e-05, "loss": 0.768, "step": 2612 }, { "epoch": 1.403207946042079, "grad_norm": 0.7265625, "learning_rate": 4.0942105927761776e-05, "loss": 0.8954, "step": 2613 }, { "epoch": 1.4037448407771551, "grad_norm": 0.7109375, "learning_rate": 4.093559974371725e-05, "loss": 0.8423, "step": 2614 }, { "epoch": 1.4042817355122312, "grad_norm": 0.6953125, "learning_rate": 4.092909174121898e-05, "loss": 0.8002, "step": 2615 }, { "epoch": 1.4048186302473071, "grad_norm": 0.83203125, "learning_rate": 4.0922581921009615e-05, "loss": 1.0378, "step": 2616 }, { "epoch": 1.405355524982383, "grad_norm": 0.65625, "learning_rate": 4.091607028383201e-05, "loss": 0.8302, "step": 2617 }, { "epoch": 1.4058924197174592, "grad_norm": 0.75, "learning_rate": 4.090955683042923e-05, "loss": 0.8857, "step": 2618 }, { "epoch": 1.4064293144525353, "grad_norm": 0.66015625, "learning_rate": 4.0903041561544534e-05, "loss": 0.8998, "step": 2619 }, { "epoch": 1.4069662091876112, "grad_norm": 0.734375, "learning_rate": 4.08965244779214e-05, "loss": 0.9169, "step": 2620 }, { "epoch": 1.407503103922687, "grad_norm": 0.6796875, "learning_rate": 4.0890005580303516e-05, "loss": 0.7286, "step": 2621 }, { "epoch": 1.4080399986577632, "grad_norm": 0.703125, "learning_rate": 4.088348486943476e-05, "loss": 0.8804, "step": 2622 }, { "epoch": 1.408576893392839, "grad_norm": 0.76953125, "learning_rate": 4.087696234605923e-05, "loss": 0.9798, "step": 2623 }, { "epoch": 1.4091137881279152, "grad_norm": 0.70703125, "learning_rate": 4.087043801092123e-05, "loss": 0.9896, "step": 2624 }, { "epoch": 1.4096506828629911, "grad_norm": 0.64453125, "learning_rate": 4.0863911864765274e-05, "loss": 0.9417, "step": 2625 }, { "epoch": 1.4101875775980672, "grad_norm": 0.69921875, "learning_rate": 4.0857383908336076e-05, "loss": 0.948, "step": 2626 }, { "epoch": 1.4107244723331431, "grad_norm": 0.66015625, "learning_rate": 4.085085414237856e-05, "loss": 0.9259, "step": 2627 }, { "epoch": 1.4112613670682193, "grad_norm": 0.55859375, "learning_rate": 4.084432256763784e-05, "loss": 0.7512, "step": 2628 }, { "epoch": 1.4117982618032952, "grad_norm": 0.7578125, "learning_rate": 4.083778918485926e-05, "loss": 0.8592, "step": 2629 }, { "epoch": 1.4123351565383713, "grad_norm": 0.84375, "learning_rate": 4.083125399478838e-05, "loss": 0.8346, "step": 2630 }, { "epoch": 1.4128720512734472, "grad_norm": 0.73828125, "learning_rate": 4.0824716998170926e-05, "loss": 0.8997, "step": 2631 }, { "epoch": 1.413408946008523, "grad_norm": 0.65234375, "learning_rate": 4.081817819575285e-05, "loss": 0.9205, "step": 2632 }, { "epoch": 1.4139458407435992, "grad_norm": 0.6875, "learning_rate": 4.0811637588280335e-05, "loss": 0.8972, "step": 2633 }, { "epoch": 1.4144827354786753, "grad_norm": 0.6875, "learning_rate": 4.0805095176499724e-05, "loss": 0.8703, "step": 2634 }, { "epoch": 1.4150196302137512, "grad_norm": 0.59765625, "learning_rate": 4.07985509611576e-05, "loss": 0.7751, "step": 2635 }, { "epoch": 1.415556524948827, "grad_norm": 0.640625, "learning_rate": 4.079200494300074e-05, "loss": 0.7056, "step": 2636 }, { "epoch": 1.4160934196839032, "grad_norm": 0.83984375, "learning_rate": 4.0785457122776125e-05, "loss": 0.8981, "step": 2637 }, { "epoch": 1.4166303144189794, "grad_norm": 0.66015625, "learning_rate": 4.077890750123095e-05, "loss": 0.8363, "step": 2638 }, { "epoch": 1.4171672091540553, "grad_norm": 0.83984375, "learning_rate": 4.07723560791126e-05, "loss": 0.9147, "step": 2639 }, { "epoch": 1.4177041038891312, "grad_norm": 0.7265625, "learning_rate": 4.0765802857168687e-05, "loss": 0.9733, "step": 2640 }, { "epoch": 1.4182409986242073, "grad_norm": 0.66796875, "learning_rate": 4.0759247836147005e-05, "loss": 0.9805, "step": 2641 }, { "epoch": 1.4187778933592832, "grad_norm": 0.7265625, "learning_rate": 4.075269101679557e-05, "loss": 0.7929, "step": 2642 }, { "epoch": 1.4193147880943593, "grad_norm": 0.80859375, "learning_rate": 4.0746132399862605e-05, "loss": 0.8564, "step": 2643 }, { "epoch": 1.4198516828294352, "grad_norm": 0.5546875, "learning_rate": 4.073957198609651e-05, "loss": 0.7691, "step": 2644 }, { "epoch": 1.4203885775645113, "grad_norm": 0.6796875, "learning_rate": 4.073300977624594e-05, "loss": 0.8256, "step": 2645 }, { "epoch": 1.4209254722995872, "grad_norm": 0.65234375, "learning_rate": 4.072644577105971e-05, "loss": 0.8678, "step": 2646 }, { "epoch": 1.4214623670346633, "grad_norm": 0.66015625, "learning_rate": 4.071987997128685e-05, "loss": 0.8509, "step": 2647 }, { "epoch": 1.4219992617697392, "grad_norm": 0.69140625, "learning_rate": 4.071331237767662e-05, "loss": 0.9182, "step": 2648 }, { "epoch": 1.4225361565048154, "grad_norm": 0.640625, "learning_rate": 4.070674299097845e-05, "loss": 0.7763, "step": 2649 }, { "epoch": 1.4230730512398913, "grad_norm": 0.71875, "learning_rate": 4.070017181194199e-05, "loss": 0.8474, "step": 2650 }, { "epoch": 1.4230730512398913, "eval_loss": 0.9133580923080444, "eval_runtime": 46.4653, "eval_samples_per_second": 20.704, "eval_steps_per_second": 20.704, "step": 2650 }, { "epoch": 1.4236099459749672, "grad_norm": 0.76953125, "learning_rate": 4.06935988413171e-05, "loss": 0.7508, "step": 2651 }, { "epoch": 1.4241468407100433, "grad_norm": 0.68359375, "learning_rate": 4.068702407985384e-05, "loss": 0.9745, "step": 2652 }, { "epoch": 1.4246837354451194, "grad_norm": 0.68359375, "learning_rate": 4.068044752830248e-05, "loss": 0.7427, "step": 2653 }, { "epoch": 1.4252206301801953, "grad_norm": 0.71875, "learning_rate": 4.067386918741347e-05, "loss": 0.9359, "step": 2654 }, { "epoch": 1.4257575249152712, "grad_norm": 0.671875, "learning_rate": 4.066728905793749e-05, "loss": 0.8267, "step": 2655 }, { "epoch": 1.4262944196503473, "grad_norm": 0.68359375, "learning_rate": 4.0660707140625434e-05, "loss": 0.8708, "step": 2656 }, { "epoch": 1.4268313143854234, "grad_norm": 0.6953125, "learning_rate": 4.065412343622836e-05, "loss": 0.9951, "step": 2657 }, { "epoch": 1.4273682091204993, "grad_norm": 0.6875, "learning_rate": 4.064753794549756e-05, "loss": 0.8867, "step": 2658 }, { "epoch": 1.4279051038555752, "grad_norm": 0.86328125, "learning_rate": 4.064095066918451e-05, "loss": 1.146, "step": 2659 }, { "epoch": 1.4284419985906514, "grad_norm": 0.84375, "learning_rate": 4.063436160804092e-05, "loss": 0.774, "step": 2660 }, { "epoch": 1.4289788933257273, "grad_norm": 0.79296875, "learning_rate": 4.0627770762818684e-05, "loss": 0.8943, "step": 2661 }, { "epoch": 1.4295157880608034, "grad_norm": 0.609375, "learning_rate": 4.0621178134269884e-05, "loss": 0.8359, "step": 2662 }, { "epoch": 1.4300526827958793, "grad_norm": 0.62890625, "learning_rate": 4.061458372314684e-05, "loss": 0.8759, "step": 2663 }, { "epoch": 1.4305895775309554, "grad_norm": 0.5859375, "learning_rate": 4.060798753020204e-05, "loss": 0.8779, "step": 2664 }, { "epoch": 1.4311264722660313, "grad_norm": 0.79296875, "learning_rate": 4.060138955618821e-05, "loss": 0.9663, "step": 2665 }, { "epoch": 1.4316633670011074, "grad_norm": 0.81640625, "learning_rate": 4.059478980185826e-05, "loss": 1.3966, "step": 2666 }, { "epoch": 1.4322002617361833, "grad_norm": 0.8046875, "learning_rate": 4.058818826796529e-05, "loss": 0.9054, "step": 2667 }, { "epoch": 1.4327371564712594, "grad_norm": 0.7265625, "learning_rate": 4.058158495526264e-05, "loss": 0.9305, "step": 2668 }, { "epoch": 1.4332740512063353, "grad_norm": 0.7578125, "learning_rate": 4.0574979864503814e-05, "loss": 0.9078, "step": 2669 }, { "epoch": 1.4338109459414112, "grad_norm": 0.72265625, "learning_rate": 4.056837299644255e-05, "loss": 0.8311, "step": 2670 }, { "epoch": 1.4343478406764874, "grad_norm": 0.6328125, "learning_rate": 4.056176435183277e-05, "loss": 0.8182, "step": 2671 }, { "epoch": 1.4348847354115635, "grad_norm": 0.6875, "learning_rate": 4.0555153931428605e-05, "loss": 0.7843, "step": 2672 }, { "epoch": 1.4354216301466394, "grad_norm": 0.87109375, "learning_rate": 4.0548541735984385e-05, "loss": 1.1244, "step": 2673 }, { "epoch": 1.4359585248817153, "grad_norm": 0.7578125, "learning_rate": 4.054192776625464e-05, "loss": 1.0792, "step": 2674 }, { "epoch": 1.4364954196167914, "grad_norm": 0.76953125, "learning_rate": 4.053531202299412e-05, "loss": 1.007, "step": 2675 }, { "epoch": 1.4370323143518675, "grad_norm": 0.90625, "learning_rate": 4.052869450695776e-05, "loss": 0.9834, "step": 2676 }, { "epoch": 1.4375692090869434, "grad_norm": 0.76171875, "learning_rate": 4.0522075218900704e-05, "loss": 0.9715, "step": 2677 }, { "epoch": 1.4381061038220193, "grad_norm": 0.67578125, "learning_rate": 4.051545415957829e-05, "loss": 0.7714, "step": 2678 }, { "epoch": 1.4386429985570954, "grad_norm": 0.640625, "learning_rate": 4.050883132974607e-05, "loss": 0.7648, "step": 2679 }, { "epoch": 1.4391798932921713, "grad_norm": 0.7265625, "learning_rate": 4.050220673015979e-05, "loss": 0.9879, "step": 2680 }, { "epoch": 1.4397167880272475, "grad_norm": 0.671875, "learning_rate": 4.0495580361575405e-05, "loss": 0.8947, "step": 2681 }, { "epoch": 1.4402536827623234, "grad_norm": 0.7109375, "learning_rate": 4.0488952224749066e-05, "loss": 0.8378, "step": 2682 }, { "epoch": 1.4407905774973995, "grad_norm": 0.796875, "learning_rate": 4.048232232043713e-05, "loss": 0.9889, "step": 2683 }, { "epoch": 1.4413274722324754, "grad_norm": 0.765625, "learning_rate": 4.0475690649396134e-05, "loss": 0.952, "step": 2684 }, { "epoch": 1.4418643669675515, "grad_norm": 0.828125, "learning_rate": 4.0469057212382864e-05, "loss": 1.0912, "step": 2685 }, { "epoch": 1.4424012617026274, "grad_norm": 0.703125, "learning_rate": 4.0462422010154266e-05, "loss": 0.73, "step": 2686 }, { "epoch": 1.4429381564377035, "grad_norm": 0.7890625, "learning_rate": 4.04557850434675e-05, "loss": 0.7924, "step": 2687 }, { "epoch": 1.4434750511727794, "grad_norm": 0.7265625, "learning_rate": 4.044914631307993e-05, "loss": 0.8306, "step": 2688 }, { "epoch": 1.4440119459078553, "grad_norm": 0.6953125, "learning_rate": 4.044250581974911e-05, "loss": 0.872, "step": 2689 }, { "epoch": 1.4445488406429314, "grad_norm": 0.67578125, "learning_rate": 4.0435863564232826e-05, "loss": 0.96, "step": 2690 }, { "epoch": 1.4450857353780076, "grad_norm": 0.73828125, "learning_rate": 4.042921954728902e-05, "loss": 0.9316, "step": 2691 }, { "epoch": 1.4456226301130835, "grad_norm": 0.640625, "learning_rate": 4.042257376967586e-05, "loss": 0.8192, "step": 2692 }, { "epoch": 1.4461595248481594, "grad_norm": 0.734375, "learning_rate": 4.041592623215174e-05, "loss": 0.9176, "step": 2693 }, { "epoch": 1.4466964195832355, "grad_norm": 0.70703125, "learning_rate": 4.04092769354752e-05, "loss": 0.829, "step": 2694 }, { "epoch": 1.4472333143183116, "grad_norm": 0.66015625, "learning_rate": 4.040262588040503e-05, "loss": 0.7773, "step": 2695 }, { "epoch": 1.4477702090533875, "grad_norm": 0.69921875, "learning_rate": 4.0395973067700176e-05, "loss": 0.9758, "step": 2696 }, { "epoch": 1.4483071037884634, "grad_norm": 0.703125, "learning_rate": 4.038931849811982e-05, "loss": 0.8894, "step": 2697 }, { "epoch": 1.4488439985235395, "grad_norm": 0.69921875, "learning_rate": 4.038266217242334e-05, "loss": 0.8639, "step": 2698 }, { "epoch": 1.4493808932586154, "grad_norm": 0.6640625, "learning_rate": 4.03760040913703e-05, "loss": 0.7328, "step": 2699 }, { "epoch": 1.4499177879936915, "grad_norm": 0.76171875, "learning_rate": 4.0369344255720474e-05, "loss": 0.8242, "step": 2700 }, { "epoch": 1.4499177879936915, "eval_loss": 0.9133481979370117, "eval_runtime": 46.0758, "eval_samples_per_second": 20.879, "eval_steps_per_second": 20.879, "step": 2700 }, { "epoch": 1.4504546827287674, "grad_norm": 0.71875, "learning_rate": 4.036268266623382e-05, "loss": 0.8669, "step": 2701 }, { "epoch": 1.4509915774638436, "grad_norm": 0.8671875, "learning_rate": 4.035601932367054e-05, "loss": 0.9937, "step": 2702 }, { "epoch": 1.4515284721989195, "grad_norm": 0.65234375, "learning_rate": 4.034935422879097e-05, "loss": 0.8431, "step": 2703 }, { "epoch": 1.4520653669339956, "grad_norm": 0.55859375, "learning_rate": 4.03426873823557e-05, "loss": 0.756, "step": 2704 }, { "epoch": 1.4526022616690715, "grad_norm": 0.6953125, "learning_rate": 4.033601878512551e-05, "loss": 0.6655, "step": 2705 }, { "epoch": 1.4531391564041476, "grad_norm": 0.6640625, "learning_rate": 4.0329348437861354e-05, "loss": 0.8231, "step": 2706 }, { "epoch": 1.4536760511392235, "grad_norm": 0.73046875, "learning_rate": 4.0322676341324415e-05, "loss": 0.9908, "step": 2707 }, { "epoch": 1.4542129458742994, "grad_norm": 0.64453125, "learning_rate": 4.031600249627606e-05, "loss": 0.9434, "step": 2708 }, { "epoch": 1.4547498406093755, "grad_norm": 0.62890625, "learning_rate": 4.030932690347785e-05, "loss": 0.781, "step": 2709 }, { "epoch": 1.4552867353444516, "grad_norm": 0.7109375, "learning_rate": 4.030264956369157e-05, "loss": 0.852, "step": 2710 }, { "epoch": 1.4558236300795275, "grad_norm": 0.671875, "learning_rate": 4.029597047767919e-05, "loss": 0.8335, "step": 2711 }, { "epoch": 1.4563605248146034, "grad_norm": 0.66796875, "learning_rate": 4.028928964620286e-05, "loss": 0.6993, "step": 2712 }, { "epoch": 1.4568974195496796, "grad_norm": 0.69921875, "learning_rate": 4.028260707002496e-05, "loss": 0.9187, "step": 2713 }, { "epoch": 1.4574343142847557, "grad_norm": 0.74609375, "learning_rate": 4.0275922749908055e-05, "loss": 0.9053, "step": 2714 }, { "epoch": 1.4579712090198316, "grad_norm": 0.67578125, "learning_rate": 4.026923668661491e-05, "loss": 0.9959, "step": 2715 }, { "epoch": 1.4585081037549075, "grad_norm": 0.53515625, "learning_rate": 4.026254888090849e-05, "loss": 0.6651, "step": 2716 }, { "epoch": 1.4590449984899836, "grad_norm": 0.6640625, "learning_rate": 4.025585933355196e-05, "loss": 0.9886, "step": 2717 }, { "epoch": 1.4595818932250595, "grad_norm": 0.734375, "learning_rate": 4.024916804530868e-05, "loss": 1.0021, "step": 2718 }, { "epoch": 1.4601187879601356, "grad_norm": 0.7109375, "learning_rate": 4.024247501694221e-05, "loss": 0.902, "step": 2719 }, { "epoch": 1.4606556826952115, "grad_norm": 0.734375, "learning_rate": 4.0235780249216314e-05, "loss": 1.1045, "step": 2720 }, { "epoch": 1.4611925774302876, "grad_norm": 0.7109375, "learning_rate": 4.022908374289494e-05, "loss": 0.9118, "step": 2721 }, { "epoch": 1.4617294721653635, "grad_norm": 0.7109375, "learning_rate": 4.022238549874226e-05, "loss": 0.7801, "step": 2722 }, { "epoch": 1.4622663669004397, "grad_norm": 0.7734375, "learning_rate": 4.02156855175226e-05, "loss": 0.8942, "step": 2723 }, { "epoch": 1.4628032616355156, "grad_norm": 0.73828125, "learning_rate": 4.020898380000055e-05, "loss": 0.8445, "step": 2724 }, { "epoch": 1.4633401563705917, "grad_norm": 0.609375, "learning_rate": 4.020228034694084e-05, "loss": 0.7591, "step": 2725 }, { "epoch": 1.4638770511056676, "grad_norm": 0.7109375, "learning_rate": 4.0195575159108414e-05, "loss": 0.9358, "step": 2726 }, { "epoch": 1.4644139458407435, "grad_norm": 0.734375, "learning_rate": 4.018886823726842e-05, "loss": 1.0574, "step": 2727 }, { "epoch": 1.4649508405758196, "grad_norm": 1.0078125, "learning_rate": 4.018215958218622e-05, "loss": 1.0147, "step": 2728 }, { "epoch": 1.4654877353108957, "grad_norm": 0.6171875, "learning_rate": 4.0175449194627343e-05, "loss": 0.7976, "step": 2729 }, { "epoch": 1.4660246300459716, "grad_norm": 0.69921875, "learning_rate": 4.0168737075357535e-05, "loss": 0.8205, "step": 2730 }, { "epoch": 1.4665615247810475, "grad_norm": 0.73828125, "learning_rate": 4.016202322514272e-05, "loss": 0.8443, "step": 2731 }, { "epoch": 1.4670984195161236, "grad_norm": 0.7578125, "learning_rate": 4.015530764474905e-05, "loss": 0.9848, "step": 2732 }, { "epoch": 1.4676353142511998, "grad_norm": 0.69140625, "learning_rate": 4.014859033494285e-05, "loss": 0.9751, "step": 2733 }, { "epoch": 1.4681722089862757, "grad_norm": 0.83984375, "learning_rate": 4.014187129649065e-05, "loss": 0.8844, "step": 2734 }, { "epoch": 1.4687091037213515, "grad_norm": 0.7421875, "learning_rate": 4.013515053015918e-05, "loss": 0.8448, "step": 2735 }, { "epoch": 1.4692459984564277, "grad_norm": 0.79296875, "learning_rate": 4.0128428036715365e-05, "loss": 1.0072, "step": 2736 }, { "epoch": 1.4697828931915036, "grad_norm": 0.58203125, "learning_rate": 4.012170381692632e-05, "loss": 0.7699, "step": 2737 }, { "epoch": 1.4703197879265797, "grad_norm": 0.71484375, "learning_rate": 4.011497787155938e-05, "loss": 0.9128, "step": 2738 }, { "epoch": 1.4708566826616556, "grad_norm": 0.77734375, "learning_rate": 4.0108250201382036e-05, "loss": 0.8606, "step": 2739 }, { "epoch": 1.4713935773967317, "grad_norm": 0.69140625, "learning_rate": 4.010152080716202e-05, "loss": 0.9566, "step": 2740 }, { "epoch": 1.4719304721318076, "grad_norm": 0.72265625, "learning_rate": 4.0094789689667234e-05, "loss": 0.8603, "step": 2741 }, { "epoch": 1.4724673668668837, "grad_norm": 0.6640625, "learning_rate": 4.008805684966578e-05, "loss": 0.6741, "step": 2742 }, { "epoch": 1.4730042616019596, "grad_norm": 0.63671875, "learning_rate": 4.008132228792597e-05, "loss": 0.8336, "step": 2743 }, { "epoch": 1.4735411563370358, "grad_norm": 0.72265625, "learning_rate": 4.00745860052163e-05, "loss": 1.0235, "step": 2744 }, { "epoch": 1.4740780510721117, "grad_norm": 0.65625, "learning_rate": 4.0067848002305455e-05, "loss": 0.9239, "step": 2745 }, { "epoch": 1.4746149458071875, "grad_norm": 0.74609375, "learning_rate": 4.0061108279962336e-05, "loss": 0.9004, "step": 2746 }, { "epoch": 1.4751518405422637, "grad_norm": 0.6484375, "learning_rate": 4.005436683895603e-05, "loss": 0.7801, "step": 2747 }, { "epoch": 1.4756887352773398, "grad_norm": 0.66796875, "learning_rate": 4.004762368005583e-05, "loss": 0.887, "step": 2748 }, { "epoch": 1.4762256300124157, "grad_norm": 0.6171875, "learning_rate": 4.004087880403119e-05, "loss": 0.6446, "step": 2749 }, { "epoch": 1.4767625247474916, "grad_norm": 0.65625, "learning_rate": 4.0034132211651806e-05, "loss": 0.8491, "step": 2750 }, { "epoch": 1.4767625247474916, "eval_loss": 0.9130702018737793, "eval_runtime": 45.7937, "eval_samples_per_second": 21.007, "eval_steps_per_second": 21.007, "step": 2750 }, { "epoch": 1.4772994194825677, "grad_norm": 0.7734375, "learning_rate": 4.002738390368754e-05, "loss": 0.9299, "step": 2751 }, { "epoch": 1.4778363142176438, "grad_norm": 0.640625, "learning_rate": 4.0020633880908476e-05, "loss": 0.8185, "step": 2752 }, { "epoch": 1.4783732089527197, "grad_norm": 0.6171875, "learning_rate": 4.0013882144084855e-05, "loss": 0.7305, "step": 2753 }, { "epoch": 1.4789101036877956, "grad_norm": 0.6640625, "learning_rate": 4.000712869398715e-05, "loss": 0.813, "step": 2754 }, { "epoch": 1.4794469984228718, "grad_norm": 0.765625, "learning_rate": 4.0000373531386015e-05, "loss": 0.8307, "step": 2755 }, { "epoch": 1.4799838931579476, "grad_norm": 0.671875, "learning_rate": 3.999361665705229e-05, "loss": 0.9109, "step": 2756 }, { "epoch": 1.4805207878930238, "grad_norm": 0.7109375, "learning_rate": 3.9986858071757026e-05, "loss": 0.9161, "step": 2757 }, { "epoch": 1.4810576826280997, "grad_norm": 0.8125, "learning_rate": 3.998009777627146e-05, "loss": 0.9108, "step": 2758 }, { "epoch": 1.4815945773631758, "grad_norm": 0.63671875, "learning_rate": 3.9973335771367035e-05, "loss": 0.905, "step": 2759 }, { "epoch": 1.4821314720982517, "grad_norm": 0.71875, "learning_rate": 3.9966572057815373e-05, "loss": 0.8476, "step": 2760 }, { "epoch": 1.4826683668333278, "grad_norm": 0.7578125, "learning_rate": 3.9959806636388314e-05, "loss": 0.9646, "step": 2761 }, { "epoch": 1.4832052615684037, "grad_norm": 0.6171875, "learning_rate": 3.995303950785786e-05, "loss": 0.8415, "step": 2762 }, { "epoch": 1.4837421563034798, "grad_norm": 0.609375, "learning_rate": 3.994627067299623e-05, "loss": 1.0265, "step": 2763 }, { "epoch": 1.4842790510385557, "grad_norm": 0.9140625, "learning_rate": 3.9939500132575834e-05, "loss": 1.058, "step": 2764 }, { "epoch": 1.4848159457736316, "grad_norm": 0.69140625, "learning_rate": 3.993272788736928e-05, "loss": 0.8433, "step": 2765 }, { "epoch": 1.4853528405087078, "grad_norm": 0.9140625, "learning_rate": 3.992595393814938e-05, "loss": 1.1677, "step": 2766 }, { "epoch": 1.4858897352437839, "grad_norm": 0.640625, "learning_rate": 3.9919178285689104e-05, "loss": 0.9373, "step": 2767 }, { "epoch": 1.4864266299788598, "grad_norm": 0.671875, "learning_rate": 3.9912400930761654e-05, "loss": 0.7869, "step": 2768 }, { "epoch": 1.4869635247139357, "grad_norm": 0.609375, "learning_rate": 3.99056218741404e-05, "loss": 0.7845, "step": 2769 }, { "epoch": 1.4875004194490118, "grad_norm": 0.77734375, "learning_rate": 3.989884111659893e-05, "loss": 0.8824, "step": 2770 }, { "epoch": 1.488037314184088, "grad_norm": 0.68359375, "learning_rate": 3.989205865891101e-05, "loss": 0.7785, "step": 2771 }, { "epoch": 1.4885742089191638, "grad_norm": 0.75390625, "learning_rate": 3.98852745018506e-05, "loss": 0.7917, "step": 2772 }, { "epoch": 1.4891111036542397, "grad_norm": 0.703125, "learning_rate": 3.987848864619186e-05, "loss": 1.0772, "step": 2773 }, { "epoch": 1.4896479983893158, "grad_norm": 0.64453125, "learning_rate": 3.9871701092709155e-05, "loss": 0.9172, "step": 2774 }, { "epoch": 1.4901848931243917, "grad_norm": 0.69140625, "learning_rate": 3.9864911842177024e-05, "loss": 0.9747, "step": 2775 }, { "epoch": 1.4907217878594679, "grad_norm": 0.6875, "learning_rate": 3.985812089537019e-05, "loss": 0.8586, "step": 2776 }, { "epoch": 1.4912586825945437, "grad_norm": 0.6640625, "learning_rate": 3.98513282530636e-05, "loss": 0.8061, "step": 2777 }, { "epoch": 1.4917955773296199, "grad_norm": 0.75390625, "learning_rate": 3.984453391603239e-05, "loss": 1.0231, "step": 2778 }, { "epoch": 1.4923324720646958, "grad_norm": 0.7265625, "learning_rate": 3.983773788505186e-05, "loss": 1.0016, "step": 2779 }, { "epoch": 1.492869366799772, "grad_norm": 0.72265625, "learning_rate": 3.9830940160897535e-05, "loss": 0.9346, "step": 2780 }, { "epoch": 1.4934062615348478, "grad_norm": 0.76171875, "learning_rate": 3.982414074434512e-05, "loss": 0.894, "step": 2781 }, { "epoch": 1.493943156269924, "grad_norm": 0.59375, "learning_rate": 3.9817339636170515e-05, "loss": 0.7455, "step": 2782 }, { "epoch": 1.4944800510049998, "grad_norm": 0.6796875, "learning_rate": 3.981053683714981e-05, "loss": 0.7339, "step": 2783 }, { "epoch": 1.4950169457400757, "grad_norm": 0.6796875, "learning_rate": 3.980373234805928e-05, "loss": 0.9382, "step": 2784 }, { "epoch": 1.4955538404751518, "grad_norm": 0.890625, "learning_rate": 3.979692616967543e-05, "loss": 0.7895, "step": 2785 }, { "epoch": 1.496090735210228, "grad_norm": 0.65625, "learning_rate": 3.9790118302774915e-05, "loss": 0.8784, "step": 2786 }, { "epoch": 1.4966276299453039, "grad_norm": 0.71875, "learning_rate": 3.97833087481346e-05, "loss": 0.9706, "step": 2787 }, { "epoch": 1.4971645246803797, "grad_norm": 0.75390625, "learning_rate": 3.9776497506531536e-05, "loss": 0.7745, "step": 2788 }, { "epoch": 1.4977014194154559, "grad_norm": 0.6875, "learning_rate": 3.976968457874298e-05, "loss": 0.8367, "step": 2789 }, { "epoch": 1.498238314150532, "grad_norm": 0.71875, "learning_rate": 3.976286996554637e-05, "loss": 0.9974, "step": 2790 }, { "epoch": 1.498775208885608, "grad_norm": 0.734375, "learning_rate": 3.975605366771934e-05, "loss": 0.974, "step": 2791 }, { "epoch": 1.4993121036206838, "grad_norm": 0.8515625, "learning_rate": 3.9749235686039724e-05, "loss": 0.957, "step": 2792 }, { "epoch": 1.49984899835576, "grad_norm": 0.73046875, "learning_rate": 3.974241602128552e-05, "loss": 0.8976, "step": 2793 }, { "epoch": 1.500385893090836, "grad_norm": 0.65234375, "learning_rate": 3.9735594674234956e-05, "loss": 0.8671, "step": 2794 }, { "epoch": 1.500922787825912, "grad_norm": 0.90234375, "learning_rate": 3.972877164566643e-05, "loss": 0.8571, "step": 2795 }, { "epoch": 1.5014596825609878, "grad_norm": 0.73828125, "learning_rate": 3.972194693635853e-05, "loss": 1.0287, "step": 2796 }, { "epoch": 1.501996577296064, "grad_norm": 0.94140625, "learning_rate": 3.9715120547090046e-05, "loss": 1.0705, "step": 2797 }, { "epoch": 1.5025334720311398, "grad_norm": 0.76953125, "learning_rate": 3.970829247863996e-05, "loss": 0.8678, "step": 2798 }, { "epoch": 1.5030703667662157, "grad_norm": 0.671875, "learning_rate": 3.970146273178743e-05, "loss": 0.9473, "step": 2799 }, { "epoch": 1.5036072615012919, "grad_norm": 0.69921875, "learning_rate": 3.969463130731183e-05, "loss": 0.8286, "step": 2800 }, { "epoch": 1.5036072615012919, "eval_loss": 0.9118887186050415, "eval_runtime": 46.142, "eval_samples_per_second": 20.849, "eval_steps_per_second": 20.849, "step": 2800 }, { "epoch": 1.504144156236368, "grad_norm": 0.7421875, "learning_rate": 3.9687798205992705e-05, "loss": 0.7714, "step": 2801 }, { "epoch": 1.504681050971444, "grad_norm": 0.8046875, "learning_rate": 3.96809634286098e-05, "loss": 0.9371, "step": 2802 }, { "epoch": 1.5052179457065198, "grad_norm": 0.640625, "learning_rate": 3.9674126975943045e-05, "loss": 1.0391, "step": 2803 }, { "epoch": 1.505754840441596, "grad_norm": 0.7890625, "learning_rate": 3.966728884877256e-05, "loss": 0.9517, "step": 2804 }, { "epoch": 1.506291735176672, "grad_norm": 0.734375, "learning_rate": 3.966044904787869e-05, "loss": 0.8353, "step": 2805 }, { "epoch": 1.506828629911748, "grad_norm": 0.703125, "learning_rate": 3.9653607574041914e-05, "loss": 0.8159, "step": 2806 }, { "epoch": 1.5073655246468238, "grad_norm": 0.7734375, "learning_rate": 3.964676442804294e-05, "loss": 0.9357, "step": 2807 }, { "epoch": 1.5079024193819, "grad_norm": 0.83203125, "learning_rate": 3.963991961066266e-05, "loss": 0.8781, "step": 2808 }, { "epoch": 1.508439314116976, "grad_norm": 0.671875, "learning_rate": 3.9633073122682156e-05, "loss": 0.7878, "step": 2809 }, { "epoch": 1.508976208852052, "grad_norm": 0.6796875, "learning_rate": 3.962622496488269e-05, "loss": 0.8794, "step": 2810 }, { "epoch": 1.5095131035871279, "grad_norm": 0.65625, "learning_rate": 3.961937513804573e-05, "loss": 0.9357, "step": 2811 }, { "epoch": 1.510049998322204, "grad_norm": 0.796875, "learning_rate": 3.961252364295294e-05, "loss": 0.9121, "step": 2812 }, { "epoch": 1.5105868930572801, "grad_norm": 0.67578125, "learning_rate": 3.9605670480386134e-05, "loss": 0.8773, "step": 2813 }, { "epoch": 1.511123787792356, "grad_norm": 0.9921875, "learning_rate": 3.9598815651127364e-05, "loss": 0.9364, "step": 2814 }, { "epoch": 1.511660682527432, "grad_norm": 0.671875, "learning_rate": 3.9591959155958855e-05, "loss": 0.8412, "step": 2815 }, { "epoch": 1.512197577262508, "grad_norm": 0.67578125, "learning_rate": 3.958510099566301e-05, "loss": 0.9101, "step": 2816 }, { "epoch": 1.512734471997584, "grad_norm": 0.63671875, "learning_rate": 3.9578241171022436e-05, "loss": 0.9992, "step": 2817 }, { "epoch": 1.5132713667326598, "grad_norm": 0.69140625, "learning_rate": 3.9571379682819925e-05, "loss": 0.8525, "step": 2818 }, { "epoch": 1.513808261467736, "grad_norm": 0.69921875, "learning_rate": 3.956451653183846e-05, "loss": 0.8469, "step": 2819 }, { "epoch": 1.514345156202812, "grad_norm": 0.66796875, "learning_rate": 3.9557651718861224e-05, "loss": 0.8208, "step": 2820 }, { "epoch": 1.514882050937888, "grad_norm": 0.671875, "learning_rate": 3.955078524467156e-05, "loss": 0.8166, "step": 2821 }, { "epoch": 1.5154189456729639, "grad_norm": 0.76171875, "learning_rate": 3.9543917110053025e-05, "loss": 0.9855, "step": 2822 }, { "epoch": 1.51595584040804, "grad_norm": 0.84765625, "learning_rate": 3.9537047315789366e-05, "loss": 0.8982, "step": 2823 }, { "epoch": 1.516492735143116, "grad_norm": 0.7578125, "learning_rate": 3.953017586266452e-05, "loss": 0.943, "step": 2824 }, { "epoch": 1.517029629878192, "grad_norm": 0.66015625, "learning_rate": 3.9523302751462586e-05, "loss": 0.8065, "step": 2825 }, { "epoch": 1.517566524613268, "grad_norm": 0.63671875, "learning_rate": 3.95164279829679e-05, "loss": 0.9145, "step": 2826 }, { "epoch": 1.518103419348344, "grad_norm": 0.625, "learning_rate": 3.950955155796493e-05, "loss": 0.7812, "step": 2827 }, { "epoch": 1.5186403140834202, "grad_norm": 0.6953125, "learning_rate": 3.950267347723838e-05, "loss": 0.9267, "step": 2828 }, { "epoch": 1.519177208818496, "grad_norm": 0.65234375, "learning_rate": 3.949579374157314e-05, "loss": 0.6184, "step": 2829 }, { "epoch": 1.519714103553572, "grad_norm": 0.75390625, "learning_rate": 3.948891235175425e-05, "loss": 0.9621, "step": 2830 }, { "epoch": 1.520250998288648, "grad_norm": 0.66015625, "learning_rate": 3.948202930856697e-05, "loss": 0.8378, "step": 2831 }, { "epoch": 1.5207878930237242, "grad_norm": 0.765625, "learning_rate": 3.947514461279676e-05, "loss": 0.8502, "step": 2832 }, { "epoch": 1.5213247877588, "grad_norm": 0.5390625, "learning_rate": 3.946825826522922e-05, "loss": 0.7776, "step": 2833 }, { "epoch": 1.521861682493876, "grad_norm": 0.72265625, "learning_rate": 3.946137026665019e-05, "loss": 0.9325, "step": 2834 }, { "epoch": 1.522398577228952, "grad_norm": 0.7421875, "learning_rate": 3.945448061784568e-05, "loss": 0.7118, "step": 2835 }, { "epoch": 1.522935471964028, "grad_norm": 0.64453125, "learning_rate": 3.944758931960188e-05, "loss": 0.943, "step": 2836 }, { "epoch": 1.523472366699104, "grad_norm": 0.69140625, "learning_rate": 3.944069637270517e-05, "loss": 0.9437, "step": 2837 }, { "epoch": 1.52400926143418, "grad_norm": 0.71875, "learning_rate": 3.943380177794213e-05, "loss": 0.9411, "step": 2838 }, { "epoch": 1.5245461561692562, "grad_norm": 0.7890625, "learning_rate": 3.942690553609951e-05, "loss": 0.9696, "step": 2839 }, { "epoch": 1.525083050904332, "grad_norm": 0.73828125, "learning_rate": 3.942000764796427e-05, "loss": 0.8598, "step": 2840 }, { "epoch": 1.525619945639408, "grad_norm": 0.62890625, "learning_rate": 3.9413108114323555e-05, "loss": 0.75, "step": 2841 }, { "epoch": 1.526156840374484, "grad_norm": 0.59765625, "learning_rate": 3.940620693596467e-05, "loss": 0.7114, "step": 2842 }, { "epoch": 1.5266937351095602, "grad_norm": 0.765625, "learning_rate": 3.939930411367513e-05, "loss": 0.9994, "step": 2843 }, { "epoch": 1.527230629844636, "grad_norm": 0.73828125, "learning_rate": 3.939239964824263e-05, "loss": 0.8982, "step": 2844 }, { "epoch": 1.527767524579712, "grad_norm": 0.76953125, "learning_rate": 3.938549354045508e-05, "loss": 0.8759, "step": 2845 }, { "epoch": 1.528304419314788, "grad_norm": 0.7265625, "learning_rate": 3.9378585791100545e-05, "loss": 1.2196, "step": 2846 }, { "epoch": 1.5288413140498642, "grad_norm": 0.7109375, "learning_rate": 3.937167640096726e-05, "loss": 0.8423, "step": 2847 }, { "epoch": 1.5293782087849401, "grad_norm": 0.69140625, "learning_rate": 3.936476537084371e-05, "loss": 0.821, "step": 2848 }, { "epoch": 1.529915103520016, "grad_norm": 0.7734375, "learning_rate": 3.935785270151851e-05, "loss": 0.7115, "step": 2849 }, { "epoch": 1.5304519982550921, "grad_norm": 0.6328125, "learning_rate": 3.935093839378049e-05, "loss": 0.7373, "step": 2850 }, { "epoch": 1.5304519982550921, "eval_loss": 0.9116076231002808, "eval_runtime": 46.0198, "eval_samples_per_second": 20.904, "eval_steps_per_second": 20.904, "step": 2850 }, { "epoch": 1.5309888929901683, "grad_norm": 0.77734375, "learning_rate": 3.934402244841866e-05, "loss": 1.0201, "step": 2851 }, { "epoch": 1.5315257877252442, "grad_norm": 0.7890625, "learning_rate": 3.933710486622222e-05, "loss": 1.0381, "step": 2852 }, { "epoch": 1.53206268246032, "grad_norm": 0.77734375, "learning_rate": 3.933018564798055e-05, "loss": 0.9738, "step": 2853 }, { "epoch": 1.5325995771953962, "grad_norm": 0.69921875, "learning_rate": 3.932326479448322e-05, "loss": 0.8155, "step": 2854 }, { "epoch": 1.533136471930472, "grad_norm": 0.75390625, "learning_rate": 3.9316342306519977e-05, "loss": 0.7815, "step": 2855 }, { "epoch": 1.533673366665548, "grad_norm": 0.79296875, "learning_rate": 3.9309418184880786e-05, "loss": 0.8796, "step": 2856 }, { "epoch": 1.534210261400624, "grad_norm": 0.609375, "learning_rate": 3.930249243035576e-05, "loss": 0.7324, "step": 2857 }, { "epoch": 1.5347471561357002, "grad_norm": 0.7265625, "learning_rate": 3.9295565043735226e-05, "loss": 0.8448, "step": 2858 }, { "epoch": 1.5352840508707761, "grad_norm": 0.765625, "learning_rate": 3.928863602580968e-05, "loss": 0.7994, "step": 2859 }, { "epoch": 1.535820945605852, "grad_norm": 0.95703125, "learning_rate": 3.928170537736981e-05, "loss": 0.9257, "step": 2860 }, { "epoch": 1.5363578403409281, "grad_norm": 0.69921875, "learning_rate": 3.9274773099206496e-05, "loss": 0.8972, "step": 2861 }, { "epoch": 1.5368947350760043, "grad_norm": 0.6171875, "learning_rate": 3.92678391921108e-05, "loss": 0.8927, "step": 2862 }, { "epoch": 1.5374316298110802, "grad_norm": 0.75390625, "learning_rate": 3.926090365687395e-05, "loss": 0.9338, "step": 2863 }, { "epoch": 1.537968524546156, "grad_norm": 0.76953125, "learning_rate": 3.9253966494287405e-05, "loss": 0.9745, "step": 2864 }, { "epoch": 1.5385054192812322, "grad_norm": 0.7265625, "learning_rate": 3.924702770514277e-05, "loss": 1.0386, "step": 2865 }, { "epoch": 1.5390423140163083, "grad_norm": 0.8515625, "learning_rate": 3.924008729023185e-05, "loss": 0.9243, "step": 2866 }, { "epoch": 1.5395792087513842, "grad_norm": 0.7578125, "learning_rate": 3.923314525034664e-05, "loss": 0.7536, "step": 2867 }, { "epoch": 1.54011610348646, "grad_norm": 0.6875, "learning_rate": 3.92262015862793e-05, "loss": 0.8662, "step": 2868 }, { "epoch": 1.5406529982215362, "grad_norm": 0.74609375, "learning_rate": 3.921925629882219e-05, "loss": 0.9451, "step": 2869 }, { "epoch": 1.5411898929566124, "grad_norm": 0.671875, "learning_rate": 3.921230938876788e-05, "loss": 0.9485, "step": 2870 }, { "epoch": 1.5417267876916882, "grad_norm": 0.7890625, "learning_rate": 3.920536085690908e-05, "loss": 1.1434, "step": 2871 }, { "epoch": 1.5422636824267641, "grad_norm": 0.65234375, "learning_rate": 3.919841070403871e-05, "loss": 1.1414, "step": 2872 }, { "epoch": 1.5428005771618403, "grad_norm": 0.734375, "learning_rate": 3.919145893094987e-05, "loss": 0.7403, "step": 2873 }, { "epoch": 1.5433374718969162, "grad_norm": 0.75390625, "learning_rate": 3.9184505538435844e-05, "loss": 0.9405, "step": 2874 }, { "epoch": 1.543874366631992, "grad_norm": 0.71875, "learning_rate": 3.917755052729011e-05, "loss": 0.7924, "step": 2875 }, { "epoch": 1.5444112613670682, "grad_norm": 0.671875, "learning_rate": 3.9170593898306306e-05, "loss": 0.7892, "step": 2876 }, { "epoch": 1.5449481561021443, "grad_norm": 0.68359375, "learning_rate": 3.916363565227829e-05, "loss": 0.7871, "step": 2877 }, { "epoch": 1.5454850508372202, "grad_norm": 0.8515625, "learning_rate": 3.915667579000007e-05, "loss": 1.0402, "step": 2878 }, { "epoch": 1.546021945572296, "grad_norm": 0.796875, "learning_rate": 3.914971431226587e-05, "loss": 1.0749, "step": 2879 }, { "epoch": 1.5465588403073722, "grad_norm": 0.69921875, "learning_rate": 3.9142751219870075e-05, "loss": 0.8598, "step": 2880 }, { "epoch": 1.5470957350424484, "grad_norm": 0.7109375, "learning_rate": 3.9135786513607264e-05, "loss": 0.9311, "step": 2881 }, { "epoch": 1.5476326297775242, "grad_norm": 0.75390625, "learning_rate": 3.912882019427219e-05, "loss": 0.9104, "step": 2882 }, { "epoch": 1.5481695245126001, "grad_norm": 0.546875, "learning_rate": 3.912185226265981e-05, "loss": 0.7468, "step": 2883 }, { "epoch": 1.5487064192476763, "grad_norm": 0.6015625, "learning_rate": 3.911488271956525e-05, "loss": 0.6917, "step": 2884 }, { "epoch": 1.5492433139827524, "grad_norm": 0.69140625, "learning_rate": 3.910791156578382e-05, "loss": 0.9262, "step": 2885 }, { "epoch": 1.5497802087178283, "grad_norm": 0.703125, "learning_rate": 3.910093880211101e-05, "loss": 0.9821, "step": 2886 }, { "epoch": 1.5503171034529042, "grad_norm": 0.6640625, "learning_rate": 3.909396442934252e-05, "loss": 0.8115, "step": 2887 }, { "epoch": 1.5508539981879803, "grad_norm": 0.6796875, "learning_rate": 3.90869884482742e-05, "loss": 1.065, "step": 2888 }, { "epoch": 1.5513908929230564, "grad_norm": 0.73828125, "learning_rate": 3.9080010859702086e-05, "loss": 0.8692, "step": 2889 }, { "epoch": 1.5519277876581323, "grad_norm": 0.80859375, "learning_rate": 3.9073031664422444e-05, "loss": 0.9388, "step": 2890 }, { "epoch": 1.5524646823932082, "grad_norm": 0.80078125, "learning_rate": 3.906605086323166e-05, "loss": 0.712, "step": 2891 }, { "epoch": 1.5530015771282843, "grad_norm": 0.73046875, "learning_rate": 3.905906845692634e-05, "loss": 0.9071, "step": 2892 }, { "epoch": 1.5535384718633602, "grad_norm": 0.73046875, "learning_rate": 3.905208444630327e-05, "loss": 0.8565, "step": 2893 }, { "epoch": 1.5540753665984361, "grad_norm": 0.5859375, "learning_rate": 3.9045098832159406e-05, "loss": 0.8977, "step": 2894 }, { "epoch": 1.5546122613335123, "grad_norm": 0.6484375, "learning_rate": 3.9038111615291894e-05, "loss": 0.9619, "step": 2895 }, { "epoch": 1.5551491560685884, "grad_norm": 0.9296875, "learning_rate": 3.9031122796498074e-05, "loss": 0.8718, "step": 2896 }, { "epoch": 1.5556860508036643, "grad_norm": 0.6171875, "learning_rate": 3.902413237657545e-05, "loss": 0.8214, "step": 2897 }, { "epoch": 1.5562229455387402, "grad_norm": 0.625, "learning_rate": 3.901714035632172e-05, "loss": 0.9797, "step": 2898 }, { "epoch": 1.5567598402738163, "grad_norm": 0.6640625, "learning_rate": 3.901014673653477e-05, "loss": 0.9068, "step": 2899 }, { "epoch": 1.5572967350088924, "grad_norm": 0.82421875, "learning_rate": 3.900315151801265e-05, "loss": 1.1709, "step": 2900 }, { "epoch": 1.5572967350088924, "eval_loss": 0.9109301567077637, "eval_runtime": 45.6924, "eval_samples_per_second": 21.054, "eval_steps_per_second": 21.054, "step": 2900 }, { "epoch": 1.5578336297439683, "grad_norm": 0.609375, "learning_rate": 3.899615470155361e-05, "loss": 0.8851, "step": 2901 }, { "epoch": 1.5583705244790442, "grad_norm": 0.6328125, "learning_rate": 3.898915628795606e-05, "loss": 0.8582, "step": 2902 }, { "epoch": 1.5589074192141203, "grad_norm": 0.65625, "learning_rate": 3.898215627801862e-05, "loss": 0.8282, "step": 2903 }, { "epoch": 1.5594443139491965, "grad_norm": 0.9140625, "learning_rate": 3.8975154672540094e-05, "loss": 0.8162, "step": 2904 }, { "epoch": 1.5599812086842724, "grad_norm": 0.74609375, "learning_rate": 3.8968151472319425e-05, "loss": 0.8415, "step": 2905 }, { "epoch": 1.5605181034193483, "grad_norm": 0.66015625, "learning_rate": 3.896114667815579e-05, "loss": 0.9145, "step": 2906 }, { "epoch": 1.5610549981544244, "grad_norm": 0.7265625, "learning_rate": 3.8954140290848505e-05, "loss": 0.9159, "step": 2907 }, { "epoch": 1.5615918928895005, "grad_norm": 0.7109375, "learning_rate": 3.8947132311197106e-05, "loss": 0.9453, "step": 2908 }, { "epoch": 1.5621287876245764, "grad_norm": 0.69921875, "learning_rate": 3.8940122740001285e-05, "loss": 0.9828, "step": 2909 }, { "epoch": 1.5626656823596523, "grad_norm": 0.7890625, "learning_rate": 3.893311157806091e-05, "loss": 0.8762, "step": 2910 }, { "epoch": 1.5632025770947284, "grad_norm": 0.67578125, "learning_rate": 3.892609882617606e-05, "loss": 1.0379, "step": 2911 }, { "epoch": 1.5637394718298043, "grad_norm": 0.6640625, "learning_rate": 3.8919084485146985e-05, "loss": 0.8265, "step": 2912 }, { "epoch": 1.5642763665648802, "grad_norm": 0.6328125, "learning_rate": 3.89120685557741e-05, "loss": 0.9004, "step": 2913 }, { "epoch": 1.5648132612999563, "grad_norm": 0.6875, "learning_rate": 3.8905051038857996e-05, "loss": 0.7486, "step": 2914 }, { "epoch": 1.5653501560350325, "grad_norm": 0.765625, "learning_rate": 3.889803193519948e-05, "loss": 0.9516, "step": 2915 }, { "epoch": 1.5658870507701084, "grad_norm": 0.703125, "learning_rate": 3.8891011245599515e-05, "loss": 0.9366, "step": 2916 }, { "epoch": 1.5664239455051843, "grad_norm": 0.78125, "learning_rate": 3.888398897085927e-05, "loss": 0.9448, "step": 2917 }, { "epoch": 1.5669608402402604, "grad_norm": 0.87890625, "learning_rate": 3.887696511178003e-05, "loss": 1.0717, "step": 2918 }, { "epoch": 1.5674977349753365, "grad_norm": 0.7265625, "learning_rate": 3.8869939669163345e-05, "loss": 0.9569, "step": 2919 }, { "epoch": 1.5680346297104124, "grad_norm": 0.671875, "learning_rate": 3.8862912643810895e-05, "loss": 0.7483, "step": 2920 }, { "epoch": 1.5685715244454883, "grad_norm": 0.74609375, "learning_rate": 3.885588403652455e-05, "loss": 1.0418, "step": 2921 }, { "epoch": 1.5691084191805644, "grad_norm": 0.6953125, "learning_rate": 3.884885384810637e-05, "loss": 0.7563, "step": 2922 }, { "epoch": 1.5696453139156405, "grad_norm": 0.9296875, "learning_rate": 3.884182207935858e-05, "loss": 1.0287, "step": 2923 }, { "epoch": 1.5701822086507164, "grad_norm": 0.75390625, "learning_rate": 3.883478873108361e-05, "loss": 0.752, "step": 2924 }, { "epoch": 1.5707191033857923, "grad_norm": 0.81640625, "learning_rate": 3.882775380408403e-05, "loss": 0.9046, "step": 2925 }, { "epoch": 1.5712559981208685, "grad_norm": 0.58203125, "learning_rate": 3.882071729916263e-05, "loss": 0.78, "step": 2926 }, { "epoch": 1.5717928928559446, "grad_norm": 0.8046875, "learning_rate": 3.8813679217122354e-05, "loss": 0.9863, "step": 2927 }, { "epoch": 1.5723297875910205, "grad_norm": 0.9140625, "learning_rate": 3.880663955876635e-05, "loss": 1.0235, "step": 2928 }, { "epoch": 1.5728666823260964, "grad_norm": 0.62890625, "learning_rate": 3.8799598324897933e-05, "loss": 0.8813, "step": 2929 }, { "epoch": 1.5734035770611725, "grad_norm": 0.73046875, "learning_rate": 3.8792555516320584e-05, "loss": 1.1182, "step": 2930 }, { "epoch": 1.5739404717962484, "grad_norm": 0.7265625, "learning_rate": 3.8785511133837975e-05, "loss": 0.9417, "step": 2931 }, { "epoch": 1.5744773665313243, "grad_norm": 0.71875, "learning_rate": 3.877846517825397e-05, "loss": 0.9112, "step": 2932 }, { "epoch": 1.5750142612664004, "grad_norm": 0.796875, "learning_rate": 3.87714176503726e-05, "loss": 0.8963, "step": 2933 }, { "epoch": 1.5755511560014765, "grad_norm": 0.70703125, "learning_rate": 3.876436855099807e-05, "loss": 0.7229, "step": 2934 }, { "epoch": 1.5760880507365524, "grad_norm": 0.62109375, "learning_rate": 3.8757317880934786e-05, "loss": 0.829, "step": 2935 }, { "epoch": 1.5766249454716283, "grad_norm": 0.71875, "learning_rate": 3.87502656409873e-05, "loss": 0.932, "step": 2936 }, { "epoch": 1.5771618402067045, "grad_norm": 0.7578125, "learning_rate": 3.874321183196038e-05, "loss": 0.9727, "step": 2937 }, { "epoch": 1.5776987349417806, "grad_norm": 0.93359375, "learning_rate": 3.8736156454658944e-05, "loss": 0.9234, "step": 2938 }, { "epoch": 1.5782356296768565, "grad_norm": 0.6328125, "learning_rate": 3.8729099509888104e-05, "loss": 0.9148, "step": 2939 }, { "epoch": 1.5787725244119324, "grad_norm": 0.67578125, "learning_rate": 3.872204099845313e-05, "loss": 0.8338, "step": 2940 }, { "epoch": 1.5793094191470085, "grad_norm": 0.68359375, "learning_rate": 3.871498092115952e-05, "loss": 0.8973, "step": 2941 }, { "epoch": 1.5798463138820846, "grad_norm": 0.63671875, "learning_rate": 3.8707919278812897e-05, "loss": 0.8941, "step": 2942 }, { "epoch": 1.5803832086171605, "grad_norm": 0.7109375, "learning_rate": 3.870085607221908e-05, "loss": 1.052, "step": 2943 }, { "epoch": 1.5809201033522364, "grad_norm": 0.671875, "learning_rate": 3.869379130218409e-05, "loss": 0.9589, "step": 2944 }, { "epoch": 1.5814569980873125, "grad_norm": 0.73046875, "learning_rate": 3.868672496951409e-05, "loss": 0.81, "step": 2945 }, { "epoch": 1.5819938928223887, "grad_norm": 0.59375, "learning_rate": 3.867965707501544e-05, "loss": 0.8769, "step": 2946 }, { "epoch": 1.5825307875574646, "grad_norm": 0.65234375, "learning_rate": 3.8672587619494674e-05, "loss": 0.8472, "step": 2947 }, { "epoch": 1.5830676822925405, "grad_norm": 0.734375, "learning_rate": 3.866551660375851e-05, "loss": 0.9401, "step": 2948 }, { "epoch": 1.5836045770276166, "grad_norm": 1.390625, "learning_rate": 3.8658444028613846e-05, "loss": 0.861, "step": 2949 }, { "epoch": 1.5841414717626925, "grad_norm": 0.67578125, "learning_rate": 3.865136989486776e-05, "loss": 0.918, "step": 2950 }, { "epoch": 1.5841414717626925, "eval_loss": 0.9100096821784973, "eval_runtime": 46.743, "eval_samples_per_second": 20.581, "eval_steps_per_second": 20.581, "step": 2950 }, { "epoch": 1.5846783664977684, "grad_norm": 0.64453125, "learning_rate": 3.864429420332748e-05, "loss": 0.8551, "step": 2951 }, { "epoch": 1.5852152612328445, "grad_norm": 0.77734375, "learning_rate": 3.8637216954800435e-05, "loss": 1.0583, "step": 2952 }, { "epoch": 1.5857521559679206, "grad_norm": 0.671875, "learning_rate": 3.8630138150094244e-05, "loss": 0.7534, "step": 2953 }, { "epoch": 1.5862890507029965, "grad_norm": 0.82421875, "learning_rate": 3.862305779001667e-05, "loss": 0.8902, "step": 2954 }, { "epoch": 1.5868259454380724, "grad_norm": 0.78515625, "learning_rate": 3.861597587537568e-05, "loss": 1.058, "step": 2955 }, { "epoch": 1.5873628401731485, "grad_norm": 0.74609375, "learning_rate": 3.8608892406979416e-05, "loss": 0.8657, "step": 2956 }, { "epoch": 1.5878997349082247, "grad_norm": 0.7265625, "learning_rate": 3.860180738563618e-05, "loss": 0.814, "step": 2957 }, { "epoch": 1.5884366296433006, "grad_norm": 0.796875, "learning_rate": 3.859472081215447e-05, "loss": 0.7906, "step": 2958 }, { "epoch": 1.5889735243783765, "grad_norm": 0.6953125, "learning_rate": 3.8587632687342954e-05, "loss": 0.8657, "step": 2959 }, { "epoch": 1.5895104191134526, "grad_norm": 0.78125, "learning_rate": 3.858054301201047e-05, "loss": 0.7898, "step": 2960 }, { "epoch": 1.5900473138485287, "grad_norm": 0.6484375, "learning_rate": 3.857345178696605e-05, "loss": 0.9045, "step": 2961 }, { "epoch": 1.5905842085836046, "grad_norm": 0.9296875, "learning_rate": 3.856635901301888e-05, "loss": 1.0102, "step": 2962 }, { "epoch": 1.5911211033186805, "grad_norm": 0.6875, "learning_rate": 3.855926469097836e-05, "loss": 1.0804, "step": 2963 }, { "epoch": 1.5916579980537566, "grad_norm": 0.73046875, "learning_rate": 3.855216882165401e-05, "loss": 0.9125, "step": 2964 }, { "epoch": 1.5921948927888327, "grad_norm": 0.66015625, "learning_rate": 3.854507140585557e-05, "loss": 0.7551, "step": 2965 }, { "epoch": 1.5927317875239086, "grad_norm": 0.6875, "learning_rate": 3.8537972444392955e-05, "loss": 0.7987, "step": 2966 }, { "epoch": 1.5932686822589845, "grad_norm": 0.765625, "learning_rate": 3.853087193807624e-05, "loss": 0.8164, "step": 2967 }, { "epoch": 1.5938055769940607, "grad_norm": 0.71484375, "learning_rate": 3.852376988771568e-05, "loss": 0.9392, "step": 2968 }, { "epoch": 1.5943424717291366, "grad_norm": 0.65625, "learning_rate": 3.851666629412172e-05, "loss": 0.7026, "step": 2969 }, { "epoch": 1.5948793664642125, "grad_norm": 0.703125, "learning_rate": 3.850956115810495e-05, "loss": 0.8727, "step": 2970 }, { "epoch": 1.5954162611992886, "grad_norm": 0.77734375, "learning_rate": 3.850245448047617e-05, "loss": 1.0059, "step": 2971 }, { "epoch": 1.5959531559343647, "grad_norm": 0.73828125, "learning_rate": 3.849534626204634e-05, "loss": 0.9154, "step": 2972 }, { "epoch": 1.5964900506694406, "grad_norm": 0.90625, "learning_rate": 3.8488236503626594e-05, "loss": 0.8564, "step": 2973 }, { "epoch": 1.5970269454045165, "grad_norm": 0.8125, "learning_rate": 3.848112520602825e-05, "loss": 0.8836, "step": 2974 }, { "epoch": 1.5975638401395926, "grad_norm": 0.69140625, "learning_rate": 3.84740123700628e-05, "loss": 0.6721, "step": 2975 }, { "epoch": 1.5981007348746687, "grad_norm": 0.703125, "learning_rate": 3.8466897996541894e-05, "loss": 0.8799, "step": 2976 }, { "epoch": 1.5986376296097446, "grad_norm": 10.875, "learning_rate": 3.845978208627739e-05, "loss": 1.0333, "step": 2977 }, { "epoch": 1.5991745243448205, "grad_norm": 0.67578125, "learning_rate": 3.845266464008129e-05, "loss": 0.9886, "step": 2978 }, { "epoch": 1.5997114190798967, "grad_norm": 0.69921875, "learning_rate": 3.844554565876579e-05, "loss": 0.7612, "step": 2979 }, { "epoch": 1.6002483138149728, "grad_norm": 0.83984375, "learning_rate": 3.8438425143143265e-05, "loss": 1.0969, "step": 2980 }, { "epoch": 1.6007852085500487, "grad_norm": 0.74609375, "learning_rate": 3.843130309402624e-05, "loss": 0.806, "step": 2981 }, { "epoch": 1.6013221032851246, "grad_norm": 0.671875, "learning_rate": 3.842417951222744e-05, "loss": 0.7616, "step": 2982 }, { "epoch": 1.6018589980202007, "grad_norm": 0.67578125, "learning_rate": 3.8417054398559755e-05, "loss": 0.8587, "step": 2983 }, { "epoch": 1.6023958927552768, "grad_norm": 0.734375, "learning_rate": 3.840992775383626e-05, "loss": 0.9865, "step": 2984 }, { "epoch": 1.6029327874903525, "grad_norm": 0.65625, "learning_rate": 3.840279957887017e-05, "loss": 0.8252, "step": 2985 }, { "epoch": 1.6034696822254286, "grad_norm": 0.6875, "learning_rate": 3.8395669874474915e-05, "loss": 0.8268, "step": 2986 }, { "epoch": 1.6040065769605047, "grad_norm": 0.62109375, "learning_rate": 3.8388538641464086e-05, "loss": 0.8161, "step": 2987 }, { "epoch": 1.6045434716955806, "grad_norm": 0.671875, "learning_rate": 3.838140588065145e-05, "loss": 0.9033, "step": 2988 }, { "epoch": 1.6050803664306565, "grad_norm": 0.6484375, "learning_rate": 3.8374271592850954e-05, "loss": 0.7718, "step": 2989 }, { "epoch": 1.6056172611657327, "grad_norm": 0.63671875, "learning_rate": 3.836713577887668e-05, "loss": 0.6896, "step": 2990 }, { "epoch": 1.6061541559008088, "grad_norm": 0.7734375, "learning_rate": 3.835999843954294e-05, "loss": 0.9686, "step": 2991 }, { "epoch": 1.6066910506358847, "grad_norm": 0.58984375, "learning_rate": 3.835285957566418e-05, "loss": 0.675, "step": 2992 }, { "epoch": 1.6072279453709606, "grad_norm": 0.85546875, "learning_rate": 3.834571918805505e-05, "loss": 1.0189, "step": 2993 }, { "epoch": 1.6077648401060367, "grad_norm": 0.734375, "learning_rate": 3.833857727753035e-05, "loss": 0.7767, "step": 2994 }, { "epoch": 1.6083017348411128, "grad_norm": 0.66796875, "learning_rate": 3.833143384490506e-05, "loss": 0.872, "step": 2995 }, { "epoch": 1.6088386295761887, "grad_norm": 0.74609375, "learning_rate": 3.832428889099434e-05, "loss": 0.7987, "step": 2996 }, { "epoch": 1.6093755243112646, "grad_norm": 0.5859375, "learning_rate": 3.8317142416613515e-05, "loss": 0.7762, "step": 2997 }, { "epoch": 1.6099124190463407, "grad_norm": 0.84375, "learning_rate": 3.83099944225781e-05, "loss": 0.8574, "step": 2998 }, { "epoch": 1.6104493137814169, "grad_norm": 0.69140625, "learning_rate": 3.830284490970376e-05, "loss": 0.8994, "step": 2999 }, { "epoch": 1.6109862085164928, "grad_norm": 0.73046875, "learning_rate": 3.8295693878806347e-05, "loss": 0.8682, "step": 3000 }, { "epoch": 1.6109862085164928, "eval_loss": 0.9103614091873169, "eval_runtime": 47.1803, "eval_samples_per_second": 20.39, "eval_steps_per_second": 20.39, "step": 3000 }, { "epoch": 1.6115231032515687, "grad_norm": 0.734375, "learning_rate": 3.828854133070189e-05, "loss": 1.0039, "step": 3001 }, { "epoch": 1.6120599979866448, "grad_norm": 0.77734375, "learning_rate": 3.828138726620658e-05, "loss": 0.8209, "step": 3002 }, { "epoch": 1.612596892721721, "grad_norm": 0.7421875, "learning_rate": 3.827423168613679e-05, "loss": 1.0899, "step": 3003 }, { "epoch": 1.6131337874567966, "grad_norm": 0.875, "learning_rate": 3.826707459130906e-05, "loss": 1.0434, "step": 3004 }, { "epoch": 1.6136706821918727, "grad_norm": 0.68359375, "learning_rate": 3.82599159825401e-05, "loss": 1.0561, "step": 3005 }, { "epoch": 1.6142075769269488, "grad_norm": 0.70703125, "learning_rate": 3.8252755860646804e-05, "loss": 0.8762, "step": 3006 }, { "epoch": 1.6147444716620247, "grad_norm": 0.578125, "learning_rate": 3.824559422644624e-05, "loss": 0.671, "step": 3007 }, { "epoch": 1.6152813663971006, "grad_norm": 0.69140625, "learning_rate": 3.823843108075562e-05, "loss": 0.8227, "step": 3008 }, { "epoch": 1.6158182611321767, "grad_norm": 0.62109375, "learning_rate": 3.8231266424392375e-05, "loss": 0.7786, "step": 3009 }, { "epoch": 1.6163551558672529, "grad_norm": 0.734375, "learning_rate": 3.822410025817406e-05, "loss": 0.8902, "step": 3010 }, { "epoch": 1.6168920506023288, "grad_norm": 0.6015625, "learning_rate": 3.821693258291844e-05, "loss": 0.7087, "step": 3011 }, { "epoch": 1.6174289453374047, "grad_norm": 0.8359375, "learning_rate": 3.820976339944344e-05, "loss": 1.0657, "step": 3012 }, { "epoch": 1.6179658400724808, "grad_norm": 0.89453125, "learning_rate": 3.820259270856714e-05, "loss": 0.85, "step": 3013 }, { "epoch": 1.618502734807557, "grad_norm": 0.78515625, "learning_rate": 3.819542051110782e-05, "loss": 1.1769, "step": 3014 }, { "epoch": 1.6190396295426328, "grad_norm": 0.68359375, "learning_rate": 3.818824680788391e-05, "loss": 0.928, "step": 3015 }, { "epoch": 1.6195765242777087, "grad_norm": 0.609375, "learning_rate": 3.818107159971402e-05, "loss": 0.8648, "step": 3016 }, { "epoch": 1.6201134190127848, "grad_norm": 0.640625, "learning_rate": 3.8173894887416945e-05, "loss": 0.9321, "step": 3017 }, { "epoch": 1.620650313747861, "grad_norm": 0.71484375, "learning_rate": 3.816671667181162e-05, "loss": 0.976, "step": 3018 }, { "epoch": 1.6211872084829368, "grad_norm": 0.93359375, "learning_rate": 3.8159536953717185e-05, "loss": 0.9066, "step": 3019 }, { "epoch": 1.6217241032180127, "grad_norm": 0.7578125, "learning_rate": 3.8152355733952925e-05, "loss": 0.8672, "step": 3020 }, { "epoch": 1.6222609979530889, "grad_norm": 0.75390625, "learning_rate": 3.814517301333832e-05, "loss": 0.8575, "step": 3021 }, { "epoch": 1.622797892688165, "grad_norm": 0.5625, "learning_rate": 3.8137988792693006e-05, "loss": 0.7132, "step": 3022 }, { "epoch": 1.6233347874232407, "grad_norm": 0.70703125, "learning_rate": 3.8130803072836796e-05, "loss": 0.8218, "step": 3023 }, { "epoch": 1.6238716821583168, "grad_norm": 0.81640625, "learning_rate": 3.8123615854589657e-05, "loss": 0.8358, "step": 3024 }, { "epoch": 1.624408576893393, "grad_norm": 0.765625, "learning_rate": 3.8116427138771746e-05, "loss": 0.8898, "step": 3025 }, { "epoch": 1.6249454716284688, "grad_norm": 0.609375, "learning_rate": 3.810923692620341e-05, "loss": 0.8229, "step": 3026 }, { "epoch": 1.6254823663635447, "grad_norm": 0.8125, "learning_rate": 3.810204521770512e-05, "loss": 0.9023, "step": 3027 }, { "epoch": 1.6260192610986208, "grad_norm": 0.63671875, "learning_rate": 3.8094852014097544e-05, "loss": 0.7394, "step": 3028 }, { "epoch": 1.626556155833697, "grad_norm": 0.76171875, "learning_rate": 3.808765731620152e-05, "loss": 0.8538, "step": 3029 }, { "epoch": 1.6270930505687728, "grad_norm": 0.76953125, "learning_rate": 3.808046112483805e-05, "loss": 1.0332, "step": 3030 }, { "epoch": 1.6276299453038487, "grad_norm": 0.6875, "learning_rate": 3.807326344082832e-05, "loss": 0.7159, "step": 3031 }, { "epoch": 1.6281668400389249, "grad_norm": 0.6796875, "learning_rate": 3.8066064264993674e-05, "loss": 0.8199, "step": 3032 }, { "epoch": 1.628703734774001, "grad_norm": 0.8515625, "learning_rate": 3.805886359815563e-05, "loss": 0.8384, "step": 3033 }, { "epoch": 1.6292406295090769, "grad_norm": 0.6484375, "learning_rate": 3.8051661441135864e-05, "loss": 0.8304, "step": 3034 }, { "epoch": 1.6297775242441528, "grad_norm": 0.74609375, "learning_rate": 3.804445779475624e-05, "loss": 0.8786, "step": 3035 }, { "epoch": 1.630314418979229, "grad_norm": 0.73828125, "learning_rate": 3.803725265983879e-05, "loss": 0.8432, "step": 3036 }, { "epoch": 1.630851313714305, "grad_norm": 0.75390625, "learning_rate": 3.803004603720571e-05, "loss": 0.896, "step": 3037 }, { "epoch": 1.631388208449381, "grad_norm": 0.68359375, "learning_rate": 3.8022837927679373e-05, "loss": 0.7521, "step": 3038 }, { "epoch": 1.6319251031844568, "grad_norm": 0.6171875, "learning_rate": 3.8015628332082295e-05, "loss": 0.8765, "step": 3039 }, { "epoch": 1.632461997919533, "grad_norm": 1.078125, "learning_rate": 3.80084172512372e-05, "loss": 1.0539, "step": 3040 }, { "epoch": 1.632998892654609, "grad_norm": 0.65625, "learning_rate": 3.800120468596696e-05, "loss": 0.889, "step": 3041 }, { "epoch": 1.6335357873896847, "grad_norm": 0.640625, "learning_rate": 3.799399063709461e-05, "loss": 0.7775, "step": 3042 }, { "epoch": 1.6340726821247609, "grad_norm": 0.62890625, "learning_rate": 3.798677510544338e-05, "loss": 0.7852, "step": 3043 }, { "epoch": 1.634609576859837, "grad_norm": 0.69921875, "learning_rate": 3.797955809183665e-05, "loss": 1.0743, "step": 3044 }, { "epoch": 1.6351464715949129, "grad_norm": 0.78515625, "learning_rate": 3.797233959709796e-05, "loss": 0.8473, "step": 3045 }, { "epoch": 1.6356833663299888, "grad_norm": 1.3515625, "learning_rate": 3.796511962205104e-05, "loss": 0.8339, "step": 3046 }, { "epoch": 1.636220261065065, "grad_norm": 0.671875, "learning_rate": 3.795789816751979e-05, "loss": 0.7525, "step": 3047 }, { "epoch": 1.636757155800141, "grad_norm": 0.62890625, "learning_rate": 3.795067523432826e-05, "loss": 0.7052, "step": 3048 }, { "epoch": 1.637294050535217, "grad_norm": 0.68359375, "learning_rate": 3.794345082330067e-05, "loss": 0.8761, "step": 3049 }, { "epoch": 1.6378309452702928, "grad_norm": 0.62890625, "learning_rate": 3.7936224935261434e-05, "loss": 0.7289, "step": 3050 }, { "epoch": 1.6378309452702928, "eval_loss": 0.9097536206245422, "eval_runtime": 46.8458, "eval_samples_per_second": 20.535, "eval_steps_per_second": 20.535, "step": 3050 }, { "epoch": 1.638367840005369, "grad_norm": 0.71875, "learning_rate": 3.792899757103511e-05, "loss": 1.0245, "step": 3051 }, { "epoch": 1.638904734740445, "grad_norm": 0.7109375, "learning_rate": 3.792176873144642e-05, "loss": 0.8299, "step": 3052 }, { "epoch": 1.639441629475521, "grad_norm": 0.80078125, "learning_rate": 3.791453841732028e-05, "loss": 0.9915, "step": 3053 }, { "epoch": 1.6399785242105969, "grad_norm": 0.7578125, "learning_rate": 3.790730662948176e-05, "loss": 0.9616, "step": 3054 }, { "epoch": 1.640515418945673, "grad_norm": 0.67578125, "learning_rate": 3.79000733687561e-05, "loss": 0.8969, "step": 3055 }, { "epoch": 1.641052313680749, "grad_norm": 0.69140625, "learning_rate": 3.7892838635968696e-05, "loss": 0.8711, "step": 3056 }, { "epoch": 1.641589208415825, "grad_norm": 0.72265625, "learning_rate": 3.788560243194513e-05, "loss": 0.9227, "step": 3057 }, { "epoch": 1.642126103150901, "grad_norm": 0.59375, "learning_rate": 3.787836475751114e-05, "loss": 0.8242, "step": 3058 }, { "epoch": 1.642662997885977, "grad_norm": 0.66015625, "learning_rate": 3.7871125613492654e-05, "loss": 0.8317, "step": 3059 }, { "epoch": 1.6431998926210531, "grad_norm": 0.71484375, "learning_rate": 3.786388500071572e-05, "loss": 0.955, "step": 3060 }, { "epoch": 1.6437367873561288, "grad_norm": 0.7109375, "learning_rate": 3.785664292000661e-05, "loss": 0.8222, "step": 3061 }, { "epoch": 1.644273682091205, "grad_norm": 0.7265625, "learning_rate": 3.784939937219172e-05, "loss": 0.9555, "step": 3062 }, { "epoch": 1.644810576826281, "grad_norm": 0.640625, "learning_rate": 3.7842154358097635e-05, "loss": 0.6415, "step": 3063 }, { "epoch": 1.645347471561357, "grad_norm": 0.5703125, "learning_rate": 3.7834907878551116e-05, "loss": 0.7773, "step": 3064 }, { "epoch": 1.6458843662964329, "grad_norm": 0.78125, "learning_rate": 3.782765993437906e-05, "loss": 0.668, "step": 3065 }, { "epoch": 1.646421261031509, "grad_norm": 0.6953125, "learning_rate": 3.7820410526408556e-05, "loss": 1.02, "step": 3066 }, { "epoch": 1.646958155766585, "grad_norm": 0.58203125, "learning_rate": 3.781315965546686e-05, "loss": 0.6888, "step": 3067 }, { "epoch": 1.647495050501661, "grad_norm": 0.76171875, "learning_rate": 3.780590732238137e-05, "loss": 0.8736, "step": 3068 }, { "epoch": 1.648031945236737, "grad_norm": 0.796875, "learning_rate": 3.779865352797969e-05, "loss": 1.0496, "step": 3069 }, { "epoch": 1.648568839971813, "grad_norm": 0.58203125, "learning_rate": 3.779139827308956e-05, "loss": 0.8068, "step": 3070 }, { "epoch": 1.6491057347068891, "grad_norm": 0.83203125, "learning_rate": 3.778414155853889e-05, "loss": 1.0234, "step": 3071 }, { "epoch": 1.649642629441965, "grad_norm": 0.6015625, "learning_rate": 3.777688338515578e-05, "loss": 0.7916, "step": 3072 }, { "epoch": 1.650179524177041, "grad_norm": 0.65234375, "learning_rate": 3.7769623753768455e-05, "loss": 1.0565, "step": 3073 }, { "epoch": 1.650716418912117, "grad_norm": 0.8046875, "learning_rate": 3.7762362665205364e-05, "loss": 1.0026, "step": 3074 }, { "epoch": 1.6512533136471932, "grad_norm": 0.7265625, "learning_rate": 3.7755100120295064e-05, "loss": 0.9098, "step": 3075 }, { "epoch": 1.651790208382269, "grad_norm": 0.73828125, "learning_rate": 3.7747836119866305e-05, "loss": 0.8866, "step": 3076 }, { "epoch": 1.652327103117345, "grad_norm": 0.71875, "learning_rate": 3.774057066474801e-05, "loss": 0.7804, "step": 3077 }, { "epoch": 1.652863997852421, "grad_norm": 0.65234375, "learning_rate": 3.7733303755769255e-05, "loss": 1.0191, "step": 3078 }, { "epoch": 1.6534008925874972, "grad_norm": 0.55859375, "learning_rate": 3.7726035393759285e-05, "loss": 0.6412, "step": 3079 }, { "epoch": 1.653937787322573, "grad_norm": 0.6328125, "learning_rate": 3.771876557954752e-05, "loss": 0.8218, "step": 3080 }, { "epoch": 1.654474682057649, "grad_norm": 0.734375, "learning_rate": 3.771149431396353e-05, "loss": 0.9572, "step": 3081 }, { "epoch": 1.6550115767927251, "grad_norm": 0.6640625, "learning_rate": 3.7704221597837055e-05, "loss": 0.894, "step": 3082 }, { "epoch": 1.655548471527801, "grad_norm": 0.625, "learning_rate": 3.7696947431998015e-05, "loss": 0.8156, "step": 3083 }, { "epoch": 1.656085366262877, "grad_norm": 0.74609375, "learning_rate": 3.7689671817276477e-05, "loss": 0.9495, "step": 3084 }, { "epoch": 1.656622260997953, "grad_norm": 0.71484375, "learning_rate": 3.768239475450269e-05, "loss": 0.7342, "step": 3085 }, { "epoch": 1.6571591557330292, "grad_norm": 0.77734375, "learning_rate": 3.767511624450705e-05, "loss": 0.7942, "step": 3086 }, { "epoch": 1.657696050468105, "grad_norm": 0.703125, "learning_rate": 3.766783628812012e-05, "loss": 0.7667, "step": 3087 }, { "epoch": 1.658232945203181, "grad_norm": 0.93359375, "learning_rate": 3.766055488617265e-05, "loss": 1.1026, "step": 3088 }, { "epoch": 1.658769839938257, "grad_norm": 0.84765625, "learning_rate": 3.765327203949554e-05, "loss": 0.9175, "step": 3089 }, { "epoch": 1.6593067346733332, "grad_norm": 0.69140625, "learning_rate": 3.7645987748919836e-05, "loss": 0.8211, "step": 3090 }, { "epoch": 1.6598436294084091, "grad_norm": 0.6953125, "learning_rate": 3.763870201527679e-05, "loss": 0.8248, "step": 3091 }, { "epoch": 1.660380524143485, "grad_norm": 0.69140625, "learning_rate": 3.763141483939779e-05, "loss": 0.976, "step": 3092 }, { "epoch": 1.6609174188785611, "grad_norm": 0.6171875, "learning_rate": 3.762412622211439e-05, "loss": 0.869, "step": 3093 }, { "epoch": 1.6614543136136373, "grad_norm": 0.63671875, "learning_rate": 3.761683616425832e-05, "loss": 0.7032, "step": 3094 }, { "epoch": 1.6619912083487132, "grad_norm": 0.6875, "learning_rate": 3.7609544666661455e-05, "loss": 0.8989, "step": 3095 }, { "epoch": 1.662528103083789, "grad_norm": 0.55859375, "learning_rate": 3.7602251730155855e-05, "loss": 0.6087, "step": 3096 }, { "epoch": 1.6630649978188652, "grad_norm": 0.6328125, "learning_rate": 3.7594957355573756e-05, "loss": 0.8414, "step": 3097 }, { "epoch": 1.6636018925539413, "grad_norm": 0.703125, "learning_rate": 3.758766154374751e-05, "loss": 1.0477, "step": 3098 }, { "epoch": 1.664138787289017, "grad_norm": 0.6953125, "learning_rate": 3.7580364295509675e-05, "loss": 0.8542, "step": 3099 }, { "epoch": 1.664675682024093, "grad_norm": 0.7109375, "learning_rate": 3.7573065611692946e-05, "loss": 0.9615, "step": 3100 }, { "epoch": 1.664675682024093, "eval_loss": 0.9097515940666199, "eval_runtime": 48.3801, "eval_samples_per_second": 19.884, "eval_steps_per_second": 19.884, "step": 3100 }, { "epoch": 1.6652125767591692, "grad_norm": 0.69140625, "learning_rate": 3.756576549313022e-05, "loss": 0.8375, "step": 3101 }, { "epoch": 1.6657494714942451, "grad_norm": 0.671875, "learning_rate": 3.755846394065451e-05, "loss": 0.6936, "step": 3102 }, { "epoch": 1.666286366229321, "grad_norm": 0.7109375, "learning_rate": 3.755116095509903e-05, "loss": 0.919, "step": 3103 }, { "epoch": 1.6668232609643971, "grad_norm": 0.7109375, "learning_rate": 3.7543856537297126e-05, "loss": 0.9726, "step": 3104 }, { "epoch": 1.6673601556994733, "grad_norm": 0.640625, "learning_rate": 3.753655068808234e-05, "loss": 0.6599, "step": 3105 }, { "epoch": 1.6678970504345492, "grad_norm": 0.71484375, "learning_rate": 3.752924340828837e-05, "loss": 0.9401, "step": 3106 }, { "epoch": 1.668433945169625, "grad_norm": 0.796875, "learning_rate": 3.7521934698749046e-05, "loss": 0.8985, "step": 3107 }, { "epoch": 1.6689708399047012, "grad_norm": 0.859375, "learning_rate": 3.75146245602984e-05, "loss": 0.9108, "step": 3108 }, { "epoch": 1.6695077346397773, "grad_norm": 0.703125, "learning_rate": 3.7507312993770596e-05, "loss": 0.7507, "step": 3109 }, { "epoch": 1.6700446293748532, "grad_norm": 0.69140625, "learning_rate": 3.7500000000000003e-05, "loss": 0.8606, "step": 3110 }, { "epoch": 1.670581524109929, "grad_norm": 0.6640625, "learning_rate": 3.749268557982111e-05, "loss": 0.6185, "step": 3111 }, { "epoch": 1.6711184188450052, "grad_norm": 0.57421875, "learning_rate": 3.748536973406857e-05, "loss": 0.6909, "step": 3112 }, { "epoch": 1.6716553135800813, "grad_norm": 0.62890625, "learning_rate": 3.747805246357724e-05, "loss": 0.85, "step": 3113 }, { "epoch": 1.6721922083151572, "grad_norm": 0.71875, "learning_rate": 3.74707337691821e-05, "loss": 0.991, "step": 3114 }, { "epoch": 1.6727291030502331, "grad_norm": 0.6640625, "learning_rate": 3.7463413651718303e-05, "loss": 0.8143, "step": 3115 }, { "epoch": 1.6732659977853093, "grad_norm": 0.79296875, "learning_rate": 3.745609211202118e-05, "loss": 0.919, "step": 3116 }, { "epoch": 1.6738028925203854, "grad_norm": 0.65234375, "learning_rate": 3.74487691509262e-05, "loss": 0.8302, "step": 3117 }, { "epoch": 1.674339787255461, "grad_norm": 0.7109375, "learning_rate": 3.7441444769269005e-05, "loss": 0.9914, "step": 3118 }, { "epoch": 1.6748766819905372, "grad_norm": 0.71875, "learning_rate": 3.743411896788541e-05, "loss": 0.9043, "step": 3119 }, { "epoch": 1.6754135767256133, "grad_norm": 0.73828125, "learning_rate": 3.742679174761137e-05, "loss": 0.9318, "step": 3120 }, { "epoch": 1.6759504714606892, "grad_norm": 0.75, "learning_rate": 3.741946310928303e-05, "loss": 0.7836, "step": 3121 }, { "epoch": 1.676487366195765, "grad_norm": 0.796875, "learning_rate": 3.741213305373666e-05, "loss": 0.9275, "step": 3122 }, { "epoch": 1.6770242609308412, "grad_norm": 0.75390625, "learning_rate": 3.740480158180872e-05, "loss": 0.8911, "step": 3123 }, { "epoch": 1.6775611556659173, "grad_norm": 0.6328125, "learning_rate": 3.739746869433583e-05, "loss": 0.807, "step": 3124 }, { "epoch": 1.6780980504009932, "grad_norm": 0.63671875, "learning_rate": 3.739013439215476e-05, "loss": 0.7685, "step": 3125 }, { "epoch": 1.6786349451360691, "grad_norm": 0.671875, "learning_rate": 3.738279867610244e-05, "loss": 1.0855, "step": 3126 }, { "epoch": 1.6791718398711453, "grad_norm": 0.76953125, "learning_rate": 3.7375461547015986e-05, "loss": 0.813, "step": 3127 }, { "epoch": 1.6797087346062214, "grad_norm": 0.80078125, "learning_rate": 3.736812300573264e-05, "loss": 1.0443, "step": 3128 }, { "epoch": 1.6802456293412973, "grad_norm": 0.69140625, "learning_rate": 3.736078305308983e-05, "loss": 0.7276, "step": 3129 }, { "epoch": 1.6807825240763732, "grad_norm": 0.7890625, "learning_rate": 3.735344168992515e-05, "loss": 0.8503, "step": 3130 }, { "epoch": 1.6813194188114493, "grad_norm": 0.82421875, "learning_rate": 3.734609891707632e-05, "loss": 0.7957, "step": 3131 }, { "epoch": 1.6818563135465254, "grad_norm": 0.7421875, "learning_rate": 3.733875473538125e-05, "loss": 0.8538, "step": 3132 }, { "epoch": 1.6823932082816013, "grad_norm": 0.640625, "learning_rate": 3.733140914567801e-05, "loss": 0.7315, "step": 3133 }, { "epoch": 1.6829301030166772, "grad_norm": 0.6953125, "learning_rate": 3.732406214880481e-05, "loss": 0.7941, "step": 3134 }, { "epoch": 1.6834669977517533, "grad_norm": 0.609375, "learning_rate": 3.731671374560007e-05, "loss": 0.7481, "step": 3135 }, { "epoch": 1.6840038924868292, "grad_norm": 0.69140625, "learning_rate": 3.7309363936902294e-05, "loss": 0.879, "step": 3136 }, { "epoch": 1.6845407872219051, "grad_norm": 0.8359375, "learning_rate": 3.730201272355022e-05, "loss": 1.0314, "step": 3137 }, { "epoch": 1.6850776819569813, "grad_norm": 0.7890625, "learning_rate": 3.729466010638269e-05, "loss": 0.9467, "step": 3138 }, { "epoch": 1.6856145766920574, "grad_norm": 1.0859375, "learning_rate": 3.728730608623875e-05, "loss": 0.8344, "step": 3139 }, { "epoch": 1.6861514714271333, "grad_norm": 0.625, "learning_rate": 3.727995066395758e-05, "loss": 0.8438, "step": 3140 }, { "epoch": 1.6866883661622092, "grad_norm": 0.71484375, "learning_rate": 3.727259384037852e-05, "loss": 0.8444, "step": 3141 }, { "epoch": 1.6872252608972853, "grad_norm": 0.7578125, "learning_rate": 3.726523561634109e-05, "loss": 0.9086, "step": 3142 }, { "epoch": 1.6877621556323614, "grad_norm": 0.6875, "learning_rate": 3.725787599268495e-05, "loss": 0.8877, "step": 3143 }, { "epoch": 1.6882990503674373, "grad_norm": 0.70703125, "learning_rate": 3.7250514970249926e-05, "loss": 1.1066, "step": 3144 }, { "epoch": 1.6888359451025132, "grad_norm": 0.62109375, "learning_rate": 3.724315254987599e-05, "loss": 0.9929, "step": 3145 }, { "epoch": 1.6893728398375893, "grad_norm": 0.63671875, "learning_rate": 3.7235788732403315e-05, "loss": 0.7913, "step": 3146 }, { "epoch": 1.6899097345726655, "grad_norm": 0.63671875, "learning_rate": 3.722842351867219e-05, "loss": 0.8937, "step": 3147 }, { "epoch": 1.6904466293077414, "grad_norm": 0.65234375, "learning_rate": 3.7221056909523075e-05, "loss": 0.8757, "step": 3148 }, { "epoch": 1.6909835240428173, "grad_norm": 0.76953125, "learning_rate": 3.721368890579661e-05, "loss": 0.8272, "step": 3149 }, { "epoch": 1.6915204187778934, "grad_norm": 0.6875, "learning_rate": 3.720631950833355e-05, "loss": 0.9054, "step": 3150 }, { "epoch": 1.6915204187778934, "eval_loss": 0.9101051688194275, "eval_runtime": 46.1851, "eval_samples_per_second": 20.829, "eval_steps_per_second": 20.829, "step": 3150 }, { "epoch": 1.6920573135129695, "grad_norm": 0.6484375, "learning_rate": 3.7198948717974866e-05, "loss": 0.8198, "step": 3151 }, { "epoch": 1.6925942082480454, "grad_norm": 0.703125, "learning_rate": 3.719157653556166e-05, "loss": 0.7609, "step": 3152 }, { "epoch": 1.6931311029831213, "grad_norm": 0.6328125, "learning_rate": 3.7184202961935156e-05, "loss": 0.8904, "step": 3153 }, { "epoch": 1.6936679977181974, "grad_norm": 0.77734375, "learning_rate": 3.71768279979368e-05, "loss": 1.0937, "step": 3154 }, { "epoch": 1.6942048924532733, "grad_norm": 0.6796875, "learning_rate": 3.716945164440816e-05, "loss": 0.8533, "step": 3155 }, { "epoch": 1.6947417871883492, "grad_norm": 0.65234375, "learning_rate": 3.7162073902190974e-05, "loss": 0.7931, "step": 3156 }, { "epoch": 1.6952786819234253, "grad_norm": 0.6796875, "learning_rate": 3.715469477212714e-05, "loss": 0.8886, "step": 3157 }, { "epoch": 1.6958155766585015, "grad_norm": 0.734375, "learning_rate": 3.7147314255058704e-05, "loss": 1.003, "step": 3158 }, { "epoch": 1.6963524713935774, "grad_norm": 0.7734375, "learning_rate": 3.7139932351827875e-05, "loss": 0.6984, "step": 3159 }, { "epoch": 1.6968893661286533, "grad_norm": 0.79296875, "learning_rate": 3.713254906327703e-05, "loss": 0.8247, "step": 3160 }, { "epoch": 1.6974262608637294, "grad_norm": 0.6875, "learning_rate": 3.712516439024868e-05, "loss": 0.9855, "step": 3161 }, { "epoch": 1.6979631555988055, "grad_norm": 0.8203125, "learning_rate": 3.7117778333585534e-05, "loss": 1.003, "step": 3162 }, { "epoch": 1.6985000503338814, "grad_norm": 0.796875, "learning_rate": 3.7110390894130414e-05, "loss": 0.9409, "step": 3163 }, { "epoch": 1.6990369450689573, "grad_norm": 0.703125, "learning_rate": 3.710300207272633e-05, "loss": 0.8235, "step": 3164 }, { "epoch": 1.6995738398040334, "grad_norm": 0.70703125, "learning_rate": 3.709561187021644e-05, "loss": 1.2149, "step": 3165 }, { "epoch": 1.7001107345391095, "grad_norm": 0.875, "learning_rate": 3.708822028744405e-05, "loss": 1.0149, "step": 3166 }, { "epoch": 1.7006476292741854, "grad_norm": 0.75, "learning_rate": 3.7080827325252635e-05, "loss": 0.8986, "step": 3167 }, { "epoch": 1.7011845240092613, "grad_norm": 0.6875, "learning_rate": 3.7073432984485846e-05, "loss": 0.9211, "step": 3168 }, { "epoch": 1.7017214187443375, "grad_norm": 0.57421875, "learning_rate": 3.706603726598745e-05, "loss": 0.7196, "step": 3169 }, { "epoch": 1.7022583134794136, "grad_norm": 0.83203125, "learning_rate": 3.7058640170601394e-05, "loss": 0.8942, "step": 3170 }, { "epoch": 1.7027952082144895, "grad_norm": 0.73828125, "learning_rate": 3.705124169917179e-05, "loss": 0.8141, "step": 3171 }, { "epoch": 1.7033321029495654, "grad_norm": 0.6796875, "learning_rate": 3.704384185254288e-05, "loss": 0.9116, "step": 3172 }, { "epoch": 1.7038689976846415, "grad_norm": 0.76953125, "learning_rate": 3.703644063155911e-05, "loss": 0.9862, "step": 3173 }, { "epoch": 1.7044058924197174, "grad_norm": 0.59765625, "learning_rate": 3.702903803706502e-05, "loss": 1.047, "step": 3174 }, { "epoch": 1.7049427871547933, "grad_norm": 0.75390625, "learning_rate": 3.7021634069905356e-05, "loss": 0.8986, "step": 3175 }, { "epoch": 1.7054796818898694, "grad_norm": 0.671875, "learning_rate": 3.7014228730925003e-05, "loss": 0.8705, "step": 3176 }, { "epoch": 1.7060165766249455, "grad_norm": 0.7109375, "learning_rate": 3.700682202096901e-05, "loss": 0.9543, "step": 3177 }, { "epoch": 1.7065534713600214, "grad_norm": 0.625, "learning_rate": 3.699941394088257e-05, "loss": 0.9731, "step": 3178 }, { "epoch": 1.7070903660950973, "grad_norm": 0.82421875, "learning_rate": 3.699200449151103e-05, "loss": 1.0328, "step": 3179 }, { "epoch": 1.7076272608301735, "grad_norm": 0.86328125, "learning_rate": 3.6984593673699926e-05, "loss": 1.0875, "step": 3180 }, { "epoch": 1.7081641555652496, "grad_norm": 0.7734375, "learning_rate": 3.69771814882949e-05, "loss": 1.0672, "step": 3181 }, { "epoch": 1.7087010503003255, "grad_norm": 0.5859375, "learning_rate": 3.69697679361418e-05, "loss": 0.9401, "step": 3182 }, { "epoch": 1.7092379450354014, "grad_norm": 0.8984375, "learning_rate": 3.696235301808657e-05, "loss": 0.9693, "step": 3183 }, { "epoch": 1.7097748397704775, "grad_norm": 0.66796875, "learning_rate": 3.695493673497539e-05, "loss": 0.7992, "step": 3184 }, { "epoch": 1.7103117345055536, "grad_norm": 0.7109375, "learning_rate": 3.6947519087654524e-05, "loss": 0.9487, "step": 3185 }, { "epoch": 1.7108486292406295, "grad_norm": 0.65234375, "learning_rate": 3.694010007697043e-05, "loss": 0.7009, "step": 3186 }, { "epoch": 1.7113855239757054, "grad_norm": 0.71875, "learning_rate": 3.693267970376971e-05, "loss": 0.8556, "step": 3187 }, { "epoch": 1.7119224187107815, "grad_norm": 0.6015625, "learning_rate": 3.6925257968899114e-05, "loss": 0.7355, "step": 3188 }, { "epoch": 1.7124593134458577, "grad_norm": 0.75390625, "learning_rate": 3.691783487320556e-05, "loss": 0.838, "step": 3189 }, { "epoch": 1.7129962081809336, "grad_norm": 0.76171875, "learning_rate": 3.691041041753613e-05, "loss": 1.0657, "step": 3190 }, { "epoch": 1.7135331029160095, "grad_norm": 0.73828125, "learning_rate": 3.6902984602738025e-05, "loss": 0.8307, "step": 3191 }, { "epoch": 1.7140699976510856, "grad_norm": 0.6484375, "learning_rate": 3.689555742965865e-05, "loss": 0.8304, "step": 3192 }, { "epoch": 1.7146068923861615, "grad_norm": 0.71484375, "learning_rate": 3.6888128899145524e-05, "loss": 0.8784, "step": 3193 }, { "epoch": 1.7151437871212374, "grad_norm": 0.640625, "learning_rate": 3.688069901204634e-05, "loss": 0.738, "step": 3194 }, { "epoch": 1.7156806818563135, "grad_norm": 0.6953125, "learning_rate": 3.6873267769208944e-05, "loss": 0.8378, "step": 3195 }, { "epoch": 1.7162175765913896, "grad_norm": 0.73046875, "learning_rate": 3.686583517148133e-05, "loss": 0.8341, "step": 3196 }, { "epoch": 1.7167544713264655, "grad_norm": 0.71484375, "learning_rate": 3.685840121971165e-05, "loss": 0.8852, "step": 3197 }, { "epoch": 1.7172913660615414, "grad_norm": 0.76953125, "learning_rate": 3.6850965914748225e-05, "loss": 0.8795, "step": 3198 }, { "epoch": 1.7178282607966175, "grad_norm": 0.6015625, "learning_rate": 3.68435292574395e-05, "loss": 0.8056, "step": 3199 }, { "epoch": 1.7183651555316937, "grad_norm": 0.71875, "learning_rate": 3.68360912486341e-05, "loss": 0.9033, "step": 3200 }, { "epoch": 1.7183651555316937, "eval_loss": 0.9093900918960571, "eval_runtime": 46.5819, "eval_samples_per_second": 20.652, "eval_steps_per_second": 20.652, "step": 3200 }, { "epoch": 1.7189020502667696, "grad_norm": 0.6640625, "learning_rate": 3.6828651889180807e-05, "loss": 0.9704, "step": 3201 }, { "epoch": 1.7194389450018455, "grad_norm": 1.1171875, "learning_rate": 3.682121117992853e-05, "loss": 1.1052, "step": 3202 }, { "epoch": 1.7199758397369216, "grad_norm": 0.73046875, "learning_rate": 3.681376912172636e-05, "loss": 0.8855, "step": 3203 }, { "epoch": 1.7205127344719977, "grad_norm": 0.66015625, "learning_rate": 3.680632571542351e-05, "loss": 0.9028, "step": 3204 }, { "epoch": 1.7210496292070736, "grad_norm": 0.86328125, "learning_rate": 3.679888096186939e-05, "loss": 0.9277, "step": 3205 }, { "epoch": 1.7215865239421495, "grad_norm": 0.66796875, "learning_rate": 3.6791434861913535e-05, "loss": 0.8151, "step": 3206 }, { "epoch": 1.7221234186772256, "grad_norm": 0.6328125, "learning_rate": 3.678398741640563e-05, "loss": 0.923, "step": 3207 }, { "epoch": 1.7226603134123017, "grad_norm": 0.80859375, "learning_rate": 3.677653862619552e-05, "loss": 1.0484, "step": 3208 }, { "epoch": 1.7231972081473776, "grad_norm": 0.55859375, "learning_rate": 3.676908849213323e-05, "loss": 0.6934, "step": 3209 }, { "epoch": 1.7237341028824535, "grad_norm": 1.078125, "learning_rate": 3.67616370150689e-05, "loss": 0.7035, "step": 3210 }, { "epoch": 1.7242709976175297, "grad_norm": 0.71484375, "learning_rate": 3.6754184195852826e-05, "loss": 0.9222, "step": 3211 }, { "epoch": 1.7248078923526056, "grad_norm": 0.65234375, "learning_rate": 3.6746730035335484e-05, "loss": 0.8372, "step": 3212 }, { "epoch": 1.7253447870876815, "grad_norm": 0.89453125, "learning_rate": 3.6739274534367484e-05, "loss": 0.8406, "step": 3213 }, { "epoch": 1.7258816818227576, "grad_norm": 0.671875, "learning_rate": 3.67318176937996e-05, "loss": 0.8601, "step": 3214 }, { "epoch": 1.7264185765578337, "grad_norm": 0.70703125, "learning_rate": 3.672435951448274e-05, "loss": 0.9916, "step": 3215 }, { "epoch": 1.7269554712929096, "grad_norm": 0.6484375, "learning_rate": 3.671689999726798e-05, "loss": 0.7553, "step": 3216 }, { "epoch": 1.7274923660279855, "grad_norm": 0.73046875, "learning_rate": 3.6709439143006556e-05, "loss": 0.9272, "step": 3217 }, { "epoch": 1.7280292607630616, "grad_norm": 0.8046875, "learning_rate": 3.670197695254984e-05, "loss": 0.9901, "step": 3218 }, { "epoch": 1.7285661554981377, "grad_norm": 0.71484375, "learning_rate": 3.669451342674935e-05, "loss": 0.8426, "step": 3219 }, { "epoch": 1.7291030502332136, "grad_norm": 0.7734375, "learning_rate": 3.668704856645678e-05, "loss": 1.0482, "step": 3220 }, { "epoch": 1.7296399449682895, "grad_norm": 0.70703125, "learning_rate": 3.667958237252397e-05, "loss": 0.787, "step": 3221 }, { "epoch": 1.7301768397033657, "grad_norm": 0.79296875, "learning_rate": 3.6672114845802895e-05, "loss": 0.9153, "step": 3222 }, { "epoch": 1.7307137344384418, "grad_norm": 0.64453125, "learning_rate": 3.666464598714571e-05, "loss": 0.7852, "step": 3223 }, { "epoch": 1.7312506291735177, "grad_norm": 0.76171875, "learning_rate": 3.665717579740469e-05, "loss": 0.8749, "step": 3224 }, { "epoch": 1.7317875239085936, "grad_norm": 0.63671875, "learning_rate": 3.6649704277432286e-05, "loss": 0.805, "step": 3225 }, { "epoch": 1.7323244186436697, "grad_norm": 0.6484375, "learning_rate": 3.66422314280811e-05, "loss": 0.8683, "step": 3226 }, { "epoch": 1.7328613133787458, "grad_norm": 0.69140625, "learning_rate": 3.663475725020386e-05, "loss": 0.9145, "step": 3227 }, { "epoch": 1.7333982081138217, "grad_norm": 0.6171875, "learning_rate": 3.662728174465348e-05, "loss": 0.851, "step": 3228 }, { "epoch": 1.7339351028488976, "grad_norm": 0.66796875, "learning_rate": 3.661980491228301e-05, "loss": 0.8579, "step": 3229 }, { "epoch": 1.7344719975839737, "grad_norm": 0.61328125, "learning_rate": 3.661232675394563e-05, "loss": 0.8754, "step": 3230 }, { "epoch": 1.7350088923190496, "grad_norm": 0.6484375, "learning_rate": 3.660484727049472e-05, "loss": 0.8597, "step": 3231 }, { "epoch": 1.7355457870541255, "grad_norm": 0.80859375, "learning_rate": 3.659736646278378e-05, "loss": 0.8489, "step": 3232 }, { "epoch": 1.7360826817892017, "grad_norm": 0.82421875, "learning_rate": 3.658988433166645e-05, "loss": 0.888, "step": 3233 }, { "epoch": 1.7366195765242778, "grad_norm": 0.66015625, "learning_rate": 3.6582400877996546e-05, "loss": 0.8524, "step": 3234 }, { "epoch": 1.7371564712593537, "grad_norm": 0.88671875, "learning_rate": 3.657491610262802e-05, "loss": 0.9846, "step": 3235 }, { "epoch": 1.7376933659944296, "grad_norm": 0.6640625, "learning_rate": 3.656743000641499e-05, "loss": 0.9523, "step": 3236 }, { "epoch": 1.7382302607295057, "grad_norm": 0.69140625, "learning_rate": 3.65599425902117e-05, "loss": 0.916, "step": 3237 }, { "epoch": 1.7387671554645818, "grad_norm": 0.63671875, "learning_rate": 3.655245385487256e-05, "loss": 0.8783, "step": 3238 }, { "epoch": 1.7393040501996577, "grad_norm": 0.87109375, "learning_rate": 3.654496380125214e-05, "loss": 0.8304, "step": 3239 }, { "epoch": 1.7398409449347336, "grad_norm": 0.625, "learning_rate": 3.653747243020515e-05, "loss": 0.8268, "step": 3240 }, { "epoch": 1.7403778396698097, "grad_norm": 0.8046875, "learning_rate": 3.652997974258644e-05, "loss": 1.0443, "step": 3241 }, { "epoch": 1.7409147344048859, "grad_norm": 0.81640625, "learning_rate": 3.6522485739251034e-05, "loss": 0.89, "step": 3242 }, { "epoch": 1.7414516291399618, "grad_norm": 0.73046875, "learning_rate": 3.651499042105407e-05, "loss": 0.6678, "step": 3243 }, { "epoch": 1.7419885238750377, "grad_norm": 0.703125, "learning_rate": 3.650749378885089e-05, "loss": 0.8595, "step": 3244 }, { "epoch": 1.7425254186101138, "grad_norm": 0.66015625, "learning_rate": 3.649999584349693e-05, "loss": 0.8412, "step": 3245 }, { "epoch": 1.74306231334519, "grad_norm": 0.7734375, "learning_rate": 3.649249658584781e-05, "loss": 0.805, "step": 3246 }, { "epoch": 1.7435992080802658, "grad_norm": 0.73828125, "learning_rate": 3.648499601675928e-05, "loss": 0.9899, "step": 3247 }, { "epoch": 1.7441361028153417, "grad_norm": 0.62890625, "learning_rate": 3.6477494137087276e-05, "loss": 0.8808, "step": 3248 }, { "epoch": 1.7446729975504178, "grad_norm": 0.85546875, "learning_rate": 3.6469990947687825e-05, "loss": 1.1073, "step": 3249 }, { "epoch": 1.7452098922854937, "grad_norm": 0.61328125, "learning_rate": 3.646248644941716e-05, "loss": 0.8673, "step": 3250 }, { "epoch": 1.7452098922854937, "eval_loss": 0.9094784259796143, "eval_runtime": 47.2403, "eval_samples_per_second": 20.364, "eval_steps_per_second": 20.364, "step": 3250 }, { "epoch": 1.7457467870205696, "grad_norm": 0.63671875, "learning_rate": 3.6454980643131627e-05, "loss": 1.0337, "step": 3251 }, { "epoch": 1.7462836817556457, "grad_norm": 0.625, "learning_rate": 3.6447473529687736e-05, "loss": 0.9485, "step": 3252 }, { "epoch": 1.7468205764907219, "grad_norm": 0.73828125, "learning_rate": 3.643996510994215e-05, "loss": 0.8813, "step": 3253 }, { "epoch": 1.7473574712257978, "grad_norm": 0.6640625, "learning_rate": 3.643245538475166e-05, "loss": 0.9719, "step": 3254 }, { "epoch": 1.7478943659608737, "grad_norm": 0.70703125, "learning_rate": 3.642494435497324e-05, "loss": 0.9222, "step": 3255 }, { "epoch": 1.7484312606959498, "grad_norm": 0.7109375, "learning_rate": 3.641743202146398e-05, "loss": 0.8616, "step": 3256 }, { "epoch": 1.748968155431026, "grad_norm": 0.7421875, "learning_rate": 3.6409918385081135e-05, "loss": 0.8157, "step": 3257 }, { "epoch": 1.7495050501661018, "grad_norm": 0.93359375, "learning_rate": 3.640240344668211e-05, "loss": 0.9544, "step": 3258 }, { "epoch": 1.7500419449011777, "grad_norm": 0.91796875, "learning_rate": 3.639488720712445e-05, "loss": 0.9077, "step": 3259 }, { "epoch": 1.7505788396362538, "grad_norm": 0.65234375, "learning_rate": 3.638736966726585e-05, "loss": 0.7943, "step": 3260 }, { "epoch": 1.75111573437133, "grad_norm": 0.67578125, "learning_rate": 3.637985082796417e-05, "loss": 1.0695, "step": 3261 }, { "epoch": 1.7516526291064058, "grad_norm": 0.65625, "learning_rate": 3.6372330690077396e-05, "loss": 0.8446, "step": 3262 }, { "epoch": 1.7521895238414817, "grad_norm": 0.66796875, "learning_rate": 3.6364809254463665e-05, "loss": 0.877, "step": 3263 }, { "epoch": 1.7527264185765579, "grad_norm": 0.6015625, "learning_rate": 3.635728652198128e-05, "loss": 0.8914, "step": 3264 }, { "epoch": 1.753263313311634, "grad_norm": 0.75390625, "learning_rate": 3.634976249348867e-05, "loss": 0.9122, "step": 3265 }, { "epoch": 1.7538002080467099, "grad_norm": 0.7734375, "learning_rate": 3.634223716984443e-05, "loss": 0.9247, "step": 3266 }, { "epoch": 1.7543371027817858, "grad_norm": 0.671875, "learning_rate": 3.633471055190729e-05, "loss": 0.694, "step": 3267 }, { "epoch": 1.754873997516862, "grad_norm": 0.80859375, "learning_rate": 3.632718264053613e-05, "loss": 0.9333, "step": 3268 }, { "epoch": 1.7554108922519378, "grad_norm": 0.71875, "learning_rate": 3.6319653436589996e-05, "loss": 0.8551, "step": 3269 }, { "epoch": 1.7559477869870137, "grad_norm": 0.63671875, "learning_rate": 3.631212294092805e-05, "loss": 0.8005, "step": 3270 }, { "epoch": 1.7564846817220898, "grad_norm": 0.90234375, "learning_rate": 3.630459115440961e-05, "loss": 1.0247, "step": 3271 }, { "epoch": 1.757021576457166, "grad_norm": 0.765625, "learning_rate": 3.629705807789416e-05, "loss": 0.8605, "step": 3272 }, { "epoch": 1.7575584711922418, "grad_norm": 0.69921875, "learning_rate": 3.6289523712241316e-05, "loss": 1.124, "step": 3273 }, { "epoch": 1.7580953659273177, "grad_norm": 0.5703125, "learning_rate": 3.6281988058310856e-05, "loss": 0.7418, "step": 3274 }, { "epoch": 1.7586322606623939, "grad_norm": 0.67578125, "learning_rate": 3.627445111696268e-05, "loss": 0.6977, "step": 3275 }, { "epoch": 1.75916915539747, "grad_norm": 0.71484375, "learning_rate": 3.626691288905684e-05, "loss": 0.7461, "step": 3276 }, { "epoch": 1.7597060501325459, "grad_norm": 0.80859375, "learning_rate": 3.625937337545357e-05, "loss": 0.8126, "step": 3277 }, { "epoch": 1.7602429448676218, "grad_norm": 0.66796875, "learning_rate": 3.6251832577013214e-05, "loss": 0.9651, "step": 3278 }, { "epoch": 1.760779839602698, "grad_norm": 0.78125, "learning_rate": 3.6244290494596254e-05, "loss": 0.8284, "step": 3279 }, { "epoch": 1.761316734337774, "grad_norm": 0.80078125, "learning_rate": 3.623674712906335e-05, "loss": 0.9531, "step": 3280 }, { "epoch": 1.76185362907285, "grad_norm": 0.671875, "learning_rate": 3.6229202481275306e-05, "loss": 0.7829, "step": 3281 }, { "epoch": 1.7623905238079258, "grad_norm": 0.67578125, "learning_rate": 3.622165655209305e-05, "loss": 0.764, "step": 3282 }, { "epoch": 1.762927418543002, "grad_norm": 0.75390625, "learning_rate": 3.621410934237767e-05, "loss": 1.1222, "step": 3283 }, { "epoch": 1.763464313278078, "grad_norm": 0.734375, "learning_rate": 3.62065608529904e-05, "loss": 1.0118, "step": 3284 }, { "epoch": 1.764001208013154, "grad_norm": 0.7421875, "learning_rate": 3.619901108479261e-05, "loss": 1.0181, "step": 3285 }, { "epoch": 1.7645381027482299, "grad_norm": 0.640625, "learning_rate": 3.619146003864585e-05, "loss": 0.8426, "step": 3286 }, { "epoch": 1.765074997483306, "grad_norm": 0.64453125, "learning_rate": 3.618390771541175e-05, "loss": 0.7738, "step": 3287 }, { "epoch": 1.7656118922183819, "grad_norm": 0.6953125, "learning_rate": 3.617635411595215e-05, "loss": 0.931, "step": 3288 }, { "epoch": 1.7661487869534578, "grad_norm": 0.73046875, "learning_rate": 3.616879924112902e-05, "loss": 0.7866, "step": 3289 }, { "epoch": 1.766685681688534, "grad_norm": 0.6796875, "learning_rate": 3.6161243091804446e-05, "loss": 0.9237, "step": 3290 }, { "epoch": 1.76722257642361, "grad_norm": 0.69921875, "learning_rate": 3.61536856688407e-05, "loss": 0.8205, "step": 3291 }, { "epoch": 1.767759471158686, "grad_norm": 0.75390625, "learning_rate": 3.614612697310016e-05, "loss": 1.009, "step": 3292 }, { "epoch": 1.7682963658937618, "grad_norm": 0.77734375, "learning_rate": 3.6138567005445375e-05, "loss": 1.1373, "step": 3293 }, { "epoch": 1.768833260628838, "grad_norm": 0.7578125, "learning_rate": 3.6131005766739046e-05, "loss": 1.1374, "step": 3294 }, { "epoch": 1.769370155363914, "grad_norm": 0.76171875, "learning_rate": 3.612344325784399e-05, "loss": 0.9462, "step": 3295 }, { "epoch": 1.76990705009899, "grad_norm": 0.7265625, "learning_rate": 3.611587947962319e-05, "loss": 1.124, "step": 3296 }, { "epoch": 1.7704439448340659, "grad_norm": 0.65625, "learning_rate": 3.610831443293977e-05, "loss": 0.8779, "step": 3297 }, { "epoch": 1.770980839569142, "grad_norm": 1.046875, "learning_rate": 3.6100748118657e-05, "loss": 0.8598, "step": 3298 }, { "epoch": 1.771517734304218, "grad_norm": 0.86328125, "learning_rate": 3.6093180537638294e-05, "loss": 0.8016, "step": 3299 }, { "epoch": 1.772054629039294, "grad_norm": 1.125, "learning_rate": 3.60856116907472e-05, "loss": 1.0133, "step": 3300 }, { "epoch": 1.772054629039294, "eval_loss": 0.9077935218811035, "eval_runtime": 49.1719, "eval_samples_per_second": 19.564, "eval_steps_per_second": 19.564, "step": 3300 }, { "epoch": 1.77259152377437, "grad_norm": 0.5625, "learning_rate": 3.607804157884742e-05, "loss": 0.771, "step": 3301 }, { "epoch": 1.773128418509446, "grad_norm": 0.671875, "learning_rate": 3.607047020280281e-05, "loss": 0.9158, "step": 3302 }, { "epoch": 1.7736653132445221, "grad_norm": 0.6328125, "learning_rate": 3.606289756347736e-05, "loss": 0.8109, "step": 3303 }, { "epoch": 1.774202207979598, "grad_norm": 0.70703125, "learning_rate": 3.605532366173518e-05, "loss": 0.933, "step": 3304 }, { "epoch": 1.774739102714674, "grad_norm": 0.9453125, "learning_rate": 3.6047748498440585e-05, "loss": 0.9528, "step": 3305 }, { "epoch": 1.77527599744975, "grad_norm": 0.85546875, "learning_rate": 3.604017207445797e-05, "loss": 0.807, "step": 3306 }, { "epoch": 1.775812892184826, "grad_norm": 0.65625, "learning_rate": 3.603259439065191e-05, "loss": 0.6761, "step": 3307 }, { "epoch": 1.7763497869199019, "grad_norm": 0.8984375, "learning_rate": 3.602501544788711e-05, "loss": 0.8852, "step": 3308 }, { "epoch": 1.776886681654978, "grad_norm": 0.6796875, "learning_rate": 3.6017435247028425e-05, "loss": 0.9164, "step": 3309 }, { "epoch": 1.777423576390054, "grad_norm": 0.80859375, "learning_rate": 3.600985378894086e-05, "loss": 0.8396, "step": 3310 }, { "epoch": 1.77796047112513, "grad_norm": 0.765625, "learning_rate": 3.6002271074489554e-05, "loss": 0.9967, "step": 3311 }, { "epoch": 1.778497365860206, "grad_norm": 0.6796875, "learning_rate": 3.599468710453977e-05, "loss": 0.9166, "step": 3312 }, { "epoch": 1.779034260595282, "grad_norm": 0.6484375, "learning_rate": 3.5987101879956965e-05, "loss": 0.7454, "step": 3313 }, { "epoch": 1.7795711553303581, "grad_norm": 0.5859375, "learning_rate": 3.597951540160669e-05, "loss": 0.792, "step": 3314 }, { "epoch": 1.780108050065434, "grad_norm": 0.734375, "learning_rate": 3.597192767035466e-05, "loss": 0.9001, "step": 3315 }, { "epoch": 1.78064494480051, "grad_norm": 0.84375, "learning_rate": 3.596433868706674e-05, "loss": 0.9441, "step": 3316 }, { "epoch": 1.781181839535586, "grad_norm": 0.7421875, "learning_rate": 3.595674845260891e-05, "loss": 0.8817, "step": 3317 }, { "epoch": 1.7817187342706622, "grad_norm": 0.8671875, "learning_rate": 3.594915696784734e-05, "loss": 1.0732, "step": 3318 }, { "epoch": 1.782255629005738, "grad_norm": 0.83203125, "learning_rate": 3.59415642336483e-05, "loss": 1.038, "step": 3319 }, { "epoch": 1.782792523740814, "grad_norm": 0.65625, "learning_rate": 3.593397025087821e-05, "loss": 0.8258, "step": 3320 }, { "epoch": 1.78332941847589, "grad_norm": 0.7109375, "learning_rate": 3.5926375020403655e-05, "loss": 0.7235, "step": 3321 }, { "epoch": 1.7838663132109662, "grad_norm": 0.7578125, "learning_rate": 3.5918778543091334e-05, "loss": 0.8121, "step": 3322 }, { "epoch": 1.7844032079460421, "grad_norm": 0.73046875, "learning_rate": 3.5911180819808103e-05, "loss": 1.0629, "step": 3323 }, { "epoch": 1.784940102681118, "grad_norm": 0.734375, "learning_rate": 3.590358185142098e-05, "loss": 0.8761, "step": 3324 }, { "epoch": 1.7854769974161941, "grad_norm": 0.7109375, "learning_rate": 3.589598163879706e-05, "loss": 0.8972, "step": 3325 }, { "epoch": 1.78601389215127, "grad_norm": 0.73046875, "learning_rate": 3.588838018280366e-05, "loss": 0.8558, "step": 3326 }, { "epoch": 1.786550786886346, "grad_norm": 0.6328125, "learning_rate": 3.588077748430819e-05, "loss": 0.6925, "step": 3327 }, { "epoch": 1.787087681621422, "grad_norm": 0.75, "learning_rate": 3.587317354417822e-05, "loss": 0.9284, "step": 3328 }, { "epoch": 1.7876245763564982, "grad_norm": 0.7265625, "learning_rate": 3.5865568363281443e-05, "loss": 1.1003, "step": 3329 }, { "epoch": 1.788161471091574, "grad_norm": 0.84765625, "learning_rate": 3.585796194248572e-05, "loss": 0.8994, "step": 3330 }, { "epoch": 1.78869836582665, "grad_norm": 0.76171875, "learning_rate": 3.585035428265903e-05, "loss": 1.1384, "step": 3331 }, { "epoch": 1.789235260561726, "grad_norm": 0.65625, "learning_rate": 3.584274538466951e-05, "loss": 0.8725, "step": 3332 }, { "epoch": 1.7897721552968022, "grad_norm": 0.640625, "learning_rate": 3.5835135249385433e-05, "loss": 0.9772, "step": 3333 }, { "epoch": 1.7903090500318781, "grad_norm": 0.625, "learning_rate": 3.5827523877675196e-05, "loss": 0.9266, "step": 3334 }, { "epoch": 1.790845944766954, "grad_norm": 0.69921875, "learning_rate": 3.5819911270407377e-05, "loss": 0.9851, "step": 3335 }, { "epoch": 1.7913828395020301, "grad_norm": 0.64453125, "learning_rate": 3.581229742845065e-05, "loss": 0.837, "step": 3336 }, { "epoch": 1.7919197342371063, "grad_norm": 0.7578125, "learning_rate": 3.580468235267387e-05, "loss": 0.8346, "step": 3337 }, { "epoch": 1.7924566289721822, "grad_norm": 0.68359375, "learning_rate": 3.5797066043946e-05, "loss": 0.9218, "step": 3338 }, { "epoch": 1.792993523707258, "grad_norm": 0.7265625, "learning_rate": 3.578944850313615e-05, "loss": 1.0078, "step": 3339 }, { "epoch": 1.7935304184423342, "grad_norm": 0.71484375, "learning_rate": 3.5781829731113605e-05, "loss": 0.7248, "step": 3340 }, { "epoch": 1.7940673131774103, "grad_norm": 0.7578125, "learning_rate": 3.577420972874774e-05, "loss": 0.9012, "step": 3341 }, { "epoch": 1.7946042079124862, "grad_norm": 0.640625, "learning_rate": 3.5766588496908095e-05, "loss": 0.5642, "step": 3342 }, { "epoch": 1.795141102647562, "grad_norm": 0.83984375, "learning_rate": 3.5758966036464356e-05, "loss": 0.9413, "step": 3343 }, { "epoch": 1.7956779973826382, "grad_norm": 0.72265625, "learning_rate": 3.5751342348286354e-05, "loss": 0.8961, "step": 3344 }, { "epoch": 1.7962148921177141, "grad_norm": 0.66015625, "learning_rate": 3.5743717433244025e-05, "loss": 0.8161, "step": 3345 }, { "epoch": 1.79675178685279, "grad_norm": 0.65234375, "learning_rate": 3.573609129220748e-05, "loss": 0.8581, "step": 3346 }, { "epoch": 1.7972886815878661, "grad_norm": 0.6875, "learning_rate": 3.572846392604695e-05, "loss": 0.9598, "step": 3347 }, { "epoch": 1.7978255763229423, "grad_norm": 0.75390625, "learning_rate": 3.572083533563283e-05, "loss": 0.7201, "step": 3348 }, { "epoch": 1.7983624710580182, "grad_norm": 0.80078125, "learning_rate": 3.571320552183563e-05, "loss": 0.92, "step": 3349 }, { "epoch": 1.798899365793094, "grad_norm": 0.625, "learning_rate": 3.570557448552601e-05, "loss": 0.8208, "step": 3350 }, { "epoch": 1.798899365793094, "eval_loss": 0.9074547290802002, "eval_runtime": 46.6661, "eval_samples_per_second": 20.615, "eval_steps_per_second": 20.615, "step": 3350 }, { "epoch": 1.7994362605281702, "grad_norm": 0.609375, "learning_rate": 3.569794222757475e-05, "loss": 0.8413, "step": 3351 }, { "epoch": 1.7999731552632463, "grad_norm": 0.7265625, "learning_rate": 3.569030874885282e-05, "loss": 0.7694, "step": 3352 }, { "epoch": 1.8005100499983222, "grad_norm": 0.9296875, "learning_rate": 3.568267405023128e-05, "loss": 0.8728, "step": 3353 }, { "epoch": 1.801046944733398, "grad_norm": 0.7734375, "learning_rate": 3.567503813258134e-05, "loss": 1.0077, "step": 3354 }, { "epoch": 1.8015838394684742, "grad_norm": 0.82421875, "learning_rate": 3.5667400996774356e-05, "loss": 0.911, "step": 3355 }, { "epoch": 1.8021207342035503, "grad_norm": 0.6015625, "learning_rate": 3.5659762643681834e-05, "loss": 0.7527, "step": 3356 }, { "epoch": 1.8026576289386262, "grad_norm": 0.80078125, "learning_rate": 3.56521230741754e-05, "loss": 0.9561, "step": 3357 }, { "epoch": 1.8031945236737021, "grad_norm": 0.84765625, "learning_rate": 3.564448228912682e-05, "loss": 0.9424, "step": 3358 }, { "epoch": 1.8037314184087783, "grad_norm": 0.76171875, "learning_rate": 3.563684028940801e-05, "loss": 0.8091, "step": 3359 }, { "epoch": 1.8042683131438544, "grad_norm": 0.6953125, "learning_rate": 3.562919707589102e-05, "loss": 0.8003, "step": 3360 }, { "epoch": 1.8048052078789303, "grad_norm": 0.80078125, "learning_rate": 3.562155264944803e-05, "loss": 0.9384, "step": 3361 }, { "epoch": 1.8053421026140062, "grad_norm": 0.69921875, "learning_rate": 3.5613907010951384e-05, "loss": 0.9769, "step": 3362 }, { "epoch": 1.8058789973490823, "grad_norm": 0.61328125, "learning_rate": 3.560626016127353e-05, "loss": 1.0228, "step": 3363 }, { "epoch": 1.8064158920841582, "grad_norm": 0.71484375, "learning_rate": 3.559861210128706e-05, "loss": 0.8613, "step": 3364 }, { "epoch": 1.806952786819234, "grad_norm": 0.6171875, "learning_rate": 3.559096283186475e-05, "loss": 0.8571, "step": 3365 }, { "epoch": 1.8074896815543102, "grad_norm": 0.75390625, "learning_rate": 3.5583312353879444e-05, "loss": 1.0133, "step": 3366 }, { "epoch": 1.8080265762893863, "grad_norm": 0.703125, "learning_rate": 3.5575660668204176e-05, "loss": 0.9155, "step": 3367 }, { "epoch": 1.8085634710244622, "grad_norm": 0.7421875, "learning_rate": 3.55680077757121e-05, "loss": 0.9624, "step": 3368 }, { "epoch": 1.8091003657595381, "grad_norm": 0.78125, "learning_rate": 3.556035367727649e-05, "loss": 1.0062, "step": 3369 }, { "epoch": 1.8096372604946143, "grad_norm": 0.69140625, "learning_rate": 3.55526983737708e-05, "loss": 0.8793, "step": 3370 }, { "epoch": 1.8101741552296904, "grad_norm": 0.65625, "learning_rate": 3.554504186606859e-05, "loss": 0.9257, "step": 3371 }, { "epoch": 1.8107110499647663, "grad_norm": 0.5859375, "learning_rate": 3.5537384155043546e-05, "loss": 0.6916, "step": 3372 }, { "epoch": 1.8112479446998422, "grad_norm": 0.70703125, "learning_rate": 3.5529725241569536e-05, "loss": 0.885, "step": 3373 }, { "epoch": 1.8117848394349183, "grad_norm": 0.63671875, "learning_rate": 3.552206512652053e-05, "loss": 0.8285, "step": 3374 }, { "epoch": 1.8123217341699944, "grad_norm": 0.64453125, "learning_rate": 3.5514403810770635e-05, "loss": 0.7443, "step": 3375 }, { "epoch": 1.8128586289050703, "grad_norm": 0.6953125, "learning_rate": 3.5506741295194115e-05, "loss": 0.8971, "step": 3376 }, { "epoch": 1.8133955236401462, "grad_norm": 0.59375, "learning_rate": 3.5499077580665355e-05, "loss": 0.7787, "step": 3377 }, { "epoch": 1.8139324183752223, "grad_norm": 0.95703125, "learning_rate": 3.549141266805888e-05, "loss": 0.6698, "step": 3378 }, { "epoch": 1.8144693131102985, "grad_norm": 0.92578125, "learning_rate": 3.548374655824936e-05, "loss": 0.8459, "step": 3379 }, { "epoch": 1.8150062078453744, "grad_norm": 0.76953125, "learning_rate": 3.547607925211159e-05, "loss": 0.9377, "step": 3380 }, { "epoch": 1.8155431025804503, "grad_norm": 0.91796875, "learning_rate": 3.546841075052051e-05, "loss": 0.9742, "step": 3381 }, { "epoch": 1.8160799973155264, "grad_norm": 0.8359375, "learning_rate": 3.546074105435119e-05, "loss": 1.0094, "step": 3382 }, { "epoch": 1.8166168920506023, "grad_norm": 0.85546875, "learning_rate": 3.545307016447884e-05, "loss": 0.9406, "step": 3383 }, { "epoch": 1.8171537867856782, "grad_norm": 0.73046875, "learning_rate": 3.5445398081778814e-05, "loss": 0.9025, "step": 3384 }, { "epoch": 1.8176906815207543, "grad_norm": 0.875, "learning_rate": 3.543772480712658e-05, "loss": 0.9076, "step": 3385 }, { "epoch": 1.8182275762558304, "grad_norm": 0.60546875, "learning_rate": 3.543005034139776e-05, "loss": 0.8067, "step": 3386 }, { "epoch": 1.8187644709909063, "grad_norm": 0.65234375, "learning_rate": 3.542237468546812e-05, "loss": 0.6781, "step": 3387 }, { "epoch": 1.8193013657259822, "grad_norm": 0.88671875, "learning_rate": 3.541469784021353e-05, "loss": 0.9106, "step": 3388 }, { "epoch": 1.8198382604610583, "grad_norm": 0.609375, "learning_rate": 3.540701980651003e-05, "loss": 0.6246, "step": 3389 }, { "epoch": 1.8203751551961345, "grad_norm": 0.671875, "learning_rate": 3.5399340585233774e-05, "loss": 0.9278, "step": 3390 }, { "epoch": 1.8209120499312104, "grad_norm": 0.703125, "learning_rate": 3.5391660177261064e-05, "loss": 0.9695, "step": 3391 }, { "epoch": 1.8214489446662863, "grad_norm": 0.67578125, "learning_rate": 3.5383978583468324e-05, "loss": 0.7959, "step": 3392 }, { "epoch": 1.8219858394013624, "grad_norm": 0.82421875, "learning_rate": 3.537629580473213e-05, "loss": 1.0399, "step": 3393 }, { "epoch": 1.8225227341364385, "grad_norm": 0.7734375, "learning_rate": 3.536861184192917e-05, "loss": 0.9706, "step": 3394 }, { "epoch": 1.8230596288715144, "grad_norm": 0.65625, "learning_rate": 3.53609266959363e-05, "loss": 0.7903, "step": 3395 }, { "epoch": 1.8235965236065903, "grad_norm": 0.74609375, "learning_rate": 3.535324036763048e-05, "loss": 0.9438, "step": 3396 }, { "epoch": 1.8241334183416664, "grad_norm": 0.6796875, "learning_rate": 3.534555285788882e-05, "loss": 0.9121, "step": 3397 }, { "epoch": 1.8246703130767425, "grad_norm": 0.6484375, "learning_rate": 3.5337864167588566e-05, "loss": 0.9203, "step": 3398 }, { "epoch": 1.8252072078118184, "grad_norm": 0.71875, "learning_rate": 3.533017429760709e-05, "loss": 0.8828, "step": 3399 }, { "epoch": 1.8257441025468943, "grad_norm": 0.69921875, "learning_rate": 3.5322483248821904e-05, "loss": 0.8854, "step": 3400 }, { "epoch": 1.8257441025468943, "eval_loss": 0.9071779847145081, "eval_runtime": 46.4737, "eval_samples_per_second": 20.7, "eval_steps_per_second": 20.7, "step": 3400 }, { "epoch": 1.8262809972819705, "grad_norm": 0.65625, "learning_rate": 3.531479102211066e-05, "loss": 0.9293, "step": 3401 }, { "epoch": 1.8268178920170464, "grad_norm": 0.78515625, "learning_rate": 3.5307097618351126e-05, "loss": 1.0142, "step": 3402 }, { "epoch": 1.8273547867521223, "grad_norm": 0.66015625, "learning_rate": 3.529940303842123e-05, "loss": 0.77, "step": 3403 }, { "epoch": 1.8278916814871984, "grad_norm": 0.6796875, "learning_rate": 3.529170728319903e-05, "loss": 0.8665, "step": 3404 }, { "epoch": 1.8284285762222745, "grad_norm": 0.8203125, "learning_rate": 3.528401035356267e-05, "loss": 0.9725, "step": 3405 }, { "epoch": 1.8289654709573504, "grad_norm": 0.67578125, "learning_rate": 3.527631225039051e-05, "loss": 0.7628, "step": 3406 }, { "epoch": 1.8295023656924263, "grad_norm": 0.6796875, "learning_rate": 3.526861297456097e-05, "loss": 0.954, "step": 3407 }, { "epoch": 1.8300392604275024, "grad_norm": 0.7421875, "learning_rate": 3.526091252695266e-05, "loss": 0.8412, "step": 3408 }, { "epoch": 1.8305761551625785, "grad_norm": 0.59765625, "learning_rate": 3.5253210908444276e-05, "loss": 0.7154, "step": 3409 }, { "epoch": 1.8311130498976544, "grad_norm": 0.73046875, "learning_rate": 3.5245508119914687e-05, "loss": 0.759, "step": 3410 }, { "epoch": 1.8316499446327303, "grad_norm": 0.6953125, "learning_rate": 3.5237804162242874e-05, "loss": 0.8561, "step": 3411 }, { "epoch": 1.8321868393678065, "grad_norm": 0.6328125, "learning_rate": 3.523009903630796e-05, "loss": 0.8855, "step": 3412 }, { "epoch": 1.8327237341028826, "grad_norm": 0.69140625, "learning_rate": 3.522239274298918e-05, "loss": 0.7588, "step": 3413 }, { "epoch": 1.8332606288379585, "grad_norm": 0.65625, "learning_rate": 3.521468528316594e-05, "loss": 0.7918, "step": 3414 }, { "epoch": 1.8337975235730344, "grad_norm": 0.8984375, "learning_rate": 3.520697665771774e-05, "loss": 0.9868, "step": 3415 }, { "epoch": 1.8343344183081105, "grad_norm": 0.73046875, "learning_rate": 3.5199266867524254e-05, "loss": 0.9829, "step": 3416 }, { "epoch": 1.8348713130431866, "grad_norm": 0.8359375, "learning_rate": 3.5191555913465254e-05, "loss": 0.9624, "step": 3417 }, { "epoch": 1.8354082077782625, "grad_norm": 0.671875, "learning_rate": 3.518384379642066e-05, "loss": 0.8279, "step": 3418 }, { "epoch": 1.8359451025133384, "grad_norm": 0.74609375, "learning_rate": 3.517613051727051e-05, "loss": 0.8205, "step": 3419 }, { "epoch": 1.8364819972484145, "grad_norm": 0.69921875, "learning_rate": 3.516841607689501e-05, "loss": 0.8706, "step": 3420 }, { "epoch": 1.8370188919834904, "grad_norm": 0.671875, "learning_rate": 3.5160700476174456e-05, "loss": 0.8847, "step": 3421 }, { "epoch": 1.8375557867185663, "grad_norm": 0.703125, "learning_rate": 3.515298371598931e-05, "loss": 0.9828, "step": 3422 }, { "epoch": 1.8380926814536425, "grad_norm": 0.82421875, "learning_rate": 3.5145265797220134e-05, "loss": 0.975, "step": 3423 }, { "epoch": 1.8386295761887186, "grad_norm": 0.58984375, "learning_rate": 3.513754672074765e-05, "loss": 0.8465, "step": 3424 }, { "epoch": 1.8391664709237945, "grad_norm": 0.66796875, "learning_rate": 3.5129826487452724e-05, "loss": 0.8347, "step": 3425 }, { "epoch": 1.8397033656588704, "grad_norm": 0.79296875, "learning_rate": 3.51221050982163e-05, "loss": 0.8954, "step": 3426 }, { "epoch": 1.8402402603939465, "grad_norm": 0.66015625, "learning_rate": 3.511438255391948e-05, "loss": 0.772, "step": 3427 }, { "epoch": 1.8407771551290226, "grad_norm": 0.79296875, "learning_rate": 3.510665885544354e-05, "loss": 1.1601, "step": 3428 }, { "epoch": 1.8413140498640985, "grad_norm": 0.68359375, "learning_rate": 3.509893400366984e-05, "loss": 0.9285, "step": 3429 }, { "epoch": 1.8418509445991744, "grad_norm": 0.74609375, "learning_rate": 3.509120799947987e-05, "loss": 0.7532, "step": 3430 }, { "epoch": 1.8423878393342505, "grad_norm": 0.68359375, "learning_rate": 3.508348084375527e-05, "loss": 0.8838, "step": 3431 }, { "epoch": 1.8429247340693267, "grad_norm": 0.92578125, "learning_rate": 3.5075752537377816e-05, "loss": 0.7139, "step": 3432 }, { "epoch": 1.8434616288044026, "grad_norm": 0.88671875, "learning_rate": 3.5068023081229396e-05, "loss": 0.9805, "step": 3433 }, { "epoch": 1.8439985235394785, "grad_norm": 1.171875, "learning_rate": 3.5060292476192035e-05, "loss": 0.8156, "step": 3434 }, { "epoch": 1.8445354182745546, "grad_norm": 0.6484375, "learning_rate": 3.5052560723147895e-05, "loss": 0.8591, "step": 3435 }, { "epoch": 1.8450723130096307, "grad_norm": 0.69140625, "learning_rate": 3.5044827822979286e-05, "loss": 0.8497, "step": 3436 }, { "epoch": 1.8456092077447066, "grad_norm": 0.59375, "learning_rate": 3.503709377656861e-05, "loss": 0.8224, "step": 3437 }, { "epoch": 1.8461461024797825, "grad_norm": 0.7890625, "learning_rate": 3.5029358584798424e-05, "loss": 1.0091, "step": 3438 }, { "epoch": 1.8466829972148586, "grad_norm": 0.68359375, "learning_rate": 3.5021622248551414e-05, "loss": 0.8467, "step": 3439 }, { "epoch": 1.8472198919499345, "grad_norm": 0.8984375, "learning_rate": 3.501388476871039e-05, "loss": 0.8345, "step": 3440 }, { "epoch": 1.8477567866850104, "grad_norm": 0.83984375, "learning_rate": 3.500614614615829e-05, "loss": 0.7775, "step": 3441 }, { "epoch": 1.8482936814200865, "grad_norm": 0.79296875, "learning_rate": 3.4998406381778203e-05, "loss": 0.7873, "step": 3442 }, { "epoch": 1.8488305761551627, "grad_norm": 0.7265625, "learning_rate": 3.499066547645333e-05, "loss": 1.1564, "step": 3443 }, { "epoch": 1.8493674708902386, "grad_norm": 0.69140625, "learning_rate": 3.4982923431067e-05, "loss": 0.7893, "step": 3444 }, { "epoch": 1.8499043656253145, "grad_norm": 0.71484375, "learning_rate": 3.497518024650269e-05, "loss": 0.8709, "step": 3445 }, { "epoch": 1.8504412603603906, "grad_norm": 0.76953125, "learning_rate": 3.496743592364399e-05, "loss": 0.9887, "step": 3446 }, { "epoch": 1.8509781550954667, "grad_norm": 0.7265625, "learning_rate": 3.4959690463374615e-05, "loss": 1.0011, "step": 3447 }, { "epoch": 1.8515150498305426, "grad_norm": 0.640625, "learning_rate": 3.495194386657843e-05, "loss": 0.9166, "step": 3448 }, { "epoch": 1.8520519445656185, "grad_norm": 0.64453125, "learning_rate": 3.494419613413942e-05, "loss": 0.7669, "step": 3449 }, { "epoch": 1.8525888393006946, "grad_norm": 1.0625, "learning_rate": 3.493644726694171e-05, "loss": 0.81, "step": 3450 }, { "epoch": 1.8525888393006946, "eval_loss": 0.9073987603187561, "eval_runtime": 46.54, "eval_samples_per_second": 20.67, "eval_steps_per_second": 20.67, "step": 3450 }, { "epoch": 1.8531257340357707, "grad_norm": 0.71484375, "learning_rate": 3.4928697265869515e-05, "loss": 0.8877, "step": 3451 }, { "epoch": 1.8536626287708466, "grad_norm": 0.6953125, "learning_rate": 3.492094613180723e-05, "loss": 0.8905, "step": 3452 }, { "epoch": 1.8541995235059225, "grad_norm": 0.80078125, "learning_rate": 3.491319386563936e-05, "loss": 0.9149, "step": 3453 }, { "epoch": 1.8547364182409987, "grad_norm": 0.62109375, "learning_rate": 3.490544046825052e-05, "loss": 0.8345, "step": 3454 }, { "epoch": 1.8552733129760748, "grad_norm": 0.6171875, "learning_rate": 3.4897685940525485e-05, "loss": 0.7826, "step": 3455 }, { "epoch": 1.8558102077111507, "grad_norm": 0.7734375, "learning_rate": 3.4889930283349145e-05, "loss": 0.9327, "step": 3456 }, { "epoch": 1.8563471024462266, "grad_norm": 0.703125, "learning_rate": 3.48821734976065e-05, "loss": 0.617, "step": 3457 }, { "epoch": 1.8568839971813027, "grad_norm": 0.62890625, "learning_rate": 3.487441558418273e-05, "loss": 0.7964, "step": 3458 }, { "epoch": 1.8574208919163786, "grad_norm": 0.66796875, "learning_rate": 3.4866656543963084e-05, "loss": 1.0588, "step": 3459 }, { "epoch": 1.8579577866514545, "grad_norm": 0.80859375, "learning_rate": 3.4858896377832966e-05, "loss": 0.9347, "step": 3460 }, { "epoch": 1.8584946813865306, "grad_norm": 0.76171875, "learning_rate": 3.485113508667793e-05, "loss": 0.9807, "step": 3461 }, { "epoch": 1.8590315761216067, "grad_norm": 0.66796875, "learning_rate": 3.484337267138362e-05, "loss": 1.0128, "step": 3462 }, { "epoch": 1.8595684708566826, "grad_norm": 0.7578125, "learning_rate": 3.4835609132835834e-05, "loss": 1.0562, "step": 3463 }, { "epoch": 1.8601053655917585, "grad_norm": 0.76953125, "learning_rate": 3.482784447192049e-05, "loss": 0.7675, "step": 3464 }, { "epoch": 1.8606422603268347, "grad_norm": 0.87109375, "learning_rate": 3.482007868952362e-05, "loss": 1.0954, "step": 3465 }, { "epoch": 1.8611791550619108, "grad_norm": 0.66015625, "learning_rate": 3.4812311786531426e-05, "loss": 0.8647, "step": 3466 }, { "epoch": 1.8617160497969867, "grad_norm": 0.7734375, "learning_rate": 3.4804543763830195e-05, "loss": 0.8386, "step": 3467 }, { "epoch": 1.8622529445320626, "grad_norm": 0.7734375, "learning_rate": 3.4796774622306345e-05, "loss": 0.8632, "step": 3468 }, { "epoch": 1.8627898392671387, "grad_norm": 0.8125, "learning_rate": 3.478900436284645e-05, "loss": 1.1912, "step": 3469 }, { "epoch": 1.8633267340022148, "grad_norm": 0.7109375, "learning_rate": 3.4781232986337184e-05, "loss": 0.7997, "step": 3470 }, { "epoch": 1.8638636287372907, "grad_norm": 0.8984375, "learning_rate": 3.4773460493665376e-05, "loss": 0.8008, "step": 3471 }, { "epoch": 1.8644005234723666, "grad_norm": 0.77734375, "learning_rate": 3.4765686885717945e-05, "loss": 0.9192, "step": 3472 }, { "epoch": 1.8649374182074427, "grad_norm": 0.78515625, "learning_rate": 3.475791216338197e-05, "loss": 0.8563, "step": 3473 }, { "epoch": 1.8654743129425189, "grad_norm": 0.69921875, "learning_rate": 3.475013632754465e-05, "loss": 1.0418, "step": 3474 }, { "epoch": 1.8660112076775948, "grad_norm": 0.765625, "learning_rate": 3.474235937909329e-05, "loss": 0.967, "step": 3475 }, { "epoch": 1.8665481024126707, "grad_norm": 0.72265625, "learning_rate": 3.473458131891535e-05, "loss": 0.7923, "step": 3476 }, { "epoch": 1.8670849971477468, "grad_norm": 0.7109375, "learning_rate": 3.472680214789841e-05, "loss": 0.9474, "step": 3477 }, { "epoch": 1.8676218918828227, "grad_norm": 0.6875, "learning_rate": 3.471902186693016e-05, "loss": 0.8369, "step": 3478 }, { "epoch": 1.8681587866178986, "grad_norm": 0.61328125, "learning_rate": 3.471124047689843e-05, "loss": 0.7111, "step": 3479 }, { "epoch": 1.8686956813529747, "grad_norm": 0.67578125, "learning_rate": 3.470345797869118e-05, "loss": 0.8305, "step": 3480 }, { "epoch": 1.8692325760880508, "grad_norm": 0.73046875, "learning_rate": 3.46956743731965e-05, "loss": 0.9186, "step": 3481 }, { "epoch": 1.8697694708231267, "grad_norm": 0.953125, "learning_rate": 3.4687889661302576e-05, "loss": 0.8437, "step": 3482 }, { "epoch": 1.8703063655582026, "grad_norm": 0.67578125, "learning_rate": 3.468010384389777e-05, "loss": 0.6737, "step": 3483 }, { "epoch": 1.8708432602932787, "grad_norm": 0.77734375, "learning_rate": 3.4672316921870504e-05, "loss": 0.8664, "step": 3484 }, { "epoch": 1.8713801550283549, "grad_norm": 0.76953125, "learning_rate": 3.46645288961094e-05, "loss": 0.8586, "step": 3485 }, { "epoch": 1.8719170497634308, "grad_norm": 0.859375, "learning_rate": 3.4656739767503163e-05, "loss": 1.059, "step": 3486 }, { "epoch": 1.8724539444985067, "grad_norm": 0.6875, "learning_rate": 3.464894953694061e-05, "loss": 0.7063, "step": 3487 }, { "epoch": 1.8729908392335828, "grad_norm": 0.640625, "learning_rate": 3.464115820531074e-05, "loss": 0.7961, "step": 3488 }, { "epoch": 1.873527733968659, "grad_norm": 0.71484375, "learning_rate": 3.463336577350261e-05, "loss": 0.8118, "step": 3489 }, { "epoch": 1.8740646287037348, "grad_norm": 0.83984375, "learning_rate": 3.462557224240545e-05, "loss": 0.8889, "step": 3490 }, { "epoch": 1.8746015234388107, "grad_norm": 0.6484375, "learning_rate": 3.46177776129086e-05, "loss": 0.8418, "step": 3491 }, { "epoch": 1.8751384181738868, "grad_norm": 0.7578125, "learning_rate": 3.4609981885901524e-05, "loss": 1.0085, "step": 3492 }, { "epoch": 1.875675312908963, "grad_norm": 0.81640625, "learning_rate": 3.4602185062273826e-05, "loss": 1.0661, "step": 3493 }, { "epoch": 1.8762122076440388, "grad_norm": 0.828125, "learning_rate": 3.459438714291519e-05, "loss": 1.0243, "step": 3494 }, { "epoch": 1.8767491023791147, "grad_norm": 0.65625, "learning_rate": 3.458658812871549e-05, "loss": 0.9302, "step": 3495 }, { "epoch": 1.8772859971141909, "grad_norm": 0.80859375, "learning_rate": 3.457878802056469e-05, "loss": 0.8507, "step": 3496 }, { "epoch": 1.8778228918492668, "grad_norm": 0.6953125, "learning_rate": 3.457098681935286e-05, "loss": 0.7617, "step": 3497 }, { "epoch": 1.8783597865843427, "grad_norm": 0.6484375, "learning_rate": 3.456318452597022e-05, "loss": 0.8152, "step": 3498 }, { "epoch": 1.8788966813194188, "grad_norm": 0.6796875, "learning_rate": 3.455538114130713e-05, "loss": 0.8631, "step": 3499 }, { "epoch": 1.879433576054495, "grad_norm": 0.703125, "learning_rate": 3.454757666625404e-05, "loss": 0.9013, "step": 3500 }, { "epoch": 1.879433576054495, "eval_loss": 0.9068979024887085, "eval_runtime": 47.2179, "eval_samples_per_second": 20.374, "eval_steps_per_second": 20.374, "step": 3500 }, { "epoch": 1.8799704707895708, "grad_norm": 0.6484375, "learning_rate": 3.4539771101701537e-05, "loss": 0.8096, "step": 3501 }, { "epoch": 1.8805073655246467, "grad_norm": 0.76171875, "learning_rate": 3.453196444854034e-05, "loss": 1.0139, "step": 3502 }, { "epoch": 1.8810442602597228, "grad_norm": 0.65234375, "learning_rate": 3.4524156707661296e-05, "loss": 0.7859, "step": 3503 }, { "epoch": 1.881581154994799, "grad_norm": 0.62890625, "learning_rate": 3.451634787995535e-05, "loss": 0.7599, "step": 3504 }, { "epoch": 1.8821180497298748, "grad_norm": 0.7734375, "learning_rate": 3.45085379663136e-05, "loss": 0.9438, "step": 3505 }, { "epoch": 1.8826549444649507, "grad_norm": 0.6796875, "learning_rate": 3.450072696762725e-05, "loss": 0.9305, "step": 3506 }, { "epoch": 1.8831918392000269, "grad_norm": 0.75390625, "learning_rate": 3.449291488478763e-05, "loss": 1.0345, "step": 3507 }, { "epoch": 1.883728733935103, "grad_norm": 0.76953125, "learning_rate": 3.448510171868622e-05, "loss": 0.9365, "step": 3508 }, { "epoch": 1.8842656286701789, "grad_norm": 0.671875, "learning_rate": 3.447728747021457e-05, "loss": 0.9009, "step": 3509 }, { "epoch": 1.8848025234052548, "grad_norm": 0.6640625, "learning_rate": 3.44694721402644e-05, "loss": 0.9301, "step": 3510 }, { "epoch": 1.885339418140331, "grad_norm": 0.7109375, "learning_rate": 3.446165572972755e-05, "loss": 0.9063, "step": 3511 }, { "epoch": 1.885876312875407, "grad_norm": 0.6875, "learning_rate": 3.4453838239495946e-05, "loss": 0.7689, "step": 3512 }, { "epoch": 1.886413207610483, "grad_norm": 0.64453125, "learning_rate": 3.444601967046168e-05, "loss": 0.7711, "step": 3513 }, { "epoch": 1.8869501023455588, "grad_norm": 0.78125, "learning_rate": 3.443820002351694e-05, "loss": 0.85, "step": 3514 }, { "epoch": 1.887486997080635, "grad_norm": 0.9609375, "learning_rate": 3.443037929955406e-05, "loss": 0.9214, "step": 3515 }, { "epoch": 1.8880238918157108, "grad_norm": 0.69140625, "learning_rate": 3.442255749946548e-05, "loss": 0.7868, "step": 3516 }, { "epoch": 1.8885607865507867, "grad_norm": 0.734375, "learning_rate": 3.441473462414375e-05, "loss": 0.8949, "step": 3517 }, { "epoch": 1.8890976812858629, "grad_norm": 0.66015625, "learning_rate": 3.440691067448159e-05, "loss": 0.7483, "step": 3518 }, { "epoch": 1.889634576020939, "grad_norm": 0.671875, "learning_rate": 3.439908565137178e-05, "loss": 0.9311, "step": 3519 }, { "epoch": 1.8901714707560149, "grad_norm": 0.96484375, "learning_rate": 3.439125955570725e-05, "loss": 1.1153, "step": 3520 }, { "epoch": 1.8907083654910908, "grad_norm": 0.67578125, "learning_rate": 3.438343238838111e-05, "loss": 0.8369, "step": 3521 }, { "epoch": 1.891245260226167, "grad_norm": 0.71484375, "learning_rate": 3.437560415028648e-05, "loss": 1.0822, "step": 3522 }, { "epoch": 1.891782154961243, "grad_norm": 0.7890625, "learning_rate": 3.4367774842316683e-05, "loss": 0.8995, "step": 3523 }, { "epoch": 1.892319049696319, "grad_norm": 0.59375, "learning_rate": 3.4359944465365156e-05, "loss": 0.8483, "step": 3524 }, { "epoch": 1.8928559444313948, "grad_norm": 0.8515625, "learning_rate": 3.4352113020325424e-05, "loss": 0.9264, "step": 3525 }, { "epoch": 1.893392839166471, "grad_norm": 0.58984375, "learning_rate": 3.434428050809116e-05, "loss": 0.7832, "step": 3526 }, { "epoch": 1.893929733901547, "grad_norm": 0.77734375, "learning_rate": 3.433644692955616e-05, "loss": 0.9841, "step": 3527 }, { "epoch": 1.894466628636623, "grad_norm": 0.65234375, "learning_rate": 3.432861228561432e-05, "loss": 0.9326, "step": 3528 }, { "epoch": 1.8950035233716989, "grad_norm": 0.73828125, "learning_rate": 3.432077657715969e-05, "loss": 1.0263, "step": 3529 }, { "epoch": 1.895540418106775, "grad_norm": 1.0078125, "learning_rate": 3.4312939805086413e-05, "loss": 0.8403, "step": 3530 }, { "epoch": 1.896077312841851, "grad_norm": 0.69140625, "learning_rate": 3.4305101970288764e-05, "loss": 0.9852, "step": 3531 }, { "epoch": 1.896614207576927, "grad_norm": 0.66796875, "learning_rate": 3.4297263073661136e-05, "loss": 1.024, "step": 3532 }, { "epoch": 1.897151102312003, "grad_norm": 0.80078125, "learning_rate": 3.4289423116098056e-05, "loss": 0.9328, "step": 3533 }, { "epoch": 1.897687997047079, "grad_norm": 0.65234375, "learning_rate": 3.428158209849416e-05, "loss": 0.9998, "step": 3534 }, { "epoch": 1.898224891782155, "grad_norm": 0.70703125, "learning_rate": 3.42737400217442e-05, "loss": 0.9602, "step": 3535 }, { "epoch": 1.8987617865172308, "grad_norm": 0.76171875, "learning_rate": 3.4265896886743054e-05, "loss": 1.1247, "step": 3536 }, { "epoch": 1.899298681252307, "grad_norm": 0.6640625, "learning_rate": 3.425805269438574e-05, "loss": 0.852, "step": 3537 }, { "epoch": 1.899835575987383, "grad_norm": 0.671875, "learning_rate": 3.4250207445567376e-05, "loss": 0.8087, "step": 3538 }, { "epoch": 1.900372470722459, "grad_norm": 0.69921875, "learning_rate": 3.424236114118319e-05, "loss": 0.787, "step": 3539 }, { "epoch": 1.9009093654575349, "grad_norm": 0.66796875, "learning_rate": 3.423451378212855e-05, "loss": 0.8916, "step": 3540 }, { "epoch": 1.901446260192611, "grad_norm": 0.66796875, "learning_rate": 3.4226665369298945e-05, "loss": 0.9666, "step": 3541 }, { "epoch": 1.901983154927687, "grad_norm": 0.625, "learning_rate": 3.4218815903589974e-05, "loss": 0.8278, "step": 3542 }, { "epoch": 1.902520049662763, "grad_norm": 0.69921875, "learning_rate": 3.421096538589736e-05, "loss": 0.7801, "step": 3543 }, { "epoch": 1.903056944397839, "grad_norm": 0.6875, "learning_rate": 3.4203113817116957e-05, "loss": 0.8558, "step": 3544 }, { "epoch": 1.903593839132915, "grad_norm": 0.73046875, "learning_rate": 3.4195261198144704e-05, "loss": 0.7254, "step": 3545 }, { "epoch": 1.9041307338679911, "grad_norm": 0.8671875, "learning_rate": 3.418740752987672e-05, "loss": 0.9899, "step": 3546 }, { "epoch": 1.904667628603067, "grad_norm": 0.72265625, "learning_rate": 3.417955281320917e-05, "loss": 1.0206, "step": 3547 }, { "epoch": 1.905204523338143, "grad_norm": 0.65625, "learning_rate": 3.41716970490384e-05, "loss": 0.92, "step": 3548 }, { "epoch": 1.905741418073219, "grad_norm": 0.6796875, "learning_rate": 3.416384023826085e-05, "loss": 0.7991, "step": 3549 }, { "epoch": 1.9062783128082952, "grad_norm": 0.85546875, "learning_rate": 3.415598238177307e-05, "loss": 0.8539, "step": 3550 }, { "epoch": 1.9062783128082952, "eval_loss": 0.9063857793807983, "eval_runtime": 46.3907, "eval_samples_per_second": 20.737, "eval_steps_per_second": 20.737, "step": 3550 }, { "epoch": 1.906815207543371, "grad_norm": 0.66015625, "learning_rate": 3.4148123480471764e-05, "loss": 0.9112, "step": 3551 }, { "epoch": 1.907352102278447, "grad_norm": 0.88671875, "learning_rate": 3.41402635352537e-05, "loss": 0.8875, "step": 3552 }, { "epoch": 1.907888997013523, "grad_norm": 0.66796875, "learning_rate": 3.413240254701582e-05, "loss": 0.8938, "step": 3553 }, { "epoch": 1.908425891748599, "grad_norm": 0.859375, "learning_rate": 3.412454051665516e-05, "loss": 0.9102, "step": 3554 }, { "epoch": 1.908962786483675, "grad_norm": 0.65234375, "learning_rate": 3.411667744506887e-05, "loss": 0.8858, "step": 3555 }, { "epoch": 1.909499681218751, "grad_norm": 0.6875, "learning_rate": 3.4108813333154236e-05, "loss": 0.8897, "step": 3556 }, { "epoch": 1.9100365759538271, "grad_norm": 0.6953125, "learning_rate": 3.4100948181808645e-05, "loss": 0.7843, "step": 3557 }, { "epoch": 1.910573470688903, "grad_norm": 0.734375, "learning_rate": 3.409308199192961e-05, "loss": 0.8233, "step": 3558 }, { "epoch": 1.911110365423979, "grad_norm": 0.7734375, "learning_rate": 3.408521476441477e-05, "loss": 0.9662, "step": 3559 }, { "epoch": 1.911647260159055, "grad_norm": 0.88671875, "learning_rate": 3.407734650016187e-05, "loss": 0.909, "step": 3560 }, { "epoch": 1.9121841548941312, "grad_norm": 0.94140625, "learning_rate": 3.4069477200068776e-05, "loss": 0.9564, "step": 3561 }, { "epoch": 1.912721049629207, "grad_norm": 0.578125, "learning_rate": 3.406160686503348e-05, "loss": 0.7775, "step": 3562 }, { "epoch": 1.913257944364283, "grad_norm": 0.66796875, "learning_rate": 3.405373549595408e-05, "loss": 0.9723, "step": 3563 }, { "epoch": 1.913794839099359, "grad_norm": 0.67578125, "learning_rate": 3.40458630937288e-05, "loss": 0.916, "step": 3564 }, { "epoch": 1.9143317338344352, "grad_norm": 0.71484375, "learning_rate": 3.4037989659256e-05, "loss": 0.752, "step": 3565 }, { "epoch": 1.9148686285695111, "grad_norm": 0.62109375, "learning_rate": 3.403011519343411e-05, "loss": 0.7295, "step": 3566 }, { "epoch": 1.915405523304587, "grad_norm": 0.72265625, "learning_rate": 3.4022239697161724e-05, "loss": 0.8212, "step": 3567 }, { "epoch": 1.9159424180396631, "grad_norm": 0.68359375, "learning_rate": 3.401436317133753e-05, "loss": 0.7814, "step": 3568 }, { "epoch": 1.9164793127747393, "grad_norm": 0.71484375, "learning_rate": 3.400648561686034e-05, "loss": 0.932, "step": 3569 }, { "epoch": 1.9170162075098152, "grad_norm": 0.71875, "learning_rate": 3.399860703462908e-05, "loss": 1.0507, "step": 3570 }, { "epoch": 1.917553102244891, "grad_norm": 0.72265625, "learning_rate": 3.3990727425542814e-05, "loss": 0.8958, "step": 3571 }, { "epoch": 1.9180899969799672, "grad_norm": 0.66796875, "learning_rate": 3.398284679050067e-05, "loss": 0.7969, "step": 3572 }, { "epoch": 1.918626891715043, "grad_norm": 0.77734375, "learning_rate": 3.397496513040196e-05, "loss": 0.9821, "step": 3573 }, { "epoch": 1.919163786450119, "grad_norm": 0.8046875, "learning_rate": 3.396708244614607e-05, "loss": 1.1287, "step": 3574 }, { "epoch": 1.919700681185195, "grad_norm": 0.8203125, "learning_rate": 3.39591987386325e-05, "loss": 0.8573, "step": 3575 }, { "epoch": 1.9202375759202712, "grad_norm": 0.69140625, "learning_rate": 3.395131400876092e-05, "loss": 0.7293, "step": 3576 }, { "epoch": 1.9207744706553471, "grad_norm": 0.75390625, "learning_rate": 3.3943428257431034e-05, "loss": 0.8483, "step": 3577 }, { "epoch": 1.921311365390423, "grad_norm": 0.65625, "learning_rate": 3.393554148554273e-05, "loss": 0.8788, "step": 3578 }, { "epoch": 1.9218482601254991, "grad_norm": 0.75, "learning_rate": 3.392765369399599e-05, "loss": 0.9468, "step": 3579 }, { "epoch": 1.9223851548605753, "grad_norm": 0.78515625, "learning_rate": 3.391976488369089e-05, "loss": 0.7401, "step": 3580 }, { "epoch": 1.9229220495956512, "grad_norm": 0.75390625, "learning_rate": 3.3911875055527666e-05, "loss": 0.8656, "step": 3581 }, { "epoch": 1.923458944330727, "grad_norm": 0.6953125, "learning_rate": 3.390398421040664e-05, "loss": 0.9307, "step": 3582 }, { "epoch": 1.9239958390658032, "grad_norm": 0.65625, "learning_rate": 3.389609234922824e-05, "loss": 0.9472, "step": 3583 }, { "epoch": 1.9245327338008793, "grad_norm": 0.609375, "learning_rate": 3.388819947289307e-05, "loss": 0.7706, "step": 3584 }, { "epoch": 1.9250696285359552, "grad_norm": 0.578125, "learning_rate": 3.3880305582301766e-05, "loss": 0.7794, "step": 3585 }, { "epoch": 1.925606523271031, "grad_norm": 0.8125, "learning_rate": 3.387241067835513e-05, "loss": 1.0654, "step": 3586 }, { "epoch": 1.9261434180061072, "grad_norm": 0.671875, "learning_rate": 3.386451476195408e-05, "loss": 0.8464, "step": 3587 }, { "epoch": 1.9266803127411833, "grad_norm": 0.6796875, "learning_rate": 3.3856617833999635e-05, "loss": 0.7479, "step": 3588 }, { "epoch": 1.927217207476259, "grad_norm": 0.76171875, "learning_rate": 3.384871989539293e-05, "loss": 0.7932, "step": 3589 }, { "epoch": 1.9277541022113351, "grad_norm": 0.65625, "learning_rate": 3.3840820947035234e-05, "loss": 0.7046, "step": 3590 }, { "epoch": 1.9282909969464113, "grad_norm": 0.75, "learning_rate": 3.38329209898279e-05, "loss": 0.9476, "step": 3591 }, { "epoch": 1.9288278916814872, "grad_norm": 0.6875, "learning_rate": 3.3825020024672434e-05, "loss": 0.8789, "step": 3592 }, { "epoch": 1.929364786416563, "grad_norm": 0.78125, "learning_rate": 3.381711805247041e-05, "loss": 0.9042, "step": 3593 }, { "epoch": 1.9299016811516392, "grad_norm": 0.6015625, "learning_rate": 3.380921507412356e-05, "loss": 0.7703, "step": 3594 }, { "epoch": 1.9304385758867153, "grad_norm": 0.68359375, "learning_rate": 3.380131109053371e-05, "loss": 0.9292, "step": 3595 }, { "epoch": 1.9309754706217912, "grad_norm": 0.640625, "learning_rate": 3.379340610260281e-05, "loss": 0.9689, "step": 3596 }, { "epoch": 1.931512365356867, "grad_norm": 0.625, "learning_rate": 3.378550011123291e-05, "loss": 0.7642, "step": 3597 }, { "epoch": 1.9320492600919432, "grad_norm": 0.73828125, "learning_rate": 3.377759311732619e-05, "loss": 0.9143, "step": 3598 }, { "epoch": 1.9325861548270193, "grad_norm": 0.71484375, "learning_rate": 3.3769685121784935e-05, "loss": 0.8401, "step": 3599 }, { "epoch": 1.9331230495620952, "grad_norm": 0.703125, "learning_rate": 3.376177612551155e-05, "loss": 0.7346, "step": 3600 }, { "epoch": 1.9331230495620952, "eval_loss": 0.9065521955490112, "eval_runtime": 46.0053, "eval_samples_per_second": 20.911, "eval_steps_per_second": 20.911, "step": 3600 }, { "epoch": 1.9336599442971711, "grad_norm": 0.6953125, "learning_rate": 3.375386612940855e-05, "loss": 1.0007, "step": 3601 }, { "epoch": 1.9341968390322473, "grad_norm": 0.66796875, "learning_rate": 3.374595513437856e-05, "loss": 0.8313, "step": 3602 }, { "epoch": 1.9347337337673234, "grad_norm": 0.640625, "learning_rate": 3.3738043141324336e-05, "loss": 0.799, "step": 3603 }, { "epoch": 1.9352706285023993, "grad_norm": 0.6640625, "learning_rate": 3.373013015114873e-05, "loss": 0.8217, "step": 3604 }, { "epoch": 1.9358075232374752, "grad_norm": 0.75390625, "learning_rate": 3.372221616475472e-05, "loss": 0.8775, "step": 3605 }, { "epoch": 1.9363444179725513, "grad_norm": 0.69921875, "learning_rate": 3.3714301183045385e-05, "loss": 0.8575, "step": 3606 }, { "epoch": 1.9368813127076274, "grad_norm": 0.7265625, "learning_rate": 3.370638520692393e-05, "loss": 1.1144, "step": 3607 }, { "epoch": 1.937418207442703, "grad_norm": 0.61328125, "learning_rate": 3.369846823729366e-05, "loss": 0.7982, "step": 3608 }, { "epoch": 1.9379551021777792, "grad_norm": 0.76953125, "learning_rate": 3.369055027505801e-05, "loss": 0.8817, "step": 3609 }, { "epoch": 1.9384919969128553, "grad_norm": 0.73046875, "learning_rate": 3.3682631321120504e-05, "loss": 0.749, "step": 3610 }, { "epoch": 1.9390288916479312, "grad_norm": 0.62890625, "learning_rate": 3.3674711376384824e-05, "loss": 0.7168, "step": 3611 }, { "epoch": 1.9395657863830071, "grad_norm": 0.6484375, "learning_rate": 3.366679044175471e-05, "loss": 0.73, "step": 3612 }, { "epoch": 1.9401026811180833, "grad_norm": 0.60546875, "learning_rate": 3.365886851813406e-05, "loss": 0.819, "step": 3613 }, { "epoch": 1.9406395758531594, "grad_norm": 0.83984375, "learning_rate": 3.365094560642685e-05, "loss": 0.8556, "step": 3614 }, { "epoch": 1.9411764705882353, "grad_norm": 0.875, "learning_rate": 3.36430217075372e-05, "loss": 0.8645, "step": 3615 }, { "epoch": 1.9417133653233112, "grad_norm": 0.71484375, "learning_rate": 3.363509682236931e-05, "loss": 0.8661, "step": 3616 }, { "epoch": 1.9422502600583873, "grad_norm": 0.62890625, "learning_rate": 3.362717095182751e-05, "loss": 0.8436, "step": 3617 }, { "epoch": 1.9427871547934634, "grad_norm": 0.70703125, "learning_rate": 3.361924409681626e-05, "loss": 0.9804, "step": 3618 }, { "epoch": 1.9433240495285393, "grad_norm": 0.6171875, "learning_rate": 3.36113162582401e-05, "loss": 0.8653, "step": 3619 }, { "epoch": 1.9438609442636152, "grad_norm": 0.80859375, "learning_rate": 3.360338743700371e-05, "loss": 0.9412, "step": 3620 }, { "epoch": 1.9443978389986913, "grad_norm": 0.69921875, "learning_rate": 3.359545763401185e-05, "loss": 0.6573, "step": 3621 }, { "epoch": 1.9449347337337675, "grad_norm": 0.66015625, "learning_rate": 3.358752685016943e-05, "loss": 0.9258, "step": 3622 }, { "epoch": 1.9454716284688434, "grad_norm": 0.6953125, "learning_rate": 3.357959508638144e-05, "loss": 0.8442, "step": 3623 }, { "epoch": 1.9460085232039193, "grad_norm": 0.6015625, "learning_rate": 3.3571662343553e-05, "loss": 0.7089, "step": 3624 }, { "epoch": 1.9465454179389954, "grad_norm": 0.66015625, "learning_rate": 3.356372862258934e-05, "loss": 0.8417, "step": 3625 }, { "epoch": 1.9470823126740715, "grad_norm": 0.65625, "learning_rate": 3.355579392439578e-05, "loss": 0.894, "step": 3626 }, { "epoch": 1.9476192074091472, "grad_norm": 0.71875, "learning_rate": 3.354785824987779e-05, "loss": 0.9974, "step": 3627 }, { "epoch": 1.9481561021442233, "grad_norm": 0.71875, "learning_rate": 3.3539921599940925e-05, "loss": 0.7023, "step": 3628 }, { "epoch": 1.9486929968792994, "grad_norm": 0.58984375, "learning_rate": 3.353198397549086e-05, "loss": 0.7872, "step": 3629 }, { "epoch": 1.9492298916143753, "grad_norm": 0.78515625, "learning_rate": 3.352404537743337e-05, "loss": 0.9705, "step": 3630 }, { "epoch": 1.9497667863494512, "grad_norm": 0.75, "learning_rate": 3.351610580667436e-05, "loss": 1.1664, "step": 3631 }, { "epoch": 1.9503036810845273, "grad_norm": 0.78125, "learning_rate": 3.350816526411982e-05, "loss": 0.9449, "step": 3632 }, { "epoch": 1.9508405758196035, "grad_norm": 0.703125, "learning_rate": 3.350022375067588e-05, "loss": 0.924, "step": 3633 }, { "epoch": 1.9513774705546794, "grad_norm": 0.63671875, "learning_rate": 3.3492281267248774e-05, "loss": 0.9257, "step": 3634 }, { "epoch": 1.9519143652897553, "grad_norm": 0.6328125, "learning_rate": 3.348433781474481e-05, "loss": 0.8889, "step": 3635 }, { "epoch": 1.9524512600248314, "grad_norm": 0.81640625, "learning_rate": 3.347639339407047e-05, "loss": 0.8841, "step": 3636 }, { "epoch": 1.9529881547599075, "grad_norm": 0.6875, "learning_rate": 3.346844800613229e-05, "loss": 0.9369, "step": 3637 }, { "epoch": 1.9535250494949834, "grad_norm": 0.7265625, "learning_rate": 3.346050165183695e-05, "loss": 0.9178, "step": 3638 }, { "epoch": 1.9540619442300593, "grad_norm": 0.6640625, "learning_rate": 3.3452554332091234e-05, "loss": 0.8964, "step": 3639 }, { "epoch": 1.9545988389651354, "grad_norm": 0.69140625, "learning_rate": 3.344460604780202e-05, "loss": 0.8634, "step": 3640 }, { "epoch": 1.9551357337002115, "grad_norm": 0.71875, "learning_rate": 3.343665679987631e-05, "loss": 0.7428, "step": 3641 }, { "epoch": 1.9556726284352874, "grad_norm": 0.68359375, "learning_rate": 3.342870658922123e-05, "loss": 0.9646, "step": 3642 }, { "epoch": 1.9562095231703633, "grad_norm": 0.765625, "learning_rate": 3.342075541674398e-05, "loss": 0.996, "step": 3643 }, { "epoch": 1.9567464179054395, "grad_norm": 0.65234375, "learning_rate": 3.3412803283351895e-05, "loss": 0.8717, "step": 3644 }, { "epoch": 1.9572833126405156, "grad_norm": 0.6953125, "learning_rate": 3.3404850189952416e-05, "loss": 0.8542, "step": 3645 }, { "epoch": 1.9578202073755913, "grad_norm": 0.765625, "learning_rate": 3.339689613745309e-05, "loss": 0.8331, "step": 3646 }, { "epoch": 1.9583571021106674, "grad_norm": 0.76171875, "learning_rate": 3.3388941126761584e-05, "loss": 0.9255, "step": 3647 }, { "epoch": 1.9588939968457435, "grad_norm": 0.75390625, "learning_rate": 3.3380985158785654e-05, "loss": 0.9237, "step": 3648 }, { "epoch": 1.9594308915808194, "grad_norm": 0.78515625, "learning_rate": 3.337302823443318e-05, "loss": 1.1332, "step": 3649 }, { "epoch": 1.9599677863158953, "grad_norm": 0.90625, "learning_rate": 3.336507035461215e-05, "loss": 0.9698, "step": 3650 }, { "epoch": 1.9599677863158953, "eval_loss": 0.9060580730438232, "eval_runtime": 46.6383, "eval_samples_per_second": 20.627, "eval_steps_per_second": 20.627, "step": 3650 }, { "epoch": 1.9605046810509714, "grad_norm": 0.7734375, "learning_rate": 3.335711152023066e-05, "loss": 0.9829, "step": 3651 }, { "epoch": 1.9610415757860475, "grad_norm": 0.703125, "learning_rate": 3.3349151732196904e-05, "loss": 0.9752, "step": 3652 }, { "epoch": 1.9615784705211234, "grad_norm": 0.83203125, "learning_rate": 3.3341190991419204e-05, "loss": 1.0297, "step": 3653 }, { "epoch": 1.9621153652561993, "grad_norm": 0.66015625, "learning_rate": 3.333322929880598e-05, "loss": 0.9164, "step": 3654 }, { "epoch": 1.9626522599912755, "grad_norm": 0.73046875, "learning_rate": 3.332526665526577e-05, "loss": 0.8236, "step": 3655 }, { "epoch": 1.9631891547263516, "grad_norm": 0.66796875, "learning_rate": 3.331730306170719e-05, "loss": 0.8501, "step": 3656 }, { "epoch": 1.9637260494614275, "grad_norm": 0.70703125, "learning_rate": 3.3309338519039015e-05, "loss": 0.9423, "step": 3657 }, { "epoch": 1.9642629441965034, "grad_norm": 1.28125, "learning_rate": 3.330137302817007e-05, "loss": 1.0223, "step": 3658 }, { "epoch": 1.9647998389315795, "grad_norm": 0.63671875, "learning_rate": 3.329340659000934e-05, "loss": 0.8156, "step": 3659 }, { "epoch": 1.9653367336666556, "grad_norm": 0.75, "learning_rate": 3.32854392054659e-05, "loss": 0.8239, "step": 3660 }, { "epoch": 1.9658736284017315, "grad_norm": 0.59375, "learning_rate": 3.327747087544891e-05, "loss": 0.6689, "step": 3661 }, { "epoch": 1.9664105231368074, "grad_norm": 0.890625, "learning_rate": 3.326950160086766e-05, "loss": 1.016, "step": 3662 }, { "epoch": 1.9669474178718835, "grad_norm": 0.640625, "learning_rate": 3.3261531382631574e-05, "loss": 0.7536, "step": 3663 }, { "epoch": 1.9674843126069597, "grad_norm": 0.91015625, "learning_rate": 3.325356022165012e-05, "loss": 1.2018, "step": 3664 }, { "epoch": 1.9680212073420353, "grad_norm": 0.66015625, "learning_rate": 3.324558811883292e-05, "loss": 0.9699, "step": 3665 }, { "epoch": 1.9685581020771115, "grad_norm": 0.80859375, "learning_rate": 3.3237615075089704e-05, "loss": 0.997, "step": 3666 }, { "epoch": 1.9690949968121876, "grad_norm": 0.69921875, "learning_rate": 3.3229641091330274e-05, "loss": 0.8508, "step": 3667 }, { "epoch": 1.9696318915472635, "grad_norm": 0.8203125, "learning_rate": 3.322166616846458e-05, "loss": 0.9971, "step": 3668 }, { "epoch": 1.9701687862823394, "grad_norm": 0.82421875, "learning_rate": 3.3213690307402666e-05, "loss": 0.976, "step": 3669 }, { "epoch": 1.9707056810174155, "grad_norm": 0.6484375, "learning_rate": 3.320571350905466e-05, "loss": 0.9101, "step": 3670 }, { "epoch": 1.9712425757524916, "grad_norm": 0.74609375, "learning_rate": 3.319773577433083e-05, "loss": 0.9938, "step": 3671 }, { "epoch": 1.9717794704875675, "grad_norm": 0.703125, "learning_rate": 3.3189757104141536e-05, "loss": 0.949, "step": 3672 }, { "epoch": 1.9723163652226434, "grad_norm": 0.73828125, "learning_rate": 3.318177749939723e-05, "loss": 1.0143, "step": 3673 }, { "epoch": 1.9728532599577195, "grad_norm": 0.74609375, "learning_rate": 3.31737969610085e-05, "loss": 0.8021, "step": 3674 }, { "epoch": 1.9733901546927957, "grad_norm": 0.6171875, "learning_rate": 3.316581548988603e-05, "loss": 0.8257, "step": 3675 }, { "epoch": 1.9739270494278716, "grad_norm": 0.640625, "learning_rate": 3.31578330869406e-05, "loss": 0.7043, "step": 3676 }, { "epoch": 1.9744639441629475, "grad_norm": 0.859375, "learning_rate": 3.314984975308311e-05, "loss": 0.8382, "step": 3677 }, { "epoch": 1.9750008388980236, "grad_norm": 0.65234375, "learning_rate": 3.3141865489224546e-05, "loss": 0.7103, "step": 3678 }, { "epoch": 1.9755377336330997, "grad_norm": 0.85546875, "learning_rate": 3.313388029627601e-05, "loss": 0.799, "step": 3679 }, { "epoch": 1.9760746283681756, "grad_norm": 0.62109375, "learning_rate": 3.312589417514874e-05, "loss": 0.8044, "step": 3680 }, { "epoch": 1.9766115231032515, "grad_norm": 0.7578125, "learning_rate": 3.311790712675403e-05, "loss": 0.8228, "step": 3681 }, { "epoch": 1.9771484178383276, "grad_norm": 0.828125, "learning_rate": 3.310991915200332e-05, "loss": 1.1587, "step": 3682 }, { "epoch": 1.9776853125734037, "grad_norm": 0.66796875, "learning_rate": 3.3101930251808134e-05, "loss": 0.7751, "step": 3683 }, { "epoch": 1.9782222073084794, "grad_norm": 0.71484375, "learning_rate": 3.309394042708009e-05, "loss": 0.6996, "step": 3684 }, { "epoch": 1.9787591020435555, "grad_norm": 0.90234375, "learning_rate": 3.308594967873095e-05, "loss": 0.8803, "step": 3685 }, { "epoch": 1.9792959967786317, "grad_norm": 0.609375, "learning_rate": 3.307795800767256e-05, "loss": 0.9486, "step": 3686 }, { "epoch": 1.9798328915137076, "grad_norm": 0.68359375, "learning_rate": 3.3069965414816853e-05, "loss": 0.74, "step": 3687 }, { "epoch": 1.9803697862487835, "grad_norm": 0.90234375, "learning_rate": 3.306197190107589e-05, "loss": 0.8933, "step": 3688 }, { "epoch": 1.9809066809838596, "grad_norm": 0.8515625, "learning_rate": 3.305397746736184e-05, "loss": 1.0735, "step": 3689 }, { "epoch": 1.9814435757189357, "grad_norm": 0.6015625, "learning_rate": 3.304598211458697e-05, "loss": 0.8555, "step": 3690 }, { "epoch": 1.9819804704540116, "grad_norm": 0.85546875, "learning_rate": 3.3037985843663643e-05, "loss": 1.0123, "step": 3691 }, { "epoch": 1.9825173651890875, "grad_norm": 0.69921875, "learning_rate": 3.302998865550434e-05, "loss": 0.7485, "step": 3692 }, { "epoch": 1.9830542599241636, "grad_norm": 0.7109375, "learning_rate": 3.302199055102163e-05, "loss": 0.8797, "step": 3693 }, { "epoch": 1.9835911546592397, "grad_norm": 0.73828125, "learning_rate": 3.301399153112822e-05, "loss": 1.0695, "step": 3694 }, { "epoch": 1.9841280493943156, "grad_norm": 0.59765625, "learning_rate": 3.3005991596736884e-05, "loss": 0.7878, "step": 3695 }, { "epoch": 1.9846649441293915, "grad_norm": 0.578125, "learning_rate": 3.299799074876052e-05, "loss": 0.7779, "step": 3696 }, { "epoch": 1.9852018388644677, "grad_norm": 0.6875, "learning_rate": 3.298998898811213e-05, "loss": 0.7775, "step": 3697 }, { "epoch": 1.9857387335995438, "grad_norm": 0.83203125, "learning_rate": 3.29819863157048e-05, "loss": 0.9006, "step": 3698 }, { "epoch": 1.9862756283346197, "grad_norm": 0.75390625, "learning_rate": 3.2973982732451755e-05, "loss": 0.7849, "step": 3699 }, { "epoch": 1.9868125230696956, "grad_norm": 0.6640625, "learning_rate": 3.296597823926629e-05, "loss": 0.7256, "step": 3700 }, { "epoch": 1.9868125230696956, "eval_loss": 0.9062338471412659, "eval_runtime": 46.4266, "eval_samples_per_second": 20.721, "eval_steps_per_second": 20.721, "step": 3700 }, { "epoch": 1.9873494178047717, "grad_norm": 0.72265625, "learning_rate": 3.295797283706183e-05, "loss": 0.8203, "step": 3701 }, { "epoch": 1.9878863125398478, "grad_norm": 0.61328125, "learning_rate": 3.29499665267519e-05, "loss": 0.8417, "step": 3702 }, { "epoch": 1.9884232072749235, "grad_norm": 0.65625, "learning_rate": 3.2941959309250094e-05, "loss": 0.8639, "step": 3703 }, { "epoch": 1.9889601020099996, "grad_norm": 0.640625, "learning_rate": 3.293395118547017e-05, "loss": 0.9724, "step": 3704 }, { "epoch": 1.9894969967450757, "grad_norm": 0.89453125, "learning_rate": 3.292594215632593e-05, "loss": 1.0296, "step": 3705 }, { "epoch": 1.9900338914801516, "grad_norm": 0.765625, "learning_rate": 3.2917932222731325e-05, "loss": 0.8509, "step": 3706 }, { "epoch": 1.9905707862152275, "grad_norm": 0.65234375, "learning_rate": 3.290992138560038e-05, "loss": 0.8924, "step": 3707 }, { "epoch": 1.9911076809503037, "grad_norm": 0.84375, "learning_rate": 3.290190964584723e-05, "loss": 0.9232, "step": 3708 }, { "epoch": 1.9916445756853798, "grad_norm": 0.8203125, "learning_rate": 3.289389700438611e-05, "loss": 1.0032, "step": 3709 }, { "epoch": 1.9921814704204557, "grad_norm": 0.8125, "learning_rate": 3.2885883462131394e-05, "loss": 0.7262, "step": 3710 }, { "epoch": 1.9927183651555316, "grad_norm": 0.609375, "learning_rate": 3.28778690199975e-05, "loss": 0.9565, "step": 3711 }, { "epoch": 1.9932552598906077, "grad_norm": 0.66015625, "learning_rate": 3.2869853678898974e-05, "loss": 0.8002, "step": 3712 }, { "epoch": 1.9937921546256838, "grad_norm": 0.75390625, "learning_rate": 3.286183743975049e-05, "loss": 0.9041, "step": 3713 }, { "epoch": 1.9943290493607597, "grad_norm": 0.71875, "learning_rate": 3.2853820303466794e-05, "loss": 0.9013, "step": 3714 }, { "epoch": 1.9948659440958356, "grad_norm": 0.671875, "learning_rate": 3.284580227096273e-05, "loss": 0.8556, "step": 3715 }, { "epoch": 1.9954028388309117, "grad_norm": 0.76953125, "learning_rate": 3.283778334315328e-05, "loss": 0.8141, "step": 3716 }, { "epoch": 1.9959397335659879, "grad_norm": 0.65625, "learning_rate": 3.2829763520953486e-05, "loss": 0.7905, "step": 3717 }, { "epoch": 1.9964766283010638, "grad_norm": 0.6796875, "learning_rate": 3.2821742805278525e-05, "loss": 0.9433, "step": 3718 }, { "epoch": 1.9970135230361397, "grad_norm": 0.984375, "learning_rate": 3.281372119704366e-05, "loss": 0.9418, "step": 3719 }, { "epoch": 1.9975504177712158, "grad_norm": 0.71484375, "learning_rate": 3.2805698697164234e-05, "loss": 0.8866, "step": 3720 }, { "epoch": 1.998087312506292, "grad_norm": 0.6484375, "learning_rate": 3.279767530655575e-05, "loss": 0.7644, "step": 3721 }, { "epoch": 1.9986242072413676, "grad_norm": 0.65234375, "learning_rate": 3.278965102613376e-05, "loss": 0.7099, "step": 3722 }, { "epoch": 1.9991611019764437, "grad_norm": 0.625, "learning_rate": 3.278162585681394e-05, "loss": 0.8385, "step": 3723 }, { "epoch": 1.9996979967115198, "grad_norm": 0.82421875, "learning_rate": 3.2773599799512066e-05, "loss": 0.7835, "step": 3724 }, { "epoch": 2.0, "grad_norm": 0.46875, "learning_rate": 3.2765572855144006e-05, "loss": 0.5844, "step": 3725 }, { "epoch": 2.000536894735076, "grad_norm": 0.61328125, "learning_rate": 3.2757545024625744e-05, "loss": 0.896, "step": 3726 }, { "epoch": 2.001073789470152, "grad_norm": 0.7578125, "learning_rate": 3.274951630887335e-05, "loss": 0.8918, "step": 3727 }, { "epoch": 2.001610684205228, "grad_norm": 0.75390625, "learning_rate": 3.2741486708803005e-05, "loss": 0.7393, "step": 3728 }, { "epoch": 2.002147578940304, "grad_norm": 0.703125, "learning_rate": 3.273345622533099e-05, "loss": 0.937, "step": 3729 }, { "epoch": 2.00268447367538, "grad_norm": 0.65625, "learning_rate": 3.272542485937369e-05, "loss": 0.7374, "step": 3730 }, { "epoch": 2.003221368410456, "grad_norm": 0.7578125, "learning_rate": 3.271739261184757e-05, "loss": 0.8095, "step": 3731 }, { "epoch": 2.003758263145532, "grad_norm": 0.671875, "learning_rate": 3.2709359483669227e-05, "loss": 0.7696, "step": 3732 }, { "epoch": 2.004295157880608, "grad_norm": 0.7421875, "learning_rate": 3.2701325475755326e-05, "loss": 0.8414, "step": 3733 }, { "epoch": 2.004832052615684, "grad_norm": 0.640625, "learning_rate": 3.269329058902266e-05, "loss": 0.7175, "step": 3734 }, { "epoch": 2.00536894735076, "grad_norm": 0.66015625, "learning_rate": 3.268525482438813e-05, "loss": 0.9309, "step": 3735 }, { "epoch": 2.005905842085836, "grad_norm": 0.58984375, "learning_rate": 3.267721818276868e-05, "loss": 0.8925, "step": 3736 }, { "epoch": 2.006442736820912, "grad_norm": 0.68359375, "learning_rate": 3.2669180665081414e-05, "loss": 0.9805, "step": 3737 }, { "epoch": 2.0069796315559882, "grad_norm": 0.69921875, "learning_rate": 3.266114227224351e-05, "loss": 0.8372, "step": 3738 }, { "epoch": 2.007516526291064, "grad_norm": 0.9140625, "learning_rate": 3.2653103005172257e-05, "loss": 0.7892, "step": 3739 }, { "epoch": 2.00805342102614, "grad_norm": 0.58203125, "learning_rate": 3.264506286478503e-05, "loss": 0.6366, "step": 3740 }, { "epoch": 2.008590315761216, "grad_norm": 0.703125, "learning_rate": 3.2637021851999315e-05, "loss": 1.0804, "step": 3741 }, { "epoch": 2.0091272104962923, "grad_norm": 0.72265625, "learning_rate": 3.262897996773269e-05, "loss": 0.9327, "step": 3742 }, { "epoch": 2.009664105231368, "grad_norm": 0.7578125, "learning_rate": 3.2620937212902835e-05, "loss": 0.9353, "step": 3743 }, { "epoch": 2.010200999966444, "grad_norm": 0.85546875, "learning_rate": 3.2612893588427536e-05, "loss": 1.0168, "step": 3744 }, { "epoch": 2.01073789470152, "grad_norm": 0.7421875, "learning_rate": 3.260484909522467e-05, "loss": 0.895, "step": 3745 }, { "epoch": 2.011274789436596, "grad_norm": 0.6171875, "learning_rate": 3.259680373421221e-05, "loss": 0.8307, "step": 3746 }, { "epoch": 2.011811684171672, "grad_norm": 0.62890625, "learning_rate": 3.2588757506308236e-05, "loss": 0.7808, "step": 3747 }, { "epoch": 2.012348578906748, "grad_norm": 0.94140625, "learning_rate": 3.258071041243094e-05, "loss": 1.1381, "step": 3748 }, { "epoch": 2.0128854736418242, "grad_norm": 0.6640625, "learning_rate": 3.257266245349858e-05, "loss": 0.7828, "step": 3749 }, { "epoch": 2.0134223683769, "grad_norm": 0.8125, "learning_rate": 3.256461363042952e-05, "loss": 0.8813, "step": 3750 }, { "epoch": 2.0134223683769, "eval_loss": 0.905744194984436, "eval_runtime": 46.622, "eval_samples_per_second": 20.634, "eval_steps_per_second": 20.634, "step": 3750 }, { "epoch": 2.013959263111976, "grad_norm": 0.6171875, "learning_rate": 3.255656394414226e-05, "loss": 0.7973, "step": 3751 }, { "epoch": 2.014496157847052, "grad_norm": 0.765625, "learning_rate": 3.254851339555535e-05, "loss": 0.8319, "step": 3752 }, { "epoch": 2.0150330525821283, "grad_norm": 0.5390625, "learning_rate": 3.254046198558748e-05, "loss": 0.5987, "step": 3753 }, { "epoch": 2.015569947317204, "grad_norm": 0.56640625, "learning_rate": 3.253240971515739e-05, "loss": 0.745, "step": 3754 }, { "epoch": 2.01610684205228, "grad_norm": 0.75390625, "learning_rate": 3.252435658518397e-05, "loss": 0.775, "step": 3755 }, { "epoch": 2.016643736787356, "grad_norm": 0.70703125, "learning_rate": 3.251630259658618e-05, "loss": 0.8454, "step": 3756 }, { "epoch": 2.0171806315224323, "grad_norm": 0.734375, "learning_rate": 3.2508247750283065e-05, "loss": 0.7807, "step": 3757 }, { "epoch": 2.017717526257508, "grad_norm": 0.62109375, "learning_rate": 3.25001920471938e-05, "loss": 0.6699, "step": 3758 }, { "epoch": 2.018254420992584, "grad_norm": 0.73828125, "learning_rate": 3.249213548823765e-05, "loss": 0.7912, "step": 3759 }, { "epoch": 2.0187913157276602, "grad_norm": 0.7265625, "learning_rate": 3.2484078074333954e-05, "loss": 0.8884, "step": 3760 }, { "epoch": 2.0193282104627364, "grad_norm": 0.75, "learning_rate": 3.247601980640217e-05, "loss": 0.8316, "step": 3761 }, { "epoch": 2.019865105197812, "grad_norm": 0.64453125, "learning_rate": 3.2467960685361854e-05, "loss": 0.8054, "step": 3762 }, { "epoch": 2.020401999932888, "grad_norm": 0.6953125, "learning_rate": 3.245990071213265e-05, "loss": 0.7922, "step": 3763 }, { "epoch": 2.0209388946679643, "grad_norm": 0.69140625, "learning_rate": 3.245183988763429e-05, "loss": 0.7621, "step": 3764 }, { "epoch": 2.02147578940304, "grad_norm": 0.70703125, "learning_rate": 3.2443778212786646e-05, "loss": 0.8941, "step": 3765 }, { "epoch": 2.022012684138116, "grad_norm": 0.64453125, "learning_rate": 3.2435715688509626e-05, "loss": 0.8907, "step": 3766 }, { "epoch": 2.022549578873192, "grad_norm": 0.69921875, "learning_rate": 3.242765231572329e-05, "loss": 0.8796, "step": 3767 }, { "epoch": 2.0230864736082683, "grad_norm": 0.68359375, "learning_rate": 3.241958809534776e-05, "loss": 0.7591, "step": 3768 }, { "epoch": 2.023623368343344, "grad_norm": 0.828125, "learning_rate": 3.241152302830326e-05, "loss": 0.8742, "step": 3769 }, { "epoch": 2.02416026307842, "grad_norm": 0.82421875, "learning_rate": 3.240345711551014e-05, "loss": 1.0462, "step": 3770 }, { "epoch": 2.0246971578134962, "grad_norm": 0.6640625, "learning_rate": 3.2395390357888785e-05, "loss": 0.9092, "step": 3771 }, { "epoch": 2.0252340525485724, "grad_norm": 0.5546875, "learning_rate": 3.2387322756359745e-05, "loss": 0.614, "step": 3772 }, { "epoch": 2.025770947283648, "grad_norm": 0.6328125, "learning_rate": 3.2379254311843634e-05, "loss": 0.7836, "step": 3773 }, { "epoch": 2.026307842018724, "grad_norm": 0.6171875, "learning_rate": 3.237118502526114e-05, "loss": 0.8289, "step": 3774 }, { "epoch": 2.0268447367538003, "grad_norm": 0.73828125, "learning_rate": 3.236311489753309e-05, "loss": 0.8063, "step": 3775 }, { "epoch": 2.0273816314888764, "grad_norm": 0.6484375, "learning_rate": 3.235504392958039e-05, "loss": 0.7739, "step": 3776 }, { "epoch": 2.027918526223952, "grad_norm": 0.7109375, "learning_rate": 3.234697212232403e-05, "loss": 0.7872, "step": 3777 }, { "epoch": 2.028455420959028, "grad_norm": 0.734375, "learning_rate": 3.233889947668511e-05, "loss": 0.8459, "step": 3778 }, { "epoch": 2.0289923156941043, "grad_norm": 0.6875, "learning_rate": 3.2330825993584827e-05, "loss": 0.7905, "step": 3779 }, { "epoch": 2.0295292104291804, "grad_norm": 0.59375, "learning_rate": 3.232275167394444e-05, "loss": 0.952, "step": 3780 }, { "epoch": 2.030066105164256, "grad_norm": 0.62890625, "learning_rate": 3.2314676518685374e-05, "loss": 0.9142, "step": 3781 }, { "epoch": 2.0306029998993322, "grad_norm": 0.6015625, "learning_rate": 3.230660052872907e-05, "loss": 0.8182, "step": 3782 }, { "epoch": 2.0311398946344084, "grad_norm": 0.7890625, "learning_rate": 3.229852370499712e-05, "loss": 0.8295, "step": 3783 }, { "epoch": 2.031676789369484, "grad_norm": 0.62890625, "learning_rate": 3.229044604841119e-05, "loss": 0.8785, "step": 3784 }, { "epoch": 2.03221368410456, "grad_norm": 0.57421875, "learning_rate": 3.228236755989303e-05, "loss": 0.7369, "step": 3785 }, { "epoch": 2.0327505788396363, "grad_norm": 0.65625, "learning_rate": 3.227428824036452e-05, "loss": 0.7634, "step": 3786 }, { "epoch": 2.0332874735747124, "grad_norm": 0.6015625, "learning_rate": 3.2266208090747594e-05, "loss": 0.776, "step": 3787 }, { "epoch": 2.033824368309788, "grad_norm": 0.703125, "learning_rate": 3.22581271119643e-05, "loss": 0.8237, "step": 3788 }, { "epoch": 2.034361263044864, "grad_norm": 0.703125, "learning_rate": 3.2250045304936795e-05, "loss": 0.7268, "step": 3789 }, { "epoch": 2.0348981577799403, "grad_norm": 0.6171875, "learning_rate": 3.2241962670587314e-05, "loss": 0.7474, "step": 3790 }, { "epoch": 2.0354350525150164, "grad_norm": 0.5859375, "learning_rate": 3.223387920983816e-05, "loss": 0.7071, "step": 3791 }, { "epoch": 2.035971947250092, "grad_norm": 0.609375, "learning_rate": 3.222579492361179e-05, "loss": 0.7037, "step": 3792 }, { "epoch": 2.0365088419851682, "grad_norm": 0.66796875, "learning_rate": 3.221770981283071e-05, "loss": 0.9173, "step": 3793 }, { "epoch": 2.0370457367202444, "grad_norm": 0.6015625, "learning_rate": 3.220962387841754e-05, "loss": 0.8577, "step": 3794 }, { "epoch": 2.0375826314553205, "grad_norm": 0.67578125, "learning_rate": 3.220153712129498e-05, "loss": 1.0043, "step": 3795 }, { "epoch": 2.038119526190396, "grad_norm": 0.6640625, "learning_rate": 3.2193449542385835e-05, "loss": 0.926, "step": 3796 }, { "epoch": 2.0386564209254723, "grad_norm": 0.6484375, "learning_rate": 3.218536114261299e-05, "loss": 0.8775, "step": 3797 }, { "epoch": 2.0391933156605484, "grad_norm": 0.625, "learning_rate": 3.217727192289946e-05, "loss": 0.7468, "step": 3798 }, { "epoch": 2.0397302103956245, "grad_norm": 0.71875, "learning_rate": 3.21691818841683e-05, "loss": 0.8912, "step": 3799 }, { "epoch": 2.0402671051307, "grad_norm": 0.6875, "learning_rate": 3.216109102734269e-05, "loss": 1.1117, "step": 3800 }, { "epoch": 2.0402671051307, "eval_loss": 0.906833827495575, "eval_runtime": 46.2156, "eval_samples_per_second": 20.815, "eval_steps_per_second": 20.815, "step": 3800 }, { "epoch": 2.0408039998657763, "grad_norm": 0.68359375, "learning_rate": 3.215299935334592e-05, "loss": 0.7149, "step": 3801 }, { "epoch": 2.0413408946008524, "grad_norm": 0.66015625, "learning_rate": 3.214490686310133e-05, "loss": 0.7853, "step": 3802 }, { "epoch": 2.041877789335928, "grad_norm": 0.7578125, "learning_rate": 3.213681355753239e-05, "loss": 0.8345, "step": 3803 }, { "epoch": 2.0424146840710042, "grad_norm": 0.69140625, "learning_rate": 3.2128719437562646e-05, "loss": 0.944, "step": 3804 }, { "epoch": 2.0429515788060804, "grad_norm": 0.6953125, "learning_rate": 3.212062450411572e-05, "loss": 0.8661, "step": 3805 }, { "epoch": 2.0434884735411565, "grad_norm": 0.69921875, "learning_rate": 3.211252875811538e-05, "loss": 1.0284, "step": 3806 }, { "epoch": 2.044025368276232, "grad_norm": 0.62109375, "learning_rate": 3.2104432200485426e-05, "loss": 0.775, "step": 3807 }, { "epoch": 2.0445622630113083, "grad_norm": 0.671875, "learning_rate": 3.2096334832149785e-05, "loss": 0.8019, "step": 3808 }, { "epoch": 2.0450991577463844, "grad_norm": 0.6640625, "learning_rate": 3.208823665403248e-05, "loss": 0.742, "step": 3809 }, { "epoch": 2.0456360524814605, "grad_norm": 0.703125, "learning_rate": 3.2080137667057595e-05, "loss": 0.7986, "step": 3810 }, { "epoch": 2.046172947216536, "grad_norm": 0.66015625, "learning_rate": 3.207203787214936e-05, "loss": 0.7775, "step": 3811 }, { "epoch": 2.0467098419516123, "grad_norm": 0.703125, "learning_rate": 3.206393727023203e-05, "loss": 0.8301, "step": 3812 }, { "epoch": 2.0472467366866884, "grad_norm": 0.63671875, "learning_rate": 3.205583586223e-05, "loss": 0.8504, "step": 3813 }, { "epoch": 2.0477836314217646, "grad_norm": 0.6953125, "learning_rate": 3.2047733649067746e-05, "loss": 0.7674, "step": 3814 }, { "epoch": 2.0483205261568402, "grad_norm": 0.671875, "learning_rate": 3.203963063166983e-05, "loss": 0.8156, "step": 3815 }, { "epoch": 2.0488574208919164, "grad_norm": 0.7890625, "learning_rate": 3.203152681096091e-05, "loss": 1.0049, "step": 3816 }, { "epoch": 2.0493943156269925, "grad_norm": 0.640625, "learning_rate": 3.202342218786574e-05, "loss": 0.7347, "step": 3817 }, { "epoch": 2.0499312103620686, "grad_norm": 0.703125, "learning_rate": 3.201531676330914e-05, "loss": 0.9347, "step": 3818 }, { "epoch": 2.0504681050971443, "grad_norm": 0.703125, "learning_rate": 3.2007210538216066e-05, "loss": 0.8745, "step": 3819 }, { "epoch": 2.0510049998322204, "grad_norm": 0.6484375, "learning_rate": 3.199910351351153e-05, "loss": 0.8997, "step": 3820 }, { "epoch": 2.0515418945672965, "grad_norm": 0.703125, "learning_rate": 3.1990995690120635e-05, "loss": 0.6886, "step": 3821 }, { "epoch": 2.052078789302372, "grad_norm": 0.57421875, "learning_rate": 3.19828870689686e-05, "loss": 0.6798, "step": 3822 }, { "epoch": 2.0526156840374483, "grad_norm": 0.71484375, "learning_rate": 3.1974777650980735e-05, "loss": 0.8744, "step": 3823 }, { "epoch": 2.0531525787725244, "grad_norm": 0.62109375, "learning_rate": 3.19666674370824e-05, "loss": 0.7508, "step": 3824 }, { "epoch": 2.0536894735076006, "grad_norm": 0.6875, "learning_rate": 3.195855642819909e-05, "loss": 0.9087, "step": 3825 }, { "epoch": 2.0542263682426762, "grad_norm": 0.72265625, "learning_rate": 3.195044462525636e-05, "loss": 1.0194, "step": 3826 }, { "epoch": 2.0547632629777524, "grad_norm": 0.6953125, "learning_rate": 3.194233202917989e-05, "loss": 0.9374, "step": 3827 }, { "epoch": 2.0553001577128285, "grad_norm": 0.8515625, "learning_rate": 3.193421864089541e-05, "loss": 0.9167, "step": 3828 }, { "epoch": 2.0558370524479046, "grad_norm": 0.6484375, "learning_rate": 3.1926104461328766e-05, "loss": 0.9575, "step": 3829 }, { "epoch": 2.0563739471829803, "grad_norm": 0.5859375, "learning_rate": 3.19179894914059e-05, "loss": 0.8427, "step": 3830 }, { "epoch": 2.0569108419180564, "grad_norm": 0.703125, "learning_rate": 3.1909873732052816e-05, "loss": 0.9534, "step": 3831 }, { "epoch": 2.0574477366531325, "grad_norm": 0.76953125, "learning_rate": 3.190175718419564e-05, "loss": 0.9799, "step": 3832 }, { "epoch": 2.0579846313882086, "grad_norm": 0.765625, "learning_rate": 3.189363984876056e-05, "loss": 0.7339, "step": 3833 }, { "epoch": 2.0585215261232843, "grad_norm": 0.625, "learning_rate": 3.188552172667387e-05, "loss": 0.9575, "step": 3834 }, { "epoch": 2.0590584208583604, "grad_norm": 0.69921875, "learning_rate": 3.187740281886195e-05, "loss": 0.783, "step": 3835 }, { "epoch": 2.0595953155934366, "grad_norm": 0.6953125, "learning_rate": 3.186928312625129e-05, "loss": 0.8004, "step": 3836 }, { "epoch": 2.0601322103285127, "grad_norm": 0.77734375, "learning_rate": 3.1861162649768415e-05, "loss": 1.0324, "step": 3837 }, { "epoch": 2.0606691050635884, "grad_norm": 0.78125, "learning_rate": 3.185304139034e-05, "loss": 0.968, "step": 3838 }, { "epoch": 2.0612059997986645, "grad_norm": 0.75, "learning_rate": 3.184491934889278e-05, "loss": 0.8289, "step": 3839 }, { "epoch": 2.0617428945337406, "grad_norm": 0.69140625, "learning_rate": 3.183679652635357e-05, "loss": 0.8071, "step": 3840 }, { "epoch": 2.0622797892688163, "grad_norm": 0.83203125, "learning_rate": 3.182867292364929e-05, "loss": 0.8421, "step": 3841 }, { "epoch": 2.0628166840038924, "grad_norm": 0.66796875, "learning_rate": 3.182054854170697e-05, "loss": 0.8974, "step": 3842 }, { "epoch": 2.0633535787389685, "grad_norm": 1.1484375, "learning_rate": 3.181242338145367e-05, "loss": 0.9089, "step": 3843 }, { "epoch": 2.0638904734740446, "grad_norm": 0.6640625, "learning_rate": 3.18042974438166e-05, "loss": 0.8187, "step": 3844 }, { "epoch": 2.0644273682091203, "grad_norm": 0.71875, "learning_rate": 3.179617072972302e-05, "loss": 0.721, "step": 3845 }, { "epoch": 2.0649642629441964, "grad_norm": 0.734375, "learning_rate": 3.178804324010028e-05, "loss": 0.8762, "step": 3846 }, { "epoch": 2.0655011576792726, "grad_norm": 0.58984375, "learning_rate": 3.1779914975875856e-05, "loss": 0.795, "step": 3847 }, { "epoch": 2.0660380524143487, "grad_norm": 0.71484375, "learning_rate": 3.1771785937977274e-05, "loss": 0.8996, "step": 3848 }, { "epoch": 2.0665749471494244, "grad_norm": 0.7734375, "learning_rate": 3.176365612733216e-05, "loss": 0.7479, "step": 3849 }, { "epoch": 2.0671118418845005, "grad_norm": 0.578125, "learning_rate": 3.175552554486822e-05, "loss": 0.766, "step": 3850 }, { "epoch": 2.0671118418845005, "eval_loss": 0.9062168598175049, "eval_runtime": 46.226, "eval_samples_per_second": 20.811, "eval_steps_per_second": 20.811, "step": 3850 }, { "epoch": 2.0676487366195766, "grad_norm": 0.6640625, "learning_rate": 3.174739419151326e-05, "loss": 0.9449, "step": 3851 }, { "epoch": 2.0681856313546527, "grad_norm": 0.63671875, "learning_rate": 3.173926206819518e-05, "loss": 0.8738, "step": 3852 }, { "epoch": 2.0687225260897284, "grad_norm": 0.7421875, "learning_rate": 3.173112917584196e-05, "loss": 0.937, "step": 3853 }, { "epoch": 2.0692594208248045, "grad_norm": 0.80078125, "learning_rate": 3.172299551538164e-05, "loss": 0.9554, "step": 3854 }, { "epoch": 2.0697963155598806, "grad_norm": 0.56640625, "learning_rate": 3.171486108774241e-05, "loss": 0.7058, "step": 3855 }, { "epoch": 2.0703332102949563, "grad_norm": 0.765625, "learning_rate": 3.1706725893852484e-05, "loss": 0.9259, "step": 3856 }, { "epoch": 2.0708701050300324, "grad_norm": 0.67578125, "learning_rate": 3.16985899346402e-05, "loss": 0.8209, "step": 3857 }, { "epoch": 2.0714069997651086, "grad_norm": 0.67578125, "learning_rate": 3.169045321103398e-05, "loss": 1.0139, "step": 3858 }, { "epoch": 2.0719438945001847, "grad_norm": 0.640625, "learning_rate": 3.168231572396231e-05, "loss": 0.824, "step": 3859 }, { "epoch": 2.0724807892352604, "grad_norm": 0.73046875, "learning_rate": 3.167417747435379e-05, "loss": 0.9313, "step": 3860 }, { "epoch": 2.0730176839703365, "grad_norm": 0.66796875, "learning_rate": 3.166603846313711e-05, "loss": 0.9802, "step": 3861 }, { "epoch": 2.0735545787054126, "grad_norm": 0.83984375, "learning_rate": 3.165789869124102e-05, "loss": 0.7642, "step": 3862 }, { "epoch": 2.0740914734404887, "grad_norm": 0.703125, "learning_rate": 3.1649758159594365e-05, "loss": 0.8664, "step": 3863 }, { "epoch": 2.0746283681755644, "grad_norm": 0.72265625, "learning_rate": 3.1641616869126103e-05, "loss": 0.952, "step": 3864 }, { "epoch": 2.0751652629106405, "grad_norm": 0.65625, "learning_rate": 3.163347482076524e-05, "loss": 1.0794, "step": 3865 }, { "epoch": 2.0757021576457166, "grad_norm": 0.765625, "learning_rate": 3.16253320154409e-05, "loss": 0.8358, "step": 3866 }, { "epoch": 2.0762390523807928, "grad_norm": 0.68359375, "learning_rate": 3.161718845408227e-05, "loss": 0.9434, "step": 3867 }, { "epoch": 2.0767759471158684, "grad_norm": 0.64453125, "learning_rate": 3.160904413761863e-05, "loss": 0.6919, "step": 3868 }, { "epoch": 2.0773128418509446, "grad_norm": 0.66796875, "learning_rate": 3.1600899066979376e-05, "loss": 0.8196, "step": 3869 }, { "epoch": 2.0778497365860207, "grad_norm": 0.671875, "learning_rate": 3.159275324309393e-05, "loss": 0.8148, "step": 3870 }, { "epoch": 2.078386631321097, "grad_norm": 0.72265625, "learning_rate": 3.158460666689185e-05, "loss": 1.1962, "step": 3871 }, { "epoch": 2.0789235260561725, "grad_norm": 0.671875, "learning_rate": 3.1576459339302763e-05, "loss": 0.8825, "step": 3872 }, { "epoch": 2.0794604207912486, "grad_norm": 0.7265625, "learning_rate": 3.156831126125638e-05, "loss": 0.9035, "step": 3873 }, { "epoch": 2.0799973155263247, "grad_norm": 0.69140625, "learning_rate": 3.156016243368251e-05, "loss": 0.9358, "step": 3874 }, { "epoch": 2.0805342102614004, "grad_norm": 0.78515625, "learning_rate": 3.1552012857511014e-05, "loss": 0.8911, "step": 3875 }, { "epoch": 2.0810711049964765, "grad_norm": 0.81640625, "learning_rate": 3.1543862533671886e-05, "loss": 1.2351, "step": 3876 }, { "epoch": 2.0816079997315526, "grad_norm": 0.671875, "learning_rate": 3.1535711463095166e-05, "loss": 0.8813, "step": 3877 }, { "epoch": 2.0821448944666288, "grad_norm": 0.65234375, "learning_rate": 3.152755964671101e-05, "loss": 0.8181, "step": 3878 }, { "epoch": 2.0826817892017044, "grad_norm": 0.6015625, "learning_rate": 3.1519407085449627e-05, "loss": 0.819, "step": 3879 }, { "epoch": 2.0832186839367806, "grad_norm": 0.73046875, "learning_rate": 3.151125378024133e-05, "loss": 0.8729, "step": 3880 }, { "epoch": 2.0837555786718567, "grad_norm": 0.734375, "learning_rate": 3.150309973201653e-05, "loss": 0.872, "step": 3881 }, { "epoch": 2.084292473406933, "grad_norm": 0.546875, "learning_rate": 3.149494494170569e-05, "loss": 0.6089, "step": 3882 }, { "epoch": 2.0848293681420085, "grad_norm": 0.67578125, "learning_rate": 3.1486789410239376e-05, "loss": 0.8081, "step": 3883 }, { "epoch": 2.0853662628770846, "grad_norm": 0.76171875, "learning_rate": 3.1478633138548235e-05, "loss": 1.0491, "step": 3884 }, { "epoch": 2.0859031576121607, "grad_norm": 0.7421875, "learning_rate": 3.147047612756302e-05, "loss": 0.9228, "step": 3885 }, { "epoch": 2.086440052347237, "grad_norm": 0.76953125, "learning_rate": 3.146231837821454e-05, "loss": 0.8107, "step": 3886 }, { "epoch": 2.0869769470823125, "grad_norm": 0.671875, "learning_rate": 3.1454159891433685e-05, "loss": 0.9009, "step": 3887 }, { "epoch": 2.0875138418173886, "grad_norm": 0.6875, "learning_rate": 3.144600066815145e-05, "loss": 0.9105, "step": 3888 }, { "epoch": 2.0880507365524648, "grad_norm": 0.69140625, "learning_rate": 3.1437840709298914e-05, "loss": 0.9284, "step": 3889 }, { "epoch": 2.088587631287541, "grad_norm": 0.6328125, "learning_rate": 3.142968001580722e-05, "loss": 0.7902, "step": 3890 }, { "epoch": 2.0891245260226166, "grad_norm": 0.734375, "learning_rate": 3.142151858860761e-05, "loss": 0.9536, "step": 3891 }, { "epoch": 2.0896614207576927, "grad_norm": 0.63671875, "learning_rate": 3.1413356428631414e-05, "loss": 1.0451, "step": 3892 }, { "epoch": 2.090198315492769, "grad_norm": 0.73046875, "learning_rate": 3.140519353681003e-05, "loss": 0.8377, "step": 3893 }, { "epoch": 2.0907352102278445, "grad_norm": 0.765625, "learning_rate": 3.139702991407494e-05, "loss": 0.7791, "step": 3894 }, { "epoch": 2.0912721049629206, "grad_norm": 0.6328125, "learning_rate": 3.138886556135773e-05, "loss": 0.7932, "step": 3895 }, { "epoch": 2.0918089996979967, "grad_norm": 0.71484375, "learning_rate": 3.138070047959005e-05, "loss": 0.775, "step": 3896 }, { "epoch": 2.092345894433073, "grad_norm": 0.66796875, "learning_rate": 3.137253466970364e-05, "loss": 0.7762, "step": 3897 }, { "epoch": 2.0928827891681485, "grad_norm": 0.69921875, "learning_rate": 3.1364368132630316e-05, "loss": 0.932, "step": 3898 }, { "epoch": 2.0934196839032246, "grad_norm": 0.73046875, "learning_rate": 3.1356200869302e-05, "loss": 0.8704, "step": 3899 }, { "epoch": 2.0939565786383008, "grad_norm": 0.7265625, "learning_rate": 3.1348032880650675e-05, "loss": 0.8469, "step": 3900 }, { "epoch": 2.0939565786383008, "eval_loss": 0.9066072702407837, "eval_runtime": 46.1752, "eval_samples_per_second": 20.834, "eval_steps_per_second": 20.834, "step": 3900 }, { "epoch": 2.094493473373377, "grad_norm": 0.75, "learning_rate": 3.133986416760839e-05, "loss": 0.9143, "step": 3901 }, { "epoch": 2.0950303681084526, "grad_norm": 0.84765625, "learning_rate": 3.133169473110733e-05, "loss": 0.7819, "step": 3902 }, { "epoch": 2.0955672628435287, "grad_norm": 0.61328125, "learning_rate": 3.132352457207971e-05, "loss": 0.9131, "step": 3903 }, { "epoch": 2.096104157578605, "grad_norm": 0.71875, "learning_rate": 3.1315353691457864e-05, "loss": 0.8014, "step": 3904 }, { "epoch": 2.096641052313681, "grad_norm": 0.60546875, "learning_rate": 3.130718209017417e-05, "loss": 0.7388, "step": 3905 }, { "epoch": 2.0971779470487566, "grad_norm": 0.6640625, "learning_rate": 3.1299009769161133e-05, "loss": 0.6695, "step": 3906 }, { "epoch": 2.0977148417838327, "grad_norm": 0.80078125, "learning_rate": 3.129083672935131e-05, "loss": 0.863, "step": 3907 }, { "epoch": 2.098251736518909, "grad_norm": 0.8515625, "learning_rate": 3.1282662971677356e-05, "loss": 1.1562, "step": 3908 }, { "epoch": 2.098788631253985, "grad_norm": 0.79296875, "learning_rate": 3.127448849707198e-05, "loss": 0.9662, "step": 3909 }, { "epoch": 2.0993255259890606, "grad_norm": 0.69140625, "learning_rate": 3.126631330646802e-05, "loss": 0.8017, "step": 3910 }, { "epoch": 2.0998624207241368, "grad_norm": 0.82421875, "learning_rate": 3.1258137400798345e-05, "loss": 0.7675, "step": 3911 }, { "epoch": 2.100399315459213, "grad_norm": 0.59375, "learning_rate": 3.1249960780995946e-05, "loss": 0.7322, "step": 3912 }, { "epoch": 2.1009362101942886, "grad_norm": 0.67578125, "learning_rate": 3.1241783447993865e-05, "loss": 0.8878, "step": 3913 }, { "epoch": 2.1014731049293647, "grad_norm": 0.56640625, "learning_rate": 3.123360540272525e-05, "loss": 0.7652, "step": 3914 }, { "epoch": 2.102009999664441, "grad_norm": 0.734375, "learning_rate": 3.122542664612332e-05, "loss": 0.7856, "step": 3915 }, { "epoch": 2.102546894399517, "grad_norm": 0.78125, "learning_rate": 3.121724717912138e-05, "loss": 0.8477, "step": 3916 }, { "epoch": 2.1030837891345926, "grad_norm": 0.71875, "learning_rate": 3.120906700265278e-05, "loss": 0.7319, "step": 3917 }, { "epoch": 2.1036206838696687, "grad_norm": 0.59375, "learning_rate": 3.1200886117651016e-05, "loss": 0.7902, "step": 3918 }, { "epoch": 2.104157578604745, "grad_norm": 0.6875, "learning_rate": 3.119270452504962e-05, "loss": 0.8681, "step": 3919 }, { "epoch": 2.104694473339821, "grad_norm": 0.91015625, "learning_rate": 3.118452222578221e-05, "loss": 1.0723, "step": 3920 }, { "epoch": 2.1052313680748966, "grad_norm": 0.7265625, "learning_rate": 3.1176339220782494e-05, "loss": 0.898, "step": 3921 }, { "epoch": 2.1057682628099728, "grad_norm": 0.6171875, "learning_rate": 3.1168155510984255e-05, "loss": 0.8005, "step": 3922 }, { "epoch": 2.106305157545049, "grad_norm": 0.7734375, "learning_rate": 3.1159971097321355e-05, "loss": 0.9795, "step": 3923 }, { "epoch": 2.106842052280125, "grad_norm": 0.73828125, "learning_rate": 3.1151785980727755e-05, "loss": 0.9722, "step": 3924 }, { "epoch": 2.1073789470152007, "grad_norm": 0.66015625, "learning_rate": 3.114360016213745e-05, "loss": 0.7907, "step": 3925 }, { "epoch": 2.107915841750277, "grad_norm": 0.75390625, "learning_rate": 3.1135413642484576e-05, "loss": 0.9692, "step": 3926 }, { "epoch": 2.108452736485353, "grad_norm": 0.59375, "learning_rate": 3.112722642270331e-05, "loss": 0.9058, "step": 3927 }, { "epoch": 2.108989631220429, "grad_norm": 0.60546875, "learning_rate": 3.111903850372791e-05, "loss": 0.7444, "step": 3928 }, { "epoch": 2.1095265259555047, "grad_norm": 0.7265625, "learning_rate": 3.1110849886492724e-05, "loss": 0.8687, "step": 3929 }, { "epoch": 2.110063420690581, "grad_norm": 0.84375, "learning_rate": 3.1102660571932184e-05, "loss": 0.7473, "step": 3930 }, { "epoch": 2.110600315425657, "grad_norm": 0.57421875, "learning_rate": 3.109447056098078e-05, "loss": 0.656, "step": 3931 }, { "epoch": 2.1111372101607326, "grad_norm": 0.61328125, "learning_rate": 3.1086279854573116e-05, "loss": 0.979, "step": 3932 }, { "epoch": 2.1116741048958088, "grad_norm": 0.76171875, "learning_rate": 3.1078088453643827e-05, "loss": 0.8921, "step": 3933 }, { "epoch": 2.112210999630885, "grad_norm": 0.671875, "learning_rate": 3.1069896359127686e-05, "loss": 0.7848, "step": 3934 }, { "epoch": 2.112747894365961, "grad_norm": 0.69140625, "learning_rate": 3.10617035719595e-05, "loss": 0.6612, "step": 3935 }, { "epoch": 2.1132847891010367, "grad_norm": 0.64453125, "learning_rate": 3.1053510093074166e-05, "loss": 0.8053, "step": 3936 }, { "epoch": 2.113821683836113, "grad_norm": 0.73828125, "learning_rate": 3.1045315923406676e-05, "loss": 0.8986, "step": 3937 }, { "epoch": 2.114358578571189, "grad_norm": 0.703125, "learning_rate": 3.103712106389208e-05, "loss": 0.9231, "step": 3938 }, { "epoch": 2.114895473306265, "grad_norm": 0.66796875, "learning_rate": 3.1028925515465504e-05, "loss": 0.6654, "step": 3939 }, { "epoch": 2.1154323680413407, "grad_norm": 0.63671875, "learning_rate": 3.102072927906219e-05, "loss": 0.7507, "step": 3940 }, { "epoch": 2.115969262776417, "grad_norm": 0.6796875, "learning_rate": 3.101253235561742e-05, "loss": 0.7998, "step": 3941 }, { "epoch": 2.116506157511493, "grad_norm": 0.63671875, "learning_rate": 3.100433474606656e-05, "loss": 1.0736, "step": 3942 }, { "epoch": 2.117043052246569, "grad_norm": 0.65234375, "learning_rate": 3.099613645134507e-05, "loss": 0.749, "step": 3943 }, { "epoch": 2.1175799469816448, "grad_norm": 0.55859375, "learning_rate": 3.0987937472388475e-05, "loss": 0.7379, "step": 3944 }, { "epoch": 2.118116841716721, "grad_norm": 0.62109375, "learning_rate": 3.097973781013238e-05, "loss": 0.697, "step": 3945 }, { "epoch": 2.118653736451797, "grad_norm": 0.84765625, "learning_rate": 3.097153746551248e-05, "loss": 1.005, "step": 3946 }, { "epoch": 2.119190631186873, "grad_norm": 0.671875, "learning_rate": 3.0963336439464526e-05, "loss": 0.8847, "step": 3947 }, { "epoch": 2.119727525921949, "grad_norm": 0.6953125, "learning_rate": 3.095513473292436e-05, "loss": 0.8669, "step": 3948 }, { "epoch": 2.120264420657025, "grad_norm": 0.625, "learning_rate": 3.094693234682792e-05, "loss": 0.7202, "step": 3949 }, { "epoch": 2.120801315392101, "grad_norm": 0.71875, "learning_rate": 3.0938729282111176e-05, "loss": 0.9628, "step": 3950 }, { "epoch": 2.120801315392101, "eval_loss": 0.906258225440979, "eval_runtime": 46.8303, "eval_samples_per_second": 20.542, "eval_steps_per_second": 20.542, "step": 3950 }, { "epoch": 2.1213382101271767, "grad_norm": 0.625, "learning_rate": 3.093052553971022e-05, "loss": 0.7971, "step": 3951 }, { "epoch": 2.121875104862253, "grad_norm": 0.65234375, "learning_rate": 3.0922321120561186e-05, "loss": 0.8604, "step": 3952 }, { "epoch": 2.122411999597329, "grad_norm": 0.7890625, "learning_rate": 3.091411602560032e-05, "loss": 0.8655, "step": 3953 }, { "epoch": 2.122948894332405, "grad_norm": 0.625, "learning_rate": 3.090591025576392e-05, "loss": 0.7767, "step": 3954 }, { "epoch": 2.1234857890674808, "grad_norm": 0.71484375, "learning_rate": 3.089770381198836e-05, "loss": 0.8675, "step": 3955 }, { "epoch": 2.124022683802557, "grad_norm": 0.62109375, "learning_rate": 3.088949669521011e-05, "loss": 0.833, "step": 3956 }, { "epoch": 2.124559578537633, "grad_norm": 0.73046875, "learning_rate": 3.088128890636571e-05, "loss": 0.8068, "step": 3957 }, { "epoch": 2.125096473272709, "grad_norm": 0.66796875, "learning_rate": 3.087308044639176e-05, "loss": 0.924, "step": 3958 }, { "epoch": 2.125633368007785, "grad_norm": 0.75390625, "learning_rate": 3.0864871316224955e-05, "loss": 0.7982, "step": 3959 }, { "epoch": 2.126170262742861, "grad_norm": 0.6953125, "learning_rate": 3.0856661516802054e-05, "loss": 1.0301, "step": 3960 }, { "epoch": 2.126707157477937, "grad_norm": 0.69921875, "learning_rate": 3.084845104905991e-05, "loss": 0.8342, "step": 3961 }, { "epoch": 2.127244052213013, "grad_norm": 0.76171875, "learning_rate": 3.084023991393544e-05, "loss": 1.0664, "step": 3962 }, { "epoch": 2.127780946948089, "grad_norm": 0.61328125, "learning_rate": 3.083202811236564e-05, "loss": 0.8307, "step": 3963 }, { "epoch": 2.128317841683165, "grad_norm": 0.703125, "learning_rate": 3.082381564528756e-05, "loss": 1.0423, "step": 3964 }, { "epoch": 2.128854736418241, "grad_norm": 0.78515625, "learning_rate": 3.081560251363837e-05, "loss": 1.0167, "step": 3965 }, { "epoch": 2.129391631153317, "grad_norm": 0.65625, "learning_rate": 3.0807388718355286e-05, "loss": 0.8246, "step": 3966 }, { "epoch": 2.129928525888393, "grad_norm": 0.6875, "learning_rate": 3.07991742603756e-05, "loss": 0.7927, "step": 3967 }, { "epoch": 2.130465420623469, "grad_norm": 0.68359375, "learning_rate": 3.07909591406367e-05, "loss": 0.6947, "step": 3968 }, { "epoch": 2.131002315358545, "grad_norm": 0.61328125, "learning_rate": 3.078274336007601e-05, "loss": 0.7996, "step": 3969 }, { "epoch": 2.131539210093621, "grad_norm": 0.6328125, "learning_rate": 3.077452691963108e-05, "loss": 0.8553, "step": 3970 }, { "epoch": 2.132076104828697, "grad_norm": 0.65234375, "learning_rate": 3.076630982023951e-05, "loss": 0.8175, "step": 3971 }, { "epoch": 2.132612999563773, "grad_norm": 0.609375, "learning_rate": 3.075809206283895e-05, "loss": 0.9002, "step": 3972 }, { "epoch": 2.133149894298849, "grad_norm": 0.5546875, "learning_rate": 3.0749873648367164e-05, "loss": 0.6067, "step": 3973 }, { "epoch": 2.133686789033925, "grad_norm": 0.640625, "learning_rate": 3.074165457776199e-05, "loss": 0.7013, "step": 3974 }, { "epoch": 2.134223683769001, "grad_norm": 0.6484375, "learning_rate": 3.073343485196131e-05, "loss": 0.8546, "step": 3975 }, { "epoch": 2.134760578504077, "grad_norm": 0.72265625, "learning_rate": 3.072521447190311e-05, "loss": 0.9153, "step": 3976 }, { "epoch": 2.135297473239153, "grad_norm": 0.6796875, "learning_rate": 3.071699343852542e-05, "loss": 0.8468, "step": 3977 }, { "epoch": 2.135834367974229, "grad_norm": 0.65625, "learning_rate": 3.0708771752766394e-05, "loss": 0.8384, "step": 3978 }, { "epoch": 2.136371262709305, "grad_norm": 0.796875, "learning_rate": 3.070054941556422e-05, "loss": 0.8359, "step": 3979 }, { "epoch": 2.136908157444381, "grad_norm": 0.6328125, "learning_rate": 3.069232642785715e-05, "loss": 0.6869, "step": 3980 }, { "epoch": 2.1374450521794572, "grad_norm": 0.6875, "learning_rate": 3.068410279058355e-05, "loss": 1.1633, "step": 3981 }, { "epoch": 2.137981946914533, "grad_norm": 0.66015625, "learning_rate": 3.067587850468184e-05, "loss": 0.9488, "step": 3982 }, { "epoch": 2.138518841649609, "grad_norm": 0.6484375, "learning_rate": 3.066765357109052e-05, "loss": 0.8224, "step": 3983 }, { "epoch": 2.139055736384685, "grad_norm": 0.78125, "learning_rate": 3.0659427990748154e-05, "loss": 1.083, "step": 3984 }, { "epoch": 2.1395926311197613, "grad_norm": 0.76171875, "learning_rate": 3.065120176459338e-05, "loss": 0.8218, "step": 3985 }, { "epoch": 2.140129525854837, "grad_norm": 0.703125, "learning_rate": 3.0642974893564906e-05, "loss": 0.9032, "step": 3986 }, { "epoch": 2.140666420589913, "grad_norm": 0.671875, "learning_rate": 3.063474737860155e-05, "loss": 0.864, "step": 3987 }, { "epoch": 2.141203315324989, "grad_norm": 0.69140625, "learning_rate": 3.062651922064215e-05, "loss": 0.7869, "step": 3988 }, { "epoch": 2.141740210060065, "grad_norm": 0.6796875, "learning_rate": 3.0618290420625656e-05, "loss": 0.8007, "step": 3989 }, { "epoch": 2.142277104795141, "grad_norm": 0.60546875, "learning_rate": 3.0610060979491076e-05, "loss": 0.8238, "step": 3990 }, { "epoch": 2.142813999530217, "grad_norm": 0.640625, "learning_rate": 3.060183089817749e-05, "loss": 1.0372, "step": 3991 }, { "epoch": 2.1433508942652932, "grad_norm": 0.75, "learning_rate": 3.059360017762407e-05, "loss": 0.8935, "step": 3992 }, { "epoch": 2.143887789000369, "grad_norm": 0.73828125, "learning_rate": 3.058536881877002e-05, "loss": 1.0801, "step": 3993 }, { "epoch": 2.144424683735445, "grad_norm": 0.58984375, "learning_rate": 3.0577136822554645e-05, "loss": 0.8193, "step": 3994 }, { "epoch": 2.144961578470521, "grad_norm": 0.70703125, "learning_rate": 3.0568904189917346e-05, "loss": 0.9261, "step": 3995 }, { "epoch": 2.1454984732055973, "grad_norm": 0.7578125, "learning_rate": 3.056067092179755e-05, "loss": 1.0825, "step": 3996 }, { "epoch": 2.146035367940673, "grad_norm": 0.77734375, "learning_rate": 3.055243701913478e-05, "loss": 1.0628, "step": 3997 }, { "epoch": 2.146572262675749, "grad_norm": 0.6640625, "learning_rate": 3.054420248286863e-05, "loss": 0.6996, "step": 3998 }, { "epoch": 2.147109157410825, "grad_norm": 0.796875, "learning_rate": 3.0535967313938766e-05, "loss": 1.0626, "step": 3999 }, { "epoch": 2.1476460521459013, "grad_norm": 0.765625, "learning_rate": 3.052773151328493e-05, "loss": 0.9167, "step": 4000 }, { "epoch": 2.1476460521459013, "eval_loss": 0.906206488609314, "eval_runtime": 47.0017, "eval_samples_per_second": 20.467, "eval_steps_per_second": 20.467, "step": 4000 }, { "epoch": 2.148182946880977, "grad_norm": 0.59375, "learning_rate": 3.051949508184692e-05, "loss": 0.7831, "step": 4001 }, { "epoch": 2.148719841616053, "grad_norm": 0.70703125, "learning_rate": 3.051125802056462e-05, "loss": 0.8307, "step": 4002 }, { "epoch": 2.1492567363511292, "grad_norm": 0.7421875, "learning_rate": 3.0503020330377985e-05, "loss": 0.9071, "step": 4003 }, { "epoch": 2.1497936310862054, "grad_norm": 0.7421875, "learning_rate": 3.0494782012227048e-05, "loss": 0.8799, "step": 4004 }, { "epoch": 2.150330525821281, "grad_norm": 0.58203125, "learning_rate": 3.048654306705189e-05, "loss": 0.72, "step": 4005 }, { "epoch": 2.150867420556357, "grad_norm": 0.6953125, "learning_rate": 3.047830349579269e-05, "loss": 0.9297, "step": 4006 }, { "epoch": 2.1514043152914333, "grad_norm": 0.8984375, "learning_rate": 3.0470063299389684e-05, "loss": 0.7495, "step": 4007 }, { "epoch": 2.151941210026509, "grad_norm": 0.87890625, "learning_rate": 3.0461822478783185e-05, "loss": 0.8434, "step": 4008 }, { "epoch": 2.152478104761585, "grad_norm": 0.69921875, "learning_rate": 3.045358103491357e-05, "loss": 0.9524, "step": 4009 }, { "epoch": 2.153014999496661, "grad_norm": 0.66796875, "learning_rate": 3.0445338968721287e-05, "loss": 0.8771, "step": 4010 }, { "epoch": 2.1535518942317373, "grad_norm": 0.76171875, "learning_rate": 3.0437096281146877e-05, "loss": 1.0115, "step": 4011 }, { "epoch": 2.154088788966813, "grad_norm": 0.8203125, "learning_rate": 3.0428852973130933e-05, "loss": 1.0401, "step": 4012 }, { "epoch": 2.154625683701889, "grad_norm": 0.73828125, "learning_rate": 3.0420609045614103e-05, "loss": 0.9135, "step": 4013 }, { "epoch": 2.1551625784369652, "grad_norm": 0.75, "learning_rate": 3.0412364499537137e-05, "loss": 0.9513, "step": 4014 }, { "epoch": 2.1556994731720414, "grad_norm": 0.6796875, "learning_rate": 3.0404119335840843e-05, "loss": 0.9136, "step": 4015 }, { "epoch": 2.156236367907117, "grad_norm": 0.64453125, "learning_rate": 3.0395873555466088e-05, "loss": 0.6858, "step": 4016 }, { "epoch": 2.156773262642193, "grad_norm": 0.578125, "learning_rate": 3.038762715935383e-05, "loss": 0.8168, "step": 4017 }, { "epoch": 2.1573101573772693, "grad_norm": 0.73046875, "learning_rate": 3.037938014844508e-05, "loss": 0.9354, "step": 4018 }, { "epoch": 2.1578470521123454, "grad_norm": 0.6328125, "learning_rate": 3.037113252368094e-05, "loss": 0.818, "step": 4019 }, { "epoch": 2.158383946847421, "grad_norm": 0.64453125, "learning_rate": 3.0362884286002558e-05, "loss": 0.841, "step": 4020 }, { "epoch": 2.158920841582497, "grad_norm": 0.78125, "learning_rate": 3.035463543635116e-05, "loss": 1.0041, "step": 4021 }, { "epoch": 2.1594577363175733, "grad_norm": 0.578125, "learning_rate": 3.0346385975668047e-05, "loss": 0.7238, "step": 4022 }, { "epoch": 2.1599946310526494, "grad_norm": 0.6171875, "learning_rate": 3.0338135904894594e-05, "loss": 0.8383, "step": 4023 }, { "epoch": 2.160531525787725, "grad_norm": 0.703125, "learning_rate": 3.032988522497222e-05, "loss": 1.1517, "step": 4024 }, { "epoch": 2.1610684205228012, "grad_norm": 0.59765625, "learning_rate": 3.032163393684246e-05, "loss": 0.7259, "step": 4025 }, { "epoch": 2.1616053152578774, "grad_norm": 0.66796875, "learning_rate": 3.0313382041446858e-05, "loss": 0.7314, "step": 4026 }, { "epoch": 2.162142209992953, "grad_norm": 0.796875, "learning_rate": 3.0305129539727085e-05, "loss": 0.903, "step": 4027 }, { "epoch": 2.162679104728029, "grad_norm": 0.74609375, "learning_rate": 3.0296876432624848e-05, "loss": 0.8733, "step": 4028 }, { "epoch": 2.1632159994631053, "grad_norm": 0.671875, "learning_rate": 3.0288622721081933e-05, "loss": 0.7457, "step": 4029 }, { "epoch": 2.1637528941981814, "grad_norm": 0.82421875, "learning_rate": 3.028036840604019e-05, "loss": 0.9793, "step": 4030 }, { "epoch": 2.164289788933257, "grad_norm": 0.73828125, "learning_rate": 3.027211348844154e-05, "loss": 0.913, "step": 4031 }, { "epoch": 2.164826683668333, "grad_norm": 0.6015625, "learning_rate": 3.0263857969227966e-05, "loss": 0.7289, "step": 4032 }, { "epoch": 2.1653635784034093, "grad_norm": 0.71875, "learning_rate": 3.0255601849341554e-05, "loss": 0.8946, "step": 4033 }, { "epoch": 2.1659004731384854, "grad_norm": 0.7265625, "learning_rate": 3.02473451297244e-05, "loss": 1.0224, "step": 4034 }, { "epoch": 2.166437367873561, "grad_norm": 0.6171875, "learning_rate": 3.0239087811318715e-05, "loss": 0.7468, "step": 4035 }, { "epoch": 2.1669742626086372, "grad_norm": 0.66796875, "learning_rate": 3.023082989506677e-05, "loss": 0.7348, "step": 4036 }, { "epoch": 2.1675111573437134, "grad_norm": 0.6640625, "learning_rate": 3.0222571381910884e-05, "loss": 0.8514, "step": 4037 }, { "epoch": 2.1680480520787895, "grad_norm": 0.65625, "learning_rate": 3.0214312272793472e-05, "loss": 0.8658, "step": 4038 }, { "epoch": 2.168584946813865, "grad_norm": 0.765625, "learning_rate": 3.0206052568656996e-05, "loss": 1.0283, "step": 4039 }, { "epoch": 2.1691218415489413, "grad_norm": 0.6015625, "learning_rate": 3.0197792270443982e-05, "loss": 0.7336, "step": 4040 }, { "epoch": 2.1696587362840174, "grad_norm": 0.61328125, "learning_rate": 3.0189531379097057e-05, "loss": 0.7364, "step": 4041 }, { "epoch": 2.1701956310190935, "grad_norm": 0.65234375, "learning_rate": 3.018126989555888e-05, "loss": 0.774, "step": 4042 }, { "epoch": 2.170732525754169, "grad_norm": 0.6953125, "learning_rate": 3.017300782077218e-05, "loss": 0.8639, "step": 4043 }, { "epoch": 2.1712694204892453, "grad_norm": 0.640625, "learning_rate": 3.016474515567979e-05, "loss": 0.8877, "step": 4044 }, { "epoch": 2.1718063152243214, "grad_norm": 0.75, "learning_rate": 3.015648190122457e-05, "loss": 0.7921, "step": 4045 }, { "epoch": 2.172343209959397, "grad_norm": 0.7265625, "learning_rate": 3.014821805834946e-05, "loss": 0.8884, "step": 4046 }, { "epoch": 2.1728801046944732, "grad_norm": 0.64453125, "learning_rate": 3.0139953627997475e-05, "loss": 0.856, "step": 4047 }, { "epoch": 2.1734169994295494, "grad_norm": 0.75, "learning_rate": 3.0131688611111685e-05, "loss": 0.892, "step": 4048 }, { "epoch": 2.1739538941646255, "grad_norm": 0.70703125, "learning_rate": 3.0123423008635236e-05, "loss": 0.9259, "step": 4049 }, { "epoch": 2.174490788899701, "grad_norm": 0.65625, "learning_rate": 3.011515682151135e-05, "loss": 0.8287, "step": 4050 }, { "epoch": 2.174490788899701, "eval_loss": 0.9057601690292358, "eval_runtime": 46.7747, "eval_samples_per_second": 20.567, "eval_steps_per_second": 20.567, "step": 4050 }, { "epoch": 2.1750276836347773, "grad_norm": 0.70703125, "learning_rate": 3.0106890050683278e-05, "loss": 1.0291, "step": 4051 }, { "epoch": 2.1755645783698534, "grad_norm": 0.7109375, "learning_rate": 3.0098622697094387e-05, "loss": 0.8809, "step": 4052 }, { "epoch": 2.1761014731049295, "grad_norm": 0.62109375, "learning_rate": 3.009035476168808e-05, "loss": 0.7938, "step": 4053 }, { "epoch": 2.176638367840005, "grad_norm": 0.6875, "learning_rate": 3.008208624540782e-05, "loss": 0.8107, "step": 4054 }, { "epoch": 2.1771752625750813, "grad_norm": 0.64453125, "learning_rate": 3.007381714919717e-05, "loss": 0.5469, "step": 4055 }, { "epoch": 2.1777121573101574, "grad_norm": 0.75390625, "learning_rate": 3.0065547473999723e-05, "loss": 1.1463, "step": 4056 }, { "epoch": 2.1782490520452336, "grad_norm": 0.703125, "learning_rate": 3.0057277220759156e-05, "loss": 1.0264, "step": 4057 }, { "epoch": 2.1787859467803092, "grad_norm": 0.73828125, "learning_rate": 3.004900639041922e-05, "loss": 0.775, "step": 4058 }, { "epoch": 2.1793228415153854, "grad_norm": 1.25, "learning_rate": 3.004073498392371e-05, "loss": 0.9768, "step": 4059 }, { "epoch": 2.1798597362504615, "grad_norm": 0.796875, "learning_rate": 3.0032463002216505e-05, "loss": 0.918, "step": 4060 }, { "epoch": 2.1803966309855376, "grad_norm": 0.67578125, "learning_rate": 3.002419044624154e-05, "loss": 0.8603, "step": 4061 }, { "epoch": 2.1809335257206133, "grad_norm": 0.79296875, "learning_rate": 3.001591731694282e-05, "loss": 0.9137, "step": 4062 }, { "epoch": 2.1814704204556894, "grad_norm": 0.60546875, "learning_rate": 3.000764361526441e-05, "loss": 0.8619, "step": 4063 }, { "epoch": 2.1820073151907655, "grad_norm": 0.8125, "learning_rate": 2.9999369342150446e-05, "loss": 0.7769, "step": 4064 }, { "epoch": 2.182544209925841, "grad_norm": 0.84375, "learning_rate": 2.9991094498545126e-05, "loss": 0.8459, "step": 4065 }, { "epoch": 2.1830811046609173, "grad_norm": 0.84765625, "learning_rate": 2.998281908539272e-05, "loss": 0.9062, "step": 4066 }, { "epoch": 2.1836179993959934, "grad_norm": 0.8828125, "learning_rate": 2.9974543103637558e-05, "loss": 0.9375, "step": 4067 }, { "epoch": 2.1841548941310696, "grad_norm": 0.70703125, "learning_rate": 2.9966266554224027e-05, "loss": 0.7708, "step": 4068 }, { "epoch": 2.1846917888661452, "grad_norm": 0.703125, "learning_rate": 2.995798943809659e-05, "loss": 0.9111, "step": 4069 }, { "epoch": 2.1852286836012214, "grad_norm": 0.609375, "learning_rate": 2.9949711756199772e-05, "loss": 0.6897, "step": 4070 }, { "epoch": 2.1857655783362975, "grad_norm": 0.64453125, "learning_rate": 2.9941433509478156e-05, "loss": 0.8613, "step": 4071 }, { "epoch": 2.1863024730713736, "grad_norm": 0.7578125, "learning_rate": 2.9933154698876403e-05, "loss": 1.0753, "step": 4072 }, { "epoch": 2.1868393678064493, "grad_norm": 0.85546875, "learning_rate": 2.992487532533922e-05, "loss": 0.9237, "step": 4073 }, { "epoch": 2.1873762625415254, "grad_norm": 0.69140625, "learning_rate": 2.9916595389811397e-05, "loss": 0.8755, "step": 4074 }, { "epoch": 2.1879131572766015, "grad_norm": 0.73046875, "learning_rate": 2.9908314893237776e-05, "loss": 0.8316, "step": 4075 }, { "epoch": 2.1884500520116776, "grad_norm": 0.68359375, "learning_rate": 2.9900033836563263e-05, "loss": 0.9919, "step": 4076 }, { "epoch": 2.1889869467467533, "grad_norm": 0.74609375, "learning_rate": 2.9891752220732838e-05, "loss": 0.9578, "step": 4077 }, { "epoch": 2.1895238414818294, "grad_norm": 0.62109375, "learning_rate": 2.9883470046691543e-05, "loss": 0.8043, "step": 4078 }, { "epoch": 2.1900607362169056, "grad_norm": 0.63671875, "learning_rate": 2.987518731538446e-05, "loss": 0.7975, "step": 4079 }, { "epoch": 2.1905976309519817, "grad_norm": 0.70703125, "learning_rate": 2.986690402775677e-05, "loss": 0.9073, "step": 4080 }, { "epoch": 2.1911345256870574, "grad_norm": 0.77734375, "learning_rate": 2.9858620184753695e-05, "loss": 0.8354, "step": 4081 }, { "epoch": 2.1916714204221335, "grad_norm": 0.5859375, "learning_rate": 2.9850335787320526e-05, "loss": 0.7135, "step": 4082 }, { "epoch": 2.1922083151572096, "grad_norm": 0.6640625, "learning_rate": 2.984205083640263e-05, "loss": 0.6937, "step": 4083 }, { "epoch": 2.1927452098922853, "grad_norm": 0.61328125, "learning_rate": 2.9833765332945402e-05, "loss": 0.7958, "step": 4084 }, { "epoch": 2.1932821046273614, "grad_norm": 0.90625, "learning_rate": 2.9825479277894343e-05, "loss": 1.0131, "step": 4085 }, { "epoch": 2.1938189993624375, "grad_norm": 0.7265625, "learning_rate": 2.9817192672194984e-05, "loss": 0.8615, "step": 4086 }, { "epoch": 2.1943558940975136, "grad_norm": 0.69140625, "learning_rate": 2.980890551679293e-05, "loss": 0.8079, "step": 4087 }, { "epoch": 2.1948927888325893, "grad_norm": 0.66796875, "learning_rate": 2.980061781263387e-05, "loss": 0.7669, "step": 4088 }, { "epoch": 2.1954296835676654, "grad_norm": 0.7109375, "learning_rate": 2.9792329560663507e-05, "loss": 0.721, "step": 4089 }, { "epoch": 2.1959665783027416, "grad_norm": 0.66015625, "learning_rate": 2.9784040761827658e-05, "loss": 0.8621, "step": 4090 }, { "epoch": 2.1965034730378177, "grad_norm": 0.7421875, "learning_rate": 2.977575141707218e-05, "loss": 1.0204, "step": 4091 }, { "epoch": 2.1970403677728934, "grad_norm": 0.63671875, "learning_rate": 2.9767461527342983e-05, "loss": 0.8186, "step": 4092 }, { "epoch": 2.1975772625079695, "grad_norm": 0.74609375, "learning_rate": 2.9759171093586047e-05, "loss": 1.0321, "step": 4093 }, { "epoch": 2.1981141572430456, "grad_norm": 0.6640625, "learning_rate": 2.975088011674742e-05, "loss": 0.8158, "step": 4094 }, { "epoch": 2.1986510519781217, "grad_norm": 0.87109375, "learning_rate": 2.9742588597773204e-05, "loss": 0.9527, "step": 4095 }, { "epoch": 2.1991879467131974, "grad_norm": 0.6953125, "learning_rate": 2.9734296537609575e-05, "loss": 0.773, "step": 4096 }, { "epoch": 2.1997248414482735, "grad_norm": 0.6484375, "learning_rate": 2.9726003937202757e-05, "loss": 0.7991, "step": 4097 }, { "epoch": 2.2002617361833496, "grad_norm": 0.70703125, "learning_rate": 2.9717710797499025e-05, "loss": 0.7457, "step": 4098 }, { "epoch": 2.2007986309184258, "grad_norm": 0.6796875, "learning_rate": 2.970941711944476e-05, "loss": 0.827, "step": 4099 }, { "epoch": 2.2013355256535014, "grad_norm": 0.8046875, "learning_rate": 2.970112290398636e-05, "loss": 0.866, "step": 4100 }, { "epoch": 2.2013355256535014, "eval_loss": 0.9053437113761902, "eval_runtime": 46.5428, "eval_samples_per_second": 20.669, "eval_steps_per_second": 20.669, "step": 4100 }, { "epoch": 2.2018724203885776, "grad_norm": 0.65625, "learning_rate": 2.96928281520703e-05, "loss": 0.9705, "step": 4101 }, { "epoch": 2.2024093151236537, "grad_norm": 0.6484375, "learning_rate": 2.9684532864643122e-05, "loss": 0.7725, "step": 4102 }, { "epoch": 2.2029462098587294, "grad_norm": 0.83984375, "learning_rate": 2.9676237042651412e-05, "loss": 1.1466, "step": 4103 }, { "epoch": 2.2034831045938055, "grad_norm": 0.70703125, "learning_rate": 2.966794068704184e-05, "loss": 0.9232, "step": 4104 }, { "epoch": 2.2040199993288816, "grad_norm": 0.83203125, "learning_rate": 2.9659643798761123e-05, "loss": 0.8079, "step": 4105 }, { "epoch": 2.2045568940639577, "grad_norm": 0.63671875, "learning_rate": 2.9651346378756028e-05, "loss": 0.7778, "step": 4106 }, { "epoch": 2.2050937887990334, "grad_norm": 0.796875, "learning_rate": 2.9643048427973417e-05, "loss": 0.8648, "step": 4107 }, { "epoch": 2.2056306835341095, "grad_norm": 0.7734375, "learning_rate": 2.9634749947360178e-05, "loss": 0.993, "step": 4108 }, { "epoch": 2.2061675782691856, "grad_norm": 0.71484375, "learning_rate": 2.962645093786328e-05, "loss": 0.8057, "step": 4109 }, { "epoch": 2.2067044730042618, "grad_norm": 0.66796875, "learning_rate": 2.961815140042974e-05, "loss": 0.7218, "step": 4110 }, { "epoch": 2.2072413677393374, "grad_norm": 1.0703125, "learning_rate": 2.9609851336006632e-05, "loss": 0.9607, "step": 4111 }, { "epoch": 2.2077782624744136, "grad_norm": 0.79296875, "learning_rate": 2.9601550745541112e-05, "loss": 0.9372, "step": 4112 }, { "epoch": 2.2083151572094897, "grad_norm": 0.6796875, "learning_rate": 2.9593249629980386e-05, "loss": 0.919, "step": 4113 }, { "epoch": 2.208852051944566, "grad_norm": 0.8203125, "learning_rate": 2.95849479902717e-05, "loss": 0.9681, "step": 4114 }, { "epoch": 2.2093889466796415, "grad_norm": 0.62890625, "learning_rate": 2.9576645827362386e-05, "loss": 0.8681, "step": 4115 }, { "epoch": 2.2099258414147176, "grad_norm": 0.81640625, "learning_rate": 2.9568343142199824e-05, "loss": 0.8769, "step": 4116 }, { "epoch": 2.2104627361497937, "grad_norm": 0.76953125, "learning_rate": 2.956003993573146e-05, "loss": 0.9149, "step": 4117 }, { "epoch": 2.21099963088487, "grad_norm": 0.69140625, "learning_rate": 2.955173620890479e-05, "loss": 0.8632, "step": 4118 }, { "epoch": 2.2115365256199455, "grad_norm": 0.703125, "learning_rate": 2.9543431962667372e-05, "loss": 0.8461, "step": 4119 }, { "epoch": 2.2120734203550216, "grad_norm": 0.65234375, "learning_rate": 2.9535127197966828e-05, "loss": 0.8977, "step": 4120 }, { "epoch": 2.2126103150900978, "grad_norm": 0.72265625, "learning_rate": 2.9526821915750847e-05, "loss": 0.9606, "step": 4121 }, { "epoch": 2.2131472098251734, "grad_norm": 0.65234375, "learning_rate": 2.9518516116967145e-05, "loss": 0.8072, "step": 4122 }, { "epoch": 2.2136841045602496, "grad_norm": 0.6328125, "learning_rate": 2.9510209802563537e-05, "loss": 0.843, "step": 4123 }, { "epoch": 2.2142209992953257, "grad_norm": 0.84375, "learning_rate": 2.9501902973487877e-05, "loss": 1.114, "step": 4124 }, { "epoch": 2.214757894030402, "grad_norm": 0.671875, "learning_rate": 2.9493595630688076e-05, "loss": 0.8705, "step": 4125 }, { "epoch": 2.2152947887654775, "grad_norm": 1.3671875, "learning_rate": 2.9485287775112103e-05, "loss": 0.9069, "step": 4126 }, { "epoch": 2.2158316835005536, "grad_norm": 0.73046875, "learning_rate": 2.9476979407707995e-05, "loss": 0.8303, "step": 4127 }, { "epoch": 2.2163685782356297, "grad_norm": 0.65625, "learning_rate": 2.9468670529423835e-05, "loss": 0.9907, "step": 4128 }, { "epoch": 2.216905472970706, "grad_norm": 0.7890625, "learning_rate": 2.9460361141207782e-05, "loss": 1.0083, "step": 4129 }, { "epoch": 2.2174423677057815, "grad_norm": 0.70703125, "learning_rate": 2.945205124400804e-05, "loss": 0.7639, "step": 4130 }, { "epoch": 2.2179792624408576, "grad_norm": 0.578125, "learning_rate": 2.9443740838772865e-05, "loss": 0.7762, "step": 4131 }, { "epoch": 2.2185161571759338, "grad_norm": 0.65625, "learning_rate": 2.9435429926450586e-05, "loss": 0.8291, "step": 4132 }, { "epoch": 2.21905305191101, "grad_norm": 0.65234375, "learning_rate": 2.9427118507989586e-05, "loss": 0.7603, "step": 4133 }, { "epoch": 2.2195899466460856, "grad_norm": 0.77734375, "learning_rate": 2.94188065843383e-05, "loss": 1.0706, "step": 4134 }, { "epoch": 2.2201268413811617, "grad_norm": 0.6875, "learning_rate": 2.9410494156445216e-05, "loss": 0.9553, "step": 4135 }, { "epoch": 2.220663736116238, "grad_norm": 0.82421875, "learning_rate": 2.9402181225258894e-05, "loss": 0.7941, "step": 4136 }, { "epoch": 2.221200630851314, "grad_norm": 0.671875, "learning_rate": 2.9393867791727947e-05, "loss": 0.895, "step": 4137 }, { "epoch": 2.2217375255863896, "grad_norm": 0.56640625, "learning_rate": 2.938555385680105e-05, "loss": 0.6571, "step": 4138 }, { "epoch": 2.2222744203214657, "grad_norm": 0.7890625, "learning_rate": 2.9377239421426905e-05, "loss": 0.9236, "step": 4139 }, { "epoch": 2.222811315056542, "grad_norm": 0.78125, "learning_rate": 2.9368924486554317e-05, "loss": 0.9655, "step": 4140 }, { "epoch": 2.2233482097916175, "grad_norm": 0.75, "learning_rate": 2.9360609053132116e-05, "loss": 0.901, "step": 4141 }, { "epoch": 2.2238851045266936, "grad_norm": 0.6328125, "learning_rate": 2.9352293122109202e-05, "loss": 0.7299, "step": 4142 }, { "epoch": 2.2244219992617698, "grad_norm": 0.55859375, "learning_rate": 2.934397669443452e-05, "loss": 0.6925, "step": 4143 }, { "epoch": 2.224958893996846, "grad_norm": 0.70703125, "learning_rate": 2.933565977105709e-05, "loss": 0.8147, "step": 4144 }, { "epoch": 2.2254957887319216, "grad_norm": 0.921875, "learning_rate": 2.932734235292597e-05, "loss": 1.0086, "step": 4145 }, { "epoch": 2.2260326834669977, "grad_norm": 0.69921875, "learning_rate": 2.9319024440990296e-05, "loss": 1.1011, "step": 4146 }, { "epoch": 2.226569578202074, "grad_norm": 0.7109375, "learning_rate": 2.931070603619922e-05, "loss": 0.9351, "step": 4147 }, { "epoch": 2.22710647293715, "grad_norm": 0.65234375, "learning_rate": 2.9302387139502003e-05, "loss": 0.8345, "step": 4148 }, { "epoch": 2.2276433676722256, "grad_norm": 0.71484375, "learning_rate": 2.9294067751847933e-05, "loss": 0.9992, "step": 4149 }, { "epoch": 2.2281802624073017, "grad_norm": 0.640625, "learning_rate": 2.9285747874186342e-05, "loss": 0.9124, "step": 4150 }, { "epoch": 2.2281802624073017, "eval_loss": 0.9054576754570007, "eval_runtime": 46.6126, "eval_samples_per_second": 20.638, "eval_steps_per_second": 20.638, "step": 4150 }, { "epoch": 2.228717157142378, "grad_norm": 0.8125, "learning_rate": 2.9277427507466654e-05, "loss": 0.8543, "step": 4151 }, { "epoch": 2.229254051877454, "grad_norm": 0.54296875, "learning_rate": 2.926910665263831e-05, "loss": 0.6306, "step": 4152 }, { "epoch": 2.2297909466125296, "grad_norm": 0.62890625, "learning_rate": 2.926078531065084e-05, "loss": 0.7358, "step": 4153 }, { "epoch": 2.2303278413476058, "grad_norm": 0.69140625, "learning_rate": 2.9252463482453802e-05, "loss": 0.8626, "step": 4154 }, { "epoch": 2.230864736082682, "grad_norm": 0.64453125, "learning_rate": 2.924414116899683e-05, "loss": 0.8718, "step": 4155 }, { "epoch": 2.231401630817758, "grad_norm": 0.73046875, "learning_rate": 2.9235818371229606e-05, "loss": 0.7792, "step": 4156 }, { "epoch": 2.2319385255528337, "grad_norm": 0.65234375, "learning_rate": 2.9227495090101863e-05, "loss": 0.7288, "step": 4157 }, { "epoch": 2.23247542028791, "grad_norm": 0.6484375, "learning_rate": 2.921917132656339e-05, "loss": 0.9498, "step": 4158 }, { "epoch": 2.233012315022986, "grad_norm": 0.7109375, "learning_rate": 2.9210847081564042e-05, "loss": 0.9954, "step": 4159 }, { "epoch": 2.2335492097580616, "grad_norm": 0.6875, "learning_rate": 2.920252235605371e-05, "loss": 0.8263, "step": 4160 }, { "epoch": 2.2340861044931377, "grad_norm": 0.56640625, "learning_rate": 2.9194197150982357e-05, "loss": 0.7274, "step": 4161 }, { "epoch": 2.234622999228214, "grad_norm": 0.62890625, "learning_rate": 2.918587146729999e-05, "loss": 0.7906, "step": 4162 }, { "epoch": 2.23515989396329, "grad_norm": 0.73046875, "learning_rate": 2.917754530595669e-05, "loss": 0.926, "step": 4163 }, { "epoch": 2.2356967886983656, "grad_norm": 0.6171875, "learning_rate": 2.916921866790256e-05, "loss": 0.7595, "step": 4164 }, { "epoch": 2.2362336834334418, "grad_norm": 0.859375, "learning_rate": 2.916089155408778e-05, "loss": 0.9828, "step": 4165 }, { "epoch": 2.236770578168518, "grad_norm": 0.60546875, "learning_rate": 2.915256396546258e-05, "loss": 0.7342, "step": 4166 }, { "epoch": 2.237307472903594, "grad_norm": 0.83203125, "learning_rate": 2.914423590297725e-05, "loss": 0.8831, "step": 4167 }, { "epoch": 2.2378443676386697, "grad_norm": 0.74609375, "learning_rate": 2.9135907367582117e-05, "loss": 0.9026, "step": 4168 }, { "epoch": 2.238381262373746, "grad_norm": 0.63671875, "learning_rate": 2.912757836022757e-05, "loss": 0.8294, "step": 4169 }, { "epoch": 2.238918157108822, "grad_norm": 0.66015625, "learning_rate": 2.911924888186407e-05, "loss": 0.868, "step": 4170 }, { "epoch": 2.239455051843898, "grad_norm": 0.72265625, "learning_rate": 2.9110918933442104e-05, "loss": 0.8918, "step": 4171 }, { "epoch": 2.2399919465789737, "grad_norm": 0.68359375, "learning_rate": 2.9102588515912225e-05, "loss": 0.8856, "step": 4172 }, { "epoch": 2.24052884131405, "grad_norm": 0.73828125, "learning_rate": 2.909425763022504e-05, "loss": 1.046, "step": 4173 }, { "epoch": 2.241065736049126, "grad_norm": 0.75390625, "learning_rate": 2.9085926277331214e-05, "loss": 0.9102, "step": 4174 }, { "epoch": 2.241602630784202, "grad_norm": 0.59375, "learning_rate": 2.907759445818145e-05, "loss": 0.7687, "step": 4175 }, { "epoch": 2.2421395255192778, "grad_norm": 0.69140625, "learning_rate": 2.906926217372653e-05, "loss": 0.9139, "step": 4176 }, { "epoch": 2.242676420254354, "grad_norm": 0.65625, "learning_rate": 2.9060929424917254e-05, "loss": 0.9126, "step": 4177 }, { "epoch": 2.24321331498943, "grad_norm": 0.7421875, "learning_rate": 2.9052596212704507e-05, "loss": 1.0514, "step": 4178 }, { "epoch": 2.2437502097245057, "grad_norm": 0.62890625, "learning_rate": 2.9044262538039206e-05, "loss": 0.8539, "step": 4179 }, { "epoch": 2.244287104459582, "grad_norm": 0.73046875, "learning_rate": 2.9035928401872336e-05, "loss": 1.0017, "step": 4180 }, { "epoch": 2.244823999194658, "grad_norm": 0.5859375, "learning_rate": 2.9027593805154927e-05, "loss": 0.7458, "step": 4181 }, { "epoch": 2.245360893929734, "grad_norm": 0.91796875, "learning_rate": 2.9019258748838058e-05, "loss": 0.7076, "step": 4182 }, { "epoch": 2.2458977886648097, "grad_norm": 0.796875, "learning_rate": 2.901092323387286e-05, "loss": 0.9812, "step": 4183 }, { "epoch": 2.246434683399886, "grad_norm": 0.6328125, "learning_rate": 2.9002587261210534e-05, "loss": 0.8256, "step": 4184 }, { "epoch": 2.246971578134962, "grad_norm": 0.6796875, "learning_rate": 2.8994250831802307e-05, "loss": 0.7752, "step": 4185 }, { "epoch": 2.247508472870038, "grad_norm": 0.60546875, "learning_rate": 2.898591394659948e-05, "loss": 0.5932, "step": 4186 }, { "epoch": 2.2480453676051138, "grad_norm": 0.6015625, "learning_rate": 2.8977576606553398e-05, "loss": 0.6987, "step": 4187 }, { "epoch": 2.24858226234019, "grad_norm": 0.98046875, "learning_rate": 2.8969238812615452e-05, "loss": 1.0829, "step": 4188 }, { "epoch": 2.249119157075266, "grad_norm": 0.6171875, "learning_rate": 2.8960900565737086e-05, "loss": 0.8561, "step": 4189 }, { "epoch": 2.249656051810342, "grad_norm": 0.703125, "learning_rate": 2.895256186686981e-05, "loss": 0.7907, "step": 4190 }, { "epoch": 2.250192946545418, "grad_norm": 0.78125, "learning_rate": 2.8944222716965163e-05, "loss": 1.0313, "step": 4191 }, { "epoch": 2.250729841280494, "grad_norm": 0.79296875, "learning_rate": 2.893588311697476e-05, "loss": 0.9696, "step": 4192 }, { "epoch": 2.25126673601557, "grad_norm": 0.6484375, "learning_rate": 2.8927543067850248e-05, "loss": 0.6992, "step": 4193 }, { "epoch": 2.251803630750646, "grad_norm": 0.69921875, "learning_rate": 2.8919202570543336e-05, "loss": 0.7765, "step": 4194 }, { "epoch": 2.252340525485722, "grad_norm": 0.68359375, "learning_rate": 2.8910861626005776e-05, "loss": 0.6895, "step": 4195 }, { "epoch": 2.252877420220798, "grad_norm": 0.609375, "learning_rate": 2.890252023518938e-05, "loss": 0.7656, "step": 4196 }, { "epoch": 2.253414314955874, "grad_norm": 0.671875, "learning_rate": 2.8894178399046002e-05, "loss": 0.6924, "step": 4197 }, { "epoch": 2.2539512096909498, "grad_norm": 0.5390625, "learning_rate": 2.888583611852755e-05, "loss": 0.7085, "step": 4198 }, { "epoch": 2.254488104426026, "grad_norm": 0.6171875, "learning_rate": 2.8877493394585982e-05, "loss": 0.817, "step": 4199 }, { "epoch": 2.255024999161102, "grad_norm": 0.6796875, "learning_rate": 2.886915022817332e-05, "loss": 0.722, "step": 4200 }, { "epoch": 2.255024999161102, "eval_loss": 0.9057018160820007, "eval_runtime": 46.7193, "eval_samples_per_second": 20.591, "eval_steps_per_second": 20.591, "step": 4200 }, { "epoch": 2.255561893896178, "grad_norm": 0.57421875, "learning_rate": 2.8860806620241627e-05, "loss": 0.9053, "step": 4201 }, { "epoch": 2.256098788631254, "grad_norm": 0.7265625, "learning_rate": 2.885246257174299e-05, "loss": 0.9915, "step": 4202 }, { "epoch": 2.25663568336633, "grad_norm": 0.703125, "learning_rate": 2.8844118083629596e-05, "loss": 0.9003, "step": 4203 }, { "epoch": 2.257172578101406, "grad_norm": 0.78515625, "learning_rate": 2.883577315685364e-05, "loss": 0.9268, "step": 4204 }, { "epoch": 2.257709472836482, "grad_norm": 0.54296875, "learning_rate": 2.8827427792367394e-05, "loss": 0.7192, "step": 4205 }, { "epoch": 2.258246367571558, "grad_norm": 0.6640625, "learning_rate": 2.881908199112317e-05, "loss": 0.8311, "step": 4206 }, { "epoch": 2.258783262306634, "grad_norm": 0.578125, "learning_rate": 2.8810735754073315e-05, "loss": 0.5093, "step": 4207 }, { "epoch": 2.25932015704171, "grad_norm": 0.6875, "learning_rate": 2.880238908217026e-05, "loss": 0.851, "step": 4208 }, { "epoch": 2.259857051776786, "grad_norm": 0.921875, "learning_rate": 2.8794041976366463e-05, "loss": 0.7708, "step": 4209 }, { "epoch": 2.260393946511862, "grad_norm": 0.66796875, "learning_rate": 2.878569443761442e-05, "loss": 0.8941, "step": 4210 }, { "epoch": 2.260930841246938, "grad_norm": 0.6796875, "learning_rate": 2.87773464668667e-05, "loss": 0.8557, "step": 4211 }, { "epoch": 2.261467735982014, "grad_norm": 0.6328125, "learning_rate": 2.8768998065075915e-05, "loss": 0.7612, "step": 4212 }, { "epoch": 2.2620046307170902, "grad_norm": 0.66015625, "learning_rate": 2.8760649233194714e-05, "loss": 0.8401, "step": 4213 }, { "epoch": 2.262541525452166, "grad_norm": 0.6796875, "learning_rate": 2.8752299972175824e-05, "loss": 0.8999, "step": 4214 }, { "epoch": 2.263078420187242, "grad_norm": 0.66796875, "learning_rate": 2.8743950282971977e-05, "loss": 0.8894, "step": 4215 }, { "epoch": 2.263615314922318, "grad_norm": 0.66015625, "learning_rate": 2.873560016653599e-05, "loss": 0.8505, "step": 4216 }, { "epoch": 2.264152209657394, "grad_norm": 0.6796875, "learning_rate": 2.8727249623820717e-05, "loss": 0.893, "step": 4217 }, { "epoch": 2.26468910439247, "grad_norm": 0.671875, "learning_rate": 2.8718898655779066e-05, "loss": 0.7213, "step": 4218 }, { "epoch": 2.265225999127546, "grad_norm": 0.77734375, "learning_rate": 2.871054726336398e-05, "loss": 1.1274, "step": 4219 }, { "epoch": 2.265762893862622, "grad_norm": 0.7421875, "learning_rate": 2.8702195447528456e-05, "loss": 1.0309, "step": 4220 }, { "epoch": 2.266299788597698, "grad_norm": 0.6484375, "learning_rate": 2.8693843209225545e-05, "loss": 0.9051, "step": 4221 }, { "epoch": 2.266836683332774, "grad_norm": 0.703125, "learning_rate": 2.8685490549408363e-05, "loss": 0.7285, "step": 4222 }, { "epoch": 2.26737357806785, "grad_norm": 0.59765625, "learning_rate": 2.867713746903002e-05, "loss": 0.6962, "step": 4223 }, { "epoch": 2.2679104728029262, "grad_norm": 0.70703125, "learning_rate": 2.8668783969043723e-05, "loss": 0.9066, "step": 4224 }, { "epoch": 2.268447367538002, "grad_norm": 0.84375, "learning_rate": 2.866043005040272e-05, "loss": 0.7681, "step": 4225 }, { "epoch": 2.268984262273078, "grad_norm": 0.84765625, "learning_rate": 2.8652075714060295e-05, "loss": 0.8514, "step": 4226 }, { "epoch": 2.269521157008154, "grad_norm": 0.76953125, "learning_rate": 2.8643720960969778e-05, "loss": 1.0333, "step": 4227 }, { "epoch": 2.2700580517432303, "grad_norm": 0.69140625, "learning_rate": 2.8635365792084556e-05, "loss": 0.7421, "step": 4228 }, { "epoch": 2.270594946478306, "grad_norm": 0.73828125, "learning_rate": 2.8627010208358053e-05, "loss": 0.845, "step": 4229 }, { "epoch": 2.271131841213382, "grad_norm": 0.6796875, "learning_rate": 2.861865421074376e-05, "loss": 0.8226, "step": 4230 }, { "epoch": 2.271668735948458, "grad_norm": 0.734375, "learning_rate": 2.8610297800195194e-05, "loss": 0.8869, "step": 4231 }, { "epoch": 2.2722056306835343, "grad_norm": 0.62109375, "learning_rate": 2.8601940977665927e-05, "loss": 0.7535, "step": 4232 }, { "epoch": 2.27274252541861, "grad_norm": 0.69921875, "learning_rate": 2.8593583744109574e-05, "loss": 0.8447, "step": 4233 }, { "epoch": 2.273279420153686, "grad_norm": 0.7265625, "learning_rate": 2.858522610047982e-05, "loss": 1.0643, "step": 4234 }, { "epoch": 2.2738163148887622, "grad_norm": 0.66796875, "learning_rate": 2.8576868047730354e-05, "loss": 0.8225, "step": 4235 }, { "epoch": 2.274353209623838, "grad_norm": 0.56640625, "learning_rate": 2.856850958681495e-05, "loss": 0.6639, "step": 4236 }, { "epoch": 2.274890104358914, "grad_norm": 0.8359375, "learning_rate": 2.8560150718687413e-05, "loss": 0.9331, "step": 4237 }, { "epoch": 2.27542699909399, "grad_norm": 0.640625, "learning_rate": 2.8551791444301583e-05, "loss": 0.7256, "step": 4238 }, { "epoch": 2.2759638938290663, "grad_norm": 0.69921875, "learning_rate": 2.8543431764611383e-05, "loss": 0.7501, "step": 4239 }, { "epoch": 2.276500788564142, "grad_norm": 0.734375, "learning_rate": 2.8535071680570734e-05, "loss": 0.9966, "step": 4240 }, { "epoch": 2.277037683299218, "grad_norm": 0.82421875, "learning_rate": 2.852671119313365e-05, "loss": 0.8804, "step": 4241 }, { "epoch": 2.277574578034294, "grad_norm": 0.65234375, "learning_rate": 2.8518350303254147e-05, "loss": 0.7867, "step": 4242 }, { "epoch": 2.2781114727693703, "grad_norm": 0.69921875, "learning_rate": 2.8509989011886325e-05, "loss": 0.8485, "step": 4243 }, { "epoch": 2.278648367504446, "grad_norm": 0.95703125, "learning_rate": 2.8501627319984296e-05, "loss": 0.9357, "step": 4244 }, { "epoch": 2.279185262239522, "grad_norm": 0.71484375, "learning_rate": 2.8493265228502253e-05, "loss": 0.8212, "step": 4245 }, { "epoch": 2.2797221569745982, "grad_norm": 0.72265625, "learning_rate": 2.8484902738394403e-05, "loss": 0.8409, "step": 4246 }, { "epoch": 2.2802590517096744, "grad_norm": 0.66015625, "learning_rate": 2.847653985061503e-05, "loss": 0.729, "step": 4247 }, { "epoch": 2.28079594644475, "grad_norm": 0.64453125, "learning_rate": 2.846817656611842e-05, "loss": 0.7594, "step": 4248 }, { "epoch": 2.281332841179826, "grad_norm": 0.6875, "learning_rate": 2.845981288585895e-05, "loss": 0.7443, "step": 4249 }, { "epoch": 2.2818697359149023, "grad_norm": 0.80859375, "learning_rate": 2.8451448810791014e-05, "loss": 0.956, "step": 4250 }, { "epoch": 2.2818697359149023, "eval_loss": 0.9055709838867188, "eval_runtime": 46.8289, "eval_samples_per_second": 20.543, "eval_steps_per_second": 20.543, "step": 4250 }, { "epoch": 2.2824066306499784, "grad_norm": 0.703125, "learning_rate": 2.844308434186906e-05, "loss": 0.8582, "step": 4251 }, { "epoch": 2.282943525385054, "grad_norm": 0.76953125, "learning_rate": 2.8434719480047577e-05, "loss": 0.8337, "step": 4252 }, { "epoch": 2.28348042012013, "grad_norm": 0.703125, "learning_rate": 2.8426354226281105e-05, "loss": 0.9313, "step": 4253 }, { "epoch": 2.2840173148552063, "grad_norm": 0.8984375, "learning_rate": 2.8417988581524217e-05, "loss": 0.9046, "step": 4254 }, { "epoch": 2.284554209590282, "grad_norm": 0.6953125, "learning_rate": 2.8409622546731558e-05, "loss": 0.873, "step": 4255 }, { "epoch": 2.285091104325358, "grad_norm": 0.6875, "learning_rate": 2.8401256122857784e-05, "loss": 0.8176, "step": 4256 }, { "epoch": 2.2856279990604342, "grad_norm": 0.8125, "learning_rate": 2.8392889310857612e-05, "loss": 0.8911, "step": 4257 }, { "epoch": 2.2861648937955104, "grad_norm": 0.69140625, "learning_rate": 2.8384522111685803e-05, "loss": 1.085, "step": 4258 }, { "epoch": 2.286701788530586, "grad_norm": 0.671875, "learning_rate": 2.837615452629716e-05, "loss": 0.6702, "step": 4259 }, { "epoch": 2.287238683265662, "grad_norm": 0.59765625, "learning_rate": 2.836778655564653e-05, "loss": 0.6827, "step": 4260 }, { "epoch": 2.2877755780007383, "grad_norm": 0.8046875, "learning_rate": 2.8359418200688805e-05, "loss": 0.9794, "step": 4261 }, { "epoch": 2.2883124727358144, "grad_norm": 0.7109375, "learning_rate": 2.835104946237891e-05, "loss": 0.9336, "step": 4262 }, { "epoch": 2.28884936747089, "grad_norm": 0.703125, "learning_rate": 2.8342680341671845e-05, "loss": 0.9572, "step": 4263 }, { "epoch": 2.289386262205966, "grad_norm": 0.8125, "learning_rate": 2.833431083952262e-05, "loss": 0.7498, "step": 4264 }, { "epoch": 2.2899231569410423, "grad_norm": 0.64453125, "learning_rate": 2.83259409568863e-05, "loss": 0.9209, "step": 4265 }, { "epoch": 2.2904600516761184, "grad_norm": 0.7421875, "learning_rate": 2.8317570694717998e-05, "loss": 0.7977, "step": 4266 }, { "epoch": 2.290996946411194, "grad_norm": 0.8046875, "learning_rate": 2.8309200053972868e-05, "loss": 1.0071, "step": 4267 }, { "epoch": 2.2915338411462702, "grad_norm": 0.61328125, "learning_rate": 2.8300829035606102e-05, "loss": 0.7656, "step": 4268 }, { "epoch": 2.2920707358813464, "grad_norm": 0.6171875, "learning_rate": 2.829245764057294e-05, "loss": 0.6606, "step": 4269 }, { "epoch": 2.2926076306164225, "grad_norm": 0.703125, "learning_rate": 2.8284085869828665e-05, "loss": 0.7588, "step": 4270 }, { "epoch": 2.293144525351498, "grad_norm": 0.63671875, "learning_rate": 2.82757137243286e-05, "loss": 0.842, "step": 4271 }, { "epoch": 2.2936814200865743, "grad_norm": 0.671875, "learning_rate": 2.826734120502813e-05, "loss": 0.7822, "step": 4272 }, { "epoch": 2.2942183148216504, "grad_norm": 0.61328125, "learning_rate": 2.8258968312882638e-05, "loss": 0.6891, "step": 4273 }, { "epoch": 2.294755209556726, "grad_norm": 0.73828125, "learning_rate": 2.8250595048847594e-05, "loss": 0.9425, "step": 4274 }, { "epoch": 2.295292104291802, "grad_norm": 0.6953125, "learning_rate": 2.824222141387849e-05, "loss": 0.9684, "step": 4275 }, { "epoch": 2.2958289990268783, "grad_norm": 0.6875, "learning_rate": 2.8233847408930857e-05, "loss": 0.861, "step": 4276 }, { "epoch": 2.2963658937619544, "grad_norm": 0.671875, "learning_rate": 2.822547303496029e-05, "loss": 0.8275, "step": 4277 }, { "epoch": 2.29690278849703, "grad_norm": 0.76953125, "learning_rate": 2.8217098292922393e-05, "loss": 0.7691, "step": 4278 }, { "epoch": 2.2974396832321062, "grad_norm": 0.7734375, "learning_rate": 2.8208723183772844e-05, "loss": 1.0145, "step": 4279 }, { "epoch": 2.2979765779671824, "grad_norm": 0.6640625, "learning_rate": 2.820034770846735e-05, "loss": 0.7483, "step": 4280 }, { "epoch": 2.2985134727022585, "grad_norm": 0.640625, "learning_rate": 2.8191971867961652e-05, "loss": 0.6169, "step": 4281 }, { "epoch": 2.299050367437334, "grad_norm": 0.7421875, "learning_rate": 2.8183595663211536e-05, "loss": 0.708, "step": 4282 }, { "epoch": 2.2995872621724103, "grad_norm": 0.8125, "learning_rate": 2.8175219095172838e-05, "loss": 1.0034, "step": 4283 }, { "epoch": 2.3001241569074864, "grad_norm": 0.71875, "learning_rate": 2.816684216480143e-05, "loss": 0.8352, "step": 4284 }, { "epoch": 2.3006610516425625, "grad_norm": 0.77734375, "learning_rate": 2.8158464873053237e-05, "loss": 1.0078, "step": 4285 }, { "epoch": 2.301197946377638, "grad_norm": 0.75390625, "learning_rate": 2.815008722088419e-05, "loss": 0.8073, "step": 4286 }, { "epoch": 2.3017348411127143, "grad_norm": 0.68359375, "learning_rate": 2.81417092092503e-05, "loss": 0.8694, "step": 4287 }, { "epoch": 2.3022717358477904, "grad_norm": 0.703125, "learning_rate": 2.8133330839107608e-05, "loss": 0.9218, "step": 4288 }, { "epoch": 2.3028086305828666, "grad_norm": 0.78515625, "learning_rate": 2.8124952111412183e-05, "loss": 0.9545, "step": 4289 }, { "epoch": 2.3033455253179422, "grad_norm": 0.6328125, "learning_rate": 2.8116573027120152e-05, "loss": 0.8647, "step": 4290 }, { "epoch": 2.3038824200530184, "grad_norm": 0.61328125, "learning_rate": 2.810819358718767e-05, "loss": 0.695, "step": 4291 }, { "epoch": 2.3044193147880945, "grad_norm": 0.9140625, "learning_rate": 2.8099813792570928e-05, "loss": 0.8527, "step": 4292 }, { "epoch": 2.30495620952317, "grad_norm": 0.7265625, "learning_rate": 2.809143364422619e-05, "loss": 0.7821, "step": 4293 }, { "epoch": 2.3054931042582463, "grad_norm": 0.6875, "learning_rate": 2.8083053143109717e-05, "loss": 0.9497, "step": 4294 }, { "epoch": 2.3060299989933224, "grad_norm": 0.83984375, "learning_rate": 2.8074672290177832e-05, "loss": 1.2043, "step": 4295 }, { "epoch": 2.3065668937283985, "grad_norm": 0.68359375, "learning_rate": 2.806629108638691e-05, "loss": 0.8137, "step": 4296 }, { "epoch": 2.307103788463474, "grad_norm": 0.6953125, "learning_rate": 2.8057909532693338e-05, "loss": 0.9738, "step": 4297 }, { "epoch": 2.3076406831985503, "grad_norm": 0.63671875, "learning_rate": 2.8049527630053562e-05, "loss": 0.6856, "step": 4298 }, { "epoch": 2.3081775779336264, "grad_norm": 0.66015625, "learning_rate": 2.8041145379424073e-05, "loss": 0.8664, "step": 4299 }, { "epoch": 2.3087144726687026, "grad_norm": 0.6015625, "learning_rate": 2.8032762781761374e-05, "loss": 0.6837, "step": 4300 }, { "epoch": 2.3087144726687026, "eval_loss": 0.9049941897392273, "eval_runtime": 46.0915, "eval_samples_per_second": 20.872, "eval_steps_per_second": 20.872, "step": 4300 }, { "epoch": 2.3092513674037782, "grad_norm": 0.70703125, "learning_rate": 2.8024379838022036e-05, "loss": 0.7317, "step": 4301 }, { "epoch": 2.3097882621388544, "grad_norm": 0.85546875, "learning_rate": 2.8015996549162664e-05, "loss": 0.8046, "step": 4302 }, { "epoch": 2.3103251568739305, "grad_norm": 0.578125, "learning_rate": 2.800761291613988e-05, "loss": 0.7216, "step": 4303 }, { "epoch": 2.3108620516090066, "grad_norm": 0.75, "learning_rate": 2.7999228939910382e-05, "loss": 0.9202, "step": 4304 }, { "epoch": 2.3113989463440823, "grad_norm": 0.8359375, "learning_rate": 2.799084462143088e-05, "loss": 0.8792, "step": 4305 }, { "epoch": 2.3119358410791584, "grad_norm": 0.69921875, "learning_rate": 2.798245996165813e-05, "loss": 0.8558, "step": 4306 }, { "epoch": 2.3124727358142345, "grad_norm": 0.7109375, "learning_rate": 2.7974074961548928e-05, "loss": 0.7988, "step": 4307 }, { "epoch": 2.3130096305493106, "grad_norm": 0.75, "learning_rate": 2.796568962206011e-05, "loss": 0.9657, "step": 4308 }, { "epoch": 2.3135465252843863, "grad_norm": 0.59375, "learning_rate": 2.7957303944148543e-05, "loss": 0.8314, "step": 4309 }, { "epoch": 2.3140834200194624, "grad_norm": 0.74609375, "learning_rate": 2.7948917928771158e-05, "loss": 1.2084, "step": 4310 }, { "epoch": 2.3146203147545386, "grad_norm": 0.7265625, "learning_rate": 2.7940531576884876e-05, "loss": 0.9992, "step": 4311 }, { "epoch": 2.3151572094896142, "grad_norm": 0.6640625, "learning_rate": 2.793214488944671e-05, "loss": 0.8511, "step": 4312 }, { "epoch": 2.3156941042246904, "grad_norm": 0.7890625, "learning_rate": 2.792375786741368e-05, "loss": 0.9781, "step": 4313 }, { "epoch": 2.3162309989597665, "grad_norm": 0.7109375, "learning_rate": 2.7915370511742855e-05, "loss": 0.889, "step": 4314 }, { "epoch": 2.3167678936948426, "grad_norm": 0.7265625, "learning_rate": 2.7906982823391332e-05, "loss": 0.6994, "step": 4315 }, { "epoch": 2.3173047884299183, "grad_norm": 0.7890625, "learning_rate": 2.7898594803316253e-05, "loss": 0.7376, "step": 4316 }, { "epoch": 2.3178416831649944, "grad_norm": 0.78125, "learning_rate": 2.7890206452474794e-05, "loss": 0.9348, "step": 4317 }, { "epoch": 2.3183785779000705, "grad_norm": 0.68359375, "learning_rate": 2.7881817771824188e-05, "loss": 0.7226, "step": 4318 }, { "epoch": 2.3189154726351466, "grad_norm": 0.69140625, "learning_rate": 2.787342876232167e-05, "loss": 0.7594, "step": 4319 }, { "epoch": 2.3194523673702223, "grad_norm": 0.66796875, "learning_rate": 2.7865039424924542e-05, "loss": 0.8509, "step": 4320 }, { "epoch": 2.3199892621052984, "grad_norm": 0.8359375, "learning_rate": 2.7856649760590136e-05, "loss": 0.8446, "step": 4321 }, { "epoch": 2.3205261568403746, "grad_norm": 0.9765625, "learning_rate": 2.7848259770275814e-05, "loss": 0.9816, "step": 4322 }, { "epoch": 2.3210630515754507, "grad_norm": 0.74609375, "learning_rate": 2.7839869454938983e-05, "loss": 0.9402, "step": 4323 }, { "epoch": 2.3215999463105264, "grad_norm": 0.8046875, "learning_rate": 2.7831478815537087e-05, "loss": 0.9757, "step": 4324 }, { "epoch": 2.3221368410456025, "grad_norm": 1.0703125, "learning_rate": 2.782308785302759e-05, "loss": 0.966, "step": 4325 }, { "epoch": 2.3226737357806786, "grad_norm": 0.6484375, "learning_rate": 2.7814696568368027e-05, "loss": 0.8309, "step": 4326 }, { "epoch": 2.3232106305157547, "grad_norm": 0.7265625, "learning_rate": 2.7806304962515938e-05, "loss": 0.8635, "step": 4327 }, { "epoch": 2.3237475252508304, "grad_norm": 0.68359375, "learning_rate": 2.7797913036428914e-05, "loss": 0.7369, "step": 4328 }, { "epoch": 2.3242844199859065, "grad_norm": 0.734375, "learning_rate": 2.7789520791064578e-05, "loss": 0.8928, "step": 4329 }, { "epoch": 2.3248213147209826, "grad_norm": 0.72265625, "learning_rate": 2.778112822738059e-05, "loss": 0.7585, "step": 4330 }, { "epoch": 2.3253582094560583, "grad_norm": 0.640625, "learning_rate": 2.777273534633466e-05, "loss": 0.702, "step": 4331 }, { "epoch": 2.3258951041911344, "grad_norm": 0.70703125, "learning_rate": 2.776434214888451e-05, "loss": 0.9025, "step": 4332 }, { "epoch": 2.3264319989262106, "grad_norm": 0.75, "learning_rate": 2.7755948635987906e-05, "loss": 0.9347, "step": 4333 }, { "epoch": 2.3269688936612867, "grad_norm": 0.66796875, "learning_rate": 2.7747554808602665e-05, "loss": 0.8172, "step": 4334 }, { "epoch": 2.3275057883963624, "grad_norm": 0.75390625, "learning_rate": 2.7739160667686637e-05, "loss": 0.907, "step": 4335 }, { "epoch": 2.3280426831314385, "grad_norm": 0.7734375, "learning_rate": 2.7730766214197674e-05, "loss": 0.8014, "step": 4336 }, { "epoch": 2.3285795778665146, "grad_norm": 0.62890625, "learning_rate": 2.772237144909371e-05, "loss": 0.8979, "step": 4337 }, { "epoch": 2.3291164726015907, "grad_norm": 0.71875, "learning_rate": 2.7713976373332686e-05, "loss": 0.8025, "step": 4338 }, { "epoch": 2.3296533673366664, "grad_norm": 0.71875, "learning_rate": 2.7705580987872588e-05, "loss": 0.8223, "step": 4339 }, { "epoch": 2.3301902620717425, "grad_norm": 0.66015625, "learning_rate": 2.7697185293671434e-05, "loss": 0.7898, "step": 4340 }, { "epoch": 2.3307271568068186, "grad_norm": 0.70703125, "learning_rate": 2.768878929168728e-05, "loss": 0.806, "step": 4341 }, { "epoch": 2.3312640515418948, "grad_norm": 0.62890625, "learning_rate": 2.7680392982878223e-05, "loss": 0.9565, "step": 4342 }, { "epoch": 2.3318009462769704, "grad_norm": 0.61328125, "learning_rate": 2.7671996368202374e-05, "loss": 0.9082, "step": 4343 }, { "epoch": 2.3323378410120466, "grad_norm": 0.68359375, "learning_rate": 2.766359944861791e-05, "loss": 0.8444, "step": 4344 }, { "epoch": 2.3328747357471227, "grad_norm": 0.640625, "learning_rate": 2.7655202225083015e-05, "loss": 0.991, "step": 4345 }, { "epoch": 2.333411630482199, "grad_norm": 0.6796875, "learning_rate": 2.7646804698555917e-05, "loss": 0.8506, "step": 4346 }, { "epoch": 2.3339485252172745, "grad_norm": 0.66015625, "learning_rate": 2.763840686999488e-05, "loss": 0.8369, "step": 4347 }, { "epoch": 2.3344854199523506, "grad_norm": 0.73046875, "learning_rate": 2.763000874035822e-05, "loss": 0.9519, "step": 4348 }, { "epoch": 2.3350223146874267, "grad_norm": 0.66015625, "learning_rate": 2.762161031060425e-05, "loss": 0.7414, "step": 4349 }, { "epoch": 2.3355592094225024, "grad_norm": 0.84375, "learning_rate": 2.761321158169134e-05, "loss": 1.0191, "step": 4350 }, { "epoch": 2.3355592094225024, "eval_loss": 0.9044721126556396, "eval_runtime": 46.2168, "eval_samples_per_second": 20.815, "eval_steps_per_second": 20.815, "step": 4350 }, { "epoch": 2.3360961041575785, "grad_norm": 0.6953125, "learning_rate": 2.76048125545779e-05, "loss": 1.0184, "step": 4351 }, { "epoch": 2.3366329988926546, "grad_norm": 0.703125, "learning_rate": 2.759641323022235e-05, "loss": 0.6846, "step": 4352 }, { "epoch": 2.3371698936277308, "grad_norm": 0.640625, "learning_rate": 2.758801360958318e-05, "loss": 0.8757, "step": 4353 }, { "epoch": 2.3377067883628064, "grad_norm": 0.61328125, "learning_rate": 2.7579613693618884e-05, "loss": 0.8561, "step": 4354 }, { "epoch": 2.3382436830978826, "grad_norm": 0.69921875, "learning_rate": 2.7571213483287984e-05, "loss": 0.8019, "step": 4355 }, { "epoch": 2.3387805778329587, "grad_norm": 0.6796875, "learning_rate": 2.756281297954908e-05, "loss": 0.7533, "step": 4356 }, { "epoch": 2.339317472568035, "grad_norm": 0.62109375, "learning_rate": 2.7554412183360757e-05, "loss": 0.8404, "step": 4357 }, { "epoch": 2.3398543673031105, "grad_norm": 0.77734375, "learning_rate": 2.7546011095681638e-05, "loss": 0.8433, "step": 4358 }, { "epoch": 2.3403912620381866, "grad_norm": 0.59375, "learning_rate": 2.7537609717470425e-05, "loss": 0.9055, "step": 4359 }, { "epoch": 2.3409281567732627, "grad_norm": 0.71875, "learning_rate": 2.7529208049685807e-05, "loss": 0.8937, "step": 4360 }, { "epoch": 2.341465051508339, "grad_norm": 0.86328125, "learning_rate": 2.7520806093286516e-05, "loss": 0.798, "step": 4361 }, { "epoch": 2.3420019462434145, "grad_norm": 0.7265625, "learning_rate": 2.7512403849231332e-05, "loss": 1.106, "step": 4362 }, { "epoch": 2.3425388409784906, "grad_norm": 0.65234375, "learning_rate": 2.750400131847905e-05, "loss": 0.7766, "step": 4363 }, { "epoch": 2.3430757357135668, "grad_norm": 0.6640625, "learning_rate": 2.7495598501988502e-05, "loss": 0.7044, "step": 4364 }, { "epoch": 2.343612630448643, "grad_norm": 0.640625, "learning_rate": 2.7487195400718568e-05, "loss": 0.8082, "step": 4365 }, { "epoch": 2.3441495251837186, "grad_norm": 0.7734375, "learning_rate": 2.747879201562813e-05, "loss": 0.927, "step": 4366 }, { "epoch": 2.3446864199187947, "grad_norm": 0.91015625, "learning_rate": 2.747038834767614e-05, "loss": 0.9076, "step": 4367 }, { "epoch": 2.345223314653871, "grad_norm": 0.625, "learning_rate": 2.7461984397821555e-05, "loss": 0.7095, "step": 4368 }, { "epoch": 2.3457602093889465, "grad_norm": 0.62109375, "learning_rate": 2.7453580167023367e-05, "loss": 0.8225, "step": 4369 }, { "epoch": 2.3462971041240226, "grad_norm": 0.72265625, "learning_rate": 2.7445175656240617e-05, "loss": 0.8181, "step": 4370 }, { "epoch": 2.3468339988590987, "grad_norm": 0.68359375, "learning_rate": 2.7436770866432356e-05, "loss": 0.8887, "step": 4371 }, { "epoch": 2.347370893594175, "grad_norm": 0.65234375, "learning_rate": 2.7428365798557676e-05, "loss": 0.7915, "step": 4372 }, { "epoch": 2.3479077883292505, "grad_norm": 0.88671875, "learning_rate": 2.741996045357571e-05, "loss": 0.7064, "step": 4373 }, { "epoch": 2.3484446830643266, "grad_norm": 0.7109375, "learning_rate": 2.7411554832445608e-05, "loss": 0.8799, "step": 4374 }, { "epoch": 2.3489815777994028, "grad_norm": 0.56640625, "learning_rate": 2.740314893612656e-05, "loss": 0.762, "step": 4375 }, { "epoch": 2.349518472534479, "grad_norm": 0.640625, "learning_rate": 2.7394742765577785e-05, "loss": 0.8482, "step": 4376 }, { "epoch": 2.3500553672695546, "grad_norm": 0.69140625, "learning_rate": 2.7386336321758534e-05, "loss": 0.8223, "step": 4377 }, { "epoch": 2.3505922620046307, "grad_norm": 0.765625, "learning_rate": 2.737792960562809e-05, "loss": 0.8307, "step": 4378 }, { "epoch": 2.351129156739707, "grad_norm": 0.640625, "learning_rate": 2.736952261814576e-05, "loss": 0.7123, "step": 4379 }, { "epoch": 2.351666051474783, "grad_norm": 0.953125, "learning_rate": 2.736111536027089e-05, "loss": 0.9244, "step": 4380 }, { "epoch": 2.3522029462098586, "grad_norm": 0.68359375, "learning_rate": 2.7352707832962865e-05, "loss": 0.8727, "step": 4381 }, { "epoch": 2.3527398409449347, "grad_norm": 0.60546875, "learning_rate": 2.734430003718107e-05, "loss": 0.8808, "step": 4382 }, { "epoch": 2.353276735680011, "grad_norm": 0.640625, "learning_rate": 2.733589197388496e-05, "loss": 0.6586, "step": 4383 }, { "epoch": 2.353813630415087, "grad_norm": 0.73046875, "learning_rate": 2.7327483644033992e-05, "loss": 0.9529, "step": 4384 }, { "epoch": 2.3543505251501626, "grad_norm": 0.65625, "learning_rate": 2.7319075048587666e-05, "loss": 0.649, "step": 4385 }, { "epoch": 2.3548874198852388, "grad_norm": 0.6875, "learning_rate": 2.731066618850551e-05, "loss": 0.7666, "step": 4386 }, { "epoch": 2.355424314620315, "grad_norm": 0.77734375, "learning_rate": 2.7302257064747077e-05, "loss": 0.9609, "step": 4387 }, { "epoch": 2.3559612093553906, "grad_norm": 0.5546875, "learning_rate": 2.7293847678271954e-05, "loss": 0.6664, "step": 4388 }, { "epoch": 2.3564981040904667, "grad_norm": 0.6875, "learning_rate": 2.7285438030039768e-05, "loss": 0.8773, "step": 4389 }, { "epoch": 2.357034998825543, "grad_norm": 0.7109375, "learning_rate": 2.7277028121010162e-05, "loss": 0.8702, "step": 4390 }, { "epoch": 2.357571893560619, "grad_norm": 0.89453125, "learning_rate": 2.7268617952142806e-05, "loss": 0.9229, "step": 4391 }, { "epoch": 2.3581087882956946, "grad_norm": 0.69921875, "learning_rate": 2.7260207524397413e-05, "loss": 0.9931, "step": 4392 }, { "epoch": 2.3586456830307707, "grad_norm": 0.7109375, "learning_rate": 2.7251796838733716e-05, "loss": 0.7213, "step": 4393 }, { "epoch": 2.359182577765847, "grad_norm": 0.62109375, "learning_rate": 2.724338589611149e-05, "loss": 0.7953, "step": 4394 }, { "epoch": 2.359719472500923, "grad_norm": 0.62890625, "learning_rate": 2.723497469749052e-05, "loss": 0.6998, "step": 4395 }, { "epoch": 2.3602563672359986, "grad_norm": 0.69140625, "learning_rate": 2.722656324383062e-05, "loss": 0.812, "step": 4396 }, { "epoch": 2.3607932619710748, "grad_norm": 0.7109375, "learning_rate": 2.7218151536091674e-05, "loss": 1.0321, "step": 4397 }, { "epoch": 2.361330156706151, "grad_norm": 0.578125, "learning_rate": 2.7209739575233544e-05, "loss": 0.693, "step": 4398 }, { "epoch": 2.361867051441227, "grad_norm": 0.5859375, "learning_rate": 2.720132736221614e-05, "loss": 0.7624, "step": 4399 }, { "epoch": 2.3624039461763027, "grad_norm": 0.62890625, "learning_rate": 2.7192914897999404e-05, "loss": 0.9707, "step": 4400 }, { "epoch": 2.3624039461763027, "eval_loss": 0.9049584865570068, "eval_runtime": 45.9822, "eval_samples_per_second": 20.921, "eval_steps_per_second": 20.921, "step": 4400 }, { "epoch": 2.362940840911379, "grad_norm": 0.74609375, "learning_rate": 2.7184502183543314e-05, "loss": 0.886, "step": 4401 }, { "epoch": 2.363477735646455, "grad_norm": 0.984375, "learning_rate": 2.717608921980786e-05, "loss": 0.857, "step": 4402 }, { "epoch": 2.364014630381531, "grad_norm": 0.6875, "learning_rate": 2.7167676007753067e-05, "loss": 0.6939, "step": 4403 }, { "epoch": 2.3645515251166067, "grad_norm": 0.65234375, "learning_rate": 2.715926254833898e-05, "loss": 0.6873, "step": 4404 }, { "epoch": 2.365088419851683, "grad_norm": 0.76171875, "learning_rate": 2.7150848842525704e-05, "loss": 0.8498, "step": 4405 }, { "epoch": 2.365625314586759, "grad_norm": 0.7578125, "learning_rate": 2.714243489127333e-05, "loss": 0.9702, "step": 4406 }, { "epoch": 2.3661622093218346, "grad_norm": 0.73046875, "learning_rate": 2.7134020695542012e-05, "loss": 0.91, "step": 4407 }, { "epoch": 2.3666991040569108, "grad_norm": 0.66796875, "learning_rate": 2.7125606256291902e-05, "loss": 0.6683, "step": 4408 }, { "epoch": 2.367235998791987, "grad_norm": 0.66796875, "learning_rate": 2.7117191574483197e-05, "loss": 0.9067, "step": 4409 }, { "epoch": 2.367772893527063, "grad_norm": 0.69140625, "learning_rate": 2.7108776651076118e-05, "loss": 1.0223, "step": 4410 }, { "epoch": 2.3683097882621387, "grad_norm": 0.703125, "learning_rate": 2.7100361487030933e-05, "loss": 0.8826, "step": 4411 }, { "epoch": 2.368846682997215, "grad_norm": 0.8046875, "learning_rate": 2.7091946083307896e-05, "loss": 1.0356, "step": 4412 }, { "epoch": 2.369383577732291, "grad_norm": 0.5546875, "learning_rate": 2.708353044086731e-05, "loss": 0.6455, "step": 4413 }, { "epoch": 2.369920472467367, "grad_norm": 0.75, "learning_rate": 2.7075114560669518e-05, "loss": 0.9449, "step": 4414 }, { "epoch": 2.3704573672024427, "grad_norm": 0.65234375, "learning_rate": 2.7066698443674883e-05, "loss": 0.7601, "step": 4415 }, { "epoch": 2.370994261937519, "grad_norm": 0.6875, "learning_rate": 2.7058282090843778e-05, "loss": 0.8004, "step": 4416 }, { "epoch": 2.371531156672595, "grad_norm": 0.69921875, "learning_rate": 2.7049865503136616e-05, "loss": 0.7394, "step": 4417 }, { "epoch": 2.372068051407671, "grad_norm": 0.63671875, "learning_rate": 2.7041448681513842e-05, "loss": 0.6308, "step": 4418 }, { "epoch": 2.3726049461427468, "grad_norm": 0.7265625, "learning_rate": 2.7033031626935934e-05, "loss": 0.8589, "step": 4419 }, { "epoch": 2.373141840877823, "grad_norm": 0.671875, "learning_rate": 2.7024614340363365e-05, "loss": 0.9038, "step": 4420 }, { "epoch": 2.373678735612899, "grad_norm": 0.64453125, "learning_rate": 2.7016196822756644e-05, "loss": 0.8403, "step": 4421 }, { "epoch": 2.374215630347975, "grad_norm": 0.95703125, "learning_rate": 2.700777907507635e-05, "loss": 0.8906, "step": 4422 }, { "epoch": 2.374752525083051, "grad_norm": 0.66796875, "learning_rate": 2.6999361098283033e-05, "loss": 0.942, "step": 4423 }, { "epoch": 2.375289419818127, "grad_norm": 0.87109375, "learning_rate": 2.699094289333729e-05, "loss": 0.9625, "step": 4424 }, { "epoch": 2.375826314553203, "grad_norm": 0.6171875, "learning_rate": 2.6982524461199753e-05, "loss": 0.7712, "step": 4425 }, { "epoch": 2.3763632092882787, "grad_norm": 0.72265625, "learning_rate": 2.697410580283107e-05, "loss": 0.9637, "step": 4426 }, { "epoch": 2.376900104023355, "grad_norm": 0.67578125, "learning_rate": 2.696568691919191e-05, "loss": 0.7743, "step": 4427 }, { "epoch": 2.377436998758431, "grad_norm": 0.63671875, "learning_rate": 2.6957267811242986e-05, "loss": 0.7868, "step": 4428 }, { "epoch": 2.377973893493507, "grad_norm": 0.609375, "learning_rate": 2.694884847994501e-05, "loss": 1.0199, "step": 4429 }, { "epoch": 2.3785107882285828, "grad_norm": 0.74609375, "learning_rate": 2.6940428926258748e-05, "loss": 1.0205, "step": 4430 }, { "epoch": 2.379047682963659, "grad_norm": 0.734375, "learning_rate": 2.6932009151144973e-05, "loss": 0.9248, "step": 4431 }, { "epoch": 2.379584577698735, "grad_norm": 0.7109375, "learning_rate": 2.692358915556449e-05, "loss": 0.9396, "step": 4432 }, { "epoch": 2.380121472433811, "grad_norm": 0.78515625, "learning_rate": 2.6915168940478115e-05, "loss": 0.9513, "step": 4433 }, { "epoch": 2.380658367168887, "grad_norm": 0.59375, "learning_rate": 2.6906748506846713e-05, "loss": 0.8836, "step": 4434 }, { "epoch": 2.381195261903963, "grad_norm": 0.7734375, "learning_rate": 2.6898327855631155e-05, "loss": 0.9377, "step": 4435 }, { "epoch": 2.381732156639039, "grad_norm": 0.61328125, "learning_rate": 2.6889906987792362e-05, "loss": 0.7775, "step": 4436 }, { "epoch": 2.382269051374115, "grad_norm": 0.6640625, "learning_rate": 2.688148590429124e-05, "loss": 0.85, "step": 4437 }, { "epoch": 2.382805946109191, "grad_norm": 0.69921875, "learning_rate": 2.6873064606088744e-05, "loss": 0.779, "step": 4438 }, { "epoch": 2.383342840844267, "grad_norm": 0.63671875, "learning_rate": 2.6864643094145857e-05, "loss": 0.7394, "step": 4439 }, { "epoch": 2.383879735579343, "grad_norm": 0.7421875, "learning_rate": 2.685622136942359e-05, "loss": 0.9771, "step": 4440 }, { "epoch": 2.384416630314419, "grad_norm": 0.765625, "learning_rate": 2.684779943288295e-05, "loss": 0.8868, "step": 4441 }, { "epoch": 2.384953525049495, "grad_norm": 0.69921875, "learning_rate": 2.6839377285484997e-05, "loss": 0.7228, "step": 4442 }, { "epoch": 2.385490419784571, "grad_norm": 0.72265625, "learning_rate": 2.6830954928190794e-05, "loss": 0.6917, "step": 4443 }, { "epoch": 2.386027314519647, "grad_norm": 0.68359375, "learning_rate": 2.6822532361961455e-05, "loss": 0.7938, "step": 4444 }, { "epoch": 2.386564209254723, "grad_norm": 0.8828125, "learning_rate": 2.6814109587758086e-05, "loss": 0.7497, "step": 4445 }, { "epoch": 2.387101103989799, "grad_norm": 0.62890625, "learning_rate": 2.6805686606541847e-05, "loss": 0.9161, "step": 4446 }, { "epoch": 2.387637998724875, "grad_norm": 0.609375, "learning_rate": 2.6797263419273893e-05, "loss": 0.7484, "step": 4447 }, { "epoch": 2.388174893459951, "grad_norm": 0.83984375, "learning_rate": 2.6788840026915423e-05, "loss": 0.8598, "step": 4448 }, { "epoch": 2.388711788195027, "grad_norm": 0.66796875, "learning_rate": 2.678041643042766e-05, "loss": 0.8954, "step": 4449 }, { "epoch": 2.389248682930103, "grad_norm": 0.72265625, "learning_rate": 2.6771992630771824e-05, "loss": 0.9852, "step": 4450 }, { "epoch": 2.389248682930103, "eval_loss": 0.9053536057472229, "eval_runtime": 46.1188, "eval_samples_per_second": 20.859, "eval_steps_per_second": 20.859, "step": 4450 }, { "epoch": 2.389785577665179, "grad_norm": 0.6640625, "learning_rate": 2.676356862890919e-05, "loss": 0.7657, "step": 4451 }, { "epoch": 2.390322472400255, "grad_norm": 0.69140625, "learning_rate": 2.6755144425801048e-05, "loss": 0.8893, "step": 4452 }, { "epoch": 2.390859367135331, "grad_norm": 0.703125, "learning_rate": 2.67467200224087e-05, "loss": 0.9203, "step": 4453 }, { "epoch": 2.391396261870407, "grad_norm": 0.734375, "learning_rate": 2.6738295419693473e-05, "loss": 0.9228, "step": 4454 }, { "epoch": 2.391933156605483, "grad_norm": 0.71484375, "learning_rate": 2.6729870618616724e-05, "loss": 0.7608, "step": 4455 }, { "epoch": 2.3924700513405592, "grad_norm": 0.9453125, "learning_rate": 2.6721445620139835e-05, "loss": 1.0952, "step": 4456 }, { "epoch": 2.393006946075635, "grad_norm": 0.76171875, "learning_rate": 2.6713020425224195e-05, "loss": 0.9739, "step": 4457 }, { "epoch": 2.393543840810711, "grad_norm": 0.640625, "learning_rate": 2.6704595034831236e-05, "loss": 0.9615, "step": 4458 }, { "epoch": 2.394080735545787, "grad_norm": 0.6015625, "learning_rate": 2.6696169449922386e-05, "loss": 0.8106, "step": 4459 }, { "epoch": 2.3946176302808633, "grad_norm": 0.7421875, "learning_rate": 2.668774367145913e-05, "loss": 0.9132, "step": 4460 }, { "epoch": 2.395154525015939, "grad_norm": 0.7109375, "learning_rate": 2.6679317700402956e-05, "loss": 0.8524, "step": 4461 }, { "epoch": 2.395691419751015, "grad_norm": 0.82421875, "learning_rate": 2.6670891537715354e-05, "loss": 0.9832, "step": 4462 }, { "epoch": 2.396228314486091, "grad_norm": 0.7421875, "learning_rate": 2.666246518435787e-05, "loss": 0.8812, "step": 4463 }, { "epoch": 2.396765209221167, "grad_norm": 0.6953125, "learning_rate": 2.6654038641292056e-05, "loss": 0.7877, "step": 4464 }, { "epoch": 2.397302103956243, "grad_norm": 0.73828125, "learning_rate": 2.6645611909479484e-05, "loss": 0.844, "step": 4465 }, { "epoch": 2.397838998691319, "grad_norm": 0.7421875, "learning_rate": 2.663718498988176e-05, "loss": 0.9999, "step": 4466 }, { "epoch": 2.3983758934263952, "grad_norm": 0.8203125, "learning_rate": 2.662875788346048e-05, "loss": 1.0071, "step": 4467 }, { "epoch": 2.398912788161471, "grad_norm": 0.6328125, "learning_rate": 2.6620330591177313e-05, "loss": 0.7417, "step": 4468 }, { "epoch": 2.399449682896547, "grad_norm": 0.66796875, "learning_rate": 2.6611903113993897e-05, "loss": 1.0774, "step": 4469 }, { "epoch": 2.399986577631623, "grad_norm": 0.72265625, "learning_rate": 2.660347545287193e-05, "loss": 1.0166, "step": 4470 }, { "epoch": 2.4005234723666993, "grad_norm": 0.83203125, "learning_rate": 2.6595047608773103e-05, "loss": 0.848, "step": 4471 }, { "epoch": 2.401060367101775, "grad_norm": 0.5625, "learning_rate": 2.658661958265915e-05, "loss": 0.781, "step": 4472 }, { "epoch": 2.401597261836851, "grad_norm": 0.73828125, "learning_rate": 2.65781913754918e-05, "loss": 0.9022, "step": 4473 }, { "epoch": 2.402134156571927, "grad_norm": 0.57421875, "learning_rate": 2.656976298823284e-05, "loss": 0.7464, "step": 4474 }, { "epoch": 2.4026710513070033, "grad_norm": 0.64453125, "learning_rate": 2.6561334421844037e-05, "loss": 0.8152, "step": 4475 }, { "epoch": 2.403207946042079, "grad_norm": 0.6875, "learning_rate": 2.6552905677287197e-05, "loss": 0.83, "step": 4476 }, { "epoch": 2.403744840777155, "grad_norm": 0.8125, "learning_rate": 2.654447675552416e-05, "loss": 0.8393, "step": 4477 }, { "epoch": 2.4042817355122312, "grad_norm": 0.7265625, "learning_rate": 2.6536047657516772e-05, "loss": 0.8914, "step": 4478 }, { "epoch": 2.4048186302473074, "grad_norm": 0.69921875, "learning_rate": 2.6527618384226887e-05, "loss": 0.7263, "step": 4479 }, { "epoch": 2.405355524982383, "grad_norm": 0.765625, "learning_rate": 2.65191889366164e-05, "loss": 0.9114, "step": 4480 }, { "epoch": 2.405892419717459, "grad_norm": 0.71484375, "learning_rate": 2.651075931564721e-05, "loss": 0.8877, "step": 4481 }, { "epoch": 2.4064293144525353, "grad_norm": 0.80859375, "learning_rate": 2.6502329522281262e-05, "loss": 0.7874, "step": 4482 }, { "epoch": 2.406966209187611, "grad_norm": 0.7421875, "learning_rate": 2.6493899557480485e-05, "loss": 0.9375, "step": 4483 }, { "epoch": 2.407503103922687, "grad_norm": 0.5546875, "learning_rate": 2.6485469422206843e-05, "loss": 0.7289, "step": 4484 }, { "epoch": 2.408039998657763, "grad_norm": 0.609375, "learning_rate": 2.6477039117422335e-05, "loss": 0.752, "step": 4485 }, { "epoch": 2.4085768933928393, "grad_norm": 0.67578125, "learning_rate": 2.6468608644088953e-05, "loss": 0.975, "step": 4486 }, { "epoch": 2.409113788127915, "grad_norm": 0.6640625, "learning_rate": 2.6460178003168734e-05, "loss": 1.0165, "step": 4487 }, { "epoch": 2.409650682862991, "grad_norm": 0.7578125, "learning_rate": 2.645174719562371e-05, "loss": 0.9423, "step": 4488 }, { "epoch": 2.4101875775980672, "grad_norm": 0.6484375, "learning_rate": 2.6443316222415943e-05, "loss": 1.0277, "step": 4489 }, { "epoch": 2.4107244723331434, "grad_norm": 0.671875, "learning_rate": 2.6434885084507517e-05, "loss": 0.8967, "step": 4490 }, { "epoch": 2.411261367068219, "grad_norm": 0.69921875, "learning_rate": 2.6426453782860545e-05, "loss": 0.8638, "step": 4491 }, { "epoch": 2.411798261803295, "grad_norm": 0.81640625, "learning_rate": 2.6418022318437114e-05, "loss": 0.9491, "step": 4492 }, { "epoch": 2.4123351565383713, "grad_norm": 0.72265625, "learning_rate": 2.640959069219938e-05, "loss": 0.9281, "step": 4493 }, { "epoch": 2.4128720512734474, "grad_norm": 0.62890625, "learning_rate": 2.6401158905109506e-05, "loss": 0.9745, "step": 4494 }, { "epoch": 2.413408946008523, "grad_norm": 0.7734375, "learning_rate": 2.6392726958129653e-05, "loss": 0.9925, "step": 4495 }, { "epoch": 2.413945840743599, "grad_norm": 0.76171875, "learning_rate": 2.6384294852222018e-05, "loss": 0.8337, "step": 4496 }, { "epoch": 2.4144827354786753, "grad_norm": 0.57421875, "learning_rate": 2.637586258834881e-05, "loss": 0.6912, "step": 4497 }, { "epoch": 2.4150196302137514, "grad_norm": 0.78125, "learning_rate": 2.636743016747225e-05, "loss": 0.9982, "step": 4498 }, { "epoch": 2.415556524948827, "grad_norm": 0.80859375, "learning_rate": 2.63589975905546e-05, "loss": 0.8983, "step": 4499 }, { "epoch": 2.4160934196839032, "grad_norm": 0.70703125, "learning_rate": 2.6350564858558107e-05, "loss": 0.8172, "step": 4500 }, { "epoch": 2.4160934196839032, "eval_loss": 0.9049860835075378, "eval_runtime": 45.937, "eval_samples_per_second": 20.942, "eval_steps_per_second": 20.942, "step": 4500 }, { "epoch": 2.4166303144189794, "grad_norm": 0.7109375, "learning_rate": 2.6342131972445067e-05, "loss": 0.8297, "step": 4501 }, { "epoch": 2.417167209154055, "grad_norm": 0.921875, "learning_rate": 2.633369893317777e-05, "loss": 0.9613, "step": 4502 }, { "epoch": 2.417704103889131, "grad_norm": 0.65625, "learning_rate": 2.6325265741718537e-05, "loss": 0.7996, "step": 4503 }, { "epoch": 2.4182409986242073, "grad_norm": 0.6796875, "learning_rate": 2.6316832399029696e-05, "loss": 0.6878, "step": 4504 }, { "epoch": 2.4187778933592834, "grad_norm": 0.71484375, "learning_rate": 2.63083989060736e-05, "loss": 0.9636, "step": 4505 }, { "epoch": 2.419314788094359, "grad_norm": 0.84375, "learning_rate": 2.6299965263812616e-05, "loss": 0.9298, "step": 4506 }, { "epoch": 2.419851682829435, "grad_norm": 0.85546875, "learning_rate": 2.6291531473209148e-05, "loss": 0.7767, "step": 4507 }, { "epoch": 2.4203885775645113, "grad_norm": 0.71484375, "learning_rate": 2.6283097535225565e-05, "loss": 0.699, "step": 4508 }, { "epoch": 2.4209254722995874, "grad_norm": 0.703125, "learning_rate": 2.6274663450824318e-05, "loss": 0.7578, "step": 4509 }, { "epoch": 2.421462367034663, "grad_norm": 0.6484375, "learning_rate": 2.6266229220967818e-05, "loss": 0.8686, "step": 4510 }, { "epoch": 2.4219992617697392, "grad_norm": 0.61328125, "learning_rate": 2.625779484661854e-05, "loss": 0.6708, "step": 4511 }, { "epoch": 2.4225361565048154, "grad_norm": 0.69921875, "learning_rate": 2.6249360328738932e-05, "loss": 0.7383, "step": 4512 }, { "epoch": 2.423073051239891, "grad_norm": 0.59765625, "learning_rate": 2.6240925668291496e-05, "loss": 0.7092, "step": 4513 }, { "epoch": 2.423609945974967, "grad_norm": 0.62890625, "learning_rate": 2.6232490866238714e-05, "loss": 0.9468, "step": 4514 }, { "epoch": 2.4241468407100433, "grad_norm": 0.75390625, "learning_rate": 2.622405592354312e-05, "loss": 0.8644, "step": 4515 }, { "epoch": 2.4246837354451194, "grad_norm": 0.67578125, "learning_rate": 2.621562084116726e-05, "loss": 0.9886, "step": 4516 }, { "epoch": 2.4252206301801955, "grad_norm": 0.7109375, "learning_rate": 2.6207185620073648e-05, "loss": 0.75, "step": 4517 }, { "epoch": 2.425757524915271, "grad_norm": 0.76953125, "learning_rate": 2.6198750261224873e-05, "loss": 0.7946, "step": 4518 }, { "epoch": 2.4262944196503473, "grad_norm": 0.74609375, "learning_rate": 2.6190314765583517e-05, "loss": 0.8414, "step": 4519 }, { "epoch": 2.4268313143854234, "grad_norm": 0.58984375, "learning_rate": 2.6181879134112173e-05, "loss": 0.7495, "step": 4520 }, { "epoch": 2.427368209120499, "grad_norm": 0.703125, "learning_rate": 2.6173443367773452e-05, "loss": 0.9884, "step": 4521 }, { "epoch": 2.4279051038555752, "grad_norm": 0.703125, "learning_rate": 2.6165007467529973e-05, "loss": 0.9648, "step": 4522 }, { "epoch": 2.4284419985906514, "grad_norm": 0.68359375, "learning_rate": 2.6156571434344394e-05, "loss": 0.7987, "step": 4523 }, { "epoch": 2.4289788933257275, "grad_norm": 0.69921875, "learning_rate": 2.614813526917938e-05, "loss": 0.8769, "step": 4524 }, { "epoch": 2.429515788060803, "grad_norm": 0.71875, "learning_rate": 2.6139698972997573e-05, "loss": 0.8598, "step": 4525 }, { "epoch": 2.4300526827958793, "grad_norm": 0.62890625, "learning_rate": 2.6131262546761693e-05, "loss": 0.9152, "step": 4526 }, { "epoch": 2.4305895775309554, "grad_norm": 0.8125, "learning_rate": 2.6122825991434425e-05, "loss": 0.8841, "step": 4527 }, { "epoch": 2.4311264722660315, "grad_norm": 0.74609375, "learning_rate": 2.611438930797849e-05, "loss": 0.8423, "step": 4528 }, { "epoch": 2.431663367001107, "grad_norm": 0.7421875, "learning_rate": 2.6105952497356628e-05, "loss": 0.926, "step": 4529 }, { "epoch": 2.4322002617361833, "grad_norm": 0.81640625, "learning_rate": 2.609751556053157e-05, "loss": 0.866, "step": 4530 }, { "epoch": 2.4327371564712594, "grad_norm": 0.671875, "learning_rate": 2.6089078498466096e-05, "loss": 0.8663, "step": 4531 }, { "epoch": 2.433274051206335, "grad_norm": 0.73828125, "learning_rate": 2.608064131212298e-05, "loss": 0.7887, "step": 4532 }, { "epoch": 2.4338109459414112, "grad_norm": 0.75, "learning_rate": 2.607220400246499e-05, "loss": 0.8505, "step": 4533 }, { "epoch": 2.4343478406764874, "grad_norm": 0.76171875, "learning_rate": 2.606376657045495e-05, "loss": 0.8075, "step": 4534 }, { "epoch": 2.4348847354115635, "grad_norm": 0.61328125, "learning_rate": 2.6055329017055673e-05, "loss": 0.8791, "step": 4535 }, { "epoch": 2.4354216301466396, "grad_norm": 0.76171875, "learning_rate": 2.604689134322999e-05, "loss": 1.1443, "step": 4536 }, { "epoch": 2.4359585248817153, "grad_norm": 0.89453125, "learning_rate": 2.6038453549940755e-05, "loss": 0.9011, "step": 4537 }, { "epoch": 2.4364954196167914, "grad_norm": 0.625, "learning_rate": 2.603001563815081e-05, "loss": 0.797, "step": 4538 }, { "epoch": 2.4370323143518675, "grad_norm": 0.75390625, "learning_rate": 2.602157760882304e-05, "loss": 0.9502, "step": 4539 }, { "epoch": 2.437569209086943, "grad_norm": 0.65625, "learning_rate": 2.6013139462920328e-05, "loss": 0.85, "step": 4540 }, { "epoch": 2.4381061038220193, "grad_norm": 0.69921875, "learning_rate": 2.6004701201405568e-05, "loss": 0.7806, "step": 4541 }, { "epoch": 2.4386429985570954, "grad_norm": 0.6875, "learning_rate": 2.5996262825241686e-05, "loss": 0.7601, "step": 4542 }, { "epoch": 2.4391798932921716, "grad_norm": 0.8046875, "learning_rate": 2.598782433539159e-05, "loss": 0.7733, "step": 4543 }, { "epoch": 2.4397167880272472, "grad_norm": 0.75, "learning_rate": 2.5979385732818228e-05, "loss": 0.8172, "step": 4544 }, { "epoch": 2.4402536827623234, "grad_norm": 0.73828125, "learning_rate": 2.5970947018484564e-05, "loss": 0.824, "step": 4545 }, { "epoch": 2.4407905774973995, "grad_norm": 0.69140625, "learning_rate": 2.5962508193353542e-05, "loss": 0.8564, "step": 4546 }, { "epoch": 2.4413274722324756, "grad_norm": 1.0, "learning_rate": 2.5954069258388137e-05, "loss": 1.0499, "step": 4547 }, { "epoch": 2.4418643669675513, "grad_norm": 0.71875, "learning_rate": 2.594563021455135e-05, "loss": 0.8332, "step": 4548 }, { "epoch": 2.4424012617026274, "grad_norm": 0.72265625, "learning_rate": 2.5937191062806187e-05, "loss": 0.7358, "step": 4549 }, { "epoch": 2.4429381564377035, "grad_norm": 0.73046875, "learning_rate": 2.5928751804115652e-05, "loss": 0.979, "step": 4550 }, { "epoch": 2.4429381564377035, "eval_loss": 0.9049565196037292, "eval_runtime": 47.1158, "eval_samples_per_second": 20.418, "eval_steps_per_second": 20.418, "step": 4550 }, { "epoch": 2.443475051172779, "grad_norm": 0.76953125, "learning_rate": 2.592031243944278e-05, "loss": 1.0015, "step": 4551 }, { "epoch": 2.4440119459078553, "grad_norm": 0.69140625, "learning_rate": 2.5911872969750595e-05, "loss": 0.7109, "step": 4552 }, { "epoch": 2.4445488406429314, "grad_norm": 0.68359375, "learning_rate": 2.5903433396002164e-05, "loss": 0.8791, "step": 4553 }, { "epoch": 2.4450857353780076, "grad_norm": 0.765625, "learning_rate": 2.5894993719160536e-05, "loss": 0.9258, "step": 4554 }, { "epoch": 2.4456226301130837, "grad_norm": 0.71484375, "learning_rate": 2.588655394018879e-05, "loss": 0.8183, "step": 4555 }, { "epoch": 2.4461595248481594, "grad_norm": 0.62890625, "learning_rate": 2.587811406005001e-05, "loss": 0.8501, "step": 4556 }, { "epoch": 2.4466964195832355, "grad_norm": 1.03125, "learning_rate": 2.58696740797073e-05, "loss": 0.8437, "step": 4557 }, { "epoch": 2.4472333143183116, "grad_norm": 0.73046875, "learning_rate": 2.5861234000123767e-05, "loss": 0.8139, "step": 4558 }, { "epoch": 2.4477702090533873, "grad_norm": 0.6875, "learning_rate": 2.5852793822262528e-05, "loss": 0.9045, "step": 4559 }, { "epoch": 2.4483071037884634, "grad_norm": 0.66015625, "learning_rate": 2.584435354708671e-05, "loss": 0.7598, "step": 4560 }, { "epoch": 2.4488439985235395, "grad_norm": 0.70703125, "learning_rate": 2.583591317555945e-05, "loss": 0.9663, "step": 4561 }, { "epoch": 2.4493808932586156, "grad_norm": 0.70703125, "learning_rate": 2.5827472708643928e-05, "loss": 0.9587, "step": 4562 }, { "epoch": 2.4499177879936913, "grad_norm": 0.67578125, "learning_rate": 2.581903214730328e-05, "loss": 0.8771, "step": 4563 }, { "epoch": 2.4504546827287674, "grad_norm": 0.84765625, "learning_rate": 2.5810591492500695e-05, "loss": 0.9174, "step": 4564 }, { "epoch": 2.4509915774638436, "grad_norm": 0.73046875, "learning_rate": 2.5802150745199355e-05, "loss": 0.788, "step": 4565 }, { "epoch": 2.4515284721989197, "grad_norm": 0.58984375, "learning_rate": 2.5793709906362463e-05, "loss": 0.6663, "step": 4566 }, { "epoch": 2.4520653669339953, "grad_norm": 0.6875, "learning_rate": 2.578526897695321e-05, "loss": 0.9211, "step": 4567 }, { "epoch": 2.4526022616690715, "grad_norm": 0.73828125, "learning_rate": 2.577682795793483e-05, "loss": 1.0319, "step": 4568 }, { "epoch": 2.4531391564041476, "grad_norm": 0.7109375, "learning_rate": 2.576838685027053e-05, "loss": 0.8483, "step": 4569 }, { "epoch": 2.4536760511392233, "grad_norm": 0.66015625, "learning_rate": 2.5759945654923575e-05, "loss": 0.7972, "step": 4570 }, { "epoch": 2.4542129458742994, "grad_norm": 0.71875, "learning_rate": 2.575150437285718e-05, "loss": 0.916, "step": 4571 }, { "epoch": 2.4547498406093755, "grad_norm": 0.75390625, "learning_rate": 2.5743063005034628e-05, "loss": 0.7371, "step": 4572 }, { "epoch": 2.4552867353444516, "grad_norm": 0.73828125, "learning_rate": 2.573462155241918e-05, "loss": 0.7676, "step": 4573 }, { "epoch": 2.4558236300795278, "grad_norm": 0.79296875, "learning_rate": 2.572618001597411e-05, "loss": 0.851, "step": 4574 }, { "epoch": 2.4563605248146034, "grad_norm": 0.66796875, "learning_rate": 2.57177383966627e-05, "loss": 0.7246, "step": 4575 }, { "epoch": 2.4568974195496796, "grad_norm": 0.69140625, "learning_rate": 2.570929669544825e-05, "loss": 0.9511, "step": 4576 }, { "epoch": 2.4574343142847557, "grad_norm": 0.85546875, "learning_rate": 2.570085491329406e-05, "loss": 0.8849, "step": 4577 }, { "epoch": 2.4579712090198313, "grad_norm": 0.6953125, "learning_rate": 2.5692413051163457e-05, "loss": 0.8535, "step": 4578 }, { "epoch": 2.4585081037549075, "grad_norm": 0.82421875, "learning_rate": 2.5683971110019755e-05, "loss": 0.807, "step": 4579 }, { "epoch": 2.4590449984899836, "grad_norm": 0.6796875, "learning_rate": 2.5675529090826277e-05, "loss": 0.9063, "step": 4580 }, { "epoch": 2.4595818932250597, "grad_norm": 0.67578125, "learning_rate": 2.5667086994546387e-05, "loss": 0.7945, "step": 4581 }, { "epoch": 2.4601187879601354, "grad_norm": 0.65234375, "learning_rate": 2.565864482214342e-05, "loss": 0.8773, "step": 4582 }, { "epoch": 2.4606556826952115, "grad_norm": 0.66015625, "learning_rate": 2.565020257458074e-05, "loss": 0.6864, "step": 4583 }, { "epoch": 2.4611925774302876, "grad_norm": 0.71484375, "learning_rate": 2.5641760252821706e-05, "loss": 0.8717, "step": 4584 }, { "epoch": 2.4617294721653638, "grad_norm": 0.74609375, "learning_rate": 2.5633317857829697e-05, "loss": 0.7126, "step": 4585 }, { "epoch": 2.4622663669004394, "grad_norm": 0.61328125, "learning_rate": 2.562487539056811e-05, "loss": 0.8311, "step": 4586 }, { "epoch": 2.4628032616355156, "grad_norm": 0.6953125, "learning_rate": 2.5616432852000326e-05, "loss": 0.8652, "step": 4587 }, { "epoch": 2.4633401563705917, "grad_norm": 0.62890625, "learning_rate": 2.560799024308974e-05, "loss": 0.6975, "step": 4588 }, { "epoch": 2.4638770511056673, "grad_norm": 0.6953125, "learning_rate": 2.5599547564799777e-05, "loss": 0.885, "step": 4589 }, { "epoch": 2.4644139458407435, "grad_norm": 0.6171875, "learning_rate": 2.5591104818093837e-05, "loss": 0.9691, "step": 4590 }, { "epoch": 2.4649508405758196, "grad_norm": 0.578125, "learning_rate": 2.5582662003935353e-05, "loss": 0.5909, "step": 4591 }, { "epoch": 2.4654877353108957, "grad_norm": 0.66796875, "learning_rate": 2.5574219123287758e-05, "loss": 0.6404, "step": 4592 }, { "epoch": 2.466024630045972, "grad_norm": 0.73828125, "learning_rate": 2.556577617711448e-05, "loss": 0.9185, "step": 4593 }, { "epoch": 2.4665615247810475, "grad_norm": 0.87109375, "learning_rate": 2.5557333166378982e-05, "loss": 0.9454, "step": 4594 }, { "epoch": 2.4670984195161236, "grad_norm": 0.62890625, "learning_rate": 2.554889009204472e-05, "loss": 0.909, "step": 4595 }, { "epoch": 2.4676353142511998, "grad_norm": 0.70703125, "learning_rate": 2.5540446955075137e-05, "loss": 0.9791, "step": 4596 }, { "epoch": 2.4681722089862754, "grad_norm": 0.66796875, "learning_rate": 2.5532003756433714e-05, "loss": 0.9859, "step": 4597 }, { "epoch": 2.4687091037213515, "grad_norm": 0.6640625, "learning_rate": 2.5523560497083926e-05, "loss": 0.9977, "step": 4598 }, { "epoch": 2.4692459984564277, "grad_norm": 0.82421875, "learning_rate": 2.551511717798925e-05, "loss": 0.8019, "step": 4599 }, { "epoch": 2.469782893191504, "grad_norm": 0.7890625, "learning_rate": 2.5506673800113194e-05, "loss": 0.9173, "step": 4600 }, { "epoch": 2.469782893191504, "eval_loss": 0.9041844606399536, "eval_runtime": 46.0744, "eval_samples_per_second": 20.879, "eval_steps_per_second": 20.879, "step": 4600 }, { "epoch": 2.4703197879265795, "grad_norm": 0.6328125, "learning_rate": 2.5498230364419234e-05, "loss": 0.8684, "step": 4601 }, { "epoch": 2.4708566826616556, "grad_norm": 0.69140625, "learning_rate": 2.548978687187087e-05, "loss": 0.7525, "step": 4602 }, { "epoch": 2.4713935773967317, "grad_norm": 0.68359375, "learning_rate": 2.5481343323431634e-05, "loss": 0.8157, "step": 4603 }, { "epoch": 2.471930472131808, "grad_norm": 0.67578125, "learning_rate": 2.5472899720065025e-05, "loss": 0.6885, "step": 4604 }, { "epoch": 2.4724673668668835, "grad_norm": 0.7109375, "learning_rate": 2.5464456062734566e-05, "loss": 0.9621, "step": 4605 }, { "epoch": 2.4730042616019596, "grad_norm": 0.69921875, "learning_rate": 2.545601235240379e-05, "loss": 0.847, "step": 4606 }, { "epoch": 2.4735411563370358, "grad_norm": 0.63671875, "learning_rate": 2.5447568590036232e-05, "loss": 0.8625, "step": 4607 }, { "epoch": 2.4740780510721114, "grad_norm": 0.703125, "learning_rate": 2.543912477659543e-05, "loss": 0.9182, "step": 4608 }, { "epoch": 2.4746149458071875, "grad_norm": 0.64453125, "learning_rate": 2.543068091304493e-05, "loss": 0.849, "step": 4609 }, { "epoch": 2.4751518405422637, "grad_norm": 0.625, "learning_rate": 2.5422237000348276e-05, "loss": 0.6942, "step": 4610 }, { "epoch": 2.47568873527734, "grad_norm": 0.83203125, "learning_rate": 2.5413793039469043e-05, "loss": 1.0324, "step": 4611 }, { "epoch": 2.476225630012416, "grad_norm": 0.6328125, "learning_rate": 2.540534903137078e-05, "loss": 0.7678, "step": 4612 }, { "epoch": 2.4767625247474916, "grad_norm": 0.6171875, "learning_rate": 2.539690497701706e-05, "loss": 0.6931, "step": 4613 }, { "epoch": 2.4772994194825677, "grad_norm": 0.69921875, "learning_rate": 2.5388460877371455e-05, "loss": 0.7073, "step": 4614 }, { "epoch": 2.477836314217644, "grad_norm": 0.73828125, "learning_rate": 2.538001673339754e-05, "loss": 0.921, "step": 4615 }, { "epoch": 2.4783732089527195, "grad_norm": 0.72265625, "learning_rate": 2.5371572546058915e-05, "loss": 0.8605, "step": 4616 }, { "epoch": 2.4789101036877956, "grad_norm": 0.59375, "learning_rate": 2.5363128316319153e-05, "loss": 0.7955, "step": 4617 }, { "epoch": 2.4794469984228718, "grad_norm": 0.72265625, "learning_rate": 2.535468404514184e-05, "loss": 0.7811, "step": 4618 }, { "epoch": 2.479983893157948, "grad_norm": 0.60546875, "learning_rate": 2.5346239733490594e-05, "loss": 0.7526, "step": 4619 }, { "epoch": 2.4805207878930235, "grad_norm": 0.67578125, "learning_rate": 2.533779538232901e-05, "loss": 0.7551, "step": 4620 }, { "epoch": 2.4810576826280997, "grad_norm": 0.65234375, "learning_rate": 2.5329350992620697e-05, "loss": 0.7493, "step": 4621 }, { "epoch": 2.481594577363176, "grad_norm": 0.6796875, "learning_rate": 2.532090656532926e-05, "loss": 0.957, "step": 4622 }, { "epoch": 2.482131472098252, "grad_norm": 0.64453125, "learning_rate": 2.5312462101418326e-05, "loss": 0.8087, "step": 4623 }, { "epoch": 2.4826683668333276, "grad_norm": 1.15625, "learning_rate": 2.53040176018515e-05, "loss": 0.6707, "step": 4624 }, { "epoch": 2.4832052615684037, "grad_norm": 0.7578125, "learning_rate": 2.5295573067592428e-05, "loss": 0.9298, "step": 4625 }, { "epoch": 2.48374215630348, "grad_norm": 0.73046875, "learning_rate": 2.5287128499604712e-05, "loss": 0.9217, "step": 4626 }, { "epoch": 2.4842790510385555, "grad_norm": 0.58203125, "learning_rate": 2.5278683898852006e-05, "loss": 0.7417, "step": 4627 }, { "epoch": 2.4848159457736316, "grad_norm": 0.828125, "learning_rate": 2.527023926629794e-05, "loss": 0.8912, "step": 4628 }, { "epoch": 2.4853528405087078, "grad_norm": 0.62109375, "learning_rate": 2.5261794602906145e-05, "loss": 0.8226, "step": 4629 }, { "epoch": 2.485889735243784, "grad_norm": 0.57421875, "learning_rate": 2.5253349909640278e-05, "loss": 0.6685, "step": 4630 }, { "epoch": 2.48642662997886, "grad_norm": 0.92578125, "learning_rate": 2.5244905187463973e-05, "loss": 0.8736, "step": 4631 }, { "epoch": 2.4869635247139357, "grad_norm": 0.78125, "learning_rate": 2.523646043734088e-05, "loss": 0.83, "step": 4632 }, { "epoch": 2.487500419449012, "grad_norm": 0.796875, "learning_rate": 2.522801566023467e-05, "loss": 0.9054, "step": 4633 }, { "epoch": 2.488037314184088, "grad_norm": 0.67578125, "learning_rate": 2.521957085710897e-05, "loss": 0.9039, "step": 4634 }, { "epoch": 2.4885742089191636, "grad_norm": 0.62109375, "learning_rate": 2.5211126028927464e-05, "loss": 0.7891, "step": 4635 }, { "epoch": 2.4891111036542397, "grad_norm": 0.72265625, "learning_rate": 2.5202681176653803e-05, "loss": 0.9205, "step": 4636 }, { "epoch": 2.489647998389316, "grad_norm": 0.6640625, "learning_rate": 2.5194236301251657e-05, "loss": 0.6967, "step": 4637 }, { "epoch": 2.490184893124392, "grad_norm": 0.58984375, "learning_rate": 2.5185791403684682e-05, "loss": 0.7991, "step": 4638 }, { "epoch": 2.4907217878594676, "grad_norm": 0.6015625, "learning_rate": 2.517734648491656e-05, "loss": 0.7712, "step": 4639 }, { "epoch": 2.4912586825945437, "grad_norm": 0.640625, "learning_rate": 2.516890154591095e-05, "loss": 0.7971, "step": 4640 }, { "epoch": 2.49179557732962, "grad_norm": 0.71875, "learning_rate": 2.5160456587631544e-05, "loss": 0.9565, "step": 4641 }, { "epoch": 2.492332472064696, "grad_norm": 0.66796875, "learning_rate": 2.515201161104201e-05, "loss": 0.9156, "step": 4642 }, { "epoch": 2.4928693667997717, "grad_norm": 0.73046875, "learning_rate": 2.514356661710601e-05, "loss": 0.8908, "step": 4643 }, { "epoch": 2.493406261534848, "grad_norm": 0.6484375, "learning_rate": 2.513512160678726e-05, "loss": 0.7587, "step": 4644 }, { "epoch": 2.493943156269924, "grad_norm": 0.7109375, "learning_rate": 2.512667658104941e-05, "loss": 0.7787, "step": 4645 }, { "epoch": 2.4944800510049996, "grad_norm": 0.69140625, "learning_rate": 2.511823154085617e-05, "loss": 0.8485, "step": 4646 }, { "epoch": 2.4950169457400757, "grad_norm": 0.6796875, "learning_rate": 2.510978648717121e-05, "loss": 0.7542, "step": 4647 }, { "epoch": 2.495553840475152, "grad_norm": 0.62109375, "learning_rate": 2.5101341420958212e-05, "loss": 0.8525, "step": 4648 }, { "epoch": 2.496090735210228, "grad_norm": 0.7421875, "learning_rate": 2.509289634318088e-05, "loss": 0.8682, "step": 4649 }, { "epoch": 2.496627629945304, "grad_norm": 0.7890625, "learning_rate": 2.508445125480291e-05, "loss": 0.8936, "step": 4650 }, { "epoch": 2.496627629945304, "eval_loss": 0.9043004512786865, "eval_runtime": 46.2228, "eval_samples_per_second": 20.812, "eval_steps_per_second": 20.812, "step": 4650 }, { "epoch": 2.4971645246803797, "grad_norm": 0.78515625, "learning_rate": 2.507600615678797e-05, "loss": 0.7684, "step": 4651 }, { "epoch": 2.497701419415456, "grad_norm": 0.83203125, "learning_rate": 2.506756105009977e-05, "loss": 0.9203, "step": 4652 }, { "epoch": 2.498238314150532, "grad_norm": 0.7109375, "learning_rate": 2.5059115935701992e-05, "loss": 1.0308, "step": 4653 }, { "epoch": 2.4987752088856077, "grad_norm": 0.59375, "learning_rate": 2.5050670814558346e-05, "loss": 0.7377, "step": 4654 }, { "epoch": 2.499312103620684, "grad_norm": 0.6171875, "learning_rate": 2.5042225687632514e-05, "loss": 0.8715, "step": 4655 }, { "epoch": 2.49984899835576, "grad_norm": 0.71875, "learning_rate": 2.5033780555888193e-05, "loss": 0.9159, "step": 4656 }, { "epoch": 2.500385893090836, "grad_norm": 0.65625, "learning_rate": 2.5025335420289086e-05, "loss": 0.8335, "step": 4657 }, { "epoch": 2.500922787825912, "grad_norm": 0.73046875, "learning_rate": 2.5016890281798893e-05, "loss": 1.0118, "step": 4658 }, { "epoch": 2.501459682560988, "grad_norm": 0.65625, "learning_rate": 2.5008445141381294e-05, "loss": 0.7462, "step": 4659 }, { "epoch": 2.501996577296064, "grad_norm": 0.64453125, "learning_rate": 2.5e-05, "loss": 0.8913, "step": 4660 }, { "epoch": 2.50253347203114, "grad_norm": 0.75, "learning_rate": 2.4991554858618712e-05, "loss": 0.8428, "step": 4661 }, { "epoch": 2.5030703667662157, "grad_norm": 0.68359375, "learning_rate": 2.4983109718201112e-05, "loss": 1.022, "step": 4662 }, { "epoch": 2.503607261501292, "grad_norm": 0.68359375, "learning_rate": 2.4974664579710913e-05, "loss": 0.8398, "step": 4663 }, { "epoch": 2.504144156236368, "grad_norm": 0.6875, "learning_rate": 2.496621944411181e-05, "loss": 0.879, "step": 4664 }, { "epoch": 2.5046810509714437, "grad_norm": 0.65625, "learning_rate": 2.4957774312367495e-05, "loss": 0.8861, "step": 4665 }, { "epoch": 2.50521794570652, "grad_norm": 0.7265625, "learning_rate": 2.4949329185441666e-05, "loss": 0.8208, "step": 4666 }, { "epoch": 2.505754840441596, "grad_norm": 0.65234375, "learning_rate": 2.494088406429801e-05, "loss": 0.8499, "step": 4667 }, { "epoch": 2.506291735176672, "grad_norm": 0.72265625, "learning_rate": 2.493243894990024e-05, "loss": 0.8782, "step": 4668 }, { "epoch": 2.506828629911748, "grad_norm": 0.69921875, "learning_rate": 2.4923993843212037e-05, "loss": 0.768, "step": 4669 }, { "epoch": 2.507365524646824, "grad_norm": 1.0703125, "learning_rate": 2.4915548745197097e-05, "loss": 0.8833, "step": 4670 }, { "epoch": 2.5079024193819, "grad_norm": 0.921875, "learning_rate": 2.4907103656819118e-05, "loss": 0.972, "step": 4671 }, { "epoch": 2.508439314116976, "grad_norm": 0.6171875, "learning_rate": 2.4898658579041787e-05, "loss": 0.8168, "step": 4672 }, { "epoch": 2.5089762088520517, "grad_norm": 0.76953125, "learning_rate": 2.4890213512828803e-05, "loss": 1.0264, "step": 4673 }, { "epoch": 2.509513103587128, "grad_norm": 0.83203125, "learning_rate": 2.4881768459143844e-05, "loss": 0.8988, "step": 4674 }, { "epoch": 2.510049998322204, "grad_norm": 0.63671875, "learning_rate": 2.487332341895059e-05, "loss": 0.6743, "step": 4675 }, { "epoch": 2.51058689305728, "grad_norm": 0.68359375, "learning_rate": 2.4864878393212748e-05, "loss": 0.7572, "step": 4676 }, { "epoch": 2.5111237877923562, "grad_norm": 0.70703125, "learning_rate": 2.485643338289399e-05, "loss": 0.8108, "step": 4677 }, { "epoch": 2.511660682527432, "grad_norm": 0.7109375, "learning_rate": 2.4847988388957997e-05, "loss": 0.7347, "step": 4678 }, { "epoch": 2.512197577262508, "grad_norm": 0.6484375, "learning_rate": 2.483954341236846e-05, "loss": 0.9389, "step": 4679 }, { "epoch": 2.512734471997584, "grad_norm": 0.6796875, "learning_rate": 2.483109845408905e-05, "loss": 1.0592, "step": 4680 }, { "epoch": 2.51327136673266, "grad_norm": 0.6640625, "learning_rate": 2.4822653515083453e-05, "loss": 0.7301, "step": 4681 }, { "epoch": 2.513808261467736, "grad_norm": 0.76953125, "learning_rate": 2.481420859631533e-05, "loss": 1.0802, "step": 4682 }, { "epoch": 2.514345156202812, "grad_norm": 0.93359375, "learning_rate": 2.4805763698748352e-05, "loss": 0.8456, "step": 4683 }, { "epoch": 2.5148820509378877, "grad_norm": 0.7421875, "learning_rate": 2.4797318823346203e-05, "loss": 0.8313, "step": 4684 }, { "epoch": 2.515418945672964, "grad_norm": 0.76171875, "learning_rate": 2.478887397107254e-05, "loss": 0.8066, "step": 4685 }, { "epoch": 2.51595584040804, "grad_norm": 0.62109375, "learning_rate": 2.4780429142891033e-05, "loss": 0.7529, "step": 4686 }, { "epoch": 2.516492735143116, "grad_norm": 0.66796875, "learning_rate": 2.4771984339765338e-05, "loss": 0.7963, "step": 4687 }, { "epoch": 2.5170296298781922, "grad_norm": 0.73046875, "learning_rate": 2.476353956265912e-05, "loss": 0.9988, "step": 4688 }, { "epoch": 2.517566524613268, "grad_norm": 0.5859375, "learning_rate": 2.4755094812536033e-05, "loss": 0.8237, "step": 4689 }, { "epoch": 2.518103419348344, "grad_norm": 0.6796875, "learning_rate": 2.474665009035973e-05, "loss": 0.864, "step": 4690 }, { "epoch": 2.51864031408342, "grad_norm": 0.7421875, "learning_rate": 2.4738205397093864e-05, "loss": 0.7942, "step": 4691 }, { "epoch": 2.519177208818496, "grad_norm": 0.73828125, "learning_rate": 2.472976073370207e-05, "loss": 0.8739, "step": 4692 }, { "epoch": 2.519714103553572, "grad_norm": 0.6484375, "learning_rate": 2.4721316101148e-05, "loss": 0.7663, "step": 4693 }, { "epoch": 2.520250998288648, "grad_norm": 0.6328125, "learning_rate": 2.471287150039529e-05, "loss": 0.9158, "step": 4694 }, { "epoch": 2.520787893023724, "grad_norm": 0.69140625, "learning_rate": 2.4704426932407578e-05, "loss": 0.7818, "step": 4695 }, { "epoch": 2.5213247877588003, "grad_norm": 0.78125, "learning_rate": 2.4695982398148497e-05, "loss": 0.7843, "step": 4696 }, { "epoch": 2.521861682493876, "grad_norm": 0.58203125, "learning_rate": 2.468753789858168e-05, "loss": 0.7717, "step": 4697 }, { "epoch": 2.522398577228952, "grad_norm": 0.66796875, "learning_rate": 2.467909343467075e-05, "loss": 0.8969, "step": 4698 }, { "epoch": 2.5229354719640282, "grad_norm": 0.66015625, "learning_rate": 2.4670649007379315e-05, "loss": 0.8058, "step": 4699 }, { "epoch": 2.523472366699104, "grad_norm": 0.796875, "learning_rate": 2.4662204617670994e-05, "loss": 0.6992, "step": 4700 }, { "epoch": 2.523472366699104, "eval_loss": 0.9045475125312805, "eval_runtime": 45.9539, "eval_samples_per_second": 20.934, "eval_steps_per_second": 20.934, "step": 4700 }, { "epoch": 2.52400926143418, "grad_norm": 0.8046875, "learning_rate": 2.465376026650941e-05, "loss": 0.9277, "step": 4701 }, { "epoch": 2.524546156169256, "grad_norm": 0.74609375, "learning_rate": 2.4645315954858165e-05, "loss": 0.9898, "step": 4702 }, { "epoch": 2.525083050904332, "grad_norm": 0.59765625, "learning_rate": 2.4636871683680856e-05, "loss": 0.7192, "step": 4703 }, { "epoch": 2.525619945639408, "grad_norm": 0.71484375, "learning_rate": 2.462842745394109e-05, "loss": 0.7924, "step": 4704 }, { "epoch": 2.526156840374484, "grad_norm": 0.765625, "learning_rate": 2.461998326660246e-05, "loss": 0.9676, "step": 4705 }, { "epoch": 2.52669373510956, "grad_norm": 0.7265625, "learning_rate": 2.4611539122628544e-05, "loss": 1.0498, "step": 4706 }, { "epoch": 2.5272306298446363, "grad_norm": 1.3359375, "learning_rate": 2.460309502298295e-05, "loss": 0.8755, "step": 4707 }, { "epoch": 2.527767524579712, "grad_norm": 0.6015625, "learning_rate": 2.4594650968629226e-05, "loss": 0.7964, "step": 4708 }, { "epoch": 2.528304419314788, "grad_norm": 0.69921875, "learning_rate": 2.4586206960530963e-05, "loss": 0.9157, "step": 4709 }, { "epoch": 2.5288413140498642, "grad_norm": 0.77734375, "learning_rate": 2.4577762999651726e-05, "loss": 0.9714, "step": 4710 }, { "epoch": 2.52937820878494, "grad_norm": 0.76171875, "learning_rate": 2.4569319086955075e-05, "loss": 0.7733, "step": 4711 }, { "epoch": 2.529915103520016, "grad_norm": 0.63671875, "learning_rate": 2.4560875223404573e-05, "loss": 0.7672, "step": 4712 }, { "epoch": 2.530451998255092, "grad_norm": 0.61328125, "learning_rate": 2.455243140996377e-05, "loss": 0.7962, "step": 4713 }, { "epoch": 2.5309888929901683, "grad_norm": 0.765625, "learning_rate": 2.4543987647596206e-05, "loss": 0.7733, "step": 4714 }, { "epoch": 2.5315257877252444, "grad_norm": 0.65625, "learning_rate": 2.4535543937265443e-05, "loss": 0.7989, "step": 4715 }, { "epoch": 2.53206268246032, "grad_norm": 0.640625, "learning_rate": 2.4527100279934985e-05, "loss": 0.8543, "step": 4716 }, { "epoch": 2.532599577195396, "grad_norm": 1.2109375, "learning_rate": 2.4518656676568376e-05, "loss": 0.9989, "step": 4717 }, { "epoch": 2.5331364719304723, "grad_norm": 0.6171875, "learning_rate": 2.4510213128129132e-05, "loss": 0.7076, "step": 4718 }, { "epoch": 2.533673366665548, "grad_norm": 0.69140625, "learning_rate": 2.4501769635580772e-05, "loss": 0.8046, "step": 4719 }, { "epoch": 2.534210261400624, "grad_norm": 0.72265625, "learning_rate": 2.4493326199886812e-05, "loss": 0.8405, "step": 4720 }, { "epoch": 2.5347471561357002, "grad_norm": 0.62109375, "learning_rate": 2.4484882822010748e-05, "loss": 0.8945, "step": 4721 }, { "epoch": 2.535284050870776, "grad_norm": 0.7109375, "learning_rate": 2.447643950291608e-05, "loss": 0.9577, "step": 4722 }, { "epoch": 2.535820945605852, "grad_norm": 0.62109375, "learning_rate": 2.4467996243566295e-05, "loss": 0.8425, "step": 4723 }, { "epoch": 2.536357840340928, "grad_norm": 0.7890625, "learning_rate": 2.445955304492487e-05, "loss": 1.0657, "step": 4724 }, { "epoch": 2.5368947350760043, "grad_norm": 0.7734375, "learning_rate": 2.445110990795529e-05, "loss": 0.9352, "step": 4725 }, { "epoch": 2.5374316298110804, "grad_norm": 0.6953125, "learning_rate": 2.444266683362102e-05, "loss": 0.836, "step": 4726 }, { "epoch": 2.537968524546156, "grad_norm": 0.6171875, "learning_rate": 2.4434223822885526e-05, "loss": 0.7781, "step": 4727 }, { "epoch": 2.538505419281232, "grad_norm": 0.6171875, "learning_rate": 2.4425780876712248e-05, "loss": 0.7379, "step": 4728 }, { "epoch": 2.5390423140163083, "grad_norm": 0.74609375, "learning_rate": 2.4417337996064653e-05, "loss": 0.9583, "step": 4729 }, { "epoch": 2.539579208751384, "grad_norm": 0.6875, "learning_rate": 2.440889518190617e-05, "loss": 0.8795, "step": 4730 }, { "epoch": 2.54011610348646, "grad_norm": 0.76953125, "learning_rate": 2.4400452435200226e-05, "loss": 0.9928, "step": 4731 }, { "epoch": 2.5406529982215362, "grad_norm": 0.7734375, "learning_rate": 2.439200975691027e-05, "loss": 0.9659, "step": 4732 }, { "epoch": 2.5411898929566124, "grad_norm": 0.59765625, "learning_rate": 2.4383567147999683e-05, "loss": 0.9078, "step": 4733 }, { "epoch": 2.5417267876916885, "grad_norm": 0.62890625, "learning_rate": 2.43751246094319e-05, "loss": 0.666, "step": 4734 }, { "epoch": 2.542263682426764, "grad_norm": 0.671875, "learning_rate": 2.436668214217031e-05, "loss": 0.9913, "step": 4735 }, { "epoch": 2.5428005771618403, "grad_norm": 0.66015625, "learning_rate": 2.43582397471783e-05, "loss": 0.8997, "step": 4736 }, { "epoch": 2.5433374718969164, "grad_norm": 0.75390625, "learning_rate": 2.434979742541927e-05, "loss": 0.9264, "step": 4737 }, { "epoch": 2.543874366631992, "grad_norm": 0.6796875, "learning_rate": 2.4341355177856587e-05, "loss": 0.9566, "step": 4738 }, { "epoch": 2.544411261367068, "grad_norm": 0.78125, "learning_rate": 2.4332913005453612e-05, "loss": 1.0158, "step": 4739 }, { "epoch": 2.5449481561021443, "grad_norm": 0.67578125, "learning_rate": 2.432447090917373e-05, "loss": 0.8305, "step": 4740 }, { "epoch": 2.54548505083722, "grad_norm": 0.66796875, "learning_rate": 2.4316028889980254e-05, "loss": 1.0086, "step": 4741 }, { "epoch": 2.546021945572296, "grad_norm": 0.71484375, "learning_rate": 2.4307586948836552e-05, "loss": 0.8123, "step": 4742 }, { "epoch": 2.5465588403073722, "grad_norm": 0.8203125, "learning_rate": 2.4299145086705947e-05, "loss": 0.9146, "step": 4743 }, { "epoch": 2.5470957350424484, "grad_norm": 0.7421875, "learning_rate": 2.4290703304551756e-05, "loss": 0.8819, "step": 4744 }, { "epoch": 2.5476326297775245, "grad_norm": 0.77734375, "learning_rate": 2.4282261603337306e-05, "loss": 0.9958, "step": 4745 }, { "epoch": 2.5481695245126, "grad_norm": 0.68359375, "learning_rate": 2.4273819984025896e-05, "loss": 0.8316, "step": 4746 }, { "epoch": 2.5487064192476763, "grad_norm": 0.73828125, "learning_rate": 2.4265378447580818e-05, "loss": 0.9154, "step": 4747 }, { "epoch": 2.5492433139827524, "grad_norm": 0.80078125, "learning_rate": 2.4256936994965368e-05, "loss": 0.9037, "step": 4748 }, { "epoch": 2.549780208717828, "grad_norm": 0.79296875, "learning_rate": 2.424849562714282e-05, "loss": 0.8984, "step": 4749 }, { "epoch": 2.550317103452904, "grad_norm": 0.85546875, "learning_rate": 2.4240054345076438e-05, "loss": 0.79, "step": 4750 }, { "epoch": 2.550317103452904, "eval_loss": 0.9044550657272339, "eval_runtime": 46.4744, "eval_samples_per_second": 20.7, "eval_steps_per_second": 20.7, "step": 4750 }, { "epoch": 2.5508539981879803, "grad_norm": 0.64453125, "learning_rate": 2.4231613149729478e-05, "loss": 0.7006, "step": 4751 }, { "epoch": 2.5513908929230564, "grad_norm": 0.78125, "learning_rate": 2.422317204206518e-05, "loss": 0.8896, "step": 4752 }, { "epoch": 2.5519277876581326, "grad_norm": 0.79296875, "learning_rate": 2.4214731023046793e-05, "loss": 0.8646, "step": 4753 }, { "epoch": 2.5524646823932082, "grad_norm": 0.64453125, "learning_rate": 2.4206290093637546e-05, "loss": 0.8121, "step": 4754 }, { "epoch": 2.5530015771282843, "grad_norm": 0.78515625, "learning_rate": 2.419784925480064e-05, "loss": 0.7799, "step": 4755 }, { "epoch": 2.5535384718633605, "grad_norm": 0.6640625, "learning_rate": 2.4189408507499304e-05, "loss": 1.0234, "step": 4756 }, { "epoch": 2.554075366598436, "grad_norm": 0.6171875, "learning_rate": 2.4180967852696725e-05, "loss": 0.7871, "step": 4757 }, { "epoch": 2.5546122613335123, "grad_norm": 0.76953125, "learning_rate": 2.417252729135608e-05, "loss": 0.768, "step": 4758 }, { "epoch": 2.5551491560685884, "grad_norm": 0.64453125, "learning_rate": 2.4164086824440552e-05, "loss": 0.8422, "step": 4759 }, { "epoch": 2.555686050803664, "grad_norm": 0.66796875, "learning_rate": 2.4155646452913296e-05, "loss": 0.9319, "step": 4760 }, { "epoch": 2.55622294553874, "grad_norm": 0.74609375, "learning_rate": 2.414720617773748e-05, "loss": 0.8486, "step": 4761 }, { "epoch": 2.5567598402738163, "grad_norm": 0.53125, "learning_rate": 2.413876599987624e-05, "loss": 0.598, "step": 4762 }, { "epoch": 2.5572967350088924, "grad_norm": 0.62109375, "learning_rate": 2.41303259202927e-05, "loss": 0.67, "step": 4763 }, { "epoch": 2.5578336297439686, "grad_norm": 0.6875, "learning_rate": 2.4121885939949987e-05, "loss": 0.8766, "step": 4764 }, { "epoch": 2.5583705244790442, "grad_norm": 0.671875, "learning_rate": 2.4113446059811217e-05, "loss": 0.7272, "step": 4765 }, { "epoch": 2.5589074192141203, "grad_norm": 0.609375, "learning_rate": 2.4105006280839473e-05, "loss": 0.7251, "step": 4766 }, { "epoch": 2.5594443139491965, "grad_norm": 0.78125, "learning_rate": 2.409656660399785e-05, "loss": 0.7694, "step": 4767 }, { "epoch": 2.559981208684272, "grad_norm": 0.73828125, "learning_rate": 2.4088127030249415e-05, "loss": 0.8735, "step": 4768 }, { "epoch": 2.5605181034193483, "grad_norm": 0.63671875, "learning_rate": 2.4079687560557226e-05, "loss": 0.774, "step": 4769 }, { "epoch": 2.5610549981544244, "grad_norm": 0.7421875, "learning_rate": 2.407124819588435e-05, "loss": 1.0585, "step": 4770 }, { "epoch": 2.5615918928895005, "grad_norm": 0.6171875, "learning_rate": 2.406280893719382e-05, "loss": 0.7963, "step": 4771 }, { "epoch": 2.5621287876245766, "grad_norm": 0.65625, "learning_rate": 2.4054369785448648e-05, "loss": 0.6606, "step": 4772 }, { "epoch": 2.5626656823596523, "grad_norm": 0.66796875, "learning_rate": 2.4045930741611862e-05, "loss": 0.8388, "step": 4773 }, { "epoch": 2.5632025770947284, "grad_norm": 0.53125, "learning_rate": 2.4037491806646474e-05, "loss": 0.7488, "step": 4774 }, { "epoch": 2.5637394718298046, "grad_norm": 0.69921875, "learning_rate": 2.402905298151545e-05, "loss": 0.7551, "step": 4775 }, { "epoch": 2.5642763665648802, "grad_norm": 0.64453125, "learning_rate": 2.4020614267181778e-05, "loss": 0.7906, "step": 4776 }, { "epoch": 2.5648132612999563, "grad_norm": 0.5625, "learning_rate": 2.4012175664608412e-05, "loss": 0.816, "step": 4777 }, { "epoch": 2.5653501560350325, "grad_norm": 0.796875, "learning_rate": 2.4003737174758323e-05, "loss": 0.9098, "step": 4778 }, { "epoch": 2.565887050770108, "grad_norm": 0.63671875, "learning_rate": 2.3995298798594438e-05, "loss": 0.9021, "step": 4779 }, { "epoch": 2.5664239455051843, "grad_norm": 0.984375, "learning_rate": 2.3986860537079675e-05, "loss": 0.8583, "step": 4780 }, { "epoch": 2.5669608402402604, "grad_norm": 0.74609375, "learning_rate": 2.3978422391176963e-05, "loss": 0.8389, "step": 4781 }, { "epoch": 2.5674977349753365, "grad_norm": 0.6875, "learning_rate": 2.3969984361849198e-05, "loss": 0.7442, "step": 4782 }, { "epoch": 2.5680346297104126, "grad_norm": 0.984375, "learning_rate": 2.3961546450059257e-05, "loss": 0.8782, "step": 4783 }, { "epoch": 2.5685715244454883, "grad_norm": 0.72265625, "learning_rate": 2.3953108656770016e-05, "loss": 0.7903, "step": 4784 }, { "epoch": 2.5691084191805644, "grad_norm": 0.80859375, "learning_rate": 2.394467098294433e-05, "loss": 0.9673, "step": 4785 }, { "epoch": 2.5696453139156405, "grad_norm": 0.859375, "learning_rate": 2.3936233429545054e-05, "loss": 0.9672, "step": 4786 }, { "epoch": 2.5701822086507162, "grad_norm": 0.80859375, "learning_rate": 2.392779599753502e-05, "loss": 0.9942, "step": 4787 }, { "epoch": 2.5707191033857923, "grad_norm": 0.79296875, "learning_rate": 2.391935868787703e-05, "loss": 0.8621, "step": 4788 }, { "epoch": 2.5712559981208685, "grad_norm": 0.58203125, "learning_rate": 2.3910921501533906e-05, "loss": 0.8573, "step": 4789 }, { "epoch": 2.5717928928559446, "grad_norm": 0.65234375, "learning_rate": 2.390248443946843e-05, "loss": 0.8559, "step": 4790 }, { "epoch": 2.5723297875910207, "grad_norm": 0.65625, "learning_rate": 2.3894047502643385e-05, "loss": 0.7038, "step": 4791 }, { "epoch": 2.5728666823260964, "grad_norm": 0.65234375, "learning_rate": 2.388561069202152e-05, "loss": 0.757, "step": 4792 }, { "epoch": 2.5734035770611725, "grad_norm": 0.6015625, "learning_rate": 2.387717400856558e-05, "loss": 0.6814, "step": 4793 }, { "epoch": 2.5739404717962486, "grad_norm": 0.66796875, "learning_rate": 2.3868737453238316e-05, "loss": 0.7257, "step": 4794 }, { "epoch": 2.5744773665313243, "grad_norm": 0.69140625, "learning_rate": 2.386030102700243e-05, "loss": 0.8394, "step": 4795 }, { "epoch": 2.5750142612664004, "grad_norm": 0.69921875, "learning_rate": 2.385186473082063e-05, "loss": 0.7631, "step": 4796 }, { "epoch": 2.5755511560014765, "grad_norm": 0.7109375, "learning_rate": 2.38434285656556e-05, "loss": 0.984, "step": 4797 }, { "epoch": 2.5760880507365522, "grad_norm": 0.66796875, "learning_rate": 2.3834992532470023e-05, "loss": 0.737, "step": 4798 }, { "epoch": 2.5766249454716283, "grad_norm": 0.66015625, "learning_rate": 2.382655663222656e-05, "loss": 0.7902, "step": 4799 }, { "epoch": 2.5771618402067045, "grad_norm": 0.6953125, "learning_rate": 2.3818120865887836e-05, "loss": 0.7661, "step": 4800 }, { "epoch": 2.5771618402067045, "eval_loss": 0.9042898416519165, "eval_runtime": 45.9124, "eval_samples_per_second": 20.953, "eval_steps_per_second": 20.953, "step": 4800 }, { "epoch": 2.5776987349417806, "grad_norm": 0.85546875, "learning_rate": 2.3809685234416486e-05, "loss": 0.8461, "step": 4801 }, { "epoch": 2.5782356296768567, "grad_norm": 0.71484375, "learning_rate": 2.380124973877513e-05, "loss": 0.881, "step": 4802 }, { "epoch": 2.5787725244119324, "grad_norm": 0.59765625, "learning_rate": 2.3792814379926358e-05, "loss": 0.7544, "step": 4803 }, { "epoch": 2.5793094191470085, "grad_norm": 0.7890625, "learning_rate": 2.3784379158832747e-05, "loss": 0.8336, "step": 4804 }, { "epoch": 2.5798463138820846, "grad_norm": 0.703125, "learning_rate": 2.3775944076456874e-05, "loss": 0.8285, "step": 4805 }, { "epoch": 2.5803832086171603, "grad_norm": 0.67578125, "learning_rate": 2.376750913376129e-05, "loss": 0.7354, "step": 4806 }, { "epoch": 2.5809201033522364, "grad_norm": 0.74609375, "learning_rate": 2.3759074331708517e-05, "loss": 0.8723, "step": 4807 }, { "epoch": 2.5814569980873125, "grad_norm": 0.66015625, "learning_rate": 2.375063967126108e-05, "loss": 0.7881, "step": 4808 }, { "epoch": 2.5819938928223887, "grad_norm": 0.79296875, "learning_rate": 2.374220515338147e-05, "loss": 0.8679, "step": 4809 }, { "epoch": 2.582530787557465, "grad_norm": 0.6796875, "learning_rate": 2.3733770779032184e-05, "loss": 0.8832, "step": 4810 }, { "epoch": 2.5830676822925405, "grad_norm": 0.63671875, "learning_rate": 2.372533654917569e-05, "loss": 1.0323, "step": 4811 }, { "epoch": 2.5836045770276166, "grad_norm": 0.6171875, "learning_rate": 2.371690246477444e-05, "loss": 0.8778, "step": 4812 }, { "epoch": 2.5841414717626927, "grad_norm": 0.62890625, "learning_rate": 2.3708468526790858e-05, "loss": 0.8339, "step": 4813 }, { "epoch": 2.5846783664977684, "grad_norm": 0.59375, "learning_rate": 2.3700034736187383e-05, "loss": 0.9372, "step": 4814 }, { "epoch": 2.5852152612328445, "grad_norm": 0.63671875, "learning_rate": 2.3691601093926404e-05, "loss": 0.704, "step": 4815 }, { "epoch": 2.5857521559679206, "grad_norm": 0.72265625, "learning_rate": 2.368316760097032e-05, "loss": 0.9305, "step": 4816 }, { "epoch": 2.5862890507029963, "grad_norm": 0.67578125, "learning_rate": 2.367473425828148e-05, "loss": 0.9001, "step": 4817 }, { "epoch": 2.5868259454380724, "grad_norm": 0.72265625, "learning_rate": 2.3666301066822236e-05, "loss": 0.9246, "step": 4818 }, { "epoch": 2.5873628401731485, "grad_norm": 0.7109375, "learning_rate": 2.365786802755494e-05, "loss": 0.9648, "step": 4819 }, { "epoch": 2.5878997349082247, "grad_norm": 0.71875, "learning_rate": 2.36494351414419e-05, "loss": 0.9637, "step": 4820 }, { "epoch": 2.588436629643301, "grad_norm": 0.7265625, "learning_rate": 2.36410024094454e-05, "loss": 0.8621, "step": 4821 }, { "epoch": 2.5889735243783765, "grad_norm": 0.87109375, "learning_rate": 2.363256983252775e-05, "loss": 0.7851, "step": 4822 }, { "epoch": 2.5895104191134526, "grad_norm": 0.6328125, "learning_rate": 2.3624137411651196e-05, "loss": 0.7886, "step": 4823 }, { "epoch": 2.5900473138485287, "grad_norm": 0.6953125, "learning_rate": 2.361570514777799e-05, "loss": 0.804, "step": 4824 }, { "epoch": 2.5905842085836044, "grad_norm": 0.6796875, "learning_rate": 2.3607273041870356e-05, "loss": 0.8343, "step": 4825 }, { "epoch": 2.5911211033186805, "grad_norm": 0.68359375, "learning_rate": 2.3598841094890496e-05, "loss": 0.8373, "step": 4826 }, { "epoch": 2.5916579980537566, "grad_norm": 0.75390625, "learning_rate": 2.359040930780062e-05, "loss": 0.7907, "step": 4827 }, { "epoch": 2.5921948927888327, "grad_norm": 0.60546875, "learning_rate": 2.3581977681562895e-05, "loss": 0.898, "step": 4828 }, { "epoch": 2.592731787523909, "grad_norm": 0.671875, "learning_rate": 2.3573546217139464e-05, "loss": 0.7337, "step": 4829 }, { "epoch": 2.5932686822589845, "grad_norm": 0.7109375, "learning_rate": 2.3565114915492482e-05, "loss": 0.9152, "step": 4830 }, { "epoch": 2.5938055769940607, "grad_norm": 1.3515625, "learning_rate": 2.355668377758406e-05, "loss": 0.8682, "step": 4831 }, { "epoch": 2.594342471729137, "grad_norm": 0.60546875, "learning_rate": 2.3548252804376288e-05, "loss": 0.6563, "step": 4832 }, { "epoch": 2.5948793664642125, "grad_norm": 0.74609375, "learning_rate": 2.3539821996831278e-05, "loss": 0.8853, "step": 4833 }, { "epoch": 2.5954162611992886, "grad_norm": 0.796875, "learning_rate": 2.353139135591105e-05, "loss": 0.9459, "step": 4834 }, { "epoch": 2.5959531559343647, "grad_norm": 0.66796875, "learning_rate": 2.352296088257767e-05, "loss": 0.9339, "step": 4835 }, { "epoch": 2.5964900506694404, "grad_norm": 0.6640625, "learning_rate": 2.3514530577793163e-05, "loss": 0.8762, "step": 4836 }, { "epoch": 2.5970269454045165, "grad_norm": 0.828125, "learning_rate": 2.350610044251952e-05, "loss": 0.9001, "step": 4837 }, { "epoch": 2.5975638401395926, "grad_norm": 0.84765625, "learning_rate": 2.3497670477718744e-05, "loss": 0.8743, "step": 4838 }, { "epoch": 2.5981007348746687, "grad_norm": 0.6328125, "learning_rate": 2.348924068435279e-05, "loss": 0.8777, "step": 4839 }, { "epoch": 2.598637629609745, "grad_norm": 0.828125, "learning_rate": 2.34808110633836e-05, "loss": 0.7591, "step": 4840 }, { "epoch": 2.5991745243448205, "grad_norm": 0.6171875, "learning_rate": 2.3472381615773126e-05, "loss": 0.7358, "step": 4841 }, { "epoch": 2.5997114190798967, "grad_norm": 0.79296875, "learning_rate": 2.3463952342483237e-05, "loss": 0.9206, "step": 4842 }, { "epoch": 2.600248313814973, "grad_norm": 0.7578125, "learning_rate": 2.3455523244475846e-05, "loss": 0.9042, "step": 4843 }, { "epoch": 2.6007852085500485, "grad_norm": 0.73046875, "learning_rate": 2.3447094322712805e-05, "loss": 1.0895, "step": 4844 }, { "epoch": 2.6013221032851246, "grad_norm": 0.79296875, "learning_rate": 2.343866557815597e-05, "loss": 0.9379, "step": 4845 }, { "epoch": 2.6018589980202007, "grad_norm": 0.63671875, "learning_rate": 2.3430237011767167e-05, "loss": 0.8434, "step": 4846 }, { "epoch": 2.602395892755277, "grad_norm": 0.6640625, "learning_rate": 2.3421808624508202e-05, "loss": 0.7852, "step": 4847 }, { "epoch": 2.6029327874903525, "grad_norm": 0.6875, "learning_rate": 2.341338041734086e-05, "loss": 0.8722, "step": 4848 }, { "epoch": 2.6034696822254286, "grad_norm": 0.62890625, "learning_rate": 2.3404952391226906e-05, "loss": 0.973, "step": 4849 }, { "epoch": 2.6040065769605047, "grad_norm": 0.6328125, "learning_rate": 2.3396524547128072e-05, "loss": 0.9067, "step": 4850 }, { "epoch": 2.6040065769605047, "eval_loss": 0.9035809636116028, "eval_runtime": 46.2018, "eval_samples_per_second": 20.822, "eval_steps_per_second": 20.822, "step": 4850 }, { "epoch": 2.604543471695581, "grad_norm": 0.66015625, "learning_rate": 2.3388096886006105e-05, "loss": 0.833, "step": 4851 }, { "epoch": 2.6050803664306565, "grad_norm": 0.6640625, "learning_rate": 2.3379669408822696e-05, "loss": 0.8819, "step": 4852 }, { "epoch": 2.6056172611657327, "grad_norm": 0.68359375, "learning_rate": 2.3371242116539525e-05, "loss": 0.911, "step": 4853 }, { "epoch": 2.606154155900809, "grad_norm": 0.671875, "learning_rate": 2.336281501011825e-05, "loss": 1.0088, "step": 4854 }, { "epoch": 2.6066910506358845, "grad_norm": 0.625, "learning_rate": 2.3354388090520522e-05, "loss": 0.7627, "step": 4855 }, { "epoch": 2.6072279453709606, "grad_norm": 0.60546875, "learning_rate": 2.3345961358707953e-05, "loss": 0.741, "step": 4856 }, { "epoch": 2.6077648401060367, "grad_norm": 0.63671875, "learning_rate": 2.333753481564213e-05, "loss": 0.7867, "step": 4857 }, { "epoch": 2.608301734841113, "grad_norm": 0.6328125, "learning_rate": 2.332910846228466e-05, "loss": 0.8172, "step": 4858 }, { "epoch": 2.608838629576189, "grad_norm": 0.95703125, "learning_rate": 2.3320682299597054e-05, "loss": 1.0437, "step": 4859 }, { "epoch": 2.6093755243112646, "grad_norm": 0.9765625, "learning_rate": 2.331225632854087e-05, "loss": 1.0193, "step": 4860 }, { "epoch": 2.6099124190463407, "grad_norm": 0.70703125, "learning_rate": 2.3303830550077617e-05, "loss": 0.7396, "step": 4861 }, { "epoch": 2.610449313781417, "grad_norm": 0.62109375, "learning_rate": 2.329540496516877e-05, "loss": 0.8171, "step": 4862 }, { "epoch": 2.6109862085164925, "grad_norm": 0.71875, "learning_rate": 2.3286979574775807e-05, "loss": 0.7973, "step": 4863 }, { "epoch": 2.6115231032515687, "grad_norm": 0.62890625, "learning_rate": 2.327855437986017e-05, "loss": 0.8042, "step": 4864 }, { "epoch": 2.612059997986645, "grad_norm": 0.6171875, "learning_rate": 2.3270129381383275e-05, "loss": 0.7434, "step": 4865 }, { "epoch": 2.612596892721721, "grad_norm": 0.8515625, "learning_rate": 2.326170458030654e-05, "loss": 0.9505, "step": 4866 }, { "epoch": 2.6131337874567966, "grad_norm": 0.828125, "learning_rate": 2.3253279977591308e-05, "loss": 0.8996, "step": 4867 }, { "epoch": 2.6136706821918727, "grad_norm": 0.6171875, "learning_rate": 2.3244855574198958e-05, "loss": 0.679, "step": 4868 }, { "epoch": 2.614207576926949, "grad_norm": 0.62109375, "learning_rate": 2.3236431371090814e-05, "loss": 0.8053, "step": 4869 }, { "epoch": 2.614744471662025, "grad_norm": 0.74609375, "learning_rate": 2.3228007369228178e-05, "loss": 0.9276, "step": 4870 }, { "epoch": 2.6152813663971006, "grad_norm": 0.6484375, "learning_rate": 2.3219583569572347e-05, "loss": 0.7546, "step": 4871 }, { "epoch": 2.6158182611321767, "grad_norm": 0.6484375, "learning_rate": 2.321115997308458e-05, "loss": 0.8459, "step": 4872 }, { "epoch": 2.616355155867253, "grad_norm": 0.6953125, "learning_rate": 2.3202736580726106e-05, "loss": 0.8358, "step": 4873 }, { "epoch": 2.6168920506023285, "grad_norm": 0.6484375, "learning_rate": 2.3194313393458156e-05, "loss": 0.8025, "step": 4874 }, { "epoch": 2.6174289453374047, "grad_norm": 0.55078125, "learning_rate": 2.3185890412241917e-05, "loss": 0.7274, "step": 4875 }, { "epoch": 2.617965840072481, "grad_norm": 0.6875, "learning_rate": 2.3177467638038555e-05, "loss": 0.8235, "step": 4876 }, { "epoch": 2.618502734807557, "grad_norm": 0.65234375, "learning_rate": 2.3169045071809215e-05, "loss": 0.9587, "step": 4877 }, { "epoch": 2.619039629542633, "grad_norm": 0.7734375, "learning_rate": 2.3160622714515012e-05, "loss": 0.9466, "step": 4878 }, { "epoch": 2.6195765242777087, "grad_norm": 0.7734375, "learning_rate": 2.3152200567117057e-05, "loss": 0.8643, "step": 4879 }, { "epoch": 2.620113419012785, "grad_norm": 1.0625, "learning_rate": 2.314377863057642e-05, "loss": 0.8484, "step": 4880 }, { "epoch": 2.620650313747861, "grad_norm": 0.74609375, "learning_rate": 2.313535690585414e-05, "loss": 0.7703, "step": 4881 }, { "epoch": 2.6211872084829366, "grad_norm": 0.703125, "learning_rate": 2.3126935393911255e-05, "loss": 0.8477, "step": 4882 }, { "epoch": 2.6217241032180127, "grad_norm": 0.75390625, "learning_rate": 2.311851409570877e-05, "loss": 1.0015, "step": 4883 }, { "epoch": 2.622260997953089, "grad_norm": 0.55078125, "learning_rate": 2.3110093012207647e-05, "loss": 0.6186, "step": 4884 }, { "epoch": 2.622797892688165, "grad_norm": 0.671875, "learning_rate": 2.310167214436885e-05, "loss": 0.8147, "step": 4885 }, { "epoch": 2.6233347874232407, "grad_norm": 0.6953125, "learning_rate": 2.309325149315329e-05, "loss": 0.7688, "step": 4886 }, { "epoch": 2.623871682158317, "grad_norm": 0.68359375, "learning_rate": 2.308483105952189e-05, "loss": 0.8862, "step": 4887 }, { "epoch": 2.624408576893393, "grad_norm": 0.61328125, "learning_rate": 2.307641084443552e-05, "loss": 0.8191, "step": 4888 }, { "epoch": 2.624945471628469, "grad_norm": 0.67578125, "learning_rate": 2.3067990848855026e-05, "loss": 0.6825, "step": 4889 }, { "epoch": 2.6254823663635447, "grad_norm": 0.6796875, "learning_rate": 2.3059571073741248e-05, "loss": 0.7945, "step": 4890 }, { "epoch": 2.626019261098621, "grad_norm": 0.59765625, "learning_rate": 2.3051151520054994e-05, "loss": 0.6775, "step": 4891 }, { "epoch": 2.626556155833697, "grad_norm": 0.734375, "learning_rate": 2.3042732188757023e-05, "loss": 1.176, "step": 4892 }, { "epoch": 2.6270930505687726, "grad_norm": 0.640625, "learning_rate": 2.3034313080808095e-05, "loss": 0.8802, "step": 4893 }, { "epoch": 2.6276299453038487, "grad_norm": 0.66015625, "learning_rate": 2.302589419716894e-05, "loss": 0.7208, "step": 4894 }, { "epoch": 2.628166840038925, "grad_norm": 0.65234375, "learning_rate": 2.301747553880025e-05, "loss": 0.7477, "step": 4895 }, { "epoch": 2.628703734774001, "grad_norm": 0.8203125, "learning_rate": 2.3009057106662714e-05, "loss": 1.0084, "step": 4896 }, { "epoch": 2.629240629509077, "grad_norm": 0.72265625, "learning_rate": 2.3000638901716976e-05, "loss": 0.87, "step": 4897 }, { "epoch": 2.629777524244153, "grad_norm": 0.6640625, "learning_rate": 2.2992220924923652e-05, "loss": 0.7963, "step": 4898 }, { "epoch": 2.630314418979229, "grad_norm": 0.83203125, "learning_rate": 2.2983803177243352e-05, "loss": 0.9892, "step": 4899 }, { "epoch": 2.630851313714305, "grad_norm": 0.7578125, "learning_rate": 2.297538565963665e-05, "loss": 0.7251, "step": 4900 }, { "epoch": 2.630851313714305, "eval_loss": 0.9034591317176819, "eval_runtime": 46.1079, "eval_samples_per_second": 20.864, "eval_steps_per_second": 20.864, "step": 4900 }, { "epoch": 2.6313882084493807, "grad_norm": 0.75, "learning_rate": 2.296696837306408e-05, "loss": 0.9207, "step": 4901 }, { "epoch": 2.631925103184457, "grad_norm": 0.69140625, "learning_rate": 2.295855131848616e-05, "loss": 0.86, "step": 4902 }, { "epoch": 2.632461997919533, "grad_norm": 0.84765625, "learning_rate": 2.2950134496863386e-05, "loss": 1.002, "step": 4903 }, { "epoch": 2.632998892654609, "grad_norm": 0.8125, "learning_rate": 2.2941717909156228e-05, "loss": 0.8904, "step": 4904 }, { "epoch": 2.6335357873896847, "grad_norm": 0.66015625, "learning_rate": 2.2933301556325123e-05, "loss": 0.8564, "step": 4905 }, { "epoch": 2.634072682124761, "grad_norm": 0.73046875, "learning_rate": 2.2924885439330477e-05, "loss": 0.8905, "step": 4906 }, { "epoch": 2.634609576859837, "grad_norm": 0.68359375, "learning_rate": 2.291646955913269e-05, "loss": 0.7895, "step": 4907 }, { "epoch": 2.635146471594913, "grad_norm": 0.68359375, "learning_rate": 2.2908053916692117e-05, "loss": 0.8444, "step": 4908 }, { "epoch": 2.635683366329989, "grad_norm": 0.70703125, "learning_rate": 2.289963851296908e-05, "loss": 0.828, "step": 4909 }, { "epoch": 2.636220261065065, "grad_norm": 0.81640625, "learning_rate": 2.2891223348923884e-05, "loss": 1.0982, "step": 4910 }, { "epoch": 2.636757155800141, "grad_norm": 0.73046875, "learning_rate": 2.2882808425516805e-05, "loss": 0.7755, "step": 4911 }, { "epoch": 2.6372940505352167, "grad_norm": 1.609375, "learning_rate": 2.2874393743708104e-05, "loss": 0.9889, "step": 4912 }, { "epoch": 2.637830945270293, "grad_norm": 0.68359375, "learning_rate": 2.2865979304457997e-05, "loss": 0.7403, "step": 4913 }, { "epoch": 2.638367840005369, "grad_norm": 0.69921875, "learning_rate": 2.2857565108726667e-05, "loss": 0.8106, "step": 4914 }, { "epoch": 2.638904734740445, "grad_norm": 0.6015625, "learning_rate": 2.2849151157474298e-05, "loss": 0.7078, "step": 4915 }, { "epoch": 2.639441629475521, "grad_norm": 0.6015625, "learning_rate": 2.2840737451661018e-05, "loss": 0.7153, "step": 4916 }, { "epoch": 2.639978524210597, "grad_norm": 0.80859375, "learning_rate": 2.2832323992246945e-05, "loss": 0.8945, "step": 4917 }, { "epoch": 2.640515418945673, "grad_norm": 0.7109375, "learning_rate": 2.2823910780192152e-05, "loss": 0.8071, "step": 4918 }, { "epoch": 2.641052313680749, "grad_norm": 0.58203125, "learning_rate": 2.281549781645669e-05, "loss": 0.7665, "step": 4919 }, { "epoch": 2.641589208415825, "grad_norm": 0.73046875, "learning_rate": 2.2807085102000598e-05, "loss": 0.8724, "step": 4920 }, { "epoch": 2.642126103150901, "grad_norm": 0.890625, "learning_rate": 2.2798672637783865e-05, "loss": 0.9079, "step": 4921 }, { "epoch": 2.642662997885977, "grad_norm": 0.8203125, "learning_rate": 2.279026042476646e-05, "loss": 0.9932, "step": 4922 }, { "epoch": 2.643199892621053, "grad_norm": 0.7109375, "learning_rate": 2.2781848463908328e-05, "loss": 0.9764, "step": 4923 }, { "epoch": 2.643736787356129, "grad_norm": 0.74609375, "learning_rate": 2.2773436756169378e-05, "loss": 0.917, "step": 4924 }, { "epoch": 2.644273682091205, "grad_norm": 0.71875, "learning_rate": 2.2765025302509497e-05, "loss": 1.0003, "step": 4925 }, { "epoch": 2.644810576826281, "grad_norm": 0.640625, "learning_rate": 2.2756614103888523e-05, "loss": 0.8843, "step": 4926 }, { "epoch": 2.645347471561357, "grad_norm": 0.59765625, "learning_rate": 2.2748203161266286e-05, "loss": 0.7537, "step": 4927 }, { "epoch": 2.645884366296433, "grad_norm": 0.78125, "learning_rate": 2.2739792475602596e-05, "loss": 0.874, "step": 4928 }, { "epoch": 2.646421261031509, "grad_norm": 0.6171875, "learning_rate": 2.2731382047857204e-05, "loss": 0.8862, "step": 4929 }, { "epoch": 2.646958155766585, "grad_norm": 0.78515625, "learning_rate": 2.272297187898984e-05, "loss": 0.8466, "step": 4930 }, { "epoch": 2.647495050501661, "grad_norm": 0.9609375, "learning_rate": 2.271456196996023e-05, "loss": 0.8819, "step": 4931 }, { "epoch": 2.648031945236737, "grad_norm": 0.671875, "learning_rate": 2.2706152321728045e-05, "loss": 0.7642, "step": 4932 }, { "epoch": 2.648568839971813, "grad_norm": 0.65625, "learning_rate": 2.269774293525293e-05, "loss": 0.8886, "step": 4933 }, { "epoch": 2.649105734706889, "grad_norm": 0.703125, "learning_rate": 2.26893338114945e-05, "loss": 0.9149, "step": 4934 }, { "epoch": 2.6496426294419653, "grad_norm": 0.66015625, "learning_rate": 2.2680924951412337e-05, "loss": 0.7346, "step": 4935 }, { "epoch": 2.650179524177041, "grad_norm": 0.74609375, "learning_rate": 2.267251635596601e-05, "loss": 0.8143, "step": 4936 }, { "epoch": 2.650716418912117, "grad_norm": 0.7265625, "learning_rate": 2.2664108026115043e-05, "loss": 0.9506, "step": 4937 }, { "epoch": 2.651253313647193, "grad_norm": 0.6875, "learning_rate": 2.2655699962818934e-05, "loss": 0.8035, "step": 4938 }, { "epoch": 2.651790208382269, "grad_norm": 0.65625, "learning_rate": 2.2647292167037144e-05, "loss": 0.7886, "step": 4939 }, { "epoch": 2.652327103117345, "grad_norm": 0.6875, "learning_rate": 2.263888463972911e-05, "loss": 0.7561, "step": 4940 }, { "epoch": 2.652863997852421, "grad_norm": 0.6953125, "learning_rate": 2.2630477381854247e-05, "loss": 0.839, "step": 4941 }, { "epoch": 2.6534008925874972, "grad_norm": 0.6484375, "learning_rate": 2.2622070394371926e-05, "loss": 0.7929, "step": 4942 }, { "epoch": 2.653937787322573, "grad_norm": 0.609375, "learning_rate": 2.261366367824148e-05, "loss": 0.7112, "step": 4943 }, { "epoch": 2.654474682057649, "grad_norm": 0.81640625, "learning_rate": 2.2605257234422224e-05, "loss": 0.8821, "step": 4944 }, { "epoch": 2.655011576792725, "grad_norm": 0.66015625, "learning_rate": 2.2596851063873446e-05, "loss": 0.892, "step": 4945 }, { "epoch": 2.6555484715278013, "grad_norm": 0.6328125, "learning_rate": 2.25884451675544e-05, "loss": 0.8032, "step": 4946 }, { "epoch": 2.656085366262877, "grad_norm": 0.703125, "learning_rate": 2.2580039546424292e-05, "loss": 0.9186, "step": 4947 }, { "epoch": 2.656622260997953, "grad_norm": 0.71875, "learning_rate": 2.2571634201442326e-05, "loss": 0.8775, "step": 4948 }, { "epoch": 2.657159155733029, "grad_norm": 0.7421875, "learning_rate": 2.2563229133567647e-05, "loss": 0.807, "step": 4949 }, { "epoch": 2.657696050468105, "grad_norm": 0.66015625, "learning_rate": 2.2554824343759392e-05, "loss": 0.7873, "step": 4950 }, { "epoch": 2.657696050468105, "eval_loss": 0.9035736918449402, "eval_runtime": 46.1198, "eval_samples_per_second": 20.859, "eval_steps_per_second": 20.859, "step": 4950 }, { "epoch": 2.658232945203181, "grad_norm": 2.859375, "learning_rate": 2.254641983297664e-05, "loss": 0.9557, "step": 4951 }, { "epoch": 2.658769839938257, "grad_norm": 0.7421875, "learning_rate": 2.253801560217845e-05, "loss": 0.8769, "step": 4952 }, { "epoch": 2.6593067346733332, "grad_norm": 0.81640625, "learning_rate": 2.2529611652323866e-05, "loss": 1.0339, "step": 4953 }, { "epoch": 2.6598436294084093, "grad_norm": 0.6640625, "learning_rate": 2.2521207984371872e-05, "loss": 0.9272, "step": 4954 }, { "epoch": 2.660380524143485, "grad_norm": 0.73828125, "learning_rate": 2.2512804599281438e-05, "loss": 0.9604, "step": 4955 }, { "epoch": 2.660917418878561, "grad_norm": 0.69921875, "learning_rate": 2.25044014980115e-05, "loss": 0.8449, "step": 4956 }, { "epoch": 2.6614543136136373, "grad_norm": 0.65234375, "learning_rate": 2.249599868152096e-05, "loss": 0.8504, "step": 4957 }, { "epoch": 2.661991208348713, "grad_norm": 0.6640625, "learning_rate": 2.248759615076868e-05, "loss": 0.9573, "step": 4958 }, { "epoch": 2.662528103083789, "grad_norm": 0.66015625, "learning_rate": 2.247919390671349e-05, "loss": 0.7302, "step": 4959 }, { "epoch": 2.663064997818865, "grad_norm": 0.72265625, "learning_rate": 2.24707919503142e-05, "loss": 1.0578, "step": 4960 }, { "epoch": 2.6636018925539413, "grad_norm": 0.58984375, "learning_rate": 2.2462390282529577e-05, "loss": 0.7379, "step": 4961 }, { "epoch": 2.664138787289017, "grad_norm": 0.8671875, "learning_rate": 2.2453988904318364e-05, "loss": 0.9434, "step": 4962 }, { "epoch": 2.664675682024093, "grad_norm": 0.66796875, "learning_rate": 2.2445587816639252e-05, "loss": 0.8265, "step": 4963 }, { "epoch": 2.6652125767591692, "grad_norm": 0.7109375, "learning_rate": 2.2437187020450926e-05, "loss": 1.0917, "step": 4964 }, { "epoch": 2.6657494714942453, "grad_norm": 0.6640625, "learning_rate": 2.2428786516712015e-05, "loss": 0.8258, "step": 4965 }, { "epoch": 2.666286366229321, "grad_norm": 0.75390625, "learning_rate": 2.242038630638112e-05, "loss": 1.0332, "step": 4966 }, { "epoch": 2.666823260964397, "grad_norm": 0.61328125, "learning_rate": 2.241198639041683e-05, "loss": 0.7824, "step": 4967 }, { "epoch": 2.6673601556994733, "grad_norm": 0.64453125, "learning_rate": 2.2403586769777652e-05, "loss": 0.9146, "step": 4968 }, { "epoch": 2.667897050434549, "grad_norm": 0.59765625, "learning_rate": 2.239518744542211e-05, "loss": 0.7834, "step": 4969 }, { "epoch": 2.668433945169625, "grad_norm": 0.68359375, "learning_rate": 2.238678841830867e-05, "loss": 0.7471, "step": 4970 }, { "epoch": 2.668970839904701, "grad_norm": 0.6484375, "learning_rate": 2.2378389689395754e-05, "loss": 0.7633, "step": 4971 }, { "epoch": 2.6695077346397773, "grad_norm": 0.6015625, "learning_rate": 2.2369991259641785e-05, "loss": 0.8676, "step": 4972 }, { "epoch": 2.6700446293748534, "grad_norm": 0.73828125, "learning_rate": 2.2361593130005118e-05, "loss": 0.8312, "step": 4973 }, { "epoch": 2.670581524109929, "grad_norm": 0.69140625, "learning_rate": 2.235319530144409e-05, "loss": 0.7971, "step": 4974 }, { "epoch": 2.6711184188450052, "grad_norm": 0.65234375, "learning_rate": 2.2344797774916998e-05, "loss": 0.833, "step": 4975 }, { "epoch": 2.6716553135800813, "grad_norm": 0.73828125, "learning_rate": 2.2336400551382096e-05, "loss": 0.8202, "step": 4976 }, { "epoch": 2.672192208315157, "grad_norm": 0.6484375, "learning_rate": 2.232800363179763e-05, "loss": 0.743, "step": 4977 }, { "epoch": 2.672729103050233, "grad_norm": 0.6640625, "learning_rate": 2.2319607017121786e-05, "loss": 0.844, "step": 4978 }, { "epoch": 2.6732659977853093, "grad_norm": 0.703125, "learning_rate": 2.2311210708312726e-05, "loss": 0.8646, "step": 4979 }, { "epoch": 2.6738028925203854, "grad_norm": 0.79296875, "learning_rate": 2.230281470632857e-05, "loss": 0.962, "step": 4980 }, { "epoch": 2.674339787255461, "grad_norm": 0.6484375, "learning_rate": 2.2294419012127418e-05, "loss": 1.1699, "step": 4981 }, { "epoch": 2.674876681990537, "grad_norm": 0.6328125, "learning_rate": 2.228602362666732e-05, "loss": 0.849, "step": 4982 }, { "epoch": 2.6754135767256133, "grad_norm": 0.77734375, "learning_rate": 2.2277628550906293e-05, "loss": 0.9349, "step": 4983 }, { "epoch": 2.6759504714606894, "grad_norm": 0.59375, "learning_rate": 2.226923378580234e-05, "loss": 0.6634, "step": 4984 }, { "epoch": 2.676487366195765, "grad_norm": 0.73828125, "learning_rate": 2.2260839332313375e-05, "loss": 0.8338, "step": 4985 }, { "epoch": 2.6770242609308412, "grad_norm": 0.83203125, "learning_rate": 2.2252445191397337e-05, "loss": 0.839, "step": 4986 }, { "epoch": 2.6775611556659173, "grad_norm": 0.69140625, "learning_rate": 2.22440513640121e-05, "loss": 0.8034, "step": 4987 }, { "epoch": 2.678098050400993, "grad_norm": 0.72265625, "learning_rate": 2.2235657851115497e-05, "loss": 0.9674, "step": 4988 }, { "epoch": 2.678634945136069, "grad_norm": 0.66796875, "learning_rate": 2.2227264653665345e-05, "loss": 0.7859, "step": 4989 }, { "epoch": 2.6791718398711453, "grad_norm": 0.65625, "learning_rate": 2.221887177261941e-05, "loss": 0.7488, "step": 4990 }, { "epoch": 2.6797087346062214, "grad_norm": 0.65625, "learning_rate": 2.221047920893542e-05, "loss": 0.6674, "step": 4991 }, { "epoch": 2.6802456293412975, "grad_norm": 0.81640625, "learning_rate": 2.22020869635711e-05, "loss": 0.8728, "step": 4992 }, { "epoch": 2.680782524076373, "grad_norm": 0.68359375, "learning_rate": 2.2193695037484068e-05, "loss": 0.91, "step": 4993 }, { "epoch": 2.6813194188114493, "grad_norm": 0.79296875, "learning_rate": 2.2185303431631982e-05, "loss": 0.9383, "step": 4994 }, { "epoch": 2.6818563135465254, "grad_norm": 0.7421875, "learning_rate": 2.2176912146972414e-05, "loss": 0.8167, "step": 4995 }, { "epoch": 2.682393208281601, "grad_norm": 0.78515625, "learning_rate": 2.216852118446292e-05, "loss": 1.0551, "step": 4996 }, { "epoch": 2.6829301030166772, "grad_norm": 0.88671875, "learning_rate": 2.216013054506102e-05, "loss": 0.9115, "step": 4997 }, { "epoch": 2.6834669977517533, "grad_norm": 0.578125, "learning_rate": 2.215174022972419e-05, "loss": 0.7588, "step": 4998 }, { "epoch": 2.684003892486829, "grad_norm": 0.62109375, "learning_rate": 2.2143350239409867e-05, "loss": 0.7433, "step": 4999 }, { "epoch": 2.684540787221905, "grad_norm": 0.734375, "learning_rate": 2.213496057507547e-05, "loss": 0.8441, "step": 5000 }, { "epoch": 2.684540787221905, "eval_loss": 0.903358519077301, "eval_runtime": 45.9635, "eval_samples_per_second": 20.93, "eval_steps_per_second": 20.93, "step": 5000 }, { "epoch": 2.6850776819569813, "grad_norm": 0.7578125, "learning_rate": 2.212657123767834e-05, "loss": 0.9228, "step": 5001 }, { "epoch": 2.6856145766920574, "grad_norm": 0.72265625, "learning_rate": 2.2118182228175824e-05, "loss": 0.7715, "step": 5002 }, { "epoch": 2.6861514714271335, "grad_norm": 0.65625, "learning_rate": 2.2109793547525215e-05, "loss": 0.7831, "step": 5003 }, { "epoch": 2.686688366162209, "grad_norm": 0.66796875, "learning_rate": 2.2101405196683756e-05, "loss": 0.7604, "step": 5004 }, { "epoch": 2.6872252608972853, "grad_norm": 0.75390625, "learning_rate": 2.2093017176608677e-05, "loss": 0.948, "step": 5005 }, { "epoch": 2.6877621556323614, "grad_norm": 0.74609375, "learning_rate": 2.208462948825715e-05, "loss": 0.9593, "step": 5006 }, { "epoch": 2.688299050367437, "grad_norm": 0.69140625, "learning_rate": 2.2076242132586317e-05, "loss": 0.7838, "step": 5007 }, { "epoch": 2.6888359451025132, "grad_norm": 0.65625, "learning_rate": 2.2067855110553286e-05, "loss": 0.666, "step": 5008 }, { "epoch": 2.6893728398375893, "grad_norm": 0.76171875, "learning_rate": 2.205946842311513e-05, "loss": 0.9892, "step": 5009 }, { "epoch": 2.6899097345726655, "grad_norm": 0.79296875, "learning_rate": 2.2051082071228854e-05, "loss": 0.8428, "step": 5010 }, { "epoch": 2.6904466293077416, "grad_norm": 0.65234375, "learning_rate": 2.2042696055851463e-05, "loss": 0.9376, "step": 5011 }, { "epoch": 2.6909835240428173, "grad_norm": 0.703125, "learning_rate": 2.2034310377939895e-05, "loss": 0.7812, "step": 5012 }, { "epoch": 2.6915204187778934, "grad_norm": 0.640625, "learning_rate": 2.2025925038451075e-05, "loss": 0.8864, "step": 5013 }, { "epoch": 2.6920573135129695, "grad_norm": 0.65234375, "learning_rate": 2.2017540038341875e-05, "loss": 0.8902, "step": 5014 }, { "epoch": 2.692594208248045, "grad_norm": 0.69921875, "learning_rate": 2.2009155378569117e-05, "loss": 0.8702, "step": 5015 }, { "epoch": 2.6931311029831213, "grad_norm": 0.7265625, "learning_rate": 2.2000771060089617e-05, "loss": 0.9905, "step": 5016 }, { "epoch": 2.6936679977181974, "grad_norm": 0.7265625, "learning_rate": 2.199238708386012e-05, "loss": 1.1197, "step": 5017 }, { "epoch": 2.694204892453273, "grad_norm": 0.78515625, "learning_rate": 2.198400345083735e-05, "loss": 0.9472, "step": 5018 }, { "epoch": 2.6947417871883492, "grad_norm": 0.59375, "learning_rate": 2.1975620161977973e-05, "loss": 0.7182, "step": 5019 }, { "epoch": 2.6952786819234253, "grad_norm": 0.640625, "learning_rate": 2.1967237218238628e-05, "loss": 0.8, "step": 5020 }, { "epoch": 2.6958155766585015, "grad_norm": 0.6796875, "learning_rate": 2.1958854620575936e-05, "loss": 0.83, "step": 5021 }, { "epoch": 2.6963524713935776, "grad_norm": 0.6875, "learning_rate": 2.195047236994644e-05, "loss": 0.8183, "step": 5022 }, { "epoch": 2.6968893661286533, "grad_norm": 0.83984375, "learning_rate": 2.1942090467306668e-05, "loss": 0.8642, "step": 5023 }, { "epoch": 2.6974262608637294, "grad_norm": 0.6875, "learning_rate": 2.193370891361309e-05, "loss": 0.7442, "step": 5024 }, { "epoch": 2.6979631555988055, "grad_norm": 0.74609375, "learning_rate": 2.1925327709822167e-05, "loss": 0.9279, "step": 5025 }, { "epoch": 2.698500050333881, "grad_norm": 0.58984375, "learning_rate": 2.1916946856890293e-05, "loss": 0.7511, "step": 5026 }, { "epoch": 2.6990369450689573, "grad_norm": 0.73046875, "learning_rate": 2.1908566355773817e-05, "loss": 0.8585, "step": 5027 }, { "epoch": 2.6995738398040334, "grad_norm": 0.86328125, "learning_rate": 2.1900186207429078e-05, "loss": 0.9793, "step": 5028 }, { "epoch": 2.7001107345391095, "grad_norm": 0.58984375, "learning_rate": 2.1891806412812338e-05, "loss": 0.7375, "step": 5029 }, { "epoch": 2.7006476292741857, "grad_norm": 0.74609375, "learning_rate": 2.1883426972879853e-05, "loss": 0.9499, "step": 5030 }, { "epoch": 2.7011845240092613, "grad_norm": 0.625, "learning_rate": 2.1875047888587823e-05, "loss": 0.814, "step": 5031 }, { "epoch": 2.7017214187443375, "grad_norm": 0.78515625, "learning_rate": 2.186666916089239e-05, "loss": 0.7674, "step": 5032 }, { "epoch": 2.7022583134794136, "grad_norm": 0.69921875, "learning_rate": 2.1858290790749698e-05, "loss": 0.7517, "step": 5033 }, { "epoch": 2.7027952082144893, "grad_norm": 0.6875, "learning_rate": 2.1849912779115818e-05, "loss": 0.8155, "step": 5034 }, { "epoch": 2.7033321029495654, "grad_norm": 0.74609375, "learning_rate": 2.1841535126946776e-05, "loss": 0.9909, "step": 5035 }, { "epoch": 2.7038689976846415, "grad_norm": 0.73046875, "learning_rate": 2.1833157835198574e-05, "loss": 0.9182, "step": 5036 }, { "epoch": 2.704405892419717, "grad_norm": 0.6484375, "learning_rate": 2.1824780904827164e-05, "loss": 0.8444, "step": 5037 }, { "epoch": 2.7049427871547933, "grad_norm": 0.69140625, "learning_rate": 2.181640433678847e-05, "loss": 0.8912, "step": 5038 }, { "epoch": 2.7054796818898694, "grad_norm": 1.03125, "learning_rate": 2.1808028132038354e-05, "loss": 0.7917, "step": 5039 }, { "epoch": 2.7060165766249455, "grad_norm": 0.56640625, "learning_rate": 2.179965229153265e-05, "loss": 0.6682, "step": 5040 }, { "epoch": 2.7065534713600217, "grad_norm": 0.7421875, "learning_rate": 2.1791276816227152e-05, "loss": 0.7958, "step": 5041 }, { "epoch": 2.7070903660950973, "grad_norm": 0.61328125, "learning_rate": 2.178290170707761e-05, "loss": 0.9435, "step": 5042 }, { "epoch": 2.7076272608301735, "grad_norm": 0.6640625, "learning_rate": 2.177452696503972e-05, "loss": 0.796, "step": 5043 }, { "epoch": 2.7081641555652496, "grad_norm": 0.6796875, "learning_rate": 2.176615259106915e-05, "loss": 0.8683, "step": 5044 }, { "epoch": 2.7087010503003253, "grad_norm": 0.58984375, "learning_rate": 2.1757778586121517e-05, "loss": 0.7923, "step": 5045 }, { "epoch": 2.7092379450354014, "grad_norm": 0.73046875, "learning_rate": 2.1749404951152412e-05, "loss": 0.6945, "step": 5046 }, { "epoch": 2.7097748397704775, "grad_norm": 0.67578125, "learning_rate": 2.1741031687117368e-05, "loss": 0.8413, "step": 5047 }, { "epoch": 2.7103117345055536, "grad_norm": 0.671875, "learning_rate": 2.1732658794971874e-05, "loss": 1.0605, "step": 5048 }, { "epoch": 2.7108486292406297, "grad_norm": 0.62890625, "learning_rate": 2.1724286275671396e-05, "loss": 0.7467, "step": 5049 }, { "epoch": 2.7113855239757054, "grad_norm": 0.70703125, "learning_rate": 2.1715914130171337e-05, "loss": 0.9242, "step": 5050 }, { "epoch": 2.7113855239757054, "eval_loss": 0.9034383296966553, "eval_runtime": 46.065, "eval_samples_per_second": 20.884, "eval_steps_per_second": 20.884, "step": 5050 }, { "epoch": 2.7119224187107815, "grad_norm": 0.671875, "learning_rate": 2.170754235942707e-05, "loss": 0.6494, "step": 5051 }, { "epoch": 2.7124593134458577, "grad_norm": 0.66796875, "learning_rate": 2.169917096439391e-05, "loss": 0.8253, "step": 5052 }, { "epoch": 2.7129962081809333, "grad_norm": 0.6640625, "learning_rate": 2.169079994602714e-05, "loss": 0.8121, "step": 5053 }, { "epoch": 2.7135331029160095, "grad_norm": 0.77734375, "learning_rate": 2.1682429305282008e-05, "loss": 0.8765, "step": 5054 }, { "epoch": 2.7140699976510856, "grad_norm": 0.8125, "learning_rate": 2.1674059043113707e-05, "loss": 1.0917, "step": 5055 }, { "epoch": 2.7146068923861613, "grad_norm": 0.71484375, "learning_rate": 2.1665689160477383e-05, "loss": 1.0088, "step": 5056 }, { "epoch": 2.7151437871212374, "grad_norm": 0.765625, "learning_rate": 2.1657319658328158e-05, "loss": 0.8097, "step": 5057 }, { "epoch": 2.7156806818563135, "grad_norm": 0.99609375, "learning_rate": 2.1648950537621087e-05, "loss": 1.2574, "step": 5058 }, { "epoch": 2.7162175765913896, "grad_norm": 0.61328125, "learning_rate": 2.1640581799311208e-05, "loss": 0.8028, "step": 5059 }, { "epoch": 2.7167544713264657, "grad_norm": 0.72265625, "learning_rate": 2.1632213444353482e-05, "loss": 0.7571, "step": 5060 }, { "epoch": 2.7172913660615414, "grad_norm": 0.72265625, "learning_rate": 2.1623845473702848e-05, "loss": 0.743, "step": 5061 }, { "epoch": 2.7178282607966175, "grad_norm": 0.67578125, "learning_rate": 2.1615477888314203e-05, "loss": 0.9261, "step": 5062 }, { "epoch": 2.7183651555316937, "grad_norm": 0.78515625, "learning_rate": 2.1607110689142393e-05, "loss": 0.8333, "step": 5063 }, { "epoch": 2.7189020502667693, "grad_norm": 0.69140625, "learning_rate": 2.1598743877142222e-05, "loss": 0.8227, "step": 5064 }, { "epoch": 2.7194389450018455, "grad_norm": 0.6640625, "learning_rate": 2.159037745326844e-05, "loss": 0.925, "step": 5065 }, { "epoch": 2.7199758397369216, "grad_norm": 0.80078125, "learning_rate": 2.1582011418475782e-05, "loss": 0.973, "step": 5066 }, { "epoch": 2.7205127344719977, "grad_norm": 0.9140625, "learning_rate": 2.15736457737189e-05, "loss": 0.9659, "step": 5067 }, { "epoch": 2.721049629207074, "grad_norm": 0.609375, "learning_rate": 2.1565280519952435e-05, "loss": 0.7824, "step": 5068 }, { "epoch": 2.7215865239421495, "grad_norm": 0.66796875, "learning_rate": 2.1556915658130954e-05, "loss": 0.8146, "step": 5069 }, { "epoch": 2.7221234186772256, "grad_norm": 0.76171875, "learning_rate": 2.154855118920899e-05, "loss": 0.9196, "step": 5070 }, { "epoch": 2.7226603134123017, "grad_norm": 1.59375, "learning_rate": 2.154018711414106e-05, "loss": 1.2276, "step": 5071 }, { "epoch": 2.7231972081473774, "grad_norm": 0.7578125, "learning_rate": 2.1531823433881588e-05, "loss": 0.8539, "step": 5072 }, { "epoch": 2.7237341028824535, "grad_norm": 0.76171875, "learning_rate": 2.152346014938498e-05, "loss": 0.7373, "step": 5073 }, { "epoch": 2.7242709976175297, "grad_norm": 0.78125, "learning_rate": 2.1515097261605596e-05, "loss": 0.6589, "step": 5074 }, { "epoch": 2.7248078923526053, "grad_norm": 0.73046875, "learning_rate": 2.150673477149775e-05, "loss": 0.9421, "step": 5075 }, { "epoch": 2.7253447870876815, "grad_norm": 0.65625, "learning_rate": 2.149837268001571e-05, "loss": 0.8746, "step": 5076 }, { "epoch": 2.7258816818227576, "grad_norm": 0.64453125, "learning_rate": 2.149001098811369e-05, "loss": 0.6623, "step": 5077 }, { "epoch": 2.7264185765578337, "grad_norm": 0.6484375, "learning_rate": 2.148164969674586e-05, "loss": 0.8815, "step": 5078 }, { "epoch": 2.72695547129291, "grad_norm": 0.6640625, "learning_rate": 2.147328880686636e-05, "loss": 0.8938, "step": 5079 }, { "epoch": 2.7274923660279855, "grad_norm": 0.74609375, "learning_rate": 2.146492831942927e-05, "loss": 0.8871, "step": 5080 }, { "epoch": 2.7280292607630616, "grad_norm": 0.984375, "learning_rate": 2.145656823538862e-05, "loss": 0.9893, "step": 5081 }, { "epoch": 2.7285661554981377, "grad_norm": 0.65625, "learning_rate": 2.1448208555698416e-05, "loss": 0.7905, "step": 5082 }, { "epoch": 2.7291030502332134, "grad_norm": 0.77734375, "learning_rate": 2.1439849281312593e-05, "loss": 1.0244, "step": 5083 }, { "epoch": 2.7296399449682895, "grad_norm": 0.6953125, "learning_rate": 2.143149041318506e-05, "loss": 0.7836, "step": 5084 }, { "epoch": 2.7301768397033657, "grad_norm": 0.73046875, "learning_rate": 2.1423131952269655e-05, "loss": 0.8897, "step": 5085 }, { "epoch": 2.730713734438442, "grad_norm": 0.6875, "learning_rate": 2.1414773899520187e-05, "loss": 0.9864, "step": 5086 }, { "epoch": 2.731250629173518, "grad_norm": 0.60546875, "learning_rate": 2.1406416255890428e-05, "loss": 0.6947, "step": 5087 }, { "epoch": 2.7317875239085936, "grad_norm": 0.60546875, "learning_rate": 2.139805902233408e-05, "loss": 0.8749, "step": 5088 }, { "epoch": 2.7323244186436697, "grad_norm": 0.66796875, "learning_rate": 2.1389702199804808e-05, "loss": 0.9514, "step": 5089 }, { "epoch": 2.732861313378746, "grad_norm": 0.671875, "learning_rate": 2.1381345789256245e-05, "loss": 0.7669, "step": 5090 }, { "epoch": 2.7333982081138215, "grad_norm": 0.7421875, "learning_rate": 2.137298979164195e-05, "loss": 1.098, "step": 5091 }, { "epoch": 2.7339351028488976, "grad_norm": 0.7109375, "learning_rate": 2.1364634207915447e-05, "loss": 1.1326, "step": 5092 }, { "epoch": 2.7344719975839737, "grad_norm": 0.69140625, "learning_rate": 2.1356279039030235e-05, "loss": 0.8915, "step": 5093 }, { "epoch": 2.7350088923190494, "grad_norm": 0.66796875, "learning_rate": 2.1347924285939714e-05, "loss": 0.8269, "step": 5094 }, { "epoch": 2.7355457870541255, "grad_norm": 0.74609375, "learning_rate": 2.1339569949597284e-05, "loss": 0.8303, "step": 5095 }, { "epoch": 2.7360826817892017, "grad_norm": 0.84765625, "learning_rate": 2.133121603095628e-05, "loss": 0.9537, "step": 5096 }, { "epoch": 2.736619576524278, "grad_norm": 0.765625, "learning_rate": 2.1322862530969984e-05, "loss": 0.7892, "step": 5097 }, { "epoch": 2.737156471259354, "grad_norm": 0.78515625, "learning_rate": 2.1314509450591647e-05, "loss": 1.0223, "step": 5098 }, { "epoch": 2.7376933659944296, "grad_norm": 1.015625, "learning_rate": 2.130615679077445e-05, "loss": 0.9654, "step": 5099 }, { "epoch": 2.7382302607295057, "grad_norm": 0.9765625, "learning_rate": 2.1297804552471547e-05, "loss": 0.8931, "step": 5100 }, { "epoch": 2.7382302607295057, "eval_loss": 0.9029310345649719, "eval_runtime": 47.2098, "eval_samples_per_second": 20.377, "eval_steps_per_second": 20.377, "step": 5100 }, { "epoch": 2.738767155464582, "grad_norm": 0.70703125, "learning_rate": 2.1289452736636035e-05, "loss": 0.7365, "step": 5101 }, { "epoch": 2.7393040501996575, "grad_norm": 0.66015625, "learning_rate": 2.1281101344220937e-05, "loss": 0.8493, "step": 5102 }, { "epoch": 2.7398409449347336, "grad_norm": 0.6875, "learning_rate": 2.1272750376179286e-05, "loss": 0.8451, "step": 5103 }, { "epoch": 2.7403778396698097, "grad_norm": 0.6328125, "learning_rate": 2.1264399833464015e-05, "loss": 0.8994, "step": 5104 }, { "epoch": 2.740914734404886, "grad_norm": 0.73046875, "learning_rate": 2.1256049717028032e-05, "loss": 0.9397, "step": 5105 }, { "epoch": 2.741451629139962, "grad_norm": 0.69921875, "learning_rate": 2.1247700027824182e-05, "loss": 0.6846, "step": 5106 }, { "epoch": 2.7419885238750377, "grad_norm": 0.609375, "learning_rate": 2.1239350766805285e-05, "loss": 0.738, "step": 5107 }, { "epoch": 2.742525418610114, "grad_norm": 0.7890625, "learning_rate": 2.1231001934924088e-05, "loss": 1.0414, "step": 5108 }, { "epoch": 2.74306231334519, "grad_norm": 0.7734375, "learning_rate": 2.12226535331333e-05, "loss": 0.8378, "step": 5109 }, { "epoch": 2.7435992080802656, "grad_norm": 0.69921875, "learning_rate": 2.1214305562385592e-05, "loss": 0.8825, "step": 5110 }, { "epoch": 2.7441361028153417, "grad_norm": 0.71484375, "learning_rate": 2.1205958023633546e-05, "loss": 0.8217, "step": 5111 }, { "epoch": 2.744672997550418, "grad_norm": 0.74609375, "learning_rate": 2.1197610917829742e-05, "loss": 0.8131, "step": 5112 }, { "epoch": 2.7452098922854935, "grad_norm": 0.65625, "learning_rate": 2.1189264245926687e-05, "loss": 0.7979, "step": 5113 }, { "epoch": 2.7457467870205696, "grad_norm": 0.62109375, "learning_rate": 2.1180918008876836e-05, "loss": 0.7882, "step": 5114 }, { "epoch": 2.7462836817556457, "grad_norm": 0.56640625, "learning_rate": 2.1172572207632608e-05, "loss": 0.7587, "step": 5115 }, { "epoch": 2.746820576490722, "grad_norm": 0.71484375, "learning_rate": 2.1164226843146365e-05, "loss": 0.9854, "step": 5116 }, { "epoch": 2.747357471225798, "grad_norm": 0.72265625, "learning_rate": 2.1155881916370407e-05, "loss": 0.8934, "step": 5117 }, { "epoch": 2.7478943659608737, "grad_norm": 0.69921875, "learning_rate": 2.114753742825702e-05, "loss": 0.8471, "step": 5118 }, { "epoch": 2.74843126069595, "grad_norm": 0.5859375, "learning_rate": 2.1139193379758386e-05, "loss": 0.8186, "step": 5119 }, { "epoch": 2.748968155431026, "grad_norm": 0.71875, "learning_rate": 2.1130849771826683e-05, "loss": 0.8046, "step": 5120 }, { "epoch": 2.7495050501661016, "grad_norm": 0.9375, "learning_rate": 2.112250660541402e-05, "loss": 1.1443, "step": 5121 }, { "epoch": 2.7500419449011777, "grad_norm": 0.85546875, "learning_rate": 2.1114163881472457e-05, "loss": 0.903, "step": 5122 }, { "epoch": 2.750578839636254, "grad_norm": 0.69140625, "learning_rate": 2.1105821600954007e-05, "loss": 0.9971, "step": 5123 }, { "epoch": 2.75111573437133, "grad_norm": 0.703125, "learning_rate": 2.109747976481063e-05, "loss": 0.7847, "step": 5124 }, { "epoch": 2.751652629106406, "grad_norm": 0.72265625, "learning_rate": 2.1089138373994223e-05, "loss": 0.9203, "step": 5125 }, { "epoch": 2.7521895238414817, "grad_norm": 0.69921875, "learning_rate": 2.1080797429456677e-05, "loss": 0.8991, "step": 5126 }, { "epoch": 2.752726418576558, "grad_norm": 0.7109375, "learning_rate": 2.1072456932149758e-05, "loss": 0.9665, "step": 5127 }, { "epoch": 2.753263313311634, "grad_norm": 0.66796875, "learning_rate": 2.106411688302525e-05, "loss": 0.9092, "step": 5128 }, { "epoch": 2.7538002080467097, "grad_norm": 0.64453125, "learning_rate": 2.1055777283034843e-05, "loss": 0.8094, "step": 5129 }, { "epoch": 2.754337102781786, "grad_norm": 0.625, "learning_rate": 2.1047438133130196e-05, "loss": 0.8451, "step": 5130 }, { "epoch": 2.754873997516862, "grad_norm": 0.72265625, "learning_rate": 2.1039099434262917e-05, "loss": 0.8954, "step": 5131 }, { "epoch": 2.7554108922519376, "grad_norm": 0.6171875, "learning_rate": 2.1030761187384557e-05, "loss": 0.7139, "step": 5132 }, { "epoch": 2.7559477869870137, "grad_norm": 0.74609375, "learning_rate": 2.10224233934466e-05, "loss": 0.6447, "step": 5133 }, { "epoch": 2.75648468172209, "grad_norm": 0.6796875, "learning_rate": 2.1014086053400516e-05, "loss": 0.879, "step": 5134 }, { "epoch": 2.757021576457166, "grad_norm": 0.68359375, "learning_rate": 2.1005749168197696e-05, "loss": 0.9523, "step": 5135 }, { "epoch": 2.757558471192242, "grad_norm": 0.70703125, "learning_rate": 2.0997412738789475e-05, "loss": 0.7948, "step": 5136 }, { "epoch": 2.7580953659273177, "grad_norm": 0.703125, "learning_rate": 2.0989076766127148e-05, "loss": 0.8665, "step": 5137 }, { "epoch": 2.758632260662394, "grad_norm": 0.7109375, "learning_rate": 2.0980741251161948e-05, "loss": 0.7218, "step": 5138 }, { "epoch": 2.75916915539747, "grad_norm": 0.87890625, "learning_rate": 2.097240619484508e-05, "loss": 0.9672, "step": 5139 }, { "epoch": 2.7597060501325457, "grad_norm": 0.7734375, "learning_rate": 2.0964071598127666e-05, "loss": 0.968, "step": 5140 }, { "epoch": 2.760242944867622, "grad_norm": 0.68359375, "learning_rate": 2.0955737461960793e-05, "loss": 0.8452, "step": 5141 }, { "epoch": 2.760779839602698, "grad_norm": 0.69921875, "learning_rate": 2.0947403787295495e-05, "loss": 0.8077, "step": 5142 }, { "epoch": 2.761316734337774, "grad_norm": 0.640625, "learning_rate": 2.0939070575082752e-05, "loss": 0.882, "step": 5143 }, { "epoch": 2.76185362907285, "grad_norm": 0.66015625, "learning_rate": 2.093073782627348e-05, "loss": 0.9186, "step": 5144 }, { "epoch": 2.762390523807926, "grad_norm": 0.85546875, "learning_rate": 2.0922405541818555e-05, "loss": 0.8085, "step": 5145 }, { "epoch": 2.762927418543002, "grad_norm": 0.65234375, "learning_rate": 2.0914073722668788e-05, "loss": 0.9689, "step": 5146 }, { "epoch": 2.763464313278078, "grad_norm": 0.63671875, "learning_rate": 2.090574236977496e-05, "loss": 0.7498, "step": 5147 }, { "epoch": 2.7640012080131537, "grad_norm": 0.734375, "learning_rate": 2.089741148408778e-05, "loss": 0.7993, "step": 5148 }, { "epoch": 2.76453810274823, "grad_norm": 0.76953125, "learning_rate": 2.0889081066557902e-05, "loss": 0.7807, "step": 5149 }, { "epoch": 2.765074997483306, "grad_norm": 0.75390625, "learning_rate": 2.088075111813593e-05, "loss": 1.0847, "step": 5150 }, { "epoch": 2.765074997483306, "eval_loss": 0.9028027057647705, "eval_runtime": 45.7427, "eval_samples_per_second": 21.031, "eval_steps_per_second": 21.031, "step": 5150 }, { "epoch": 2.7656118922183817, "grad_norm": 0.6171875, "learning_rate": 2.0872421639772426e-05, "loss": 0.8422, "step": 5151 }, { "epoch": 2.766148786953458, "grad_norm": 0.63671875, "learning_rate": 2.0864092632417892e-05, "loss": 0.8761, "step": 5152 }, { "epoch": 2.766685681688534, "grad_norm": 0.65234375, "learning_rate": 2.0855764097022758e-05, "loss": 0.6654, "step": 5153 }, { "epoch": 2.76722257642361, "grad_norm": 0.5859375, "learning_rate": 2.0847436034537428e-05, "loss": 0.7115, "step": 5154 }, { "epoch": 2.767759471158686, "grad_norm": 0.65234375, "learning_rate": 2.083910844591222e-05, "loss": 0.8087, "step": 5155 }, { "epoch": 2.768296365893762, "grad_norm": 0.91015625, "learning_rate": 2.0830781332097446e-05, "loss": 0.867, "step": 5156 }, { "epoch": 2.768833260628838, "grad_norm": 0.671875, "learning_rate": 2.0822454694043317e-05, "loss": 0.8529, "step": 5157 }, { "epoch": 2.769370155363914, "grad_norm": 0.61328125, "learning_rate": 2.0814128532700004e-05, "loss": 1.0199, "step": 5158 }, { "epoch": 2.7699070500989897, "grad_norm": 0.6875, "learning_rate": 2.0805802849017646e-05, "loss": 0.9396, "step": 5159 }, { "epoch": 2.770443944834066, "grad_norm": 0.71484375, "learning_rate": 2.07974776439463e-05, "loss": 0.8423, "step": 5160 }, { "epoch": 2.770980839569142, "grad_norm": 0.72265625, "learning_rate": 2.078915291843597e-05, "loss": 0.9325, "step": 5161 }, { "epoch": 2.771517734304218, "grad_norm": 0.734375, "learning_rate": 2.078082867343662e-05, "loss": 0.904, "step": 5162 }, { "epoch": 2.7720546290392942, "grad_norm": 0.75, "learning_rate": 2.0772504909898143e-05, "loss": 0.854, "step": 5163 }, { "epoch": 2.77259152377437, "grad_norm": 0.6484375, "learning_rate": 2.07641816287704e-05, "loss": 0.7798, "step": 5164 }, { "epoch": 2.773128418509446, "grad_norm": 0.70703125, "learning_rate": 2.075585883100317e-05, "loss": 0.6959, "step": 5165 }, { "epoch": 2.773665313244522, "grad_norm": 0.76953125, "learning_rate": 2.0747536517546197e-05, "loss": 1.0618, "step": 5166 }, { "epoch": 2.774202207979598, "grad_norm": 0.61328125, "learning_rate": 2.0739214689349163e-05, "loss": 0.7171, "step": 5167 }, { "epoch": 2.774739102714674, "grad_norm": 0.75, "learning_rate": 2.0730893347361695e-05, "loss": 0.8049, "step": 5168 }, { "epoch": 2.77527599744975, "grad_norm": 0.671875, "learning_rate": 2.0722572492533356e-05, "loss": 0.9721, "step": 5169 }, { "epoch": 2.7758128921848257, "grad_norm": 0.62109375, "learning_rate": 2.0714252125813667e-05, "loss": 0.8227, "step": 5170 }, { "epoch": 2.776349786919902, "grad_norm": 0.6328125, "learning_rate": 2.0705932248152073e-05, "loss": 0.8242, "step": 5171 }, { "epoch": 2.776886681654978, "grad_norm": 0.68359375, "learning_rate": 2.0697612860498e-05, "loss": 0.9323, "step": 5172 }, { "epoch": 2.777423576390054, "grad_norm": 0.65234375, "learning_rate": 2.0689293963800785e-05, "loss": 0.8015, "step": 5173 }, { "epoch": 2.7779604711251302, "grad_norm": 0.62109375, "learning_rate": 2.0680975559009713e-05, "loss": 0.784, "step": 5174 }, { "epoch": 2.778497365860206, "grad_norm": 0.8984375, "learning_rate": 2.067265764707403e-05, "loss": 0.9371, "step": 5175 }, { "epoch": 2.779034260595282, "grad_norm": 0.65234375, "learning_rate": 2.0664340228942913e-05, "loss": 0.6046, "step": 5176 }, { "epoch": 2.779571155330358, "grad_norm": 0.69140625, "learning_rate": 2.0656023305565485e-05, "loss": 0.9095, "step": 5177 }, { "epoch": 2.780108050065434, "grad_norm": 0.82421875, "learning_rate": 2.064770687789081e-05, "loss": 0.7477, "step": 5178 }, { "epoch": 2.78064494480051, "grad_norm": 0.625, "learning_rate": 2.0639390946867886e-05, "loss": 0.8375, "step": 5179 }, { "epoch": 2.781181839535586, "grad_norm": 0.6328125, "learning_rate": 2.0631075513445685e-05, "loss": 0.862, "step": 5180 }, { "epoch": 2.781718734270662, "grad_norm": 0.7421875, "learning_rate": 2.0622760578573097e-05, "loss": 1.1194, "step": 5181 }, { "epoch": 2.7822556290057383, "grad_norm": 0.62890625, "learning_rate": 2.0614446143198956e-05, "loss": 0.749, "step": 5182 }, { "epoch": 2.782792523740814, "grad_norm": 0.5703125, "learning_rate": 2.0606132208272052e-05, "loss": 0.7441, "step": 5183 }, { "epoch": 2.78332941847589, "grad_norm": 0.984375, "learning_rate": 2.0597818774741108e-05, "loss": 0.9655, "step": 5184 }, { "epoch": 2.7838663132109662, "grad_norm": 0.8359375, "learning_rate": 2.0589505843554797e-05, "loss": 0.8296, "step": 5185 }, { "epoch": 2.784403207946042, "grad_norm": 0.7734375, "learning_rate": 2.0581193415661716e-05, "loss": 0.8359, "step": 5186 }, { "epoch": 2.784940102681118, "grad_norm": 0.6953125, "learning_rate": 2.057288149201042e-05, "loss": 0.841, "step": 5187 }, { "epoch": 2.785476997416194, "grad_norm": 0.7421875, "learning_rate": 2.0564570073549417e-05, "loss": 0.7948, "step": 5188 }, { "epoch": 2.78601389215127, "grad_norm": 0.58984375, "learning_rate": 2.055625916122714e-05, "loss": 0.6271, "step": 5189 }, { "epoch": 2.786550786886346, "grad_norm": 0.6484375, "learning_rate": 2.0547948755991967e-05, "loss": 0.7313, "step": 5190 }, { "epoch": 2.787087681621422, "grad_norm": 0.91015625, "learning_rate": 2.0539638858792214e-05, "loss": 0.9432, "step": 5191 }, { "epoch": 2.787624576356498, "grad_norm": 0.71875, "learning_rate": 2.0531329470576164e-05, "loss": 0.8472, "step": 5192 }, { "epoch": 2.7881614710915743, "grad_norm": 0.6640625, "learning_rate": 2.0523020592292015e-05, "loss": 0.764, "step": 5193 }, { "epoch": 2.78869836582665, "grad_norm": 0.71875, "learning_rate": 2.0514712224887906e-05, "loss": 0.8106, "step": 5194 }, { "epoch": 2.789235260561726, "grad_norm": 0.73046875, "learning_rate": 2.0506404369311937e-05, "loss": 0.9758, "step": 5195 }, { "epoch": 2.7897721552968022, "grad_norm": 0.67578125, "learning_rate": 2.049809702651213e-05, "loss": 0.9376, "step": 5196 }, { "epoch": 2.790309050031878, "grad_norm": 0.6796875, "learning_rate": 2.0489790197436466e-05, "loss": 0.7836, "step": 5197 }, { "epoch": 2.790845944766954, "grad_norm": 0.625, "learning_rate": 2.0481483883032858e-05, "loss": 0.7543, "step": 5198 }, { "epoch": 2.79138283950203, "grad_norm": 0.734375, "learning_rate": 2.047317808424916e-05, "loss": 0.9031, "step": 5199 }, { "epoch": 2.7919197342371063, "grad_norm": 0.75, "learning_rate": 2.0464872802033175e-05, "loss": 0.7797, "step": 5200 }, { "epoch": 2.7919197342371063, "eval_loss": 0.9027925729751587, "eval_runtime": 46.0281, "eval_samples_per_second": 20.9, "eval_steps_per_second": 20.9, "step": 5200 }, { "epoch": 2.7924566289721824, "grad_norm": 0.65234375, "learning_rate": 2.0456568037332634e-05, "loss": 0.8983, "step": 5201 }, { "epoch": 2.792993523707258, "grad_norm": 0.67578125, "learning_rate": 2.0448263791095224e-05, "loss": 1.1263, "step": 5202 }, { "epoch": 2.793530418442334, "grad_norm": 0.70703125, "learning_rate": 2.0439960064268555e-05, "loss": 0.7796, "step": 5203 }, { "epoch": 2.7940673131774103, "grad_norm": 1.4765625, "learning_rate": 2.043165685780018e-05, "loss": 0.8716, "step": 5204 }, { "epoch": 2.794604207912486, "grad_norm": 0.984375, "learning_rate": 2.042335417263762e-05, "loss": 0.8917, "step": 5205 }, { "epoch": 2.795141102647562, "grad_norm": 0.6171875, "learning_rate": 2.0415052009728308e-05, "loss": 0.7989, "step": 5206 }, { "epoch": 2.7956779973826382, "grad_norm": 0.6484375, "learning_rate": 2.040675037001962e-05, "loss": 0.7786, "step": 5207 }, { "epoch": 2.796214892117714, "grad_norm": 0.69921875, "learning_rate": 2.0398449254458887e-05, "loss": 1.0738, "step": 5208 }, { "epoch": 2.79675178685279, "grad_norm": 0.61328125, "learning_rate": 2.039014866399337e-05, "loss": 0.7783, "step": 5209 }, { "epoch": 2.797288681587866, "grad_norm": 0.66015625, "learning_rate": 2.0381848599570276e-05, "loss": 0.8454, "step": 5210 }, { "epoch": 2.7978255763229423, "grad_norm": 0.828125, "learning_rate": 2.0373549062136732e-05, "loss": 0.9527, "step": 5211 }, { "epoch": 2.7983624710580184, "grad_norm": 0.69921875, "learning_rate": 2.0365250052639824e-05, "loss": 0.8622, "step": 5212 }, { "epoch": 2.798899365793094, "grad_norm": 0.7578125, "learning_rate": 2.0356951572026585e-05, "loss": 0.7361, "step": 5213 }, { "epoch": 2.79943626052817, "grad_norm": 0.6640625, "learning_rate": 2.0348653621243974e-05, "loss": 0.6588, "step": 5214 }, { "epoch": 2.7999731552632463, "grad_norm": 0.67578125, "learning_rate": 2.0340356201238883e-05, "loss": 0.8215, "step": 5215 }, { "epoch": 2.800510049998322, "grad_norm": 0.83203125, "learning_rate": 2.033205931295816e-05, "loss": 0.7829, "step": 5216 }, { "epoch": 2.801046944733398, "grad_norm": 0.7109375, "learning_rate": 2.032376295734859e-05, "loss": 0.7765, "step": 5217 }, { "epoch": 2.8015838394684742, "grad_norm": 0.62890625, "learning_rate": 2.031546713535688e-05, "loss": 0.7968, "step": 5218 }, { "epoch": 2.8021207342035503, "grad_norm": 0.671875, "learning_rate": 2.030717184792971e-05, "loss": 0.8357, "step": 5219 }, { "epoch": 2.8026576289386265, "grad_norm": 0.63671875, "learning_rate": 2.0298877096013648e-05, "loss": 0.9425, "step": 5220 }, { "epoch": 2.803194523673702, "grad_norm": 0.77734375, "learning_rate": 2.0290582880555244e-05, "loss": 0.8235, "step": 5221 }, { "epoch": 2.8037314184087783, "grad_norm": 0.80859375, "learning_rate": 2.0282289202500977e-05, "loss": 1.013, "step": 5222 }, { "epoch": 2.8042683131438544, "grad_norm": 0.703125, "learning_rate": 2.0273996062797252e-05, "loss": 0.7613, "step": 5223 }, { "epoch": 2.80480520787893, "grad_norm": 0.63671875, "learning_rate": 2.0265703462390427e-05, "loss": 0.9224, "step": 5224 }, { "epoch": 2.805342102614006, "grad_norm": 0.7421875, "learning_rate": 2.02574114022268e-05, "loss": 0.8286, "step": 5225 }, { "epoch": 2.8058789973490823, "grad_norm": 0.73046875, "learning_rate": 2.0249119883252585e-05, "loss": 0.9074, "step": 5226 }, { "epoch": 2.806415892084158, "grad_norm": 0.6953125, "learning_rate": 2.0240828906413963e-05, "loss": 1.018, "step": 5227 }, { "epoch": 2.806952786819234, "grad_norm": 0.71484375, "learning_rate": 2.0232538472657026e-05, "loss": 0.7941, "step": 5228 }, { "epoch": 2.8074896815543102, "grad_norm": 0.6796875, "learning_rate": 2.0224248582927827e-05, "loss": 0.6987, "step": 5229 }, { "epoch": 2.8080265762893863, "grad_norm": 0.61328125, "learning_rate": 2.0215959238172345e-05, "loss": 0.8979, "step": 5230 }, { "epoch": 2.8085634710244625, "grad_norm": 0.75, "learning_rate": 2.0207670439336495e-05, "loss": 0.7204, "step": 5231 }, { "epoch": 2.809100365759538, "grad_norm": 0.66015625, "learning_rate": 2.019938218736614e-05, "loss": 0.9128, "step": 5232 }, { "epoch": 2.8096372604946143, "grad_norm": 0.828125, "learning_rate": 2.0191094483207073e-05, "loss": 0.8289, "step": 5233 }, { "epoch": 2.8101741552296904, "grad_norm": 0.68359375, "learning_rate": 2.0182807327805025e-05, "loss": 0.9231, "step": 5234 }, { "epoch": 2.810711049964766, "grad_norm": 0.73828125, "learning_rate": 2.0174520722105673e-05, "loss": 0.9004, "step": 5235 }, { "epoch": 2.811247944699842, "grad_norm": 0.80078125, "learning_rate": 2.016623466705461e-05, "loss": 0.8392, "step": 5236 }, { "epoch": 2.8117848394349183, "grad_norm": 0.7890625, "learning_rate": 2.015794916359738e-05, "loss": 0.9063, "step": 5237 }, { "epoch": 2.8123217341699944, "grad_norm": 0.77734375, "learning_rate": 2.0149664212679476e-05, "loss": 0.8567, "step": 5238 }, { "epoch": 2.8128586289050705, "grad_norm": 0.703125, "learning_rate": 2.014137981524631e-05, "loss": 0.7895, "step": 5239 }, { "epoch": 2.8133955236401462, "grad_norm": 0.83984375, "learning_rate": 2.0133095972243233e-05, "loss": 1.0232, "step": 5240 }, { "epoch": 2.8139324183752223, "grad_norm": 0.65234375, "learning_rate": 2.0124812684615542e-05, "loss": 0.8287, "step": 5241 }, { "epoch": 2.8144693131102985, "grad_norm": 0.6796875, "learning_rate": 2.0116529953308466e-05, "loss": 0.8465, "step": 5242 }, { "epoch": 2.815006207845374, "grad_norm": 0.6328125, "learning_rate": 2.0108247779267157e-05, "loss": 0.9057, "step": 5243 }, { "epoch": 2.8155431025804503, "grad_norm": 0.828125, "learning_rate": 2.0099966163436746e-05, "loss": 0.7271, "step": 5244 }, { "epoch": 2.8160799973155264, "grad_norm": 0.6640625, "learning_rate": 2.009168510676223e-05, "loss": 0.8052, "step": 5245 }, { "epoch": 2.816616892050602, "grad_norm": 0.671875, "learning_rate": 2.0083404610188612e-05, "loss": 0.7657, "step": 5246 }, { "epoch": 2.817153786785678, "grad_norm": 0.765625, "learning_rate": 2.007512467466079e-05, "loss": 0.8261, "step": 5247 }, { "epoch": 2.8176906815207543, "grad_norm": 0.67578125, "learning_rate": 2.0066845301123603e-05, "loss": 0.76, "step": 5248 }, { "epoch": 2.8182275762558304, "grad_norm": 0.6171875, "learning_rate": 2.0058566490521847e-05, "loss": 0.8264, "step": 5249 }, { "epoch": 2.8187644709909065, "grad_norm": 0.64453125, "learning_rate": 2.0050288243800233e-05, "loss": 0.7537, "step": 5250 }, { "epoch": 2.8187644709909065, "eval_loss": 0.9029639959335327, "eval_runtime": 46.1523, "eval_samples_per_second": 20.844, "eval_steps_per_second": 20.844, "step": 5250 }, { "epoch": 2.8193013657259822, "grad_norm": 0.82421875, "learning_rate": 2.0042010561903408e-05, "loss": 0.9437, "step": 5251 }, { "epoch": 2.8198382604610583, "grad_norm": 0.74609375, "learning_rate": 2.0033733445775982e-05, "loss": 0.8539, "step": 5252 }, { "epoch": 2.8203751551961345, "grad_norm": 0.7265625, "learning_rate": 2.0025456896362445e-05, "loss": 0.7846, "step": 5253 }, { "epoch": 2.82091204993121, "grad_norm": 0.59375, "learning_rate": 2.001718091460728e-05, "loss": 0.6701, "step": 5254 }, { "epoch": 2.8214489446662863, "grad_norm": 0.56640625, "learning_rate": 2.0008905501454877e-05, "loss": 0.6474, "step": 5255 }, { "epoch": 2.8219858394013624, "grad_norm": 1.140625, "learning_rate": 2.0000630657849556e-05, "loss": 0.765, "step": 5256 }, { "epoch": 2.8225227341364385, "grad_norm": 0.6484375, "learning_rate": 1.9992356384735593e-05, "loss": 0.6924, "step": 5257 }, { "epoch": 2.8230596288715146, "grad_norm": 0.625, "learning_rate": 1.9984082683057186e-05, "loss": 0.7839, "step": 5258 }, { "epoch": 2.8235965236065903, "grad_norm": 0.57421875, "learning_rate": 1.9975809553758463e-05, "loss": 0.8914, "step": 5259 }, { "epoch": 2.8241334183416664, "grad_norm": 0.77734375, "learning_rate": 1.9967536997783494e-05, "loss": 0.9146, "step": 5260 }, { "epoch": 2.8246703130767425, "grad_norm": 0.828125, "learning_rate": 1.9959265016076294e-05, "loss": 0.9914, "step": 5261 }, { "epoch": 2.825207207811818, "grad_norm": 1.015625, "learning_rate": 1.9950993609580786e-05, "loss": 0.9179, "step": 5262 }, { "epoch": 2.8257441025468943, "grad_norm": 0.640625, "learning_rate": 1.994272277924085e-05, "loss": 0.7413, "step": 5263 }, { "epoch": 2.8262809972819705, "grad_norm": 0.67578125, "learning_rate": 1.9934452526000283e-05, "loss": 0.8451, "step": 5264 }, { "epoch": 2.826817892017046, "grad_norm": 0.78125, "learning_rate": 1.9926182850802837e-05, "loss": 0.8916, "step": 5265 }, { "epoch": 2.8273547867521223, "grad_norm": 0.734375, "learning_rate": 1.9917913754592183e-05, "loss": 0.8789, "step": 5266 }, { "epoch": 2.8278916814871984, "grad_norm": 0.82421875, "learning_rate": 1.9909645238311923e-05, "loss": 1.0534, "step": 5267 }, { "epoch": 2.8284285762222745, "grad_norm": 0.77734375, "learning_rate": 1.9901377302905612e-05, "loss": 0.9451, "step": 5268 }, { "epoch": 2.8289654709573506, "grad_norm": 0.640625, "learning_rate": 1.9893109949316725e-05, "loss": 0.7487, "step": 5269 }, { "epoch": 2.8295023656924263, "grad_norm": 0.79296875, "learning_rate": 1.988484317848866e-05, "loss": 0.9931, "step": 5270 }, { "epoch": 2.8300392604275024, "grad_norm": 0.83203125, "learning_rate": 1.9876576991364766e-05, "loss": 1.1299, "step": 5271 }, { "epoch": 2.8305761551625785, "grad_norm": 0.76171875, "learning_rate": 1.9868311388888318e-05, "loss": 1.0833, "step": 5272 }, { "epoch": 2.831113049897654, "grad_norm": 0.71484375, "learning_rate": 1.986004637200253e-05, "loss": 0.7378, "step": 5273 }, { "epoch": 2.8316499446327303, "grad_norm": 0.87890625, "learning_rate": 1.9851781941650542e-05, "loss": 0.9345, "step": 5274 }, { "epoch": 2.8321868393678065, "grad_norm": 0.62890625, "learning_rate": 1.984351809877544e-05, "loss": 0.8584, "step": 5275 }, { "epoch": 2.8327237341028826, "grad_norm": 0.76171875, "learning_rate": 1.983525484432021e-05, "loss": 0.8483, "step": 5276 }, { "epoch": 2.8332606288379587, "grad_norm": 0.6796875, "learning_rate": 1.982699217922782e-05, "loss": 0.8624, "step": 5277 }, { "epoch": 2.8337975235730344, "grad_norm": 0.765625, "learning_rate": 1.981873010444113e-05, "loss": 0.8226, "step": 5278 }, { "epoch": 2.8343344183081105, "grad_norm": 0.65234375, "learning_rate": 1.9810468620902952e-05, "loss": 0.7661, "step": 5279 }, { "epoch": 2.8348713130431866, "grad_norm": 0.65234375, "learning_rate": 1.980220772955602e-05, "loss": 0.8675, "step": 5280 }, { "epoch": 2.8354082077782623, "grad_norm": 0.81640625, "learning_rate": 1.9793947431343013e-05, "loss": 0.9995, "step": 5281 }, { "epoch": 2.8359451025133384, "grad_norm": 0.828125, "learning_rate": 1.978568772720653e-05, "loss": 0.9801, "step": 5282 }, { "epoch": 2.8364819972484145, "grad_norm": 0.7421875, "learning_rate": 1.9777428618089118e-05, "loss": 0.8755, "step": 5283 }, { "epoch": 2.83701889198349, "grad_norm": 0.5625, "learning_rate": 1.976917010493323e-05, "loss": 0.6681, "step": 5284 }, { "epoch": 2.8375557867185663, "grad_norm": 0.5703125, "learning_rate": 1.9760912188681284e-05, "loss": 0.7422, "step": 5285 }, { "epoch": 2.8380926814536425, "grad_norm": 0.6328125, "learning_rate": 1.9752654870275608e-05, "loss": 0.8308, "step": 5286 }, { "epoch": 2.8386295761887186, "grad_norm": 0.765625, "learning_rate": 1.9744398150658462e-05, "loss": 0.9757, "step": 5287 }, { "epoch": 2.8391664709237947, "grad_norm": 0.68359375, "learning_rate": 1.973614203077204e-05, "loss": 0.7586, "step": 5288 }, { "epoch": 2.8397033656588704, "grad_norm": 0.671875, "learning_rate": 1.972788651155847e-05, "loss": 0.6954, "step": 5289 }, { "epoch": 2.8402402603939465, "grad_norm": 0.640625, "learning_rate": 1.9719631593959816e-05, "loss": 0.869, "step": 5290 }, { "epoch": 2.8407771551290226, "grad_norm": 0.62109375, "learning_rate": 1.9711377278918073e-05, "loss": 0.8526, "step": 5291 }, { "epoch": 2.8413140498640983, "grad_norm": 0.73046875, "learning_rate": 1.970312356737515e-05, "loss": 0.7922, "step": 5292 }, { "epoch": 2.8418509445991744, "grad_norm": 0.6328125, "learning_rate": 1.9694870460272914e-05, "loss": 0.831, "step": 5293 }, { "epoch": 2.8423878393342505, "grad_norm": 0.75390625, "learning_rate": 1.9686617958553148e-05, "loss": 0.9719, "step": 5294 }, { "epoch": 2.8429247340693267, "grad_norm": 0.8125, "learning_rate": 1.9678366063157556e-05, "loss": 0.9645, "step": 5295 }, { "epoch": 2.843461628804403, "grad_norm": 0.7578125, "learning_rate": 1.9670114775027786e-05, "loss": 0.9471, "step": 5296 }, { "epoch": 2.8439985235394785, "grad_norm": 0.9921875, "learning_rate": 1.9661864095105415e-05, "loss": 0.8779, "step": 5297 }, { "epoch": 2.8445354182745546, "grad_norm": 0.64453125, "learning_rate": 1.9653614024331956e-05, "loss": 0.845, "step": 5298 }, { "epoch": 2.8450723130096307, "grad_norm": 0.625, "learning_rate": 1.9645364563648847e-05, "loss": 0.7228, "step": 5299 }, { "epoch": 2.8456092077447064, "grad_norm": 0.57421875, "learning_rate": 1.9637115713997444e-05, "loss": 0.7131, "step": 5300 }, { "epoch": 2.8456092077447064, "eval_loss": 0.9029813408851624, "eval_runtime": 45.6367, "eval_samples_per_second": 21.08, "eval_steps_per_second": 21.08, "step": 5300 }, { "epoch": 2.8461461024797825, "grad_norm": 0.78515625, "learning_rate": 1.962886747631906e-05, "loss": 0.8761, "step": 5301 }, { "epoch": 2.8466829972148586, "grad_norm": 0.75, "learning_rate": 1.9620619851554916e-05, "loss": 0.8243, "step": 5302 }, { "epoch": 2.8472198919499343, "grad_norm": 0.7265625, "learning_rate": 1.961237284064618e-05, "loss": 0.8589, "step": 5303 }, { "epoch": 2.8477567866850104, "grad_norm": 0.66015625, "learning_rate": 1.960412644453392e-05, "loss": 0.8168, "step": 5304 }, { "epoch": 2.8482936814200865, "grad_norm": 0.76953125, "learning_rate": 1.9595880664159166e-05, "loss": 0.9777, "step": 5305 }, { "epoch": 2.8488305761551627, "grad_norm": 0.5625, "learning_rate": 1.958763550046287e-05, "loss": 0.6433, "step": 5306 }, { "epoch": 2.849367470890239, "grad_norm": 0.57421875, "learning_rate": 1.9579390954385907e-05, "loss": 0.6728, "step": 5307 }, { "epoch": 2.8499043656253145, "grad_norm": 0.6328125, "learning_rate": 1.9571147026869076e-05, "loss": 1.0079, "step": 5308 }, { "epoch": 2.8504412603603906, "grad_norm": 0.66796875, "learning_rate": 1.9562903718853122e-05, "loss": 0.8398, "step": 5309 }, { "epoch": 2.8509781550954667, "grad_norm": 0.9375, "learning_rate": 1.9554661031278712e-05, "loss": 0.9488, "step": 5310 }, { "epoch": 2.8515150498305424, "grad_norm": 0.5546875, "learning_rate": 1.9546418965086442e-05, "loss": 0.7689, "step": 5311 }, { "epoch": 2.8520519445656185, "grad_norm": 0.69921875, "learning_rate": 1.9538177521216827e-05, "loss": 1.0181, "step": 5312 }, { "epoch": 2.8525888393006946, "grad_norm": 0.69921875, "learning_rate": 1.952993670061032e-05, "loss": 0.9337, "step": 5313 }, { "epoch": 2.8531257340357707, "grad_norm": 0.62109375, "learning_rate": 1.9521696504207315e-05, "loss": 0.7985, "step": 5314 }, { "epoch": 2.853662628770847, "grad_norm": 0.70703125, "learning_rate": 1.9513456932948114e-05, "loss": 0.7185, "step": 5315 }, { "epoch": 2.8541995235059225, "grad_norm": 0.640625, "learning_rate": 1.950521798777296e-05, "loss": 0.9152, "step": 5316 }, { "epoch": 2.8547364182409987, "grad_norm": 0.6875, "learning_rate": 1.949697966962201e-05, "loss": 0.8969, "step": 5317 }, { "epoch": 2.855273312976075, "grad_norm": 0.9296875, "learning_rate": 1.9488741979435382e-05, "loss": 0.8396, "step": 5318 }, { "epoch": 2.8558102077111505, "grad_norm": 0.67578125, "learning_rate": 1.948050491815309e-05, "loss": 0.797, "step": 5319 }, { "epoch": 2.8563471024462266, "grad_norm": 0.6640625, "learning_rate": 1.947226848671508e-05, "loss": 0.858, "step": 5320 }, { "epoch": 2.8568839971813027, "grad_norm": 0.75390625, "learning_rate": 1.946403268606124e-05, "loss": 0.9204, "step": 5321 }, { "epoch": 2.8574208919163784, "grad_norm": 0.6875, "learning_rate": 1.945579751713137e-05, "loss": 0.8311, "step": 5322 }, { "epoch": 2.8579577866514545, "grad_norm": 0.61328125, "learning_rate": 1.9447562980865226e-05, "loss": 0.717, "step": 5323 }, { "epoch": 2.8584946813865306, "grad_norm": 0.7578125, "learning_rate": 1.9439329078202457e-05, "loss": 0.9078, "step": 5324 }, { "epoch": 2.8590315761216067, "grad_norm": 0.8046875, "learning_rate": 1.9431095810082657e-05, "loss": 0.9643, "step": 5325 }, { "epoch": 2.859568470856683, "grad_norm": 0.71875, "learning_rate": 1.942286317744535e-05, "loss": 0.9782, "step": 5326 }, { "epoch": 2.8601053655917585, "grad_norm": 0.70703125, "learning_rate": 1.941463118122999e-05, "loss": 0.984, "step": 5327 }, { "epoch": 2.8606422603268347, "grad_norm": 0.66015625, "learning_rate": 1.940639982237595e-05, "loss": 0.8385, "step": 5328 }, { "epoch": 2.861179155061911, "grad_norm": 0.65625, "learning_rate": 1.9398169101822517e-05, "loss": 0.9262, "step": 5329 }, { "epoch": 2.8617160497969865, "grad_norm": 0.6640625, "learning_rate": 1.938993902050893e-05, "loss": 0.9427, "step": 5330 }, { "epoch": 2.8622529445320626, "grad_norm": 0.671875, "learning_rate": 1.938170957937435e-05, "loss": 0.7477, "step": 5331 }, { "epoch": 2.8627898392671387, "grad_norm": 0.6953125, "learning_rate": 1.9373480779357856e-05, "loss": 0.8386, "step": 5332 }, { "epoch": 2.863326734002215, "grad_norm": 0.63671875, "learning_rate": 1.9365252621398455e-05, "loss": 0.8151, "step": 5333 }, { "epoch": 2.863863628737291, "grad_norm": 0.64453125, "learning_rate": 1.9357025106435093e-05, "loss": 0.8419, "step": 5334 }, { "epoch": 2.8644005234723666, "grad_norm": 0.65625, "learning_rate": 1.934879823540663e-05, "loss": 0.7782, "step": 5335 }, { "epoch": 2.8649374182074427, "grad_norm": 0.66015625, "learning_rate": 1.934057200925186e-05, "loss": 0.5859, "step": 5336 }, { "epoch": 2.865474312942519, "grad_norm": 0.7109375, "learning_rate": 1.9332346428909485e-05, "loss": 0.8881, "step": 5337 }, { "epoch": 2.8660112076775945, "grad_norm": 0.72265625, "learning_rate": 1.932412149531816e-05, "loss": 0.9106, "step": 5338 }, { "epoch": 2.8665481024126707, "grad_norm": 0.74609375, "learning_rate": 1.9315897209416453e-05, "loss": 0.9241, "step": 5339 }, { "epoch": 2.867084997147747, "grad_norm": 0.82421875, "learning_rate": 1.9307673572142856e-05, "loss": 0.6796, "step": 5340 }, { "epoch": 2.8676218918828225, "grad_norm": 0.80078125, "learning_rate": 1.9299450584435787e-05, "loss": 1.0987, "step": 5341 }, { "epoch": 2.8681587866178986, "grad_norm": 0.76171875, "learning_rate": 1.9291228247233605e-05, "loss": 0.8761, "step": 5342 }, { "epoch": 2.8686956813529747, "grad_norm": 0.69140625, "learning_rate": 1.9283006561474574e-05, "loss": 0.8641, "step": 5343 }, { "epoch": 2.869232576088051, "grad_norm": 0.6640625, "learning_rate": 1.9274785528096893e-05, "loss": 0.8586, "step": 5344 }, { "epoch": 2.869769470823127, "grad_norm": 0.68359375, "learning_rate": 1.92665651480387e-05, "loss": 0.8617, "step": 5345 }, { "epoch": 2.8703063655582026, "grad_norm": 0.66015625, "learning_rate": 1.9258345422238018e-05, "loss": 0.7381, "step": 5346 }, { "epoch": 2.8708432602932787, "grad_norm": 0.6796875, "learning_rate": 1.9250126351632835e-05, "loss": 0.8398, "step": 5347 }, { "epoch": 2.871380155028355, "grad_norm": 0.7109375, "learning_rate": 1.9241907937161056e-05, "loss": 0.6899, "step": 5348 }, { "epoch": 2.8719170497634305, "grad_norm": 0.7421875, "learning_rate": 1.9233690179760498e-05, "loss": 0.7721, "step": 5349 }, { "epoch": 2.8724539444985067, "grad_norm": 0.7109375, "learning_rate": 1.9225473080368916e-05, "loss": 0.8321, "step": 5350 }, { "epoch": 2.8724539444985067, "eval_loss": 0.9029684662818909, "eval_runtime": 45.8297, "eval_samples_per_second": 20.991, "eval_steps_per_second": 20.991, "step": 5350 }, { "epoch": 2.872990839233583, "grad_norm": 0.6328125, "learning_rate": 1.9217256639923986e-05, "loss": 0.7561, "step": 5351 }, { "epoch": 2.873527733968659, "grad_norm": 0.7421875, "learning_rate": 1.9209040859363302e-05, "loss": 0.9548, "step": 5352 }, { "epoch": 2.874064628703735, "grad_norm": 0.69140625, "learning_rate": 1.9200825739624405e-05, "loss": 0.7987, "step": 5353 }, { "epoch": 2.8746015234388107, "grad_norm": 0.71875, "learning_rate": 1.919261128164472e-05, "loss": 0.8068, "step": 5354 }, { "epoch": 2.875138418173887, "grad_norm": 0.61328125, "learning_rate": 1.9184397486361637e-05, "loss": 0.7436, "step": 5355 }, { "epoch": 2.875675312908963, "grad_norm": 0.734375, "learning_rate": 1.9176184354712445e-05, "loss": 0.7345, "step": 5356 }, { "epoch": 2.8762122076440386, "grad_norm": 1.09375, "learning_rate": 1.916797188763437e-05, "loss": 0.7977, "step": 5357 }, { "epoch": 2.8767491023791147, "grad_norm": 0.7890625, "learning_rate": 1.915976008606456e-05, "loss": 0.9964, "step": 5358 }, { "epoch": 2.877285997114191, "grad_norm": 0.70703125, "learning_rate": 1.915154895094009e-05, "loss": 1.0197, "step": 5359 }, { "epoch": 2.8778228918492665, "grad_norm": 0.765625, "learning_rate": 1.914333848319795e-05, "loss": 0.8793, "step": 5360 }, { "epoch": 2.8783597865843427, "grad_norm": 0.78125, "learning_rate": 1.9135128683775057e-05, "loss": 0.9739, "step": 5361 }, { "epoch": 2.878896681319419, "grad_norm": 0.5859375, "learning_rate": 1.9126919553608253e-05, "loss": 0.8764, "step": 5362 }, { "epoch": 2.879433576054495, "grad_norm": 0.63671875, "learning_rate": 1.91187110936343e-05, "loss": 0.8114, "step": 5363 }, { "epoch": 2.879970470789571, "grad_norm": 0.86328125, "learning_rate": 1.9110503304789892e-05, "loss": 0.9944, "step": 5364 }, { "epoch": 2.8805073655246467, "grad_norm": 0.6484375, "learning_rate": 1.9102296188011645e-05, "loss": 0.6921, "step": 5365 }, { "epoch": 2.881044260259723, "grad_norm": 0.765625, "learning_rate": 1.9094089744236084e-05, "loss": 0.8673, "step": 5366 }, { "epoch": 2.881581154994799, "grad_norm": 0.67578125, "learning_rate": 1.9085883974399682e-05, "loss": 0.9318, "step": 5367 }, { "epoch": 2.8821180497298746, "grad_norm": 0.8984375, "learning_rate": 1.9077678879438814e-05, "loss": 0.9419, "step": 5368 }, { "epoch": 2.8826549444649507, "grad_norm": 0.65625, "learning_rate": 1.9069474460289782e-05, "loss": 0.7571, "step": 5369 }, { "epoch": 2.883191839200027, "grad_norm": 0.62890625, "learning_rate": 1.9061270717888834e-05, "loss": 0.6775, "step": 5370 }, { "epoch": 2.883728733935103, "grad_norm": 1.078125, "learning_rate": 1.9053067653172092e-05, "loss": 0.6983, "step": 5371 }, { "epoch": 2.884265628670179, "grad_norm": 0.63671875, "learning_rate": 1.9044865267075643e-05, "loss": 0.8665, "step": 5372 }, { "epoch": 2.884802523405255, "grad_norm": 0.74609375, "learning_rate": 1.9036663560535483e-05, "loss": 0.8226, "step": 5373 }, { "epoch": 2.885339418140331, "grad_norm": 0.6015625, "learning_rate": 1.9028462534487526e-05, "loss": 0.7879, "step": 5374 }, { "epoch": 2.885876312875407, "grad_norm": 0.64453125, "learning_rate": 1.902026218986762e-05, "loss": 0.8318, "step": 5375 }, { "epoch": 2.8864132076104827, "grad_norm": 0.77734375, "learning_rate": 1.901206252761153e-05, "loss": 1.0547, "step": 5376 }, { "epoch": 2.886950102345559, "grad_norm": 0.64453125, "learning_rate": 1.900386354865493e-05, "loss": 0.7827, "step": 5377 }, { "epoch": 2.887486997080635, "grad_norm": 0.91015625, "learning_rate": 1.899566525393345e-05, "loss": 0.9078, "step": 5378 }, { "epoch": 2.8880238918157106, "grad_norm": 0.6875, "learning_rate": 1.8987467644382585e-05, "loss": 0.8543, "step": 5379 }, { "epoch": 2.8885607865507867, "grad_norm": 0.734375, "learning_rate": 1.8979270720937813e-05, "loss": 1.018, "step": 5380 }, { "epoch": 2.889097681285863, "grad_norm": 0.72265625, "learning_rate": 1.8971074484534495e-05, "loss": 0.9453, "step": 5381 }, { "epoch": 2.889634576020939, "grad_norm": 0.71875, "learning_rate": 1.8962878936107926e-05, "loss": 0.7863, "step": 5382 }, { "epoch": 2.890171470756015, "grad_norm": 0.63671875, "learning_rate": 1.895468407659333e-05, "loss": 0.7151, "step": 5383 }, { "epoch": 2.890708365491091, "grad_norm": 0.66796875, "learning_rate": 1.8946489906925836e-05, "loss": 0.8228, "step": 5384 }, { "epoch": 2.891245260226167, "grad_norm": 0.7734375, "learning_rate": 1.89382964280405e-05, "loss": 0.6157, "step": 5385 }, { "epoch": 2.891782154961243, "grad_norm": 0.70703125, "learning_rate": 1.8930103640872313e-05, "loss": 0.8695, "step": 5386 }, { "epoch": 2.8923190496963187, "grad_norm": 0.62890625, "learning_rate": 1.8921911546356176e-05, "loss": 0.7433, "step": 5387 }, { "epoch": 2.892855944431395, "grad_norm": 0.6484375, "learning_rate": 1.8913720145426897e-05, "loss": 0.7066, "step": 5388 }, { "epoch": 2.893392839166471, "grad_norm": 0.72265625, "learning_rate": 1.8905529439019226e-05, "loss": 0.6999, "step": 5389 }, { "epoch": 2.893929733901547, "grad_norm": 0.75390625, "learning_rate": 1.8897339428067822e-05, "loss": 0.6704, "step": 5390 }, { "epoch": 2.894466628636623, "grad_norm": 0.69140625, "learning_rate": 1.888915011350728e-05, "loss": 0.8328, "step": 5391 }, { "epoch": 2.895003523371699, "grad_norm": 0.671875, "learning_rate": 1.8880961496272092e-05, "loss": 0.8337, "step": 5392 }, { "epoch": 2.895540418106775, "grad_norm": 0.625, "learning_rate": 1.8872773577296687e-05, "loss": 0.7285, "step": 5393 }, { "epoch": 2.896077312841851, "grad_norm": 1.1953125, "learning_rate": 1.886458635751542e-05, "loss": 0.7891, "step": 5394 }, { "epoch": 2.896614207576927, "grad_norm": 0.73828125, "learning_rate": 1.885639983786255e-05, "loss": 0.7902, "step": 5395 }, { "epoch": 2.897151102312003, "grad_norm": 0.62109375, "learning_rate": 1.8848214019272255e-05, "loss": 0.698, "step": 5396 }, { "epoch": 2.897687997047079, "grad_norm": 0.72265625, "learning_rate": 1.884002890267865e-05, "loss": 0.9232, "step": 5397 }, { "epoch": 2.8982248917821547, "grad_norm": 0.8125, "learning_rate": 1.8831844489015748e-05, "loss": 0.7353, "step": 5398 }, { "epoch": 2.898761786517231, "grad_norm": 0.6328125, "learning_rate": 1.8823660779217512e-05, "loss": 0.7497, "step": 5399 }, { "epoch": 2.899298681252307, "grad_norm": 0.76953125, "learning_rate": 1.8815477774217797e-05, "loss": 0.7554, "step": 5400 }, { "epoch": 2.899298681252307, "eval_loss": 0.9032303690910339, "eval_runtime": 45.9451, "eval_samples_per_second": 20.938, "eval_steps_per_second": 20.938, "step": 5400 }, { "epoch": 2.899835575987383, "grad_norm": 0.7109375, "learning_rate": 1.8807295474950386e-05, "loss": 0.8578, "step": 5401 }, { "epoch": 2.900372470722459, "grad_norm": 0.5859375, "learning_rate": 1.8799113882348983e-05, "loss": 0.6351, "step": 5402 }, { "epoch": 2.900909365457535, "grad_norm": 0.6796875, "learning_rate": 1.8790932997347225e-05, "loss": 0.8753, "step": 5403 }, { "epoch": 2.901446260192611, "grad_norm": 0.703125, "learning_rate": 1.8782752820878634e-05, "loss": 0.8204, "step": 5404 }, { "epoch": 2.901983154927687, "grad_norm": 0.63671875, "learning_rate": 1.8774573353876685e-05, "loss": 0.8633, "step": 5405 }, { "epoch": 2.902520049662763, "grad_norm": 0.6015625, "learning_rate": 1.8766394597274757e-05, "loss": 0.6699, "step": 5406 }, { "epoch": 2.903056944397839, "grad_norm": 0.65625, "learning_rate": 1.8758216552006137e-05, "loss": 0.9089, "step": 5407 }, { "epoch": 2.903593839132915, "grad_norm": 0.67578125, "learning_rate": 1.8750039219004063e-05, "loss": 0.7657, "step": 5408 }, { "epoch": 2.904130733867991, "grad_norm": 0.66015625, "learning_rate": 1.8741862599201658e-05, "loss": 0.755, "step": 5409 }, { "epoch": 2.9046676286030673, "grad_norm": 0.75, "learning_rate": 1.8733686693531985e-05, "loss": 1.0561, "step": 5410 }, { "epoch": 2.905204523338143, "grad_norm": 0.74609375, "learning_rate": 1.8725511502928017e-05, "loss": 0.8596, "step": 5411 }, { "epoch": 2.905741418073219, "grad_norm": 0.60546875, "learning_rate": 1.8717337028322656e-05, "loss": 0.7712, "step": 5412 }, { "epoch": 2.906278312808295, "grad_norm": 0.63671875, "learning_rate": 1.8709163270648696e-05, "loss": 0.7858, "step": 5413 }, { "epoch": 2.906815207543371, "grad_norm": 0.80078125, "learning_rate": 1.8700990230838876e-05, "loss": 0.8348, "step": 5414 }, { "epoch": 2.907352102278447, "grad_norm": 0.68359375, "learning_rate": 1.8692817909825832e-05, "loss": 0.8343, "step": 5415 }, { "epoch": 2.907888997013523, "grad_norm": 0.671875, "learning_rate": 1.868464630854215e-05, "loss": 0.8145, "step": 5416 }, { "epoch": 2.908425891748599, "grad_norm": 0.65625, "learning_rate": 1.867647542792029e-05, "loss": 0.7907, "step": 5417 }, { "epoch": 2.908962786483675, "grad_norm": 0.6640625, "learning_rate": 1.866830526889267e-05, "loss": 0.8069, "step": 5418 }, { "epoch": 2.909499681218751, "grad_norm": 0.734375, "learning_rate": 1.8660135832391607e-05, "loss": 0.9163, "step": 5419 }, { "epoch": 2.910036575953827, "grad_norm": 0.64453125, "learning_rate": 1.8651967119349338e-05, "loss": 0.7605, "step": 5420 }, { "epoch": 2.9105734706889033, "grad_norm": 0.7890625, "learning_rate": 1.8643799130698004e-05, "loss": 0.8737, "step": 5421 }, { "epoch": 2.911110365423979, "grad_norm": 0.69140625, "learning_rate": 1.8635631867369686e-05, "loss": 0.6631, "step": 5422 }, { "epoch": 2.911647260159055, "grad_norm": 0.7734375, "learning_rate": 1.8627465330296366e-05, "loss": 0.7786, "step": 5423 }, { "epoch": 2.912184154894131, "grad_norm": 0.68359375, "learning_rate": 1.8619299520409955e-05, "loss": 0.6868, "step": 5424 }, { "epoch": 2.912721049629207, "grad_norm": 0.64453125, "learning_rate": 1.8611134438642278e-05, "loss": 1.0167, "step": 5425 }, { "epoch": 2.913257944364283, "grad_norm": 0.75, "learning_rate": 1.860297008592506e-05, "loss": 0.9778, "step": 5426 }, { "epoch": 2.913794839099359, "grad_norm": 0.7265625, "learning_rate": 1.859480646318998e-05, "loss": 0.833, "step": 5427 }, { "epoch": 2.9143317338344352, "grad_norm": 0.71484375, "learning_rate": 1.858664357136859e-05, "loss": 1.0274, "step": 5428 }, { "epoch": 2.9148686285695113, "grad_norm": 0.75, "learning_rate": 1.8578481411392396e-05, "loss": 0.9675, "step": 5429 }, { "epoch": 2.915405523304587, "grad_norm": 0.77734375, "learning_rate": 1.857031998419279e-05, "loss": 1.069, "step": 5430 }, { "epoch": 2.915942418039663, "grad_norm": 0.80078125, "learning_rate": 1.8562159290701092e-05, "loss": 1.1774, "step": 5431 }, { "epoch": 2.9164793127747393, "grad_norm": 0.8984375, "learning_rate": 1.8553999331848555e-05, "loss": 0.9284, "step": 5432 }, { "epoch": 2.917016207509815, "grad_norm": 0.74609375, "learning_rate": 1.854584010856632e-05, "loss": 1.0889, "step": 5433 }, { "epoch": 2.917553102244891, "grad_norm": 0.67578125, "learning_rate": 1.8537681621785465e-05, "loss": 0.7558, "step": 5434 }, { "epoch": 2.918089996979967, "grad_norm": 0.79296875, "learning_rate": 1.852952387243698e-05, "loss": 0.8962, "step": 5435 }, { "epoch": 2.918626891715043, "grad_norm": 0.66015625, "learning_rate": 1.852136686145176e-05, "loss": 0.9257, "step": 5436 }, { "epoch": 2.919163786450119, "grad_norm": 0.70703125, "learning_rate": 1.8513210589760637e-05, "loss": 0.7801, "step": 5437 }, { "epoch": 2.919700681185195, "grad_norm": 0.75, "learning_rate": 1.8505055058294322e-05, "loss": 0.7405, "step": 5438 }, { "epoch": 2.9202375759202712, "grad_norm": 0.640625, "learning_rate": 1.849690026798348e-05, "loss": 0.863, "step": 5439 }, { "epoch": 2.9207744706553473, "grad_norm": 0.6328125, "learning_rate": 1.8488746219758674e-05, "loss": 0.7822, "step": 5440 }, { "epoch": 2.921311365390423, "grad_norm": 0.5859375, "learning_rate": 1.8480592914550383e-05, "loss": 0.6784, "step": 5441 }, { "epoch": 2.921848260125499, "grad_norm": 0.76953125, "learning_rate": 1.8472440353289e-05, "loss": 0.9939, "step": 5442 }, { "epoch": 2.9223851548605753, "grad_norm": 0.64453125, "learning_rate": 1.8464288536904833e-05, "loss": 0.8815, "step": 5443 }, { "epoch": 2.922922049595651, "grad_norm": 0.734375, "learning_rate": 1.8456137466328116e-05, "loss": 0.8956, "step": 5444 }, { "epoch": 2.923458944330727, "grad_norm": 0.77734375, "learning_rate": 1.844798714248899e-05, "loss": 0.8806, "step": 5445 }, { "epoch": 2.923995839065803, "grad_norm": 0.66796875, "learning_rate": 1.8439837566317505e-05, "loss": 0.8622, "step": 5446 }, { "epoch": 2.9245327338008793, "grad_norm": 0.67578125, "learning_rate": 1.843168873874363e-05, "loss": 1.0269, "step": 5447 }, { "epoch": 2.9250696285359554, "grad_norm": 0.65234375, "learning_rate": 1.8423540660697242e-05, "loss": 0.733, "step": 5448 }, { "epoch": 2.925606523271031, "grad_norm": 0.76171875, "learning_rate": 1.841539333310816e-05, "loss": 0.7116, "step": 5449 }, { "epoch": 2.926143418006107, "grad_norm": 0.6796875, "learning_rate": 1.840724675690608e-05, "loss": 0.8003, "step": 5450 }, { "epoch": 2.926143418006107, "eval_loss": 0.9032182693481445, "eval_runtime": 47.7071, "eval_samples_per_second": 20.165, "eval_steps_per_second": 20.165, "step": 5450 }, { "epoch": 2.9266803127411833, "grad_norm": 0.8046875, "learning_rate": 1.8399100933020633e-05, "loss": 0.9192, "step": 5451 }, { "epoch": 2.927217207476259, "grad_norm": 0.77734375, "learning_rate": 1.839095586238137e-05, "loss": 1.0464, "step": 5452 }, { "epoch": 2.927754102211335, "grad_norm": 0.828125, "learning_rate": 1.838281154591774e-05, "loss": 0.8922, "step": 5453 }, { "epoch": 2.9282909969464113, "grad_norm": 0.69140625, "learning_rate": 1.8374667984559112e-05, "loss": 0.8073, "step": 5454 }, { "epoch": 2.928827891681487, "grad_norm": 0.703125, "learning_rate": 1.836652517923477e-05, "loss": 0.9222, "step": 5455 }, { "epoch": 2.929364786416563, "grad_norm": 0.5859375, "learning_rate": 1.8358383130873902e-05, "loss": 0.8636, "step": 5456 }, { "epoch": 2.929901681151639, "grad_norm": 0.7890625, "learning_rate": 1.8350241840405638e-05, "loss": 0.975, "step": 5457 }, { "epoch": 2.9304385758867153, "grad_norm": 0.74609375, "learning_rate": 1.834210130875899e-05, "loss": 0.8365, "step": 5458 }, { "epoch": 2.9309754706217914, "grad_norm": 0.640625, "learning_rate": 1.8333961536862893e-05, "loss": 0.9949, "step": 5459 }, { "epoch": 2.931512365356867, "grad_norm": 0.65234375, "learning_rate": 1.8325822525646208e-05, "loss": 0.6634, "step": 5460 }, { "epoch": 2.932049260091943, "grad_norm": 0.71484375, "learning_rate": 1.8317684276037696e-05, "loss": 1.008, "step": 5461 }, { "epoch": 2.9325861548270193, "grad_norm": 0.6953125, "learning_rate": 1.8309546788966037e-05, "loss": 0.8497, "step": 5462 }, { "epoch": 2.933123049562095, "grad_norm": 0.6953125, "learning_rate": 1.830141006535981e-05, "loss": 0.99, "step": 5463 }, { "epoch": 2.933659944297171, "grad_norm": 0.70703125, "learning_rate": 1.8293274106147525e-05, "loss": 0.8016, "step": 5464 }, { "epoch": 2.9341968390322473, "grad_norm": 0.64453125, "learning_rate": 1.8285138912257597e-05, "loss": 0.7113, "step": 5465 }, { "epoch": 2.9347337337673234, "grad_norm": 0.69921875, "learning_rate": 1.827700448461836e-05, "loss": 0.8979, "step": 5466 }, { "epoch": 2.9352706285023995, "grad_norm": 0.71484375, "learning_rate": 1.8268870824158045e-05, "loss": 0.8675, "step": 5467 }, { "epoch": 2.935807523237475, "grad_norm": 0.6953125, "learning_rate": 1.826073793180482e-05, "loss": 0.9291, "step": 5468 }, { "epoch": 2.9363444179725513, "grad_norm": 0.62109375, "learning_rate": 1.825260580848674e-05, "loss": 0.7463, "step": 5469 }, { "epoch": 2.9368813127076274, "grad_norm": 0.640625, "learning_rate": 1.8244474455131792e-05, "loss": 0.9951, "step": 5470 }, { "epoch": 2.937418207442703, "grad_norm": 0.6484375, "learning_rate": 1.8236343872667855e-05, "loss": 0.8259, "step": 5471 }, { "epoch": 2.937955102177779, "grad_norm": 0.875, "learning_rate": 1.822821406202273e-05, "loss": 1.0088, "step": 5472 }, { "epoch": 2.9384919969128553, "grad_norm": 0.68359375, "learning_rate": 1.8220085024124146e-05, "loss": 0.8069, "step": 5473 }, { "epoch": 2.939028891647931, "grad_norm": 0.8359375, "learning_rate": 1.821195675989972e-05, "loss": 0.8615, "step": 5474 }, { "epoch": 2.939565786383007, "grad_norm": 0.8125, "learning_rate": 1.8203829270276988e-05, "loss": 1.0052, "step": 5475 }, { "epoch": 2.9401026811180833, "grad_norm": 0.74609375, "learning_rate": 1.8195702556183407e-05, "loss": 1.0683, "step": 5476 }, { "epoch": 2.9406395758531594, "grad_norm": 0.70703125, "learning_rate": 1.818757661854633e-05, "loss": 0.9592, "step": 5477 }, { "epoch": 2.9411764705882355, "grad_norm": 0.640625, "learning_rate": 1.8179451458293034e-05, "loss": 0.8372, "step": 5478 }, { "epoch": 2.941713365323311, "grad_norm": 0.734375, "learning_rate": 1.8171327076350714e-05, "loss": 0.8697, "step": 5479 }, { "epoch": 2.9422502600583873, "grad_norm": 0.62890625, "learning_rate": 1.8163203473646435e-05, "loss": 0.7804, "step": 5480 }, { "epoch": 2.9427871547934634, "grad_norm": 0.5859375, "learning_rate": 1.815508065110723e-05, "loss": 0.7558, "step": 5481 }, { "epoch": 2.943324049528539, "grad_norm": 0.6953125, "learning_rate": 1.8146958609660007e-05, "loss": 0.9593, "step": 5482 }, { "epoch": 2.943860944263615, "grad_norm": 0.6640625, "learning_rate": 1.8138837350231584e-05, "loss": 0.8378, "step": 5483 }, { "epoch": 2.9443978389986913, "grad_norm": 0.7421875, "learning_rate": 1.8130716873748718e-05, "loss": 0.8555, "step": 5484 }, { "epoch": 2.9449347337337675, "grad_norm": 0.77734375, "learning_rate": 1.812259718113805e-05, "loss": 0.7301, "step": 5485 }, { "epoch": 2.9454716284688436, "grad_norm": 0.640625, "learning_rate": 1.8114478273326135e-05, "loss": 0.8048, "step": 5486 }, { "epoch": 2.9460085232039193, "grad_norm": 0.82421875, "learning_rate": 1.8106360151239454e-05, "loss": 0.8567, "step": 5487 }, { "epoch": 2.9465454179389954, "grad_norm": 0.61328125, "learning_rate": 1.809824281580437e-05, "loss": 0.7145, "step": 5488 }, { "epoch": 2.9470823126740715, "grad_norm": 0.90234375, "learning_rate": 1.809012626794719e-05, "loss": 0.9999, "step": 5489 }, { "epoch": 2.947619207409147, "grad_norm": 0.73828125, "learning_rate": 1.808201050859411e-05, "loss": 0.9137, "step": 5490 }, { "epoch": 2.9481561021442233, "grad_norm": 0.6796875, "learning_rate": 1.8073895538671236e-05, "loss": 0.8003, "step": 5491 }, { "epoch": 2.9486929968792994, "grad_norm": 0.9375, "learning_rate": 1.8065781359104596e-05, "loss": 0.809, "step": 5492 }, { "epoch": 2.949229891614375, "grad_norm": 0.58203125, "learning_rate": 1.8057667970820115e-05, "loss": 0.7794, "step": 5493 }, { "epoch": 2.949766786349451, "grad_norm": 0.671875, "learning_rate": 1.804955537474364e-05, "loss": 0.886, "step": 5494 }, { "epoch": 2.9503036810845273, "grad_norm": 0.703125, "learning_rate": 1.804144357180091e-05, "loss": 0.9306, "step": 5495 }, { "epoch": 2.9508405758196035, "grad_norm": 0.78515625, "learning_rate": 1.803333256291761e-05, "loss": 0.8701, "step": 5496 }, { "epoch": 2.9513774705546796, "grad_norm": 0.72265625, "learning_rate": 1.802522234901927e-05, "loss": 0.6668, "step": 5497 }, { "epoch": 2.9519143652897553, "grad_norm": 0.63671875, "learning_rate": 1.80171129310314e-05, "loss": 0.6868, "step": 5498 }, { "epoch": 2.9524512600248314, "grad_norm": 0.9296875, "learning_rate": 1.800900430987937e-05, "loss": 1.0572, "step": 5499 }, { "epoch": 2.9529881547599075, "grad_norm": 0.80859375, "learning_rate": 1.800089648648848e-05, "loss": 0.862, "step": 5500 }, { "epoch": 2.9529881547599075, "eval_loss": 0.9033634662628174, "eval_runtime": 46.6055, "eval_samples_per_second": 20.641, "eval_steps_per_second": 20.641, "step": 5500 }, { "epoch": 2.953525049494983, "grad_norm": 0.7265625, "learning_rate": 1.799278946178394e-05, "loss": 0.8409, "step": 5501 }, { "epoch": 2.9540619442300593, "grad_norm": 0.7265625, "learning_rate": 1.7984683236690863e-05, "loss": 0.8949, "step": 5502 }, { "epoch": 2.9545988389651354, "grad_norm": 0.84765625, "learning_rate": 1.7976577812134265e-05, "loss": 0.9617, "step": 5503 }, { "epoch": 2.9551357337002115, "grad_norm": 0.671875, "learning_rate": 1.79684731890391e-05, "loss": 0.7503, "step": 5504 }, { "epoch": 2.9556726284352877, "grad_norm": 0.734375, "learning_rate": 1.7960369368330173e-05, "loss": 0.8546, "step": 5505 }, { "epoch": 2.9562095231703633, "grad_norm": 0.81640625, "learning_rate": 1.795226635093226e-05, "loss": 0.7963, "step": 5506 }, { "epoch": 2.9567464179054395, "grad_norm": 0.66796875, "learning_rate": 1.7944164137770002e-05, "loss": 0.845, "step": 5507 }, { "epoch": 2.9572833126405156, "grad_norm": 0.69140625, "learning_rate": 1.7936062729767973e-05, "loss": 0.9114, "step": 5508 }, { "epoch": 2.9578202073755913, "grad_norm": 0.8046875, "learning_rate": 1.7927962127850646e-05, "loss": 0.9163, "step": 5509 }, { "epoch": 2.9583571021106674, "grad_norm": 0.73046875, "learning_rate": 1.79198623329424e-05, "loss": 0.7591, "step": 5510 }, { "epoch": 2.9588939968457435, "grad_norm": 0.7265625, "learning_rate": 1.791176334596752e-05, "loss": 0.7553, "step": 5511 }, { "epoch": 2.959430891580819, "grad_norm": 0.6796875, "learning_rate": 1.7903665167850224e-05, "loss": 0.9116, "step": 5512 }, { "epoch": 2.9599677863158953, "grad_norm": 0.609375, "learning_rate": 1.7895567799514584e-05, "loss": 0.7526, "step": 5513 }, { "epoch": 2.9605046810509714, "grad_norm": 0.70703125, "learning_rate": 1.7887471241884632e-05, "loss": 0.8126, "step": 5514 }, { "epoch": 2.9610415757860475, "grad_norm": 0.66015625, "learning_rate": 1.7879375495884284e-05, "loss": 0.9015, "step": 5515 }, { "epoch": 2.9615784705211237, "grad_norm": 0.64453125, "learning_rate": 1.7871280562437366e-05, "loss": 0.7862, "step": 5516 }, { "epoch": 2.9621153652561993, "grad_norm": 0.7265625, "learning_rate": 1.7863186442467613e-05, "loss": 1.1442, "step": 5517 }, { "epoch": 2.9626522599912755, "grad_norm": 0.68359375, "learning_rate": 1.785509313689867e-05, "loss": 0.8277, "step": 5518 }, { "epoch": 2.9631891547263516, "grad_norm": 0.74609375, "learning_rate": 1.784700064665408e-05, "loss": 0.9182, "step": 5519 }, { "epoch": 2.9637260494614273, "grad_norm": 0.6328125, "learning_rate": 1.7838908972657303e-05, "loss": 0.8057, "step": 5520 }, { "epoch": 2.9642629441965034, "grad_norm": 0.65625, "learning_rate": 1.783081811583171e-05, "loss": 0.794, "step": 5521 }, { "epoch": 2.9647998389315795, "grad_norm": 0.64453125, "learning_rate": 1.7822728077100552e-05, "loss": 0.9761, "step": 5522 }, { "epoch": 2.9653367336666556, "grad_norm": 0.65234375, "learning_rate": 1.7814638857387015e-05, "loss": 0.8062, "step": 5523 }, { "epoch": 2.9658736284017317, "grad_norm": 0.75390625, "learning_rate": 1.7806550457614174e-05, "loss": 0.9379, "step": 5524 }, { "epoch": 2.9664105231368074, "grad_norm": 0.66015625, "learning_rate": 1.7798462878705025e-05, "loss": 0.8716, "step": 5525 }, { "epoch": 2.9669474178718835, "grad_norm": 0.6484375, "learning_rate": 1.7790376121582465e-05, "loss": 0.7737, "step": 5526 }, { "epoch": 2.9674843126069597, "grad_norm": 1.8828125, "learning_rate": 1.7782290187169294e-05, "loss": 0.9156, "step": 5527 }, { "epoch": 2.9680212073420353, "grad_norm": 0.640625, "learning_rate": 1.7774205076388206e-05, "loss": 0.9634, "step": 5528 }, { "epoch": 2.9685581020771115, "grad_norm": 0.6953125, "learning_rate": 1.7766120790161845e-05, "loss": 0.8445, "step": 5529 }, { "epoch": 2.9690949968121876, "grad_norm": 0.609375, "learning_rate": 1.77580373294127e-05, "loss": 0.726, "step": 5530 }, { "epoch": 2.9696318915472633, "grad_norm": 0.72265625, "learning_rate": 1.774995469506321e-05, "loss": 0.8796, "step": 5531 }, { "epoch": 2.9701687862823394, "grad_norm": 1.0390625, "learning_rate": 1.7741872888035702e-05, "loss": 1.0207, "step": 5532 }, { "epoch": 2.9707056810174155, "grad_norm": 0.7109375, "learning_rate": 1.773379190925241e-05, "loss": 0.8211, "step": 5533 }, { "epoch": 2.9712425757524916, "grad_norm": 0.68359375, "learning_rate": 1.7725711759635487e-05, "loss": 0.8441, "step": 5534 }, { "epoch": 2.9717794704875677, "grad_norm": 0.72265625, "learning_rate": 1.771763244010697e-05, "loss": 1.0759, "step": 5535 }, { "epoch": 2.9723163652226434, "grad_norm": 0.83203125, "learning_rate": 1.7709553951588813e-05, "loss": 0.7086, "step": 5536 }, { "epoch": 2.9728532599577195, "grad_norm": 0.70703125, "learning_rate": 1.770147629500288e-05, "loss": 0.9243, "step": 5537 }, { "epoch": 2.9733901546927957, "grad_norm": 0.6328125, "learning_rate": 1.7693399471270937e-05, "loss": 0.7882, "step": 5538 }, { "epoch": 2.9739270494278713, "grad_norm": 0.6484375, "learning_rate": 1.768532348131464e-05, "loss": 0.8293, "step": 5539 }, { "epoch": 2.9744639441629475, "grad_norm": 0.69921875, "learning_rate": 1.7677248326055564e-05, "loss": 0.7795, "step": 5540 }, { "epoch": 2.9750008388980236, "grad_norm": 0.7421875, "learning_rate": 1.7669174006415186e-05, "loss": 0.8374, "step": 5541 }, { "epoch": 2.9755377336330997, "grad_norm": 0.69921875, "learning_rate": 1.7661100523314894e-05, "loss": 0.8082, "step": 5542 }, { "epoch": 2.976074628368176, "grad_norm": 0.63671875, "learning_rate": 1.7653027877675976e-05, "loss": 0.692, "step": 5543 }, { "epoch": 2.9766115231032515, "grad_norm": 0.8125, "learning_rate": 1.764495607041961e-05, "loss": 1.015, "step": 5544 }, { "epoch": 2.9771484178383276, "grad_norm": 0.64453125, "learning_rate": 1.7636885102466907e-05, "loss": 0.7693, "step": 5545 }, { "epoch": 2.9776853125734037, "grad_norm": 0.55078125, "learning_rate": 1.7628814974738866e-05, "loss": 0.7557, "step": 5546 }, { "epoch": 2.9782222073084794, "grad_norm": 0.69140625, "learning_rate": 1.762074568815638e-05, "loss": 0.9253, "step": 5547 }, { "epoch": 2.9787591020435555, "grad_norm": 0.703125, "learning_rate": 1.761267724364026e-05, "loss": 0.8267, "step": 5548 }, { "epoch": 2.9792959967786317, "grad_norm": 0.6171875, "learning_rate": 1.7604609642111214e-05, "loss": 0.7756, "step": 5549 }, { "epoch": 2.9798328915137073, "grad_norm": 0.70703125, "learning_rate": 1.759654288448987e-05, "loss": 0.9439, "step": 5550 }, { "epoch": 2.9798328915137073, "eval_loss": 0.9031280875205994, "eval_runtime": 46.8345, "eval_samples_per_second": 20.54, "eval_steps_per_second": 20.54, "step": 5550 }, { "epoch": 2.9803697862487835, "grad_norm": 0.66796875, "learning_rate": 1.758847697169674e-05, "loss": 0.882, "step": 5551 }, { "epoch": 2.9809066809838596, "grad_norm": 0.73828125, "learning_rate": 1.758041190465224e-05, "loss": 0.9173, "step": 5552 }, { "epoch": 2.9814435757189357, "grad_norm": 0.7265625, "learning_rate": 1.7572347684276708e-05, "loss": 0.7461, "step": 5553 }, { "epoch": 2.981980470454012, "grad_norm": 0.75390625, "learning_rate": 1.7564284311490376e-05, "loss": 1.1007, "step": 5554 }, { "epoch": 2.9825173651890875, "grad_norm": 0.7421875, "learning_rate": 1.7556221787213363e-05, "loss": 0.7541, "step": 5555 }, { "epoch": 2.9830542599241636, "grad_norm": 0.671875, "learning_rate": 1.7548160112365713e-05, "loss": 0.9148, "step": 5556 }, { "epoch": 2.9835911546592397, "grad_norm": 0.609375, "learning_rate": 1.7540099287867358e-05, "loss": 0.8838, "step": 5557 }, { "epoch": 2.9841280493943154, "grad_norm": 0.68359375, "learning_rate": 1.7532039314638155e-05, "loss": 0.8607, "step": 5558 }, { "epoch": 2.9846649441293915, "grad_norm": 0.7890625, "learning_rate": 1.7523980193597836e-05, "loss": 1.0443, "step": 5559 }, { "epoch": 2.9852018388644677, "grad_norm": 0.81640625, "learning_rate": 1.7515921925666052e-05, "loss": 1.0869, "step": 5560 }, { "epoch": 2.985738733599544, "grad_norm": 0.63671875, "learning_rate": 1.7507864511762355e-05, "loss": 0.7088, "step": 5561 }, { "epoch": 2.98627562833462, "grad_norm": 0.66796875, "learning_rate": 1.74998079528062e-05, "loss": 0.874, "step": 5562 }, { "epoch": 2.9868125230696956, "grad_norm": 0.71484375, "learning_rate": 1.749175224971694e-05, "loss": 0.7606, "step": 5563 }, { "epoch": 2.9873494178047717, "grad_norm": 0.6640625, "learning_rate": 1.7483697403413836e-05, "loss": 0.6888, "step": 5564 }, { "epoch": 2.987886312539848, "grad_norm": 0.70703125, "learning_rate": 1.7475643414816033e-05, "loss": 0.9098, "step": 5565 }, { "epoch": 2.9884232072749235, "grad_norm": 0.9140625, "learning_rate": 1.7467590284842612e-05, "loss": 1.0294, "step": 5566 }, { "epoch": 2.9889601020099996, "grad_norm": 0.66015625, "learning_rate": 1.7459538014412528e-05, "loss": 0.7936, "step": 5567 }, { "epoch": 2.9894969967450757, "grad_norm": 0.6328125, "learning_rate": 1.745148660444465e-05, "loss": 0.8168, "step": 5568 }, { "epoch": 2.9900338914801514, "grad_norm": 0.72265625, "learning_rate": 1.7443436055857742e-05, "loss": 0.774, "step": 5569 }, { "epoch": 2.9905707862152275, "grad_norm": 0.67578125, "learning_rate": 1.7435386369570478e-05, "loss": 0.8802, "step": 5570 }, { "epoch": 2.9911076809503037, "grad_norm": 0.75390625, "learning_rate": 1.7427337546501435e-05, "loss": 0.8458, "step": 5571 }, { "epoch": 2.99164457568538, "grad_norm": 0.5859375, "learning_rate": 1.741928958756907e-05, "loss": 0.648, "step": 5572 }, { "epoch": 2.992181470420456, "grad_norm": 0.74609375, "learning_rate": 1.741124249369177e-05, "loss": 0.944, "step": 5573 }, { "epoch": 2.9927183651555316, "grad_norm": 0.62109375, "learning_rate": 1.7403196265787797e-05, "loss": 0.8706, "step": 5574 }, { "epoch": 2.9932552598906077, "grad_norm": 0.7109375, "learning_rate": 1.7395150904775338e-05, "loss": 0.8605, "step": 5575 }, { "epoch": 2.993792154625684, "grad_norm": 0.765625, "learning_rate": 1.7387106411572474e-05, "loss": 0.9711, "step": 5576 }, { "epoch": 2.9943290493607595, "grad_norm": 0.61328125, "learning_rate": 1.7379062787097167e-05, "loss": 0.8551, "step": 5577 }, { "epoch": 2.9948659440958356, "grad_norm": 0.64453125, "learning_rate": 1.7371020032267315e-05, "loss": 0.8882, "step": 5578 }, { "epoch": 2.9954028388309117, "grad_norm": 0.65625, "learning_rate": 1.736297814800069e-05, "loss": 0.9511, "step": 5579 }, { "epoch": 2.995939733565988, "grad_norm": 0.8046875, "learning_rate": 1.7354937135214983e-05, "loss": 1.235, "step": 5580 }, { "epoch": 2.996476628301064, "grad_norm": 0.9609375, "learning_rate": 1.7346896994827756e-05, "loss": 1.3069, "step": 5581 }, { "epoch": 2.9970135230361397, "grad_norm": 0.73828125, "learning_rate": 1.7338857727756495e-05, "loss": 0.8365, "step": 5582 }, { "epoch": 2.997550417771216, "grad_norm": 0.625, "learning_rate": 1.7330819334918595e-05, "loss": 0.692, "step": 5583 }, { "epoch": 2.998087312506292, "grad_norm": 0.66796875, "learning_rate": 1.732278181723133e-05, "loss": 0.7502, "step": 5584 }, { "epoch": 2.9986242072413676, "grad_norm": 0.6328125, "learning_rate": 1.731474517561188e-05, "loss": 0.7588, "step": 5585 }, { "epoch": 2.9991611019764437, "grad_norm": 0.67578125, "learning_rate": 1.7306709410977333e-05, "loss": 0.6996, "step": 5586 }, { "epoch": 2.99969799671152, "grad_norm": 0.78125, "learning_rate": 1.7298674524244673e-05, "loss": 0.8848, "step": 5587 }, { "epoch": 3.0, "grad_norm": 0.68359375, "learning_rate": 1.7290640516330786e-05, "loss": 0.4614, "step": 5588 }, { "epoch": 3.000536894735076, "grad_norm": 0.6328125, "learning_rate": 1.728260738815244e-05, "loss": 0.7966, "step": 5589 }, { "epoch": 3.001073789470152, "grad_norm": 0.609375, "learning_rate": 1.7274575140626318e-05, "loss": 0.8168, "step": 5590 }, { "epoch": 3.001610684205228, "grad_norm": 0.6328125, "learning_rate": 1.726654377466901e-05, "loss": 0.9058, "step": 5591 }, { "epoch": 3.002147578940304, "grad_norm": 0.76953125, "learning_rate": 1.7258513291196998e-05, "loss": 0.8953, "step": 5592 }, { "epoch": 3.00268447367538, "grad_norm": 0.69921875, "learning_rate": 1.725048369112665e-05, "loss": 0.8061, "step": 5593 }, { "epoch": 3.003221368410456, "grad_norm": 0.66015625, "learning_rate": 1.724245497537426e-05, "loss": 0.9316, "step": 5594 }, { "epoch": 3.003758263145532, "grad_norm": 1.1484375, "learning_rate": 1.7234427144855996e-05, "loss": 0.8271, "step": 5595 }, { "epoch": 3.004295157880608, "grad_norm": 0.63671875, "learning_rate": 1.7226400200487943e-05, "loss": 0.6916, "step": 5596 }, { "epoch": 3.004832052615684, "grad_norm": 0.69140625, "learning_rate": 1.721837414318607e-05, "loss": 0.6548, "step": 5597 }, { "epoch": 3.00536894735076, "grad_norm": 0.62109375, "learning_rate": 1.7210348973866243e-05, "loss": 0.9037, "step": 5598 }, { "epoch": 3.005905842085836, "grad_norm": 0.58984375, "learning_rate": 1.7202324693444256e-05, "loss": 0.8066, "step": 5599 }, { "epoch": 3.006442736820912, "grad_norm": 0.71875, "learning_rate": 1.7194301302835768e-05, "loss": 0.7934, "step": 5600 }, { "epoch": 3.006442736820912, "eval_loss": 0.9029846787452698, "eval_runtime": 47.4911, "eval_samples_per_second": 20.256, "eval_steps_per_second": 20.256, "step": 5600 }, { "epoch": 3.0069796315559882, "grad_norm": 0.6875, "learning_rate": 1.7186278802956353e-05, "loss": 0.7325, "step": 5601 }, { "epoch": 3.007516526291064, "grad_norm": 0.53515625, "learning_rate": 1.7178257194721477e-05, "loss": 0.7257, "step": 5602 }, { "epoch": 3.00805342102614, "grad_norm": 0.6484375, "learning_rate": 1.7170236479046516e-05, "loss": 0.8145, "step": 5603 }, { "epoch": 3.008590315761216, "grad_norm": 0.671875, "learning_rate": 1.716221665684672e-05, "loss": 0.8214, "step": 5604 }, { "epoch": 3.0091272104962923, "grad_norm": 0.640625, "learning_rate": 1.7154197729037274e-05, "loss": 0.6907, "step": 5605 }, { "epoch": 3.009664105231368, "grad_norm": 0.80078125, "learning_rate": 1.7146179696533212e-05, "loss": 0.6981, "step": 5606 }, { "epoch": 3.010200999966444, "grad_norm": 0.61328125, "learning_rate": 1.7138162560249517e-05, "loss": 0.7649, "step": 5607 }, { "epoch": 3.01073789470152, "grad_norm": 0.74609375, "learning_rate": 1.7130146321101032e-05, "loss": 0.7687, "step": 5608 }, { "epoch": 3.011274789436596, "grad_norm": 0.6953125, "learning_rate": 1.712213098000251e-05, "loss": 0.9491, "step": 5609 }, { "epoch": 3.011811684171672, "grad_norm": 0.73046875, "learning_rate": 1.711411653786861e-05, "loss": 0.8883, "step": 5610 }, { "epoch": 3.012348578906748, "grad_norm": 0.7109375, "learning_rate": 1.7106102995613888e-05, "loss": 0.8226, "step": 5611 }, { "epoch": 3.0128854736418242, "grad_norm": 0.6953125, "learning_rate": 1.709809035415278e-05, "loss": 0.6995, "step": 5612 }, { "epoch": 3.0134223683769, "grad_norm": 0.6796875, "learning_rate": 1.7090078614399635e-05, "loss": 0.8278, "step": 5613 }, { "epoch": 3.013959263111976, "grad_norm": 0.62890625, "learning_rate": 1.7082067777268678e-05, "loss": 0.9032, "step": 5614 }, { "epoch": 3.014496157847052, "grad_norm": 0.5234375, "learning_rate": 1.707405784367407e-05, "loss": 0.7106, "step": 5615 }, { "epoch": 3.0150330525821283, "grad_norm": 0.61328125, "learning_rate": 1.7066048814529834e-05, "loss": 0.6835, "step": 5616 }, { "epoch": 3.015569947317204, "grad_norm": 0.59375, "learning_rate": 1.7058040690749905e-05, "loss": 0.6236, "step": 5617 }, { "epoch": 3.01610684205228, "grad_norm": 0.8984375, "learning_rate": 1.7050033473248107e-05, "loss": 1.0277, "step": 5618 }, { "epoch": 3.016643736787356, "grad_norm": 0.625, "learning_rate": 1.704202716293817e-05, "loss": 0.9026, "step": 5619 }, { "epoch": 3.0171806315224323, "grad_norm": 0.76171875, "learning_rate": 1.703402176073371e-05, "loss": 0.78, "step": 5620 }, { "epoch": 3.017717526257508, "grad_norm": 0.63671875, "learning_rate": 1.702601726754825e-05, "loss": 0.8755, "step": 5621 }, { "epoch": 3.018254420992584, "grad_norm": 0.6328125, "learning_rate": 1.7018013684295212e-05, "loss": 0.6694, "step": 5622 }, { "epoch": 3.0187913157276602, "grad_norm": 0.58984375, "learning_rate": 1.7010011011887882e-05, "loss": 0.7758, "step": 5623 }, { "epoch": 3.0193282104627364, "grad_norm": 0.65234375, "learning_rate": 1.7002009251239488e-05, "loss": 0.7435, "step": 5624 }, { "epoch": 3.019865105197812, "grad_norm": 0.6484375, "learning_rate": 1.6994008403263122e-05, "loss": 0.7837, "step": 5625 }, { "epoch": 3.020401999932888, "grad_norm": 0.67578125, "learning_rate": 1.6986008468871783e-05, "loss": 0.904, "step": 5626 }, { "epoch": 3.0209388946679643, "grad_norm": 0.76953125, "learning_rate": 1.697800944897837e-05, "loss": 0.7706, "step": 5627 }, { "epoch": 3.02147578940304, "grad_norm": 0.8125, "learning_rate": 1.6970011344495667e-05, "loss": 0.999, "step": 5628 }, { "epoch": 3.022012684138116, "grad_norm": 0.6015625, "learning_rate": 1.696201415633636e-05, "loss": 0.9001, "step": 5629 }, { "epoch": 3.022549578873192, "grad_norm": 0.7265625, "learning_rate": 1.6954017885413044e-05, "loss": 0.9004, "step": 5630 }, { "epoch": 3.0230864736082683, "grad_norm": 0.6875, "learning_rate": 1.6946022532638163e-05, "loss": 0.8519, "step": 5631 }, { "epoch": 3.023623368343344, "grad_norm": 0.61328125, "learning_rate": 1.6938028098924117e-05, "loss": 0.8961, "step": 5632 }, { "epoch": 3.02416026307842, "grad_norm": 0.62890625, "learning_rate": 1.693003458518316e-05, "loss": 0.8072, "step": 5633 }, { "epoch": 3.0246971578134962, "grad_norm": 0.67578125, "learning_rate": 1.6922041992327447e-05, "loss": 0.8388, "step": 5634 }, { "epoch": 3.0252340525485724, "grad_norm": 0.69921875, "learning_rate": 1.6914050321269047e-05, "loss": 0.7467, "step": 5635 }, { "epoch": 3.025770947283648, "grad_norm": 0.7109375, "learning_rate": 1.6906059572919908e-05, "loss": 0.6684, "step": 5636 }, { "epoch": 3.026307842018724, "grad_norm": 0.70703125, "learning_rate": 1.6898069748191868e-05, "loss": 0.9435, "step": 5637 }, { "epoch": 3.0268447367538003, "grad_norm": 0.6328125, "learning_rate": 1.6890080847996686e-05, "loss": 0.8938, "step": 5638 }, { "epoch": 3.0273816314888764, "grad_norm": 0.9375, "learning_rate": 1.6882092873245968e-05, "loss": 0.7075, "step": 5639 }, { "epoch": 3.027918526223952, "grad_norm": 0.64453125, "learning_rate": 1.6874105824851267e-05, "loss": 0.7713, "step": 5640 }, { "epoch": 3.028455420959028, "grad_norm": 0.6875, "learning_rate": 1.686611970372399e-05, "loss": 0.9216, "step": 5641 }, { "epoch": 3.0289923156941043, "grad_norm": 0.61328125, "learning_rate": 1.6858134510775463e-05, "loss": 0.7349, "step": 5642 }, { "epoch": 3.0295292104291804, "grad_norm": 0.640625, "learning_rate": 1.68501502469169e-05, "loss": 0.8568, "step": 5643 }, { "epoch": 3.030066105164256, "grad_norm": 0.59765625, "learning_rate": 1.6842166913059402e-05, "loss": 0.9279, "step": 5644 }, { "epoch": 3.0306029998993322, "grad_norm": 0.6796875, "learning_rate": 1.683418451011397e-05, "loss": 0.8226, "step": 5645 }, { "epoch": 3.0311398946344084, "grad_norm": 0.6953125, "learning_rate": 1.6826203038991498e-05, "loss": 0.9037, "step": 5646 }, { "epoch": 3.031676789369484, "grad_norm": 0.7109375, "learning_rate": 1.6818222500602775e-05, "loss": 0.6855, "step": 5647 }, { "epoch": 3.03221368410456, "grad_norm": 0.69140625, "learning_rate": 1.681024289585848e-05, "loss": 0.8555, "step": 5648 }, { "epoch": 3.0327505788396363, "grad_norm": 0.5703125, "learning_rate": 1.680226422566918e-05, "loss": 0.732, "step": 5649 }, { "epoch": 3.0332874735747124, "grad_norm": 0.75390625, "learning_rate": 1.6794286490945342e-05, "loss": 0.7656, "step": 5650 }, { "epoch": 3.0332874735747124, "eval_loss": 0.9030352830886841, "eval_runtime": 47.4397, "eval_samples_per_second": 20.278, "eval_steps_per_second": 20.278, "step": 5650 }, { "epoch": 3.033824368309788, "grad_norm": 0.6640625, "learning_rate": 1.678630969259734e-05, "loss": 0.7524, "step": 5651 }, { "epoch": 3.034361263044864, "grad_norm": 0.73046875, "learning_rate": 1.677833383153542e-05, "loss": 0.7811, "step": 5652 }, { "epoch": 3.0348981577799403, "grad_norm": 0.65625, "learning_rate": 1.6770358908669732e-05, "loss": 0.8238, "step": 5653 }, { "epoch": 3.0354350525150164, "grad_norm": 0.90234375, "learning_rate": 1.6762384924910302e-05, "loss": 0.9373, "step": 5654 }, { "epoch": 3.035971947250092, "grad_norm": 0.77734375, "learning_rate": 1.6754411881167084e-05, "loss": 0.9124, "step": 5655 }, { "epoch": 3.0365088419851682, "grad_norm": 0.73828125, "learning_rate": 1.6746439778349888e-05, "loss": 0.8393, "step": 5656 }, { "epoch": 3.0370457367202444, "grad_norm": 0.83984375, "learning_rate": 1.6738468617368436e-05, "loss": 0.9503, "step": 5657 }, { "epoch": 3.0375826314553205, "grad_norm": 0.76171875, "learning_rate": 1.673049839913234e-05, "loss": 0.9281, "step": 5658 }, { "epoch": 3.038119526190396, "grad_norm": 0.56640625, "learning_rate": 1.6722529124551096e-05, "loss": 0.7913, "step": 5659 }, { "epoch": 3.0386564209254723, "grad_norm": 0.60546875, "learning_rate": 1.6714560794534108e-05, "loss": 0.7278, "step": 5660 }, { "epoch": 3.0391933156605484, "grad_norm": 0.71875, "learning_rate": 1.6706593409990663e-05, "loss": 0.9703, "step": 5661 }, { "epoch": 3.0397302103956245, "grad_norm": 0.72265625, "learning_rate": 1.6698626971829927e-05, "loss": 0.9035, "step": 5662 }, { "epoch": 3.0402671051307, "grad_norm": 0.734375, "learning_rate": 1.669066148096099e-05, "loss": 0.954, "step": 5663 }, { "epoch": 3.0408039998657763, "grad_norm": 0.66796875, "learning_rate": 1.6682696938292813e-05, "loss": 0.8916, "step": 5664 }, { "epoch": 3.0413408946008524, "grad_norm": 0.79296875, "learning_rate": 1.667473334473424e-05, "loss": 0.9074, "step": 5665 }, { "epoch": 3.041877789335928, "grad_norm": 0.66796875, "learning_rate": 1.6666770701194022e-05, "loss": 0.7338, "step": 5666 }, { "epoch": 3.0424146840710042, "grad_norm": 0.85546875, "learning_rate": 1.6658809008580795e-05, "loss": 0.9933, "step": 5667 }, { "epoch": 3.0429515788060804, "grad_norm": 0.69140625, "learning_rate": 1.6650848267803098e-05, "loss": 0.7882, "step": 5668 }, { "epoch": 3.0434884735411565, "grad_norm": 0.78125, "learning_rate": 1.6642888479769348e-05, "loss": 0.8334, "step": 5669 }, { "epoch": 3.044025368276232, "grad_norm": 0.7421875, "learning_rate": 1.6634929645387852e-05, "loss": 1.0332, "step": 5670 }, { "epoch": 3.0445622630113083, "grad_norm": 0.54296875, "learning_rate": 1.6626971765566822e-05, "loss": 0.7662, "step": 5671 }, { "epoch": 3.0450991577463844, "grad_norm": 0.71484375, "learning_rate": 1.6619014841214352e-05, "loss": 0.8795, "step": 5672 }, { "epoch": 3.0456360524814605, "grad_norm": 0.734375, "learning_rate": 1.6611058873238425e-05, "loss": 0.802, "step": 5673 }, { "epoch": 3.046172947216536, "grad_norm": 0.63671875, "learning_rate": 1.6603103862546913e-05, "loss": 0.7107, "step": 5674 }, { "epoch": 3.0467098419516123, "grad_norm": 0.6796875, "learning_rate": 1.659514981004759e-05, "loss": 0.7349, "step": 5675 }, { "epoch": 3.0472467366866884, "grad_norm": 0.6953125, "learning_rate": 1.6587196716648114e-05, "loss": 0.7133, "step": 5676 }, { "epoch": 3.0477836314217646, "grad_norm": 0.7109375, "learning_rate": 1.6579244583256028e-05, "loss": 0.7735, "step": 5677 }, { "epoch": 3.0483205261568402, "grad_norm": 0.828125, "learning_rate": 1.6571293410778778e-05, "loss": 0.8824, "step": 5678 }, { "epoch": 3.0488574208919164, "grad_norm": 0.63671875, "learning_rate": 1.656334320012369e-05, "loss": 0.9331, "step": 5679 }, { "epoch": 3.0493943156269925, "grad_norm": 0.625, "learning_rate": 1.6555393952197988e-05, "loss": 0.7424, "step": 5680 }, { "epoch": 3.0499312103620686, "grad_norm": 0.6015625, "learning_rate": 1.654744566790878e-05, "loss": 0.7131, "step": 5681 }, { "epoch": 3.0504681050971443, "grad_norm": 0.765625, "learning_rate": 1.653949834816306e-05, "loss": 0.8564, "step": 5682 }, { "epoch": 3.0510049998322204, "grad_norm": 0.66015625, "learning_rate": 1.6531551993867717e-05, "loss": 0.7673, "step": 5683 }, { "epoch": 3.0515418945672965, "grad_norm": 0.640625, "learning_rate": 1.6523606605929544e-05, "loss": 0.7774, "step": 5684 }, { "epoch": 3.052078789302372, "grad_norm": 0.71484375, "learning_rate": 1.6515662185255197e-05, "loss": 0.724, "step": 5685 }, { "epoch": 3.0526156840374483, "grad_norm": 0.75390625, "learning_rate": 1.650771873275124e-05, "loss": 0.8567, "step": 5686 }, { "epoch": 3.0531525787725244, "grad_norm": 0.71875, "learning_rate": 1.6499776249324123e-05, "loss": 0.892, "step": 5687 }, { "epoch": 3.0536894735076006, "grad_norm": 0.625, "learning_rate": 1.6491834735880185e-05, "loss": 0.8192, "step": 5688 }, { "epoch": 3.0542263682426762, "grad_norm": 0.63671875, "learning_rate": 1.6483894193325655e-05, "loss": 0.852, "step": 5689 }, { "epoch": 3.0547632629777524, "grad_norm": 0.6640625, "learning_rate": 1.6475954622566638e-05, "loss": 0.7537, "step": 5690 }, { "epoch": 3.0553001577128285, "grad_norm": 0.71484375, "learning_rate": 1.6468016024509145e-05, "loss": 0.9256, "step": 5691 }, { "epoch": 3.0558370524479046, "grad_norm": 0.72265625, "learning_rate": 1.6460078400059077e-05, "loss": 0.9278, "step": 5692 }, { "epoch": 3.0563739471829803, "grad_norm": 0.6015625, "learning_rate": 1.645214175012221e-05, "loss": 0.7843, "step": 5693 }, { "epoch": 3.0569108419180564, "grad_norm": 0.78125, "learning_rate": 1.6444206075604223e-05, "loss": 0.8648, "step": 5694 }, { "epoch": 3.0574477366531325, "grad_norm": 0.5546875, "learning_rate": 1.6436271377410666e-05, "loss": 0.6759, "step": 5695 }, { "epoch": 3.0579846313882086, "grad_norm": 0.63671875, "learning_rate": 1.6428337656447e-05, "loss": 0.6953, "step": 5696 }, { "epoch": 3.0585215261232843, "grad_norm": 0.640625, "learning_rate": 1.6420404913618563e-05, "loss": 0.8308, "step": 5697 }, { "epoch": 3.0590584208583604, "grad_norm": 0.64453125, "learning_rate": 1.6412473149830576e-05, "loss": 0.7212, "step": 5698 }, { "epoch": 3.0595953155934366, "grad_norm": 0.64453125, "learning_rate": 1.6404542365988154e-05, "loss": 0.6479, "step": 5699 }, { "epoch": 3.0601322103285127, "grad_norm": 0.75390625, "learning_rate": 1.6396612562996296e-05, "loss": 1.0536, "step": 5700 }, { "epoch": 3.0601322103285127, "eval_loss": 0.9032897353172302, "eval_runtime": 46.7649, "eval_samples_per_second": 20.571, "eval_steps_per_second": 20.571, "step": 5700 }, { "epoch": 3.0606691050635884, "grad_norm": 0.53515625, "learning_rate": 1.63886837417599e-05, "loss": 0.9007, "step": 5701 }, { "epoch": 3.0612059997986645, "grad_norm": 0.65625, "learning_rate": 1.6380755903183745e-05, "loss": 0.7438, "step": 5702 }, { "epoch": 3.0617428945337406, "grad_norm": 0.70703125, "learning_rate": 1.6372829048172488e-05, "loss": 0.9285, "step": 5703 }, { "epoch": 3.0622797892688163, "grad_norm": 0.75, "learning_rate": 1.6364903177630698e-05, "loss": 0.8741, "step": 5704 }, { "epoch": 3.0628166840038924, "grad_norm": 0.67578125, "learning_rate": 1.6356978292462817e-05, "loss": 0.8718, "step": 5705 }, { "epoch": 3.0633535787389685, "grad_norm": 0.6640625, "learning_rate": 1.6349054393573157e-05, "loss": 0.8836, "step": 5706 }, { "epoch": 3.0638904734740446, "grad_norm": 0.76953125, "learning_rate": 1.634113148186595e-05, "loss": 0.7811, "step": 5707 }, { "epoch": 3.0644273682091203, "grad_norm": 0.79296875, "learning_rate": 1.633320955824529e-05, "loss": 0.883, "step": 5708 }, { "epoch": 3.0649642629441964, "grad_norm": 0.6796875, "learning_rate": 1.632528862361518e-05, "loss": 1.0312, "step": 5709 }, { "epoch": 3.0655011576792726, "grad_norm": 0.6640625, "learning_rate": 1.6317368678879495e-05, "loss": 0.7349, "step": 5710 }, { "epoch": 3.0660380524143487, "grad_norm": 0.62890625, "learning_rate": 1.6309449724942e-05, "loss": 0.82, "step": 5711 }, { "epoch": 3.0665749471494244, "grad_norm": 0.7734375, "learning_rate": 1.6301531762706346e-05, "loss": 0.8962, "step": 5712 }, { "epoch": 3.0671118418845005, "grad_norm": 0.69921875, "learning_rate": 1.629361479307608e-05, "loss": 0.8272, "step": 5713 }, { "epoch": 3.0676487366195766, "grad_norm": 0.56640625, "learning_rate": 1.6285698816954624e-05, "loss": 0.7819, "step": 5714 }, { "epoch": 3.0681856313546527, "grad_norm": 0.66015625, "learning_rate": 1.6277783835245293e-05, "loss": 0.7803, "step": 5715 }, { "epoch": 3.0687225260897284, "grad_norm": 0.7109375, "learning_rate": 1.6269869848851273e-05, "loss": 1.0371, "step": 5716 }, { "epoch": 3.0692594208248045, "grad_norm": 0.65234375, "learning_rate": 1.6261956858675667e-05, "loss": 0.7819, "step": 5717 }, { "epoch": 3.0697963155598806, "grad_norm": 0.69140625, "learning_rate": 1.6254044865621442e-05, "loss": 0.8434, "step": 5718 }, { "epoch": 3.0703332102949563, "grad_norm": 0.671875, "learning_rate": 1.6246133870591455e-05, "loss": 0.8229, "step": 5719 }, { "epoch": 3.0708701050300324, "grad_norm": 0.734375, "learning_rate": 1.6238223874488452e-05, "loss": 0.9756, "step": 5720 }, { "epoch": 3.0714069997651086, "grad_norm": 0.72265625, "learning_rate": 1.623031487821507e-05, "loss": 0.9734, "step": 5721 }, { "epoch": 3.0719438945001847, "grad_norm": 0.66015625, "learning_rate": 1.622240688267382e-05, "loss": 0.9777, "step": 5722 }, { "epoch": 3.0724807892352604, "grad_norm": 0.6171875, "learning_rate": 1.62144998887671e-05, "loss": 0.7713, "step": 5723 }, { "epoch": 3.0730176839703365, "grad_norm": 0.57421875, "learning_rate": 1.6206593897397193e-05, "loss": 0.6509, "step": 5724 }, { "epoch": 3.0735545787054126, "grad_norm": 0.6953125, "learning_rate": 1.619868890946629e-05, "loss": 0.926, "step": 5725 }, { "epoch": 3.0740914734404887, "grad_norm": 0.71484375, "learning_rate": 1.6190784925876445e-05, "loss": 0.8084, "step": 5726 }, { "epoch": 3.0746283681755644, "grad_norm": 0.94921875, "learning_rate": 1.6182881947529593e-05, "loss": 0.8113, "step": 5727 }, { "epoch": 3.0751652629106405, "grad_norm": 0.63671875, "learning_rate": 1.6174979975327576e-05, "loss": 0.849, "step": 5728 }, { "epoch": 3.0757021576457166, "grad_norm": 0.71875, "learning_rate": 1.6167079010172102e-05, "loss": 0.8111, "step": 5729 }, { "epoch": 3.0762390523807928, "grad_norm": 0.6640625, "learning_rate": 1.615917905296477e-05, "loss": 0.754, "step": 5730 }, { "epoch": 3.0767759471158684, "grad_norm": 0.64453125, "learning_rate": 1.615128010460708e-05, "loss": 0.8334, "step": 5731 }, { "epoch": 3.0773128418509446, "grad_norm": 0.54296875, "learning_rate": 1.6143382166000375e-05, "loss": 0.6678, "step": 5732 }, { "epoch": 3.0778497365860207, "grad_norm": 0.6328125, "learning_rate": 1.6135485238045932e-05, "loss": 0.7403, "step": 5733 }, { "epoch": 3.078386631321097, "grad_norm": 0.76171875, "learning_rate": 1.6127589321644883e-05, "loss": 0.806, "step": 5734 }, { "epoch": 3.0789235260561725, "grad_norm": 0.71875, "learning_rate": 1.6119694417698246e-05, "loss": 0.8301, "step": 5735 }, { "epoch": 3.0794604207912486, "grad_norm": 0.72265625, "learning_rate": 1.611180052710694e-05, "loss": 0.7907, "step": 5736 }, { "epoch": 3.0799973155263247, "grad_norm": 0.671875, "learning_rate": 1.6103907650771757e-05, "loss": 0.8658, "step": 5737 }, { "epoch": 3.0805342102614004, "grad_norm": 0.65625, "learning_rate": 1.609601578959337e-05, "loss": 0.8728, "step": 5738 }, { "epoch": 3.0810711049964765, "grad_norm": 0.6484375, "learning_rate": 1.6088124944472343e-05, "loss": 0.8796, "step": 5739 }, { "epoch": 3.0816079997315526, "grad_norm": 0.60546875, "learning_rate": 1.6080235116309114e-05, "loss": 0.7863, "step": 5740 }, { "epoch": 3.0821448944666288, "grad_norm": 0.71484375, "learning_rate": 1.6072346306004023e-05, "loss": 0.6969, "step": 5741 }, { "epoch": 3.0826817892017044, "grad_norm": 0.7109375, "learning_rate": 1.6064458514457275e-05, "loss": 0.8227, "step": 5742 }, { "epoch": 3.0832186839367806, "grad_norm": 0.7890625, "learning_rate": 1.6056571742568972e-05, "loss": 0.9655, "step": 5743 }, { "epoch": 3.0837555786718567, "grad_norm": 0.55078125, "learning_rate": 1.6048685991239087e-05, "loss": 0.8102, "step": 5744 }, { "epoch": 3.084292473406933, "grad_norm": 0.63671875, "learning_rate": 1.6040801261367493e-05, "loss": 0.8922, "step": 5745 }, { "epoch": 3.0848293681420085, "grad_norm": 0.671875, "learning_rate": 1.6032917553853936e-05, "loss": 0.8152, "step": 5746 }, { "epoch": 3.0853662628770846, "grad_norm": 0.59765625, "learning_rate": 1.602503486959805e-05, "loss": 0.7547, "step": 5747 }, { "epoch": 3.0859031576121607, "grad_norm": 0.6640625, "learning_rate": 1.601715320949934e-05, "loss": 0.8982, "step": 5748 }, { "epoch": 3.086440052347237, "grad_norm": 0.7109375, "learning_rate": 1.6009272574457202e-05, "loss": 0.8848, "step": 5749 }, { "epoch": 3.0869769470823125, "grad_norm": 0.73828125, "learning_rate": 1.6001392965370925e-05, "loss": 0.7046, "step": 5750 }, { "epoch": 3.0869769470823125, "eval_loss": 0.9032167196273804, "eval_runtime": 46.6926, "eval_samples_per_second": 20.603, "eval_steps_per_second": 20.603, "step": 5750 }, { "epoch": 3.0875138418173886, "grad_norm": 0.69921875, "learning_rate": 1.5993514383139667e-05, "loss": 0.8378, "step": 5751 }, { "epoch": 3.0880507365524648, "grad_norm": 0.89453125, "learning_rate": 1.5985636828662476e-05, "loss": 1.0233, "step": 5752 }, { "epoch": 3.088587631287541, "grad_norm": 0.65625, "learning_rate": 1.5977760302838282e-05, "loss": 1.1289, "step": 5753 }, { "epoch": 3.0891245260226166, "grad_norm": 0.62109375, "learning_rate": 1.5969884806565895e-05, "loss": 0.6874, "step": 5754 }, { "epoch": 3.0896614207576927, "grad_norm": 0.69921875, "learning_rate": 1.5962010340744005e-05, "loss": 0.786, "step": 5755 }, { "epoch": 3.090198315492769, "grad_norm": 0.66796875, "learning_rate": 1.5954136906271205e-05, "loss": 0.7122, "step": 5756 }, { "epoch": 3.0907352102278445, "grad_norm": 0.640625, "learning_rate": 1.5946264504045927e-05, "loss": 1.106, "step": 5757 }, { "epoch": 3.0912721049629206, "grad_norm": 0.62890625, "learning_rate": 1.5938393134966535e-05, "loss": 0.9231, "step": 5758 }, { "epoch": 3.0918089996979967, "grad_norm": 0.6796875, "learning_rate": 1.5930522799931237e-05, "loss": 0.8559, "step": 5759 }, { "epoch": 3.092345894433073, "grad_norm": 0.7578125, "learning_rate": 1.5922653499838137e-05, "loss": 0.9821, "step": 5760 }, { "epoch": 3.0928827891681485, "grad_norm": 0.60546875, "learning_rate": 1.5914785235585234e-05, "loss": 0.8043, "step": 5761 }, { "epoch": 3.0934196839032246, "grad_norm": 0.70703125, "learning_rate": 1.5906918008070392e-05, "loss": 0.8762, "step": 5762 }, { "epoch": 3.0939565786383008, "grad_norm": 0.69140625, "learning_rate": 1.589905181819135e-05, "loss": 0.7984, "step": 5763 }, { "epoch": 3.094493473373377, "grad_norm": 0.6953125, "learning_rate": 1.5891186666845773e-05, "loss": 0.9015, "step": 5764 }, { "epoch": 3.0950303681084526, "grad_norm": 0.71875, "learning_rate": 1.5883322554931133e-05, "loss": 0.8705, "step": 5765 }, { "epoch": 3.0955672628435287, "grad_norm": 0.62109375, "learning_rate": 1.5875459483344845e-05, "loss": 0.6684, "step": 5766 }, { "epoch": 3.096104157578605, "grad_norm": 0.61328125, "learning_rate": 1.5867597452984183e-05, "loss": 0.7554, "step": 5767 }, { "epoch": 3.096641052313681, "grad_norm": 0.859375, "learning_rate": 1.5859736464746306e-05, "loss": 0.8085, "step": 5768 }, { "epoch": 3.0971779470487566, "grad_norm": 0.63671875, "learning_rate": 1.585187651952825e-05, "loss": 0.778, "step": 5769 }, { "epoch": 3.0977148417838327, "grad_norm": 0.6484375, "learning_rate": 1.5844017618226935e-05, "loss": 0.8396, "step": 5770 }, { "epoch": 3.098251736518909, "grad_norm": 0.6953125, "learning_rate": 1.5836159761739156e-05, "loss": 0.8293, "step": 5771 }, { "epoch": 3.098788631253985, "grad_norm": 0.7265625, "learning_rate": 1.58283029509616e-05, "loss": 0.8152, "step": 5772 }, { "epoch": 3.0993255259890606, "grad_norm": 0.70703125, "learning_rate": 1.5820447186790837e-05, "loss": 0.7627, "step": 5773 }, { "epoch": 3.0998624207241368, "grad_norm": 0.65625, "learning_rate": 1.5812592470123296e-05, "loss": 0.7837, "step": 5774 }, { "epoch": 3.100399315459213, "grad_norm": 0.67578125, "learning_rate": 1.5804738801855302e-05, "loss": 0.7875, "step": 5775 }, { "epoch": 3.1009362101942886, "grad_norm": 0.65625, "learning_rate": 1.5796886182883053e-05, "loss": 0.858, "step": 5776 }, { "epoch": 3.1014731049293647, "grad_norm": 0.72265625, "learning_rate": 1.5789034614102645e-05, "loss": 0.8946, "step": 5777 }, { "epoch": 3.102009999664441, "grad_norm": 0.8984375, "learning_rate": 1.5781184096410032e-05, "loss": 1.16, "step": 5778 }, { "epoch": 3.102546894399517, "grad_norm": 0.6640625, "learning_rate": 1.577333463070106e-05, "loss": 0.7316, "step": 5779 }, { "epoch": 3.1030837891345926, "grad_norm": 0.6484375, "learning_rate": 1.576548621787145e-05, "loss": 0.8395, "step": 5780 }, { "epoch": 3.1036206838696687, "grad_norm": 0.7109375, "learning_rate": 1.575763885881682e-05, "loss": 1.0934, "step": 5781 }, { "epoch": 3.104157578604745, "grad_norm": 0.69140625, "learning_rate": 1.5749792554432637e-05, "loss": 0.7515, "step": 5782 }, { "epoch": 3.104694473339821, "grad_norm": 0.68359375, "learning_rate": 1.5741947305614262e-05, "loss": 0.8488, "step": 5783 }, { "epoch": 3.1052313680748966, "grad_norm": 0.60546875, "learning_rate": 1.573410311325695e-05, "loss": 0.915, "step": 5784 }, { "epoch": 3.1057682628099728, "grad_norm": 0.71484375, "learning_rate": 1.572625997825581e-05, "loss": 0.8477, "step": 5785 }, { "epoch": 3.106305157545049, "grad_norm": 0.56640625, "learning_rate": 1.571841790150585e-05, "loss": 0.807, "step": 5786 }, { "epoch": 3.106842052280125, "grad_norm": 0.546875, "learning_rate": 1.571057688390195e-05, "loss": 0.747, "step": 5787 }, { "epoch": 3.1073789470152007, "grad_norm": 0.78125, "learning_rate": 1.5702736926338863e-05, "loss": 1.0878, "step": 5788 }, { "epoch": 3.107915841750277, "grad_norm": 0.70703125, "learning_rate": 1.5694898029711248e-05, "loss": 0.7316, "step": 5789 }, { "epoch": 3.108452736485353, "grad_norm": 0.875, "learning_rate": 1.5687060194913596e-05, "loss": 0.8267, "step": 5790 }, { "epoch": 3.108989631220429, "grad_norm": 0.6875, "learning_rate": 1.5679223422840318e-05, "loss": 0.9023, "step": 5791 }, { "epoch": 3.1095265259555047, "grad_norm": 0.7734375, "learning_rate": 1.5671387714385682e-05, "loss": 1.0515, "step": 5792 }, { "epoch": 3.110063420690581, "grad_norm": 0.609375, "learning_rate": 1.5663553070443846e-05, "loss": 0.6579, "step": 5793 }, { "epoch": 3.110600315425657, "grad_norm": 0.7734375, "learning_rate": 1.5655719491908843e-05, "loss": 0.8048, "step": 5794 }, { "epoch": 3.1111372101607326, "grad_norm": 0.69921875, "learning_rate": 1.564788697967458e-05, "loss": 0.9001, "step": 5795 }, { "epoch": 3.1116741048958088, "grad_norm": 0.71875, "learning_rate": 1.5640055534634846e-05, "loss": 0.9241, "step": 5796 }, { "epoch": 3.112210999630885, "grad_norm": 0.64453125, "learning_rate": 1.5632225157683312e-05, "loss": 0.8496, "step": 5797 }, { "epoch": 3.112747894365961, "grad_norm": 0.67578125, "learning_rate": 1.5624395849713527e-05, "loss": 0.9764, "step": 5798 }, { "epoch": 3.1132847891010367, "grad_norm": 0.75, "learning_rate": 1.5616567611618905e-05, "loss": 0.8557, "step": 5799 }, { "epoch": 3.113821683836113, "grad_norm": 0.70703125, "learning_rate": 1.5608740444292746e-05, "loss": 0.7297, "step": 5800 }, { "epoch": 3.113821683836113, "eval_loss": 0.9028341770172119, "eval_runtime": 46.5022, "eval_samples_per_second": 20.687, "eval_steps_per_second": 20.687, "step": 5800 }, { "epoch": 3.114358578571189, "grad_norm": 1.0078125, "learning_rate": 1.560091434862823e-05, "loss": 0.9834, "step": 5801 }, { "epoch": 3.114895473306265, "grad_norm": 0.62890625, "learning_rate": 1.5593089325518422e-05, "loss": 0.9395, "step": 5802 }, { "epoch": 3.1154323680413407, "grad_norm": 0.62109375, "learning_rate": 1.558526537585625e-05, "loss": 0.8744, "step": 5803 }, { "epoch": 3.115969262776417, "grad_norm": 0.7109375, "learning_rate": 1.5577442500534522e-05, "loss": 0.9399, "step": 5804 }, { "epoch": 3.116506157511493, "grad_norm": 0.71484375, "learning_rate": 1.5569620700445936e-05, "loss": 0.7778, "step": 5805 }, { "epoch": 3.117043052246569, "grad_norm": 0.70703125, "learning_rate": 1.556179997648306e-05, "loss": 0.9556, "step": 5806 }, { "epoch": 3.1175799469816448, "grad_norm": 0.61328125, "learning_rate": 1.5553980329538326e-05, "loss": 0.8476, "step": 5807 }, { "epoch": 3.118116841716721, "grad_norm": 0.78125, "learning_rate": 1.5546161760504063e-05, "loss": 0.8915, "step": 5808 }, { "epoch": 3.118653736451797, "grad_norm": 0.60546875, "learning_rate": 1.553834427027246e-05, "loss": 0.7585, "step": 5809 }, { "epoch": 3.119190631186873, "grad_norm": 0.734375, "learning_rate": 1.55305278597356e-05, "loss": 1.0599, "step": 5810 }, { "epoch": 3.119727525921949, "grad_norm": 0.6171875, "learning_rate": 1.552271252978544e-05, "loss": 0.7973, "step": 5811 }, { "epoch": 3.120264420657025, "grad_norm": 0.56640625, "learning_rate": 1.5514898281313785e-05, "loss": 0.786, "step": 5812 }, { "epoch": 3.120801315392101, "grad_norm": 0.6953125, "learning_rate": 1.5507085115212366e-05, "loss": 0.8865, "step": 5813 }, { "epoch": 3.1213382101271767, "grad_norm": 0.65625, "learning_rate": 1.549927303237275e-05, "loss": 0.9879, "step": 5814 }, { "epoch": 3.121875104862253, "grad_norm": 0.703125, "learning_rate": 1.549146203368641e-05, "loss": 0.7729, "step": 5815 }, { "epoch": 3.122411999597329, "grad_norm": 0.79296875, "learning_rate": 1.5483652120044658e-05, "loss": 1.182, "step": 5816 }, { "epoch": 3.122948894332405, "grad_norm": 0.6953125, "learning_rate": 1.547584329233871e-05, "loss": 0.8433, "step": 5817 }, { "epoch": 3.1234857890674808, "grad_norm": 0.78125, "learning_rate": 1.5468035551459663e-05, "loss": 0.7269, "step": 5818 }, { "epoch": 3.124022683802557, "grad_norm": 0.765625, "learning_rate": 1.546022889829847e-05, "loss": 0.9403, "step": 5819 }, { "epoch": 3.124559578537633, "grad_norm": 0.64453125, "learning_rate": 1.5452423333745964e-05, "loss": 0.6856, "step": 5820 }, { "epoch": 3.125096473272709, "grad_norm": 0.72265625, "learning_rate": 1.544461885869287e-05, "loss": 0.9602, "step": 5821 }, { "epoch": 3.125633368007785, "grad_norm": 0.66015625, "learning_rate": 1.5436815474029777e-05, "loss": 0.9313, "step": 5822 }, { "epoch": 3.126170262742861, "grad_norm": 0.625, "learning_rate": 1.542901318064715e-05, "loss": 0.8657, "step": 5823 }, { "epoch": 3.126707157477937, "grad_norm": 0.77734375, "learning_rate": 1.5421211979435324e-05, "loss": 0.8971, "step": 5824 }, { "epoch": 3.127244052213013, "grad_norm": 0.82421875, "learning_rate": 1.5413411871284514e-05, "loss": 0.9795, "step": 5825 }, { "epoch": 3.127780946948089, "grad_norm": 0.73046875, "learning_rate": 1.540561285708481e-05, "loss": 0.8813, "step": 5826 }, { "epoch": 3.128317841683165, "grad_norm": 0.7578125, "learning_rate": 1.5397814937726186e-05, "loss": 0.8421, "step": 5827 }, { "epoch": 3.128854736418241, "grad_norm": 0.609375, "learning_rate": 1.5390018114098475e-05, "loss": 0.6864, "step": 5828 }, { "epoch": 3.129391631153317, "grad_norm": 0.7734375, "learning_rate": 1.53822223870914e-05, "loss": 0.8245, "step": 5829 }, { "epoch": 3.129928525888393, "grad_norm": 0.76171875, "learning_rate": 1.5374427757594552e-05, "loss": 0.9671, "step": 5830 }, { "epoch": 3.130465420623469, "grad_norm": 0.671875, "learning_rate": 1.5366634226497402e-05, "loss": 0.8747, "step": 5831 }, { "epoch": 3.131002315358545, "grad_norm": 0.61328125, "learning_rate": 1.5358841794689274e-05, "loss": 0.8506, "step": 5832 }, { "epoch": 3.131539210093621, "grad_norm": 0.671875, "learning_rate": 1.5351050463059396e-05, "loss": 0.8789, "step": 5833 }, { "epoch": 3.132076104828697, "grad_norm": 0.66015625, "learning_rate": 1.534326023249685e-05, "loss": 0.8538, "step": 5834 }, { "epoch": 3.132612999563773, "grad_norm": 0.6640625, "learning_rate": 1.5335471103890603e-05, "loss": 0.7922, "step": 5835 }, { "epoch": 3.133149894298849, "grad_norm": 0.65625, "learning_rate": 1.53276830781295e-05, "loss": 0.7255, "step": 5836 }, { "epoch": 3.133686789033925, "grad_norm": 0.83984375, "learning_rate": 1.5319896156102244e-05, "loss": 0.772, "step": 5837 }, { "epoch": 3.134223683769001, "grad_norm": 0.75390625, "learning_rate": 1.5312110338697426e-05, "loss": 0.928, "step": 5838 }, { "epoch": 3.134760578504077, "grad_norm": 0.6328125, "learning_rate": 1.5304325626803507e-05, "loss": 0.6858, "step": 5839 }, { "epoch": 3.135297473239153, "grad_norm": 0.6640625, "learning_rate": 1.5296542021308825e-05, "loss": 0.9071, "step": 5840 }, { "epoch": 3.135834367974229, "grad_norm": 0.65625, "learning_rate": 1.5288759523101578e-05, "loss": 0.9167, "step": 5841 }, { "epoch": 3.136371262709305, "grad_norm": 0.6875, "learning_rate": 1.5280978133069846e-05, "loss": 0.8813, "step": 5842 }, { "epoch": 3.136908157444381, "grad_norm": 0.64453125, "learning_rate": 1.52731978521016e-05, "loss": 0.7688, "step": 5843 }, { "epoch": 3.1374450521794572, "grad_norm": 0.66015625, "learning_rate": 1.5265418681084655e-05, "loss": 0.917, "step": 5844 }, { "epoch": 3.137981946914533, "grad_norm": 0.69140625, "learning_rate": 1.525764062090671e-05, "loss": 0.7868, "step": 5845 }, { "epoch": 3.138518841649609, "grad_norm": 0.703125, "learning_rate": 1.5249863672455355e-05, "loss": 0.7626, "step": 5846 }, { "epoch": 3.139055736384685, "grad_norm": 0.71875, "learning_rate": 1.524208783661803e-05, "loss": 0.8455, "step": 5847 }, { "epoch": 3.1395926311197613, "grad_norm": 0.57421875, "learning_rate": 1.5234313114282061e-05, "loss": 0.7365, "step": 5848 }, { "epoch": 3.140129525854837, "grad_norm": 0.7109375, "learning_rate": 1.5226539506334636e-05, "loss": 0.9351, "step": 5849 }, { "epoch": 3.140666420589913, "grad_norm": 0.71484375, "learning_rate": 1.5218767013662816e-05, "loss": 0.7948, "step": 5850 }, { "epoch": 3.140666420589913, "eval_loss": 0.9028377532958984, "eval_runtime": 46.8928, "eval_samples_per_second": 20.515, "eval_steps_per_second": 20.515, "step": 5850 }, { "epoch": 3.141203315324989, "grad_norm": 0.6015625, "learning_rate": 1.5210995637153558e-05, "loss": 0.7551, "step": 5851 }, { "epoch": 3.141740210060065, "grad_norm": 0.625, "learning_rate": 1.5203225377693662e-05, "loss": 0.8331, "step": 5852 }, { "epoch": 3.142277104795141, "grad_norm": 0.6953125, "learning_rate": 1.5195456236169811e-05, "loss": 0.6731, "step": 5853 }, { "epoch": 3.142813999530217, "grad_norm": 0.67578125, "learning_rate": 1.5187688213468577e-05, "loss": 0.8433, "step": 5854 }, { "epoch": 3.1433508942652932, "grad_norm": 0.80078125, "learning_rate": 1.5179921310476375e-05, "loss": 0.7952, "step": 5855 }, { "epoch": 3.143887789000369, "grad_norm": 0.71484375, "learning_rate": 1.5172155528079512e-05, "loss": 0.8765, "step": 5856 }, { "epoch": 3.144424683735445, "grad_norm": 0.69140625, "learning_rate": 1.5164390867164177e-05, "loss": 0.9628, "step": 5857 }, { "epoch": 3.144961578470521, "grad_norm": 0.9140625, "learning_rate": 1.5156627328616385e-05, "loss": 0.7196, "step": 5858 }, { "epoch": 3.1454984732055973, "grad_norm": 0.66015625, "learning_rate": 1.5148864913322081e-05, "loss": 0.8912, "step": 5859 }, { "epoch": 3.146035367940673, "grad_norm": 0.828125, "learning_rate": 1.5141103622167041e-05, "loss": 1.2283, "step": 5860 }, { "epoch": 3.146572262675749, "grad_norm": 0.64453125, "learning_rate": 1.5133343456036925e-05, "loss": 0.6317, "step": 5861 }, { "epoch": 3.147109157410825, "grad_norm": 0.7265625, "learning_rate": 1.5125584415817279e-05, "loss": 0.7639, "step": 5862 }, { "epoch": 3.1476460521459013, "grad_norm": 0.61328125, "learning_rate": 1.5117826502393499e-05, "loss": 0.8705, "step": 5863 }, { "epoch": 3.148182946880977, "grad_norm": 0.6796875, "learning_rate": 1.511006971665086e-05, "loss": 0.9229, "step": 5864 }, { "epoch": 3.148719841616053, "grad_norm": 0.71875, "learning_rate": 1.5102314059474521e-05, "loss": 0.881, "step": 5865 }, { "epoch": 3.1492567363511292, "grad_norm": 0.6875, "learning_rate": 1.509455953174948e-05, "loss": 0.8269, "step": 5866 }, { "epoch": 3.1497936310862054, "grad_norm": 0.640625, "learning_rate": 1.5086806134360648e-05, "loss": 0.7855, "step": 5867 }, { "epoch": 3.150330525821281, "grad_norm": 0.734375, "learning_rate": 1.5079053868192771e-05, "loss": 0.9143, "step": 5868 }, { "epoch": 3.150867420556357, "grad_norm": 0.65234375, "learning_rate": 1.5071302734130489e-05, "loss": 0.7661, "step": 5869 }, { "epoch": 3.1514043152914333, "grad_norm": 0.625, "learning_rate": 1.5063552733058294e-05, "loss": 0.774, "step": 5870 }, { "epoch": 3.151941210026509, "grad_norm": 0.62890625, "learning_rate": 1.5055803865860576e-05, "loss": 0.7571, "step": 5871 }, { "epoch": 3.152478104761585, "grad_norm": 0.65625, "learning_rate": 1.5048056133421567e-05, "loss": 0.746, "step": 5872 }, { "epoch": 3.153014999496661, "grad_norm": 0.6875, "learning_rate": 1.5040309536625391e-05, "loss": 0.7903, "step": 5873 }, { "epoch": 3.1535518942317373, "grad_norm": 0.5859375, "learning_rate": 1.5032564076356023e-05, "loss": 0.695, "step": 5874 }, { "epoch": 3.154088788966813, "grad_norm": 0.62109375, "learning_rate": 1.5024819753497316e-05, "loss": 0.885, "step": 5875 }, { "epoch": 3.154625683701889, "grad_norm": 0.7890625, "learning_rate": 1.5017076568933003e-05, "loss": 0.97, "step": 5876 }, { "epoch": 3.1551625784369652, "grad_norm": 0.984375, "learning_rate": 1.5009334523546675e-05, "loss": 0.948, "step": 5877 }, { "epoch": 3.1556994731720414, "grad_norm": 0.9296875, "learning_rate": 1.5001593618221799e-05, "loss": 0.9747, "step": 5878 }, { "epoch": 3.156236367907117, "grad_norm": 0.77734375, "learning_rate": 1.4993853853841713e-05, "loss": 0.742, "step": 5879 }, { "epoch": 3.156773262642193, "grad_norm": 0.66015625, "learning_rate": 1.4986115231289622e-05, "loss": 0.7533, "step": 5880 }, { "epoch": 3.1573101573772693, "grad_norm": 0.6484375, "learning_rate": 1.4978377751448592e-05, "loss": 0.8585, "step": 5881 }, { "epoch": 3.1578470521123454, "grad_norm": 0.953125, "learning_rate": 1.4970641415201589e-05, "loss": 0.8325, "step": 5882 }, { "epoch": 3.158383946847421, "grad_norm": 0.80859375, "learning_rate": 1.4962906223431398e-05, "loss": 1.0475, "step": 5883 }, { "epoch": 3.158920841582497, "grad_norm": 0.7421875, "learning_rate": 1.4955172177020722e-05, "loss": 0.8778, "step": 5884 }, { "epoch": 3.1594577363175733, "grad_norm": 0.6484375, "learning_rate": 1.4947439276852104e-05, "loss": 0.71, "step": 5885 }, { "epoch": 3.1599946310526494, "grad_norm": 0.7265625, "learning_rate": 1.493970752380797e-05, "loss": 0.8112, "step": 5886 }, { "epoch": 3.160531525787725, "grad_norm": 0.70703125, "learning_rate": 1.4931976918770613e-05, "loss": 0.7708, "step": 5887 }, { "epoch": 3.1610684205228012, "grad_norm": 0.74609375, "learning_rate": 1.4924247462622193e-05, "loss": 0.8073, "step": 5888 }, { "epoch": 3.1616053152578774, "grad_norm": 0.76953125, "learning_rate": 1.491651915624473e-05, "loss": 0.8567, "step": 5889 }, { "epoch": 3.162142209992953, "grad_norm": 0.71484375, "learning_rate": 1.4908792000520141e-05, "loss": 0.922, "step": 5890 }, { "epoch": 3.162679104728029, "grad_norm": 0.70703125, "learning_rate": 1.4901065996330166e-05, "loss": 0.8179, "step": 5891 }, { "epoch": 3.1632159994631053, "grad_norm": 0.7265625, "learning_rate": 1.4893341144556461e-05, "loss": 0.8959, "step": 5892 }, { "epoch": 3.1637528941981814, "grad_norm": 0.66796875, "learning_rate": 1.4885617446080518e-05, "loss": 0.7958, "step": 5893 }, { "epoch": 3.164289788933257, "grad_norm": 0.71875, "learning_rate": 1.4877894901783712e-05, "loss": 0.7645, "step": 5894 }, { "epoch": 3.164826683668333, "grad_norm": 0.81640625, "learning_rate": 1.4870173512547287e-05, "loss": 1.0871, "step": 5895 }, { "epoch": 3.1653635784034093, "grad_norm": 0.58203125, "learning_rate": 1.4862453279252348e-05, "loss": 0.8, "step": 5896 }, { "epoch": 3.1659004731384854, "grad_norm": 0.640625, "learning_rate": 1.4854734202779865e-05, "loss": 0.7329, "step": 5897 }, { "epoch": 3.166437367873561, "grad_norm": 0.6484375, "learning_rate": 1.4847016284010695e-05, "loss": 0.7705, "step": 5898 }, { "epoch": 3.1669742626086372, "grad_norm": 0.60546875, "learning_rate": 1.483929952382555e-05, "loss": 0.7396, "step": 5899 }, { "epoch": 3.1675111573437134, "grad_norm": 0.83984375, "learning_rate": 1.4831583923104999e-05, "loss": 0.7877, "step": 5900 }, { "epoch": 3.1675111573437134, "eval_loss": 0.9030345678329468, "eval_runtime": 46.1703, "eval_samples_per_second": 20.836, "eval_steps_per_second": 20.836, "step": 5900 }, { "epoch": 3.1680480520787895, "grad_norm": 0.82421875, "learning_rate": 1.4823869482729493e-05, "loss": 0.8308, "step": 5901 }, { "epoch": 3.168584946813865, "grad_norm": 0.5703125, "learning_rate": 1.4816156203579346e-05, "loss": 0.7827, "step": 5902 }, { "epoch": 3.1691218415489413, "grad_norm": 0.66796875, "learning_rate": 1.480844408653475e-05, "loss": 0.7977, "step": 5903 }, { "epoch": 3.1696587362840174, "grad_norm": 0.6015625, "learning_rate": 1.4800733132475747e-05, "loss": 0.8653, "step": 5904 }, { "epoch": 3.1701956310190935, "grad_norm": 0.80078125, "learning_rate": 1.4793023342282258e-05, "loss": 0.7437, "step": 5905 }, { "epoch": 3.170732525754169, "grad_norm": 0.83984375, "learning_rate": 1.4785314716834065e-05, "loss": 0.9841, "step": 5906 }, { "epoch": 3.1712694204892453, "grad_norm": 0.65234375, "learning_rate": 1.4777607257010826e-05, "loss": 0.8282, "step": 5907 }, { "epoch": 3.1718063152243214, "grad_norm": 0.79296875, "learning_rate": 1.4769900963692051e-05, "loss": 0.8326, "step": 5908 }, { "epoch": 3.172343209959397, "grad_norm": 0.69921875, "learning_rate": 1.476219583775713e-05, "loss": 0.9247, "step": 5909 }, { "epoch": 3.1728801046944732, "grad_norm": 0.72265625, "learning_rate": 1.475449188008532e-05, "loss": 1.0606, "step": 5910 }, { "epoch": 3.1734169994295494, "grad_norm": 0.74609375, "learning_rate": 1.4746789091555724e-05, "loss": 0.7653, "step": 5911 }, { "epoch": 3.1739538941646255, "grad_norm": 0.72265625, "learning_rate": 1.4739087473047347e-05, "loss": 0.8635, "step": 5912 }, { "epoch": 3.174490788899701, "grad_norm": 0.7734375, "learning_rate": 1.473138702543903e-05, "loss": 0.8409, "step": 5913 }, { "epoch": 3.1750276836347773, "grad_norm": 0.82421875, "learning_rate": 1.4723687749609493e-05, "loss": 0.8368, "step": 5914 }, { "epoch": 3.1755645783698534, "grad_norm": 0.9609375, "learning_rate": 1.4715989646437338e-05, "loss": 0.9605, "step": 5915 }, { "epoch": 3.1761014731049295, "grad_norm": 0.62109375, "learning_rate": 1.4708292716800987e-05, "loss": 0.7128, "step": 5916 }, { "epoch": 3.176638367840005, "grad_norm": 0.74609375, "learning_rate": 1.4700596961578771e-05, "loss": 0.8462, "step": 5917 }, { "epoch": 3.1771752625750813, "grad_norm": 0.7578125, "learning_rate": 1.4692902381648877e-05, "loss": 0.8852, "step": 5918 }, { "epoch": 3.1777121573101574, "grad_norm": 0.8125, "learning_rate": 1.4685208977889344e-05, "loss": 0.7421, "step": 5919 }, { "epoch": 3.1782490520452336, "grad_norm": 0.66796875, "learning_rate": 1.46775167511781e-05, "loss": 0.7741, "step": 5920 }, { "epoch": 3.1787859467803092, "grad_norm": 0.61328125, "learning_rate": 1.4669825702392915e-05, "loss": 0.7951, "step": 5921 }, { "epoch": 3.1793228415153854, "grad_norm": 0.75, "learning_rate": 1.4662135832411436e-05, "loss": 0.9868, "step": 5922 }, { "epoch": 3.1798597362504615, "grad_norm": 0.703125, "learning_rate": 1.465444714211118e-05, "loss": 0.8983, "step": 5923 }, { "epoch": 3.1803966309855376, "grad_norm": 0.8359375, "learning_rate": 1.4646759632369528e-05, "loss": 0.9055, "step": 5924 }, { "epoch": 3.1809335257206133, "grad_norm": 0.61328125, "learning_rate": 1.4639073304063708e-05, "loss": 0.7099, "step": 5925 }, { "epoch": 3.1814704204556894, "grad_norm": 0.6328125, "learning_rate": 1.4631388158070836e-05, "loss": 0.8191, "step": 5926 }, { "epoch": 3.1820073151907655, "grad_norm": 0.87890625, "learning_rate": 1.4623704195267875e-05, "loss": 0.9476, "step": 5927 }, { "epoch": 3.182544209925841, "grad_norm": 0.703125, "learning_rate": 1.4616021416531678e-05, "loss": 0.8805, "step": 5928 }, { "epoch": 3.1830811046609173, "grad_norm": 0.6875, "learning_rate": 1.4608339822738942e-05, "loss": 0.8201, "step": 5929 }, { "epoch": 3.1836179993959934, "grad_norm": 0.71484375, "learning_rate": 1.4600659414766224e-05, "loss": 0.6948, "step": 5930 }, { "epoch": 3.1841548941310696, "grad_norm": 0.890625, "learning_rate": 1.4592980193489975e-05, "loss": 0.9722, "step": 5931 }, { "epoch": 3.1846917888661452, "grad_norm": 0.86328125, "learning_rate": 1.4585302159786478e-05, "loss": 0.9052, "step": 5932 }, { "epoch": 3.1852286836012214, "grad_norm": 0.71875, "learning_rate": 1.457762531453189e-05, "loss": 1.1633, "step": 5933 }, { "epoch": 3.1857655783362975, "grad_norm": 0.640625, "learning_rate": 1.4569949658602238e-05, "loss": 0.736, "step": 5934 }, { "epoch": 3.1863024730713736, "grad_norm": 0.5703125, "learning_rate": 1.4562275192873428e-05, "loss": 0.7624, "step": 5935 }, { "epoch": 3.1868393678064493, "grad_norm": 0.65234375, "learning_rate": 1.4554601918221192e-05, "loss": 0.7997, "step": 5936 }, { "epoch": 3.1873762625415254, "grad_norm": 0.77734375, "learning_rate": 1.4546929835521155e-05, "loss": 1.0569, "step": 5937 }, { "epoch": 3.1879131572766015, "grad_norm": 0.84375, "learning_rate": 1.4539258945648814e-05, "loss": 0.8691, "step": 5938 }, { "epoch": 3.1884500520116776, "grad_norm": 0.85546875, "learning_rate": 1.453158924947949e-05, "loss": 0.8963, "step": 5939 }, { "epoch": 3.1889869467467533, "grad_norm": 0.6484375, "learning_rate": 1.4523920747888406e-05, "loss": 0.7196, "step": 5940 }, { "epoch": 3.1895238414818294, "grad_norm": 0.609375, "learning_rate": 1.4516253441750655e-05, "loss": 0.7365, "step": 5941 }, { "epoch": 3.1900607362169056, "grad_norm": 0.78125, "learning_rate": 1.450858733194113e-05, "loss": 0.9657, "step": 5942 }, { "epoch": 3.1905976309519817, "grad_norm": 0.72265625, "learning_rate": 1.450092241933465e-05, "loss": 0.8072, "step": 5943 }, { "epoch": 3.1911345256870574, "grad_norm": 0.93359375, "learning_rate": 1.4493258704805896e-05, "loss": 0.8625, "step": 5944 }, { "epoch": 3.1916714204221335, "grad_norm": 0.73046875, "learning_rate": 1.4485596189229373e-05, "loss": 0.7525, "step": 5945 }, { "epoch": 3.1922083151572096, "grad_norm": 0.640625, "learning_rate": 1.4477934873479473e-05, "loss": 0.7938, "step": 5946 }, { "epoch": 3.1927452098922853, "grad_norm": 0.69140625, "learning_rate": 1.4470274758430468e-05, "loss": 0.8503, "step": 5947 }, { "epoch": 3.1932821046273614, "grad_norm": 0.6953125, "learning_rate": 1.4462615844956452e-05, "loss": 0.9595, "step": 5948 }, { "epoch": 3.1938189993624375, "grad_norm": 0.64453125, "learning_rate": 1.4454958133931423e-05, "loss": 0.7076, "step": 5949 }, { "epoch": 3.1943558940975136, "grad_norm": 0.7265625, "learning_rate": 1.4447301626229204e-05, "loss": 0.8918, "step": 5950 }, { "epoch": 3.1943558940975136, "eval_loss": 0.9028350114822388, "eval_runtime": 46.1726, "eval_samples_per_second": 20.835, "eval_steps_per_second": 20.835, "step": 5950 }, { "epoch": 3.1948927888325893, "grad_norm": 0.67578125, "learning_rate": 1.4439646322723516e-05, "loss": 0.895, "step": 5951 }, { "epoch": 3.1954296835676654, "grad_norm": 0.7578125, "learning_rate": 1.443199222428791e-05, "loss": 0.9363, "step": 5952 }, { "epoch": 3.1959665783027416, "grad_norm": 0.69140625, "learning_rate": 1.4424339331795824e-05, "loss": 0.9611, "step": 5953 }, { "epoch": 3.1965034730378177, "grad_norm": 0.671875, "learning_rate": 1.441668764612056e-05, "loss": 0.7588, "step": 5954 }, { "epoch": 3.1970403677728934, "grad_norm": 0.64453125, "learning_rate": 1.4409037168135254e-05, "loss": 0.8395, "step": 5955 }, { "epoch": 3.1975772625079695, "grad_norm": 0.60546875, "learning_rate": 1.4401387898712931e-05, "loss": 1.038, "step": 5956 }, { "epoch": 3.1981141572430456, "grad_norm": 0.7109375, "learning_rate": 1.4393739838726486e-05, "loss": 0.822, "step": 5957 }, { "epoch": 3.1986510519781217, "grad_norm": 0.67578125, "learning_rate": 1.4386092989048625e-05, "loss": 1.0552, "step": 5958 }, { "epoch": 3.1991879467131974, "grad_norm": 0.71484375, "learning_rate": 1.4378447350551968e-05, "loss": 0.789, "step": 5959 }, { "epoch": 3.1997248414482735, "grad_norm": 0.6328125, "learning_rate": 1.437080292410899e-05, "loss": 0.897, "step": 5960 }, { "epoch": 3.2002617361833496, "grad_norm": 0.6328125, "learning_rate": 1.4363159710591995e-05, "loss": 0.7468, "step": 5961 }, { "epoch": 3.2007986309184258, "grad_norm": 0.66796875, "learning_rate": 1.4355517710873184e-05, "loss": 0.8809, "step": 5962 }, { "epoch": 3.2013355256535014, "grad_norm": 0.8984375, "learning_rate": 1.434787692582461e-05, "loss": 0.8277, "step": 5963 }, { "epoch": 3.2018724203885776, "grad_norm": 0.75, "learning_rate": 1.434023735631817e-05, "loss": 0.8399, "step": 5964 }, { "epoch": 3.2024093151236537, "grad_norm": 0.84375, "learning_rate": 1.4332599003225638e-05, "loss": 1.0324, "step": 5965 }, { "epoch": 3.2029462098587294, "grad_norm": 0.734375, "learning_rate": 1.4324961867418668e-05, "loss": 0.7748, "step": 5966 }, { "epoch": 3.2034831045938055, "grad_norm": 0.7421875, "learning_rate": 1.4317325949768734e-05, "loss": 1.0084, "step": 5967 }, { "epoch": 3.2040199993288816, "grad_norm": 1.046875, "learning_rate": 1.4309691251147183e-05, "loss": 0.7897, "step": 5968 }, { "epoch": 3.2045568940639577, "grad_norm": 0.67578125, "learning_rate": 1.4302057772425246e-05, "loss": 0.817, "step": 5969 }, { "epoch": 3.2050937887990334, "grad_norm": 0.69140625, "learning_rate": 1.4294425514474002e-05, "loss": 0.9683, "step": 5970 }, { "epoch": 3.2056306835341095, "grad_norm": 0.63671875, "learning_rate": 1.4286794478164373e-05, "loss": 0.7078, "step": 5971 }, { "epoch": 3.2061675782691856, "grad_norm": 0.68359375, "learning_rate": 1.4279164664367179e-05, "loss": 0.8567, "step": 5972 }, { "epoch": 3.2067044730042618, "grad_norm": 0.67578125, "learning_rate": 1.4271536073953051e-05, "loss": 0.8417, "step": 5973 }, { "epoch": 3.2072413677393374, "grad_norm": 0.7734375, "learning_rate": 1.4263908707792534e-05, "loss": 0.9633, "step": 5974 }, { "epoch": 3.2077782624744136, "grad_norm": 0.59765625, "learning_rate": 1.4256282566755983e-05, "loss": 0.7639, "step": 5975 }, { "epoch": 3.2083151572094897, "grad_norm": 0.69140625, "learning_rate": 1.4248657651713659e-05, "loss": 0.8073, "step": 5976 }, { "epoch": 3.208852051944566, "grad_norm": 0.5859375, "learning_rate": 1.4241033963535644e-05, "loss": 0.567, "step": 5977 }, { "epoch": 3.2093889466796415, "grad_norm": 0.62109375, "learning_rate": 1.4233411503091904e-05, "loss": 0.7537, "step": 5978 }, { "epoch": 3.2099258414147176, "grad_norm": 0.8203125, "learning_rate": 1.422579027125227e-05, "loss": 0.951, "step": 5979 }, { "epoch": 3.2104627361497937, "grad_norm": 0.5390625, "learning_rate": 1.42181702688864e-05, "loss": 0.7827, "step": 5980 }, { "epoch": 3.21099963088487, "grad_norm": 0.63671875, "learning_rate": 1.4210551496863845e-05, "loss": 0.8011, "step": 5981 }, { "epoch": 3.2115365256199455, "grad_norm": 0.625, "learning_rate": 1.4202933956054015e-05, "loss": 0.8767, "step": 5982 }, { "epoch": 3.2120734203550216, "grad_norm": 0.62890625, "learning_rate": 1.419531764732614e-05, "loss": 0.8392, "step": 5983 }, { "epoch": 3.2126103150900978, "grad_norm": 0.82421875, "learning_rate": 1.4187702571549349e-05, "loss": 0.8689, "step": 5984 }, { "epoch": 3.2131472098251734, "grad_norm": 0.6796875, "learning_rate": 1.4180088729592633e-05, "loss": 0.8801, "step": 5985 }, { "epoch": 3.2136841045602496, "grad_norm": 0.76171875, "learning_rate": 1.4172476122324806e-05, "loss": 0.9403, "step": 5986 }, { "epoch": 3.2142209992953257, "grad_norm": 0.625, "learning_rate": 1.4164864750614576e-05, "loss": 0.7572, "step": 5987 }, { "epoch": 3.214757894030402, "grad_norm": 0.671875, "learning_rate": 1.4157254615330501e-05, "loss": 0.8931, "step": 5988 }, { "epoch": 3.2152947887654775, "grad_norm": 0.73828125, "learning_rate": 1.4149645717340975e-05, "loss": 0.9601, "step": 5989 }, { "epoch": 3.2158316835005536, "grad_norm": 0.796875, "learning_rate": 1.4142038057514284e-05, "loss": 1.0164, "step": 5990 }, { "epoch": 3.2163685782356297, "grad_norm": 0.8828125, "learning_rate": 1.4134431636718564e-05, "loss": 0.9013, "step": 5991 }, { "epoch": 3.216905472970706, "grad_norm": 0.67578125, "learning_rate": 1.4126826455821796e-05, "loss": 0.9508, "step": 5992 }, { "epoch": 3.2174423677057815, "grad_norm": 0.6875, "learning_rate": 1.4119222515691816e-05, "loss": 0.8455, "step": 5993 }, { "epoch": 3.2179792624408576, "grad_norm": 0.66015625, "learning_rate": 1.4111619817196341e-05, "loss": 0.7485, "step": 5994 }, { "epoch": 3.2185161571759338, "grad_norm": 0.68359375, "learning_rate": 1.4104018361202948e-05, "loss": 0.8297, "step": 5995 }, { "epoch": 3.21905305191101, "grad_norm": 0.7421875, "learning_rate": 1.4096418148579032e-05, "loss": 0.7641, "step": 5996 }, { "epoch": 3.2195899466460856, "grad_norm": 0.625, "learning_rate": 1.4088819180191889e-05, "loss": 0.7851, "step": 5997 }, { "epoch": 3.2201268413811617, "grad_norm": 0.7734375, "learning_rate": 1.4081221456908666e-05, "loss": 0.6875, "step": 5998 }, { "epoch": 3.220663736116238, "grad_norm": 0.74609375, "learning_rate": 1.4073624979596354e-05, "loss": 0.9539, "step": 5999 }, { "epoch": 3.221200630851314, "grad_norm": 0.7578125, "learning_rate": 1.406602974912179e-05, "loss": 0.8123, "step": 6000 }, { "epoch": 3.221200630851314, "eval_loss": 0.9029889106750488, "eval_runtime": 46.1284, "eval_samples_per_second": 20.855, "eval_steps_per_second": 20.855, "step": 6000 }, { "epoch": 3.2217375255863896, "grad_norm": 0.609375, "learning_rate": 1.405843576635171e-05, "loss": 0.7553, "step": 6001 }, { "epoch": 3.2222744203214657, "grad_norm": 0.6796875, "learning_rate": 1.4050843032152661e-05, "loss": 0.8135, "step": 6002 }, { "epoch": 3.222811315056542, "grad_norm": 0.62109375, "learning_rate": 1.4043251547391084e-05, "loss": 0.8064, "step": 6003 }, { "epoch": 3.2233482097916175, "grad_norm": 0.671875, "learning_rate": 1.4035661312933273e-05, "loss": 0.8289, "step": 6004 }, { "epoch": 3.2238851045266936, "grad_norm": 0.875, "learning_rate": 1.4028072329645345e-05, "loss": 0.8678, "step": 6005 }, { "epoch": 3.2244219992617698, "grad_norm": 0.67578125, "learning_rate": 1.4020484598393314e-05, "loss": 0.8762, "step": 6006 }, { "epoch": 3.224958893996846, "grad_norm": 0.73828125, "learning_rate": 1.4012898120043045e-05, "loss": 0.893, "step": 6007 }, { "epoch": 3.2254957887319216, "grad_norm": 0.69140625, "learning_rate": 1.4005312895460237e-05, "loss": 0.8491, "step": 6008 }, { "epoch": 3.2260326834669977, "grad_norm": 0.73828125, "learning_rate": 1.3997728925510458e-05, "loss": 0.8506, "step": 6009 }, { "epoch": 3.226569578202074, "grad_norm": 0.61328125, "learning_rate": 1.399014621105914e-05, "loss": 0.9122, "step": 6010 }, { "epoch": 3.22710647293715, "grad_norm": 0.7578125, "learning_rate": 1.3982564752971577e-05, "loss": 0.8655, "step": 6011 }, { "epoch": 3.2276433676722256, "grad_norm": 0.625, "learning_rate": 1.3974984552112894e-05, "loss": 0.6847, "step": 6012 }, { "epoch": 3.2281802624073017, "grad_norm": 0.84765625, "learning_rate": 1.3967405609348099e-05, "loss": 0.9188, "step": 6013 }, { "epoch": 3.228717157142378, "grad_norm": 0.66015625, "learning_rate": 1.3959827925542034e-05, "loss": 0.8532, "step": 6014 }, { "epoch": 3.229254051877454, "grad_norm": 0.63671875, "learning_rate": 1.3952251501559416e-05, "loss": 0.6954, "step": 6015 }, { "epoch": 3.2297909466125296, "grad_norm": 0.76171875, "learning_rate": 1.3944676338264816e-05, "loss": 0.83, "step": 6016 }, { "epoch": 3.2303278413476058, "grad_norm": 0.71484375, "learning_rate": 1.3937102436522654e-05, "loss": 0.8042, "step": 6017 }, { "epoch": 3.230864736082682, "grad_norm": 0.66015625, "learning_rate": 1.3929529797197193e-05, "loss": 0.8731, "step": 6018 }, { "epoch": 3.231401630817758, "grad_norm": 0.63671875, "learning_rate": 1.3921958421152576e-05, "loss": 0.8801, "step": 6019 }, { "epoch": 3.2319385255528337, "grad_norm": 1.2265625, "learning_rate": 1.3914388309252807e-05, "loss": 0.8929, "step": 6020 }, { "epoch": 3.23247542028791, "grad_norm": 0.62890625, "learning_rate": 1.3906819462361709e-05, "loss": 0.9557, "step": 6021 }, { "epoch": 3.233012315022986, "grad_norm": 0.73046875, "learning_rate": 1.3899251881342994e-05, "loss": 1.0061, "step": 6022 }, { "epoch": 3.2335492097580616, "grad_norm": 0.6875, "learning_rate": 1.389168556706023e-05, "loss": 0.804, "step": 6023 }, { "epoch": 3.2340861044931377, "grad_norm": 0.67578125, "learning_rate": 1.388412052037682e-05, "loss": 0.7606, "step": 6024 }, { "epoch": 3.234622999228214, "grad_norm": 0.72265625, "learning_rate": 1.387655674215602e-05, "loss": 0.9004, "step": 6025 }, { "epoch": 3.23515989396329, "grad_norm": 0.703125, "learning_rate": 1.3868994233260969e-05, "loss": 0.7433, "step": 6026 }, { "epoch": 3.2356967886983656, "grad_norm": 0.7265625, "learning_rate": 1.386143299455463e-05, "loss": 0.7762, "step": 6027 }, { "epoch": 3.2362336834334418, "grad_norm": 0.69921875, "learning_rate": 1.3853873026899846e-05, "loss": 0.7771, "step": 6028 }, { "epoch": 3.236770578168518, "grad_norm": 0.62109375, "learning_rate": 1.3846314331159313e-05, "loss": 0.8065, "step": 6029 }, { "epoch": 3.237307472903594, "grad_norm": 0.65625, "learning_rate": 1.3838756908195555e-05, "loss": 0.6639, "step": 6030 }, { "epoch": 3.2378443676386697, "grad_norm": 0.79296875, "learning_rate": 1.383120075887098e-05, "loss": 0.991, "step": 6031 }, { "epoch": 3.238381262373746, "grad_norm": 0.6484375, "learning_rate": 1.3823645884047847e-05, "loss": 0.7454, "step": 6032 }, { "epoch": 3.238918157108822, "grad_norm": 0.7109375, "learning_rate": 1.3816092284588262e-05, "loss": 0.8378, "step": 6033 }, { "epoch": 3.239455051843898, "grad_norm": 0.66015625, "learning_rate": 1.3808539961354167e-05, "loss": 0.9516, "step": 6034 }, { "epoch": 3.2399919465789737, "grad_norm": 0.62109375, "learning_rate": 1.3800988915207386e-05, "loss": 0.7173, "step": 6035 }, { "epoch": 3.24052884131405, "grad_norm": 0.6875, "learning_rate": 1.379343914700961e-05, "loss": 0.91, "step": 6036 }, { "epoch": 3.241065736049126, "grad_norm": 0.6875, "learning_rate": 1.3785890657622335e-05, "loss": 0.8349, "step": 6037 }, { "epoch": 3.241602630784202, "grad_norm": 0.6171875, "learning_rate": 1.3778343447906949e-05, "loss": 0.7921, "step": 6038 }, { "epoch": 3.2421395255192778, "grad_norm": 0.75, "learning_rate": 1.3770797518724699e-05, "loss": 0.8189, "step": 6039 }, { "epoch": 3.242676420254354, "grad_norm": 0.703125, "learning_rate": 1.3763252870936649e-05, "loss": 0.9578, "step": 6040 }, { "epoch": 3.24321331498943, "grad_norm": 0.67578125, "learning_rate": 1.3755709505403756e-05, "loss": 0.9725, "step": 6041 }, { "epoch": 3.2437502097245057, "grad_norm": 0.72265625, "learning_rate": 1.3748167422986807e-05, "loss": 1.049, "step": 6042 }, { "epoch": 3.244287104459582, "grad_norm": 0.66015625, "learning_rate": 1.3740626624546438e-05, "loss": 0.9442, "step": 6043 }, { "epoch": 3.244823999194658, "grad_norm": 0.8828125, "learning_rate": 1.3733087110943161e-05, "loss": 0.736, "step": 6044 }, { "epoch": 3.245360893929734, "grad_norm": 0.73828125, "learning_rate": 1.3725548883037334e-05, "loss": 0.7684, "step": 6045 }, { "epoch": 3.2458977886648097, "grad_norm": 0.5625, "learning_rate": 1.3718011941689152e-05, "loss": 0.7714, "step": 6046 }, { "epoch": 3.246434683399886, "grad_norm": 0.6484375, "learning_rate": 1.3710476287758683e-05, "loss": 0.7945, "step": 6047 }, { "epoch": 3.246971578134962, "grad_norm": 1.1171875, "learning_rate": 1.370294192210585e-05, "loss": 0.7305, "step": 6048 }, { "epoch": 3.247508472870038, "grad_norm": 0.64453125, "learning_rate": 1.3695408845590396e-05, "loss": 0.9207, "step": 6049 }, { "epoch": 3.2480453676051138, "grad_norm": 0.625, "learning_rate": 1.3687877059071966e-05, "loss": 0.7079, "step": 6050 }, { "epoch": 3.2480453676051138, "eval_loss": 0.9029369950294495, "eval_runtime": 46.8553, "eval_samples_per_second": 20.531, "eval_steps_per_second": 20.531, "step": 6050 }, { "epoch": 3.24858226234019, "grad_norm": 0.60546875, "learning_rate": 1.368034656341001e-05, "loss": 0.8576, "step": 6051 }, { "epoch": 3.249119157075266, "grad_norm": 0.6640625, "learning_rate": 1.3672817359463875e-05, "loss": 0.8994, "step": 6052 }, { "epoch": 3.249656051810342, "grad_norm": 0.58203125, "learning_rate": 1.366528944809271e-05, "loss": 0.825, "step": 6053 }, { "epoch": 3.250192946545418, "grad_norm": 0.7578125, "learning_rate": 1.3657762830155568e-05, "loss": 0.9276, "step": 6054 }, { "epoch": 3.250729841280494, "grad_norm": 0.6171875, "learning_rate": 1.3650237506511331e-05, "loss": 0.8413, "step": 6055 }, { "epoch": 3.25126673601557, "grad_norm": 0.7265625, "learning_rate": 1.364271347801872e-05, "loss": 0.7741, "step": 6056 }, { "epoch": 3.251803630750646, "grad_norm": 0.6484375, "learning_rate": 1.3635190745536342e-05, "loss": 0.9985, "step": 6057 }, { "epoch": 3.252340525485722, "grad_norm": 0.74609375, "learning_rate": 1.362766930992262e-05, "loss": 1.05, "step": 6058 }, { "epoch": 3.252877420220798, "grad_norm": 0.8125, "learning_rate": 1.3620149172035837e-05, "loss": 0.7583, "step": 6059 }, { "epoch": 3.253414314955874, "grad_norm": 0.63671875, "learning_rate": 1.361263033273415e-05, "loss": 0.9433, "step": 6060 }, { "epoch": 3.2539512096909498, "grad_norm": 0.77734375, "learning_rate": 1.3605112792875562e-05, "loss": 0.9179, "step": 6061 }, { "epoch": 3.254488104426026, "grad_norm": 0.6484375, "learning_rate": 1.3597596553317899e-05, "loss": 0.7118, "step": 6062 }, { "epoch": 3.255024999161102, "grad_norm": 0.7421875, "learning_rate": 1.3590081614918864e-05, "loss": 0.8761, "step": 6063 }, { "epoch": 3.255561893896178, "grad_norm": 0.66796875, "learning_rate": 1.3582567978536026e-05, "loss": 0.8091, "step": 6064 }, { "epoch": 3.256098788631254, "grad_norm": 0.73828125, "learning_rate": 1.3575055645026763e-05, "loss": 0.7951, "step": 6065 }, { "epoch": 3.25663568336633, "grad_norm": 0.65234375, "learning_rate": 1.3567544615248345e-05, "loss": 0.762, "step": 6066 }, { "epoch": 3.257172578101406, "grad_norm": 0.609375, "learning_rate": 1.3560034890057866e-05, "loss": 0.9841, "step": 6067 }, { "epoch": 3.257709472836482, "grad_norm": 0.6484375, "learning_rate": 1.3552526470312271e-05, "loss": 0.8665, "step": 6068 }, { "epoch": 3.258246367571558, "grad_norm": 0.7578125, "learning_rate": 1.3545019356868378e-05, "loss": 0.793, "step": 6069 }, { "epoch": 3.258783262306634, "grad_norm": 0.6640625, "learning_rate": 1.3537513550582853e-05, "loss": 0.6832, "step": 6070 }, { "epoch": 3.25932015704171, "grad_norm": 0.62109375, "learning_rate": 1.353000905231218e-05, "loss": 0.8276, "step": 6071 }, { "epoch": 3.259857051776786, "grad_norm": 0.7421875, "learning_rate": 1.352250586291273e-05, "loss": 1.0, "step": 6072 }, { "epoch": 3.260393946511862, "grad_norm": 0.6640625, "learning_rate": 1.351500398324072e-05, "loss": 0.7929, "step": 6073 }, { "epoch": 3.260930841246938, "grad_norm": 0.7890625, "learning_rate": 1.3507503414152195e-05, "loss": 0.8872, "step": 6074 }, { "epoch": 3.261467735982014, "grad_norm": 0.703125, "learning_rate": 1.350000415650308e-05, "loss": 0.8404, "step": 6075 }, { "epoch": 3.2620046307170902, "grad_norm": 0.69921875, "learning_rate": 1.3492506211149115e-05, "loss": 0.6903, "step": 6076 }, { "epoch": 3.262541525452166, "grad_norm": 0.78515625, "learning_rate": 1.3485009578945934e-05, "loss": 1.026, "step": 6077 }, { "epoch": 3.263078420187242, "grad_norm": 0.77734375, "learning_rate": 1.3477514260748975e-05, "loss": 0.7941, "step": 6078 }, { "epoch": 3.263615314922318, "grad_norm": 0.78125, "learning_rate": 1.3470020257413558e-05, "loss": 1.0685, "step": 6079 }, { "epoch": 3.264152209657394, "grad_norm": 0.59765625, "learning_rate": 1.3462527569794853e-05, "loss": 0.891, "step": 6080 }, { "epoch": 3.26468910439247, "grad_norm": 0.66796875, "learning_rate": 1.3455036198747856e-05, "loss": 0.8868, "step": 6081 }, { "epoch": 3.265225999127546, "grad_norm": 0.62109375, "learning_rate": 1.3447546145127434e-05, "loss": 0.8021, "step": 6082 }, { "epoch": 3.265762893862622, "grad_norm": 0.8125, "learning_rate": 1.3440057409788315e-05, "loss": 1.0257, "step": 6083 }, { "epoch": 3.266299788597698, "grad_norm": 0.62890625, "learning_rate": 1.3432569993585023e-05, "loss": 0.6777, "step": 6084 }, { "epoch": 3.266836683332774, "grad_norm": 0.7265625, "learning_rate": 1.3425083897371981e-05, "loss": 0.7381, "step": 6085 }, { "epoch": 3.26737357806785, "grad_norm": 0.73046875, "learning_rate": 1.3417599122003464e-05, "loss": 0.9388, "step": 6086 }, { "epoch": 3.2679104728029262, "grad_norm": 0.70703125, "learning_rate": 1.3410115668333556e-05, "loss": 0.767, "step": 6087 }, { "epoch": 3.268447367538002, "grad_norm": 0.6796875, "learning_rate": 1.3402633537216221e-05, "loss": 0.8137, "step": 6088 }, { "epoch": 3.268984262273078, "grad_norm": 0.71875, "learning_rate": 1.3395152729505281e-05, "loss": 0.8238, "step": 6089 }, { "epoch": 3.269521157008154, "grad_norm": 0.76953125, "learning_rate": 1.3387673246054367e-05, "loss": 0.8889, "step": 6090 }, { "epoch": 3.2700580517432303, "grad_norm": 0.69140625, "learning_rate": 1.3380195087716996e-05, "loss": 0.8642, "step": 6091 }, { "epoch": 3.270594946478306, "grad_norm": 0.7734375, "learning_rate": 1.3372718255346526e-05, "loss": 0.9319, "step": 6092 }, { "epoch": 3.271131841213382, "grad_norm": 0.66015625, "learning_rate": 1.336524274979615e-05, "loss": 0.9325, "step": 6093 }, { "epoch": 3.271668735948458, "grad_norm": 0.64453125, "learning_rate": 1.3357768571918911e-05, "loss": 0.6935, "step": 6094 }, { "epoch": 3.2722056306835343, "grad_norm": 0.94921875, "learning_rate": 1.3350295722567713e-05, "loss": 0.774, "step": 6095 }, { "epoch": 3.27274252541861, "grad_norm": 0.63671875, "learning_rate": 1.3342824202595317e-05, "loss": 0.6772, "step": 6096 }, { "epoch": 3.273279420153686, "grad_norm": 0.6875, "learning_rate": 1.3335354012854293e-05, "loss": 0.7624, "step": 6097 }, { "epoch": 3.2738163148887622, "grad_norm": 0.6640625, "learning_rate": 1.3327885154197105e-05, "loss": 0.9108, "step": 6098 }, { "epoch": 3.274353209623838, "grad_norm": 0.765625, "learning_rate": 1.3320417627476034e-05, "loss": 0.9702, "step": 6099 }, { "epoch": 3.274890104358914, "grad_norm": 0.6640625, "learning_rate": 1.3312951433543225e-05, "loss": 0.9428, "step": 6100 }, { "epoch": 3.274890104358914, "eval_loss": 0.9030158519744873, "eval_runtime": 46.3139, "eval_samples_per_second": 20.771, "eval_steps_per_second": 20.771, "step": 6100 }, { "epoch": 3.27542699909399, "grad_norm": 0.73828125, "learning_rate": 1.3305486573250653e-05, "loss": 0.8324, "step": 6101 }, { "epoch": 3.2759638938290663, "grad_norm": 0.69921875, "learning_rate": 1.3298023047450176e-05, "loss": 0.861, "step": 6102 }, { "epoch": 3.276500788564142, "grad_norm": 0.72265625, "learning_rate": 1.3290560856993448e-05, "loss": 0.9047, "step": 6103 }, { "epoch": 3.277037683299218, "grad_norm": 0.78125, "learning_rate": 1.3283100002732015e-05, "loss": 0.7606, "step": 6104 }, { "epoch": 3.277574578034294, "grad_norm": 0.625, "learning_rate": 1.3275640485517266e-05, "loss": 0.8246, "step": 6105 }, { "epoch": 3.2781114727693703, "grad_norm": 0.69921875, "learning_rate": 1.3268182306200405e-05, "loss": 0.8, "step": 6106 }, { "epoch": 3.278648367504446, "grad_norm": 0.703125, "learning_rate": 1.3260725465632512e-05, "loss": 0.8437, "step": 6107 }, { "epoch": 3.279185262239522, "grad_norm": 0.7109375, "learning_rate": 1.325326996466453e-05, "loss": 0.6511, "step": 6108 }, { "epoch": 3.2797221569745982, "grad_norm": 0.59765625, "learning_rate": 1.3245815804147185e-05, "loss": 0.8475, "step": 6109 }, { "epoch": 3.2802590517096744, "grad_norm": 0.5859375, "learning_rate": 1.3238362984931113e-05, "loss": 0.686, "step": 6110 }, { "epoch": 3.28079594644475, "grad_norm": 0.625, "learning_rate": 1.3230911507866783e-05, "loss": 0.9385, "step": 6111 }, { "epoch": 3.281332841179826, "grad_norm": 0.6328125, "learning_rate": 1.3223461373804482e-05, "loss": 0.7253, "step": 6112 }, { "epoch": 3.2818697359149023, "grad_norm": 0.66796875, "learning_rate": 1.3216012583594379e-05, "loss": 0.6429, "step": 6113 }, { "epoch": 3.2824066306499784, "grad_norm": 0.671875, "learning_rate": 1.3208565138086479e-05, "loss": 0.7178, "step": 6114 }, { "epoch": 3.282943525385054, "grad_norm": 0.63671875, "learning_rate": 1.3201119038130616e-05, "loss": 0.8726, "step": 6115 }, { "epoch": 3.28348042012013, "grad_norm": 0.609375, "learning_rate": 1.319367428457649e-05, "loss": 0.6606, "step": 6116 }, { "epoch": 3.2840173148552063, "grad_norm": 0.6796875, "learning_rate": 1.3186230878273653e-05, "loss": 0.8512, "step": 6117 }, { "epoch": 3.284554209590282, "grad_norm": 0.76953125, "learning_rate": 1.317878882007148e-05, "loss": 0.9175, "step": 6118 }, { "epoch": 3.285091104325358, "grad_norm": 0.59375, "learning_rate": 1.3171348110819198e-05, "loss": 0.8343, "step": 6119 }, { "epoch": 3.2856279990604342, "grad_norm": 0.75, "learning_rate": 1.3163908751365895e-05, "loss": 0.9887, "step": 6120 }, { "epoch": 3.2861648937955104, "grad_norm": 0.75390625, "learning_rate": 1.3156470742560506e-05, "loss": 0.877, "step": 6121 }, { "epoch": 3.286701788530586, "grad_norm": 0.59765625, "learning_rate": 1.3149034085251783e-05, "loss": 0.9246, "step": 6122 }, { "epoch": 3.287238683265662, "grad_norm": 0.65234375, "learning_rate": 1.3141598780288348e-05, "loss": 0.8138, "step": 6123 }, { "epoch": 3.2877755780007383, "grad_norm": 0.69140625, "learning_rate": 1.3134164828518674e-05, "loss": 0.8826, "step": 6124 }, { "epoch": 3.2883124727358144, "grad_norm": 0.62890625, "learning_rate": 1.3126732230791067e-05, "loss": 0.78, "step": 6125 }, { "epoch": 3.28884936747089, "grad_norm": 0.73046875, "learning_rate": 1.3119300987953664e-05, "loss": 0.9377, "step": 6126 }, { "epoch": 3.289386262205966, "grad_norm": 0.703125, "learning_rate": 1.3111871100854483e-05, "loss": 0.8684, "step": 6127 }, { "epoch": 3.2899231569410423, "grad_norm": 0.76171875, "learning_rate": 1.3104442570341352e-05, "loss": 0.8714, "step": 6128 }, { "epoch": 3.2904600516761184, "grad_norm": 0.6875, "learning_rate": 1.3097015397261969e-05, "loss": 0.7836, "step": 6129 }, { "epoch": 3.290996946411194, "grad_norm": 0.66796875, "learning_rate": 1.3089589582463879e-05, "loss": 0.8938, "step": 6130 }, { "epoch": 3.2915338411462702, "grad_norm": 0.69140625, "learning_rate": 1.3082165126794441e-05, "loss": 0.8421, "step": 6131 }, { "epoch": 3.2920707358813464, "grad_norm": 0.640625, "learning_rate": 1.307474203110089e-05, "loss": 0.7528, "step": 6132 }, { "epoch": 3.2926076306164225, "grad_norm": 0.6640625, "learning_rate": 1.30673202962303e-05, "loss": 0.7538, "step": 6133 }, { "epoch": 3.293144525351498, "grad_norm": 0.60546875, "learning_rate": 1.3059899923029584e-05, "loss": 0.6987, "step": 6134 }, { "epoch": 3.2936814200865743, "grad_norm": 0.57421875, "learning_rate": 1.3052480912345482e-05, "loss": 0.7734, "step": 6135 }, { "epoch": 3.2942183148216504, "grad_norm": 0.69921875, "learning_rate": 1.3045063265024612e-05, "loss": 1.0949, "step": 6136 }, { "epoch": 3.294755209556726, "grad_norm": 0.65625, "learning_rate": 1.3037646981913431e-05, "loss": 0.9803, "step": 6137 }, { "epoch": 3.295292104291802, "grad_norm": 0.7265625, "learning_rate": 1.303023206385821e-05, "loss": 0.9215, "step": 6138 }, { "epoch": 3.2958289990268783, "grad_norm": 0.7734375, "learning_rate": 1.3022818511705107e-05, "loss": 0.718, "step": 6139 }, { "epoch": 3.2963658937619544, "grad_norm": 0.73828125, "learning_rate": 1.3015406326300079e-05, "loss": 0.8945, "step": 6140 }, { "epoch": 3.29690278849703, "grad_norm": 0.890625, "learning_rate": 1.3007995508488963e-05, "loss": 0.7963, "step": 6141 }, { "epoch": 3.2974396832321062, "grad_norm": 0.6640625, "learning_rate": 1.3000586059117437e-05, "loss": 0.8496, "step": 6142 }, { "epoch": 3.2979765779671824, "grad_norm": 0.609375, "learning_rate": 1.2993177979030996e-05, "loss": 1.0048, "step": 6143 }, { "epoch": 3.2985134727022585, "grad_norm": 0.69140625, "learning_rate": 1.2985771269074997e-05, "loss": 1.0115, "step": 6144 }, { "epoch": 3.299050367437334, "grad_norm": 0.8125, "learning_rate": 1.2978365930094643e-05, "loss": 0.7098, "step": 6145 }, { "epoch": 3.2995872621724103, "grad_norm": 0.59375, "learning_rate": 1.2970961962934988e-05, "loss": 0.625, "step": 6146 }, { "epoch": 3.3001241569074864, "grad_norm": 0.85546875, "learning_rate": 1.2963559368440897e-05, "loss": 0.9434, "step": 6147 }, { "epoch": 3.3006610516425625, "grad_norm": 0.6640625, "learning_rate": 1.2956158147457115e-05, "loss": 0.7545, "step": 6148 }, { "epoch": 3.301197946377638, "grad_norm": 0.5703125, "learning_rate": 1.2948758300828217e-05, "loss": 0.7369, "step": 6149 }, { "epoch": 3.3017348411127143, "grad_norm": 0.71875, "learning_rate": 1.2941359829398616e-05, "loss": 0.7774, "step": 6150 }, { "epoch": 3.3017348411127143, "eval_loss": 0.902991533279419, "eval_runtime": 46.0383, "eval_samples_per_second": 20.896, "eval_steps_per_second": 20.896, "step": 6150 }, { "epoch": 3.3022717358477904, "grad_norm": 0.7734375, "learning_rate": 1.2933962734012558e-05, "loss": 0.8595, "step": 6151 }, { "epoch": 3.3028086305828666, "grad_norm": 0.67578125, "learning_rate": 1.2926567015514168e-05, "loss": 0.7515, "step": 6152 }, { "epoch": 3.3033455253179422, "grad_norm": 0.80859375, "learning_rate": 1.2919172674747366e-05, "loss": 0.8776, "step": 6153 }, { "epoch": 3.3038824200530184, "grad_norm": 0.640625, "learning_rate": 1.2911779712555955e-05, "loss": 0.688, "step": 6154 }, { "epoch": 3.3044193147880945, "grad_norm": 0.76171875, "learning_rate": 1.2904388129783574e-05, "loss": 0.9123, "step": 6155 }, { "epoch": 3.30495620952317, "grad_norm": 0.8359375, "learning_rate": 1.2896997927273675e-05, "loss": 0.7135, "step": 6156 }, { "epoch": 3.3054931042582463, "grad_norm": 0.76953125, "learning_rate": 1.2889609105869585e-05, "loss": 0.7934, "step": 6157 }, { "epoch": 3.3060299989933224, "grad_norm": 0.6953125, "learning_rate": 1.288222166641447e-05, "loss": 0.8593, "step": 6158 }, { "epoch": 3.3065668937283985, "grad_norm": 0.83984375, "learning_rate": 1.2874835609751324e-05, "loss": 1.0064, "step": 6159 }, { "epoch": 3.307103788463474, "grad_norm": 0.69921875, "learning_rate": 1.286745093672298e-05, "loss": 0.9235, "step": 6160 }, { "epoch": 3.3076406831985503, "grad_norm": 0.84375, "learning_rate": 1.2860067648172124e-05, "loss": 0.8554, "step": 6161 }, { "epoch": 3.3081775779336264, "grad_norm": 0.765625, "learning_rate": 1.2852685744941307e-05, "loss": 0.7287, "step": 6162 }, { "epoch": 3.3087144726687026, "grad_norm": 0.65234375, "learning_rate": 1.2845305227872865e-05, "loss": 0.8195, "step": 6163 }, { "epoch": 3.3092513674037782, "grad_norm": 0.671875, "learning_rate": 1.2837926097809023e-05, "loss": 0.9491, "step": 6164 }, { "epoch": 3.3097882621388544, "grad_norm": 0.73828125, "learning_rate": 1.2830548355591843e-05, "loss": 0.8297, "step": 6165 }, { "epoch": 3.3103251568739305, "grad_norm": 0.78515625, "learning_rate": 1.2823172002063202e-05, "loss": 0.97, "step": 6166 }, { "epoch": 3.3108620516090066, "grad_norm": 0.8359375, "learning_rate": 1.2815797038064853e-05, "loss": 0.8428, "step": 6167 }, { "epoch": 3.3113989463440823, "grad_norm": 0.6328125, "learning_rate": 1.2808423464438357e-05, "loss": 0.796, "step": 6168 }, { "epoch": 3.3119358410791584, "grad_norm": 0.69921875, "learning_rate": 1.2801051282025133e-05, "loss": 0.7423, "step": 6169 }, { "epoch": 3.3124727358142345, "grad_norm": 0.7421875, "learning_rate": 1.2793680491666443e-05, "loss": 1.0473, "step": 6170 }, { "epoch": 3.3130096305493106, "grad_norm": 0.78125, "learning_rate": 1.2786311094203401e-05, "loss": 0.813, "step": 6171 }, { "epoch": 3.3135465252843863, "grad_norm": 0.66015625, "learning_rate": 1.2778943090476925e-05, "loss": 0.8596, "step": 6172 }, { "epoch": 3.3140834200194624, "grad_norm": 0.66015625, "learning_rate": 1.277157648132781e-05, "loss": 0.7721, "step": 6173 }, { "epoch": 3.3146203147545386, "grad_norm": 0.6171875, "learning_rate": 1.2764211267596687e-05, "loss": 0.8599, "step": 6174 }, { "epoch": 3.3151572094896142, "grad_norm": 0.65234375, "learning_rate": 1.2756847450124004e-05, "loss": 0.6867, "step": 6175 }, { "epoch": 3.3156941042246904, "grad_norm": 0.61328125, "learning_rate": 1.2749485029750085e-05, "loss": 0.9036, "step": 6176 }, { "epoch": 3.3162309989597665, "grad_norm": 0.7578125, "learning_rate": 1.2742124007315055e-05, "loss": 1.0004, "step": 6177 }, { "epoch": 3.3167678936948426, "grad_norm": 0.68359375, "learning_rate": 1.2734764383658917e-05, "loss": 0.7567, "step": 6178 }, { "epoch": 3.3173047884299183, "grad_norm": 0.67578125, "learning_rate": 1.272740615962148e-05, "loss": 0.8134, "step": 6179 }, { "epoch": 3.3178416831649944, "grad_norm": 0.71484375, "learning_rate": 1.2720049336042422e-05, "loss": 0.8148, "step": 6180 }, { "epoch": 3.3183785779000705, "grad_norm": 0.703125, "learning_rate": 1.2712693913761253e-05, "loss": 0.9834, "step": 6181 }, { "epoch": 3.3189154726351466, "grad_norm": 0.63671875, "learning_rate": 1.2705339893617308e-05, "loss": 0.8446, "step": 6182 }, { "epoch": 3.3194523673702223, "grad_norm": 0.8046875, "learning_rate": 1.269798727644979e-05, "loss": 0.9565, "step": 6183 }, { "epoch": 3.3199892621052984, "grad_norm": 0.7109375, "learning_rate": 1.2690636063097716e-05, "loss": 0.9704, "step": 6184 }, { "epoch": 3.3205261568403746, "grad_norm": 0.72265625, "learning_rate": 1.2683286254399943e-05, "loss": 0.7425, "step": 6185 }, { "epoch": 3.3210630515754507, "grad_norm": 0.71484375, "learning_rate": 1.2675937851195185e-05, "loss": 1.0241, "step": 6186 }, { "epoch": 3.3215999463105264, "grad_norm": 0.77734375, "learning_rate": 1.2668590854322005e-05, "loss": 0.8633, "step": 6187 }, { "epoch": 3.3221368410456025, "grad_norm": 0.61328125, "learning_rate": 1.2661245264618759e-05, "loss": 0.7208, "step": 6188 }, { "epoch": 3.3226737357806786, "grad_norm": 0.5390625, "learning_rate": 1.2653901082923686e-05, "loss": 0.6294, "step": 6189 }, { "epoch": 3.3232106305157547, "grad_norm": 0.80859375, "learning_rate": 1.264655831007486e-05, "loss": 0.9833, "step": 6190 }, { "epoch": 3.3237475252508304, "grad_norm": 0.703125, "learning_rate": 1.2639216946910164e-05, "loss": 0.837, "step": 6191 }, { "epoch": 3.3242844199859065, "grad_norm": 0.73828125, "learning_rate": 1.2631876994267363e-05, "loss": 0.8409, "step": 6192 }, { "epoch": 3.3248213147209826, "grad_norm": 0.7109375, "learning_rate": 1.2624538452984025e-05, "loss": 0.7585, "step": 6193 }, { "epoch": 3.3253582094560583, "grad_norm": 0.703125, "learning_rate": 1.2617201323897565e-05, "loss": 0.8939, "step": 6194 }, { "epoch": 3.3258951041911344, "grad_norm": 0.57421875, "learning_rate": 1.2609865607845244e-05, "loss": 0.8083, "step": 6195 }, { "epoch": 3.3264319989262106, "grad_norm": 0.58203125, "learning_rate": 1.260253130566418e-05, "loss": 0.7417, "step": 6196 }, { "epoch": 3.3269688936612867, "grad_norm": 0.7421875, "learning_rate": 1.2595198418191285e-05, "loss": 0.8626, "step": 6197 }, { "epoch": 3.3275057883963624, "grad_norm": 0.6484375, "learning_rate": 1.2587866946263344e-05, "loss": 0.7583, "step": 6198 }, { "epoch": 3.3280426831314385, "grad_norm": 0.66796875, "learning_rate": 1.2580536890716982e-05, "loss": 0.7866, "step": 6199 }, { "epoch": 3.3285795778665146, "grad_norm": 0.65234375, "learning_rate": 1.2573208252388629e-05, "loss": 0.8418, "step": 6200 }, { "epoch": 3.3285795778665146, "eval_loss": 0.9033068418502808, "eval_runtime": 46.3129, "eval_samples_per_second": 20.772, "eval_steps_per_second": 20.772, "step": 6200 }, { "epoch": 3.3291164726015907, "grad_norm": 0.640625, "learning_rate": 1.2565881032114596e-05, "loss": 0.8344, "step": 6201 }, { "epoch": 3.3296533673366664, "grad_norm": 0.59375, "learning_rate": 1.2558555230730994e-05, "loss": 0.7096, "step": 6202 }, { "epoch": 3.3301902620717425, "grad_norm": 0.75390625, "learning_rate": 1.2551230849073809e-05, "loss": 0.9951, "step": 6203 }, { "epoch": 3.3307271568068186, "grad_norm": 0.6328125, "learning_rate": 1.2543907887978823e-05, "loss": 0.7308, "step": 6204 }, { "epoch": 3.3312640515418948, "grad_norm": 0.65234375, "learning_rate": 1.2536586348281692e-05, "loss": 1.0247, "step": 6205 }, { "epoch": 3.3318009462769704, "grad_norm": 0.74609375, "learning_rate": 1.2529266230817907e-05, "loss": 0.9658, "step": 6206 }, { "epoch": 3.3323378410120466, "grad_norm": 0.80078125, "learning_rate": 1.2521947536422762e-05, "loss": 0.8694, "step": 6207 }, { "epoch": 3.3328747357471227, "grad_norm": 0.75, "learning_rate": 1.2514630265931426e-05, "loss": 0.6699, "step": 6208 }, { "epoch": 3.333411630482199, "grad_norm": 0.7265625, "learning_rate": 1.250731442017891e-05, "loss": 0.7852, "step": 6209 }, { "epoch": 3.3339485252172745, "grad_norm": 0.62890625, "learning_rate": 1.2500000000000006e-05, "loss": 0.9484, "step": 6210 }, { "epoch": 3.3344854199523506, "grad_norm": 0.7265625, "learning_rate": 1.24926870062294e-05, "loss": 0.9421, "step": 6211 }, { "epoch": 3.3350223146874267, "grad_norm": 0.71484375, "learning_rate": 1.248537543970161e-05, "loss": 0.9167, "step": 6212 }, { "epoch": 3.3355592094225024, "grad_norm": 0.71875, "learning_rate": 1.247806530125096e-05, "loss": 1.0036, "step": 6213 }, { "epoch": 3.3360961041575785, "grad_norm": 0.6953125, "learning_rate": 1.2470756591711635e-05, "loss": 0.8089, "step": 6214 }, { "epoch": 3.3366329988926546, "grad_norm": 0.76171875, "learning_rate": 1.2463449311917661e-05, "loss": 1.0078, "step": 6215 }, { "epoch": 3.3371698936277308, "grad_norm": 0.68359375, "learning_rate": 1.2456143462702874e-05, "loss": 0.9643, "step": 6216 }, { "epoch": 3.3377067883628064, "grad_norm": 0.66796875, "learning_rate": 1.2448839044900985e-05, "loss": 0.8913, "step": 6217 }, { "epoch": 3.3382436830978826, "grad_norm": 0.6953125, "learning_rate": 1.2441536059345496e-05, "loss": 0.8453, "step": 6218 }, { "epoch": 3.3387805778329587, "grad_norm": 0.6328125, "learning_rate": 1.2434234506869794e-05, "loss": 0.7513, "step": 6219 }, { "epoch": 3.339317472568035, "grad_norm": 0.66015625, "learning_rate": 1.2426934388307056e-05, "loss": 0.8628, "step": 6220 }, { "epoch": 3.3398543673031105, "grad_norm": 0.70703125, "learning_rate": 1.241963570449033e-05, "loss": 0.7863, "step": 6221 }, { "epoch": 3.3403912620381866, "grad_norm": 0.66015625, "learning_rate": 1.2412338456252498e-05, "loss": 0.805, "step": 6222 }, { "epoch": 3.3409281567732627, "grad_norm": 0.69140625, "learning_rate": 1.2405042644426248e-05, "loss": 0.9499, "step": 6223 }, { "epoch": 3.341465051508339, "grad_norm": 0.74609375, "learning_rate": 1.2397748269844144e-05, "loss": 0.8158, "step": 6224 }, { "epoch": 3.3420019462434145, "grad_norm": 0.7421875, "learning_rate": 1.2390455333338547e-05, "loss": 1.0434, "step": 6225 }, { "epoch": 3.3425388409784906, "grad_norm": 0.7890625, "learning_rate": 1.2383163835741692e-05, "loss": 0.8421, "step": 6226 }, { "epoch": 3.3430757357135668, "grad_norm": 0.70703125, "learning_rate": 1.2375873777885617e-05, "loss": 0.8171, "step": 6227 }, { "epoch": 3.343612630448643, "grad_norm": 0.73828125, "learning_rate": 1.2368585160602222e-05, "loss": 0.9369, "step": 6228 }, { "epoch": 3.3441495251837186, "grad_norm": 0.74609375, "learning_rate": 1.2361297984723213e-05, "loss": 0.8971, "step": 6229 }, { "epoch": 3.3446864199187947, "grad_norm": 0.78125, "learning_rate": 1.2354012251080163e-05, "loss": 1.0165, "step": 6230 }, { "epoch": 3.345223314653871, "grad_norm": 0.7109375, "learning_rate": 1.2346727960504473e-05, "loss": 0.8555, "step": 6231 }, { "epoch": 3.3457602093889465, "grad_norm": 0.62109375, "learning_rate": 1.2339445113827356e-05, "loss": 0.7551, "step": 6232 }, { "epoch": 3.3462971041240226, "grad_norm": 0.73828125, "learning_rate": 1.233216371187988e-05, "loss": 0.8144, "step": 6233 }, { "epoch": 3.3468339988590987, "grad_norm": 0.7109375, "learning_rate": 1.2324883755492971e-05, "loss": 0.7931, "step": 6234 }, { "epoch": 3.347370893594175, "grad_norm": 0.78515625, "learning_rate": 1.2317605245497323e-05, "loss": 0.7344, "step": 6235 }, { "epoch": 3.3479077883292505, "grad_norm": 0.69140625, "learning_rate": 1.2310328182723528e-05, "loss": 0.8355, "step": 6236 }, { "epoch": 3.3484446830643266, "grad_norm": 0.76953125, "learning_rate": 1.2303052568001996e-05, "loss": 0.7781, "step": 6237 }, { "epoch": 3.3489815777994028, "grad_norm": 0.66796875, "learning_rate": 1.2295778402162949e-05, "loss": 0.7674, "step": 6238 }, { "epoch": 3.349518472534479, "grad_norm": 0.734375, "learning_rate": 1.2288505686036475e-05, "loss": 0.7675, "step": 6239 }, { "epoch": 3.3500553672695546, "grad_norm": 0.6640625, "learning_rate": 1.228123442045249e-05, "loss": 0.8565, "step": 6240 }, { "epoch": 3.3505922620046307, "grad_norm": 0.8515625, "learning_rate": 1.2273964606240718e-05, "loss": 0.9623, "step": 6241 }, { "epoch": 3.351129156739707, "grad_norm": 0.83984375, "learning_rate": 1.2266696244230747e-05, "loss": 0.7674, "step": 6242 }, { "epoch": 3.351666051474783, "grad_norm": 0.7421875, "learning_rate": 1.2259429335251996e-05, "loss": 0.8609, "step": 6243 }, { "epoch": 3.3522029462098586, "grad_norm": 0.6640625, "learning_rate": 1.2252163880133707e-05, "loss": 0.8189, "step": 6244 }, { "epoch": 3.3527398409449347, "grad_norm": 0.60546875, "learning_rate": 1.2244899879704947e-05, "loss": 0.8592, "step": 6245 }, { "epoch": 3.353276735680011, "grad_norm": 0.58984375, "learning_rate": 1.2237637334794639e-05, "loss": 0.9223, "step": 6246 }, { "epoch": 3.353813630415087, "grad_norm": 0.69140625, "learning_rate": 1.2230376246231546e-05, "loss": 0.9225, "step": 6247 }, { "epoch": 3.3543505251501626, "grad_norm": 0.6640625, "learning_rate": 1.2223116614844227e-05, "loss": 0.9264, "step": 6248 }, { "epoch": 3.3548874198852388, "grad_norm": 0.703125, "learning_rate": 1.2215858441461107e-05, "loss": 0.8881, "step": 6249 }, { "epoch": 3.355424314620315, "grad_norm": 0.75390625, "learning_rate": 1.2208601726910446e-05, "loss": 1.0364, "step": 6250 }, { "epoch": 3.355424314620315, "eval_loss": 0.9032419919967651, "eval_runtime": 46.3679, "eval_samples_per_second": 20.747, "eval_steps_per_second": 20.747, "step": 6250 }, { "epoch": 3.3559612093553906, "grad_norm": 0.87109375, "learning_rate": 1.220134647202032e-05, "loss": 0.9675, "step": 6251 }, { "epoch": 3.3564981040904667, "grad_norm": 0.68359375, "learning_rate": 1.2194092677618632e-05, "loss": 0.8381, "step": 6252 }, { "epoch": 3.357034998825543, "grad_norm": 0.640625, "learning_rate": 1.2186840344533154e-05, "loss": 0.8053, "step": 6253 }, { "epoch": 3.357571893560619, "grad_norm": 0.6796875, "learning_rate": 1.217958947359145e-05, "loss": 0.6974, "step": 6254 }, { "epoch": 3.3581087882956946, "grad_norm": 0.734375, "learning_rate": 1.2172340065620942e-05, "loss": 0.8768, "step": 6255 }, { "epoch": 3.3586456830307707, "grad_norm": 0.7578125, "learning_rate": 1.2165092121448893e-05, "loss": 0.8516, "step": 6256 }, { "epoch": 3.359182577765847, "grad_norm": 0.65625, "learning_rate": 1.2157845641902363e-05, "loss": 0.9253, "step": 6257 }, { "epoch": 3.359719472500923, "grad_norm": 0.66796875, "learning_rate": 1.2150600627808278e-05, "loss": 0.9366, "step": 6258 }, { "epoch": 3.3602563672359986, "grad_norm": 0.7265625, "learning_rate": 1.2143357079993395e-05, "loss": 0.9551, "step": 6259 }, { "epoch": 3.3607932619710748, "grad_norm": 0.72265625, "learning_rate": 1.2136114999284288e-05, "loss": 0.8466, "step": 6260 }, { "epoch": 3.361330156706151, "grad_norm": 0.71875, "learning_rate": 1.2128874386507355e-05, "loss": 1.0039, "step": 6261 }, { "epoch": 3.361867051441227, "grad_norm": 0.734375, "learning_rate": 1.2121635242488855e-05, "loss": 0.8287, "step": 6262 }, { "epoch": 3.3624039461763027, "grad_norm": 0.73046875, "learning_rate": 1.2114397568054874e-05, "loss": 0.8006, "step": 6263 }, { "epoch": 3.362940840911379, "grad_norm": 0.78515625, "learning_rate": 1.2107161364031305e-05, "loss": 0.8989, "step": 6264 }, { "epoch": 3.363477735646455, "grad_norm": 0.62109375, "learning_rate": 1.209992663124391e-05, "loss": 1.1105, "step": 6265 }, { "epoch": 3.364014630381531, "grad_norm": 0.6953125, "learning_rate": 1.209269337051824e-05, "loss": 0.9349, "step": 6266 }, { "epoch": 3.3645515251166067, "grad_norm": 0.53515625, "learning_rate": 1.2085461582679715e-05, "loss": 0.7596, "step": 6267 }, { "epoch": 3.365088419851683, "grad_norm": 0.890625, "learning_rate": 1.2078231268553585e-05, "loss": 0.9907, "step": 6268 }, { "epoch": 3.365625314586759, "grad_norm": 0.7109375, "learning_rate": 1.2071002428964905e-05, "loss": 0.9681, "step": 6269 }, { "epoch": 3.3661622093218346, "grad_norm": 0.67578125, "learning_rate": 1.2063775064738572e-05, "loss": 0.695, "step": 6270 }, { "epoch": 3.3666991040569108, "grad_norm": 0.6953125, "learning_rate": 1.2056549176699331e-05, "loss": 0.8351, "step": 6271 }, { "epoch": 3.367235998791987, "grad_norm": 0.66796875, "learning_rate": 1.2049324765671749e-05, "loss": 0.9598, "step": 6272 }, { "epoch": 3.367772893527063, "grad_norm": 0.7265625, "learning_rate": 1.2042101832480212e-05, "loss": 0.8427, "step": 6273 }, { "epoch": 3.3683097882621387, "grad_norm": 0.76953125, "learning_rate": 1.2034880377948953e-05, "loss": 0.7668, "step": 6274 }, { "epoch": 3.368846682997215, "grad_norm": 0.59375, "learning_rate": 1.2027660402902044e-05, "loss": 0.6131, "step": 6275 }, { "epoch": 3.369383577732291, "grad_norm": 0.625, "learning_rate": 1.2020441908163362e-05, "loss": 0.8759, "step": 6276 }, { "epoch": 3.369920472467367, "grad_norm": 0.69921875, "learning_rate": 1.2013224894556625e-05, "loss": 1.0069, "step": 6277 }, { "epoch": 3.3704573672024427, "grad_norm": 0.69921875, "learning_rate": 1.2006009362905396e-05, "loss": 0.7265, "step": 6278 }, { "epoch": 3.370994261937519, "grad_norm": 0.64453125, "learning_rate": 1.1998795314033048e-05, "loss": 0.7634, "step": 6279 }, { "epoch": 3.371531156672595, "grad_norm": 0.71875, "learning_rate": 1.19915827487628e-05, "loss": 0.768, "step": 6280 }, { "epoch": 3.372068051407671, "grad_norm": 0.65234375, "learning_rate": 1.198437166791771e-05, "loss": 0.7861, "step": 6281 }, { "epoch": 3.3726049461427468, "grad_norm": 0.640625, "learning_rate": 1.1977162072320632e-05, "loss": 0.6923, "step": 6282 }, { "epoch": 3.373141840877823, "grad_norm": 0.69140625, "learning_rate": 1.1969953962794284e-05, "loss": 0.9575, "step": 6283 }, { "epoch": 3.373678735612899, "grad_norm": 0.6640625, "learning_rate": 1.196274734016121e-05, "loss": 0.848, "step": 6284 }, { "epoch": 3.374215630347975, "grad_norm": 0.81640625, "learning_rate": 1.1955542205243767e-05, "loss": 0.8451, "step": 6285 }, { "epoch": 3.374752525083051, "grad_norm": 0.6953125, "learning_rate": 1.1948338558864144e-05, "loss": 0.7274, "step": 6286 }, { "epoch": 3.375289419818127, "grad_norm": 0.65234375, "learning_rate": 1.1941136401844377e-05, "loss": 0.6763, "step": 6287 }, { "epoch": 3.375826314553203, "grad_norm": 0.6171875, "learning_rate": 1.1933935735006333e-05, "loss": 0.8213, "step": 6288 }, { "epoch": 3.3763632092882787, "grad_norm": 0.71484375, "learning_rate": 1.192673655917168e-05, "loss": 0.989, "step": 6289 }, { "epoch": 3.376900104023355, "grad_norm": 0.6796875, "learning_rate": 1.1919538875161945e-05, "loss": 0.8226, "step": 6290 }, { "epoch": 3.377436998758431, "grad_norm": 0.78125, "learning_rate": 1.1912342683798486e-05, "loss": 0.859, "step": 6291 }, { "epoch": 3.377973893493507, "grad_norm": 0.6953125, "learning_rate": 1.190514798590246e-05, "loss": 0.8144, "step": 6292 }, { "epoch": 3.3785107882285828, "grad_norm": 0.57421875, "learning_rate": 1.1897954782294888e-05, "loss": 0.8307, "step": 6293 }, { "epoch": 3.379047682963659, "grad_norm": 0.859375, "learning_rate": 1.18907630737966e-05, "loss": 0.7958, "step": 6294 }, { "epoch": 3.379584577698735, "grad_norm": 0.60546875, "learning_rate": 1.1883572861228253e-05, "loss": 0.8073, "step": 6295 }, { "epoch": 3.380121472433811, "grad_norm": 0.6484375, "learning_rate": 1.1876384145410346e-05, "loss": 0.8633, "step": 6296 }, { "epoch": 3.380658367168887, "grad_norm": 0.64453125, "learning_rate": 1.1869196927163218e-05, "loss": 0.8272, "step": 6297 }, { "epoch": 3.381195261903963, "grad_norm": 0.7890625, "learning_rate": 1.1862011207306997e-05, "loss": 0.8016, "step": 6298 }, { "epoch": 3.381732156639039, "grad_norm": 0.77734375, "learning_rate": 1.1854826986661677e-05, "loss": 0.8162, "step": 6299 }, { "epoch": 3.382269051374115, "grad_norm": 0.86328125, "learning_rate": 1.1847644266047076e-05, "loss": 0.7611, "step": 6300 }, { "epoch": 3.382269051374115, "eval_loss": 0.9031023383140564, "eval_runtime": 46.2991, "eval_samples_per_second": 20.778, "eval_steps_per_second": 20.778, "step": 6300 }, { "epoch": 3.382805946109191, "grad_norm": 0.69140625, "learning_rate": 1.1840463046282827e-05, "loss": 0.8548, "step": 6301 }, { "epoch": 3.383342840844267, "grad_norm": 0.76953125, "learning_rate": 1.1833283328188385e-05, "loss": 0.8113, "step": 6302 }, { "epoch": 3.383879735579343, "grad_norm": 0.6875, "learning_rate": 1.1826105112583061e-05, "loss": 0.6172, "step": 6303 }, { "epoch": 3.384416630314419, "grad_norm": 0.7734375, "learning_rate": 1.1818928400285986e-05, "loss": 1.1469, "step": 6304 }, { "epoch": 3.384953525049495, "grad_norm": 0.68359375, "learning_rate": 1.1811753192116096e-05, "loss": 0.9609, "step": 6305 }, { "epoch": 3.385490419784571, "grad_norm": 0.703125, "learning_rate": 1.1804579488892182e-05, "loss": 0.8386, "step": 6306 }, { "epoch": 3.386027314519647, "grad_norm": 0.77734375, "learning_rate": 1.1797407291432864e-05, "loss": 1.017, "step": 6307 }, { "epoch": 3.386564209254723, "grad_norm": 0.6796875, "learning_rate": 1.1790236600556562e-05, "loss": 0.7762, "step": 6308 }, { "epoch": 3.387101103989799, "grad_norm": 0.765625, "learning_rate": 1.1783067417081561e-05, "loss": 0.865, "step": 6309 }, { "epoch": 3.387637998724875, "grad_norm": 0.69921875, "learning_rate": 1.1775899741825947e-05, "loss": 0.8347, "step": 6310 }, { "epoch": 3.388174893459951, "grad_norm": 0.69921875, "learning_rate": 1.1768733575607633e-05, "loss": 0.7458, "step": 6311 }, { "epoch": 3.388711788195027, "grad_norm": 0.62109375, "learning_rate": 1.1761568919244378e-05, "loss": 0.9071, "step": 6312 }, { "epoch": 3.389248682930103, "grad_norm": 0.72265625, "learning_rate": 1.175440577355377e-05, "loss": 1.0371, "step": 6313 }, { "epoch": 3.389785577665179, "grad_norm": 0.6796875, "learning_rate": 1.1747244139353197e-05, "loss": 0.8916, "step": 6314 }, { "epoch": 3.390322472400255, "grad_norm": 0.60546875, "learning_rate": 1.1740084017459898e-05, "loss": 0.8818, "step": 6315 }, { "epoch": 3.390859367135331, "grad_norm": 0.7421875, "learning_rate": 1.1732925408690948e-05, "loss": 0.7905, "step": 6316 }, { "epoch": 3.391396261870407, "grad_norm": 0.69140625, "learning_rate": 1.1725768313863211e-05, "loss": 0.875, "step": 6317 }, { "epoch": 3.391933156605483, "grad_norm": 0.734375, "learning_rate": 1.1718612733793424e-05, "loss": 0.7773, "step": 6318 }, { "epoch": 3.3924700513405592, "grad_norm": 0.7421875, "learning_rate": 1.171145866929812e-05, "loss": 1.0167, "step": 6319 }, { "epoch": 3.393006946075635, "grad_norm": 0.75, "learning_rate": 1.170430612119366e-05, "loss": 0.768, "step": 6320 }, { "epoch": 3.393543840810711, "grad_norm": 0.64453125, "learning_rate": 1.1697155090296244e-05, "loss": 0.7505, "step": 6321 }, { "epoch": 3.394080735545787, "grad_norm": 0.73046875, "learning_rate": 1.1690005577421908e-05, "loss": 0.9494, "step": 6322 }, { "epoch": 3.3946176302808633, "grad_norm": 0.609375, "learning_rate": 1.1682857583386485e-05, "loss": 0.9592, "step": 6323 }, { "epoch": 3.395154525015939, "grad_norm": 0.8359375, "learning_rate": 1.1675711109005664e-05, "loss": 0.8081, "step": 6324 }, { "epoch": 3.395691419751015, "grad_norm": 0.59375, "learning_rate": 1.1668566155094949e-05, "loss": 0.7074, "step": 6325 }, { "epoch": 3.396228314486091, "grad_norm": 0.64453125, "learning_rate": 1.1661422722469656e-05, "loss": 0.779, "step": 6326 }, { "epoch": 3.396765209221167, "grad_norm": 0.85546875, "learning_rate": 1.165428081194496e-05, "loss": 0.8498, "step": 6327 }, { "epoch": 3.397302103956243, "grad_norm": 0.78125, "learning_rate": 1.164714042433582e-05, "loss": 0.763, "step": 6328 }, { "epoch": 3.397838998691319, "grad_norm": 0.6640625, "learning_rate": 1.164000156045707e-05, "loss": 0.8777, "step": 6329 }, { "epoch": 3.3983758934263952, "grad_norm": 1.5078125, "learning_rate": 1.1632864221123324e-05, "loss": 1.0502, "step": 6330 }, { "epoch": 3.398912788161471, "grad_norm": 0.75, "learning_rate": 1.1625728407149054e-05, "loss": 0.8756, "step": 6331 }, { "epoch": 3.399449682896547, "grad_norm": 0.72265625, "learning_rate": 1.1618594119348548e-05, "loss": 0.9094, "step": 6332 }, { "epoch": 3.399986577631623, "grad_norm": 0.7109375, "learning_rate": 1.1611461358535908e-05, "loss": 0.957, "step": 6333 }, { "epoch": 3.4005234723666993, "grad_norm": 0.6640625, "learning_rate": 1.1604330125525079e-05, "loss": 0.8582, "step": 6334 }, { "epoch": 3.401060367101775, "grad_norm": 0.67578125, "learning_rate": 1.1597200421129844e-05, "loss": 0.9101, "step": 6335 }, { "epoch": 3.401597261836851, "grad_norm": 0.75, "learning_rate": 1.1590072246163756e-05, "loss": 0.809, "step": 6336 }, { "epoch": 3.402134156571927, "grad_norm": 0.65625, "learning_rate": 1.1582945601440246e-05, "loss": 0.7188, "step": 6337 }, { "epoch": 3.4026710513070033, "grad_norm": 0.65625, "learning_rate": 1.1575820487772566e-05, "loss": 0.768, "step": 6338 }, { "epoch": 3.403207946042079, "grad_norm": 0.96484375, "learning_rate": 1.1568696905973761e-05, "loss": 1.112, "step": 6339 }, { "epoch": 3.403744840777155, "grad_norm": 0.6796875, "learning_rate": 1.1561574856856736e-05, "loss": 0.8483, "step": 6340 }, { "epoch": 3.4042817355122312, "grad_norm": 0.7421875, "learning_rate": 1.1554454341234211e-05, "loss": 0.8113, "step": 6341 }, { "epoch": 3.4048186302473074, "grad_norm": 0.6875, "learning_rate": 1.154733535991871e-05, "loss": 0.7565, "step": 6342 }, { "epoch": 3.405355524982383, "grad_norm": 0.65234375, "learning_rate": 1.1540217913722621e-05, "loss": 0.7865, "step": 6343 }, { "epoch": 3.405892419717459, "grad_norm": 0.61328125, "learning_rate": 1.153310200345811e-05, "loss": 0.6929, "step": 6344 }, { "epoch": 3.4064293144525353, "grad_norm": 0.765625, "learning_rate": 1.1525987629937212e-05, "loss": 0.9575, "step": 6345 }, { "epoch": 3.406966209187611, "grad_norm": 0.671875, "learning_rate": 1.1518874793971754e-05, "loss": 0.9249, "step": 6346 }, { "epoch": 3.407503103922687, "grad_norm": 0.734375, "learning_rate": 1.1511763496373407e-05, "loss": 0.8732, "step": 6347 }, { "epoch": 3.408039998657763, "grad_norm": 0.70703125, "learning_rate": 1.1504653737953669e-05, "loss": 0.9182, "step": 6348 }, { "epoch": 3.4085768933928393, "grad_norm": 0.65625, "learning_rate": 1.1497545519523834e-05, "loss": 0.811, "step": 6349 }, { "epoch": 3.409113788127915, "grad_norm": 0.671875, "learning_rate": 1.149043884189506e-05, "loss": 0.8938, "step": 6350 }, { "epoch": 3.409113788127915, "eval_loss": 0.9030389189720154, "eval_runtime": 45.9493, "eval_samples_per_second": 20.936, "eval_steps_per_second": 20.936, "step": 6350 }, { "epoch": 3.409650682862991, "grad_norm": 0.6015625, "learning_rate": 1.1483333705878289e-05, "loss": 0.7719, "step": 6351 }, { "epoch": 3.4101875775980672, "grad_norm": 0.671875, "learning_rate": 1.1476230112284325e-05, "loss": 0.7383, "step": 6352 }, { "epoch": 3.4107244723331434, "grad_norm": 0.71484375, "learning_rate": 1.1469128061923761e-05, "loss": 0.8725, "step": 6353 }, { "epoch": 3.411261367068219, "grad_norm": 0.69921875, "learning_rate": 1.1462027555607052e-05, "loss": 1.072, "step": 6354 }, { "epoch": 3.411798261803295, "grad_norm": 0.69140625, "learning_rate": 1.1454928594144431e-05, "loss": 0.8541, "step": 6355 }, { "epoch": 3.4123351565383713, "grad_norm": 0.734375, "learning_rate": 1.1447831178345994e-05, "loss": 0.8959, "step": 6356 }, { "epoch": 3.4128720512734474, "grad_norm": 0.57421875, "learning_rate": 1.1440735309021652e-05, "loss": 0.7549, "step": 6357 }, { "epoch": 3.413408946008523, "grad_norm": 0.80859375, "learning_rate": 1.1433640986981114e-05, "loss": 1.0197, "step": 6358 }, { "epoch": 3.413945840743599, "grad_norm": 0.7109375, "learning_rate": 1.1426548213033947e-05, "loss": 0.7, "step": 6359 }, { "epoch": 3.4144827354786753, "grad_norm": 0.73046875, "learning_rate": 1.141945698798954e-05, "loss": 0.9265, "step": 6360 }, { "epoch": 3.4150196302137514, "grad_norm": 0.58203125, "learning_rate": 1.1412367312657058e-05, "loss": 0.7631, "step": 6361 }, { "epoch": 3.415556524948827, "grad_norm": 0.75, "learning_rate": 1.1405279187845536e-05, "loss": 0.8483, "step": 6362 }, { "epoch": 3.4160934196839032, "grad_norm": 0.93359375, "learning_rate": 1.1398192614363832e-05, "loss": 0.9273, "step": 6363 }, { "epoch": 3.4166303144189794, "grad_norm": 0.765625, "learning_rate": 1.1391107593020595e-05, "loss": 0.8118, "step": 6364 }, { "epoch": 3.417167209154055, "grad_norm": 0.58203125, "learning_rate": 1.1384024124624324e-05, "loss": 0.8016, "step": 6365 }, { "epoch": 3.417704103889131, "grad_norm": 0.69140625, "learning_rate": 1.1376942209983341e-05, "loss": 0.8617, "step": 6366 }, { "epoch": 3.4182409986242073, "grad_norm": 0.53515625, "learning_rate": 1.1369861849905764e-05, "loss": 0.6418, "step": 6367 }, { "epoch": 3.4187778933592834, "grad_norm": 0.6640625, "learning_rate": 1.1362783045199563e-05, "loss": 0.9412, "step": 6368 }, { "epoch": 3.419314788094359, "grad_norm": 0.58203125, "learning_rate": 1.1355705796672528e-05, "loss": 0.7868, "step": 6369 }, { "epoch": 3.419851682829435, "grad_norm": 0.8125, "learning_rate": 1.1348630105132253e-05, "loss": 0.8543, "step": 6370 }, { "epoch": 3.4203885775645113, "grad_norm": 0.73046875, "learning_rate": 1.1341555971386151e-05, "loss": 0.7752, "step": 6371 }, { "epoch": 3.4209254722995874, "grad_norm": 0.69140625, "learning_rate": 1.1334483396241486e-05, "loss": 0.8137, "step": 6372 }, { "epoch": 3.421462367034663, "grad_norm": 0.734375, "learning_rate": 1.1327412380505334e-05, "loss": 0.9258, "step": 6373 }, { "epoch": 3.4219992617697392, "grad_norm": 0.64453125, "learning_rate": 1.1320342924984567e-05, "loss": 0.9591, "step": 6374 }, { "epoch": 3.4225361565048154, "grad_norm": 0.72265625, "learning_rate": 1.1313275030485914e-05, "loss": 0.8314, "step": 6375 }, { "epoch": 3.423073051239891, "grad_norm": 0.71875, "learning_rate": 1.1306208697815917e-05, "loss": 0.9276, "step": 6376 }, { "epoch": 3.423609945974967, "grad_norm": 0.72265625, "learning_rate": 1.1299143927780926e-05, "loss": 0.8003, "step": 6377 }, { "epoch": 3.4241468407100433, "grad_norm": 0.89453125, "learning_rate": 1.129208072118711e-05, "loss": 0.8508, "step": 6378 }, { "epoch": 3.4246837354451194, "grad_norm": 0.70703125, "learning_rate": 1.128501907884049e-05, "loss": 0.8766, "step": 6379 }, { "epoch": 3.4252206301801955, "grad_norm": 0.66796875, "learning_rate": 1.1277959001546867e-05, "loss": 0.9165, "step": 6380 }, { "epoch": 3.425757524915271, "grad_norm": 0.69140625, "learning_rate": 1.1270900490111902e-05, "loss": 0.9921, "step": 6381 }, { "epoch": 3.4262944196503473, "grad_norm": 0.7578125, "learning_rate": 1.1263843545341063e-05, "loss": 1.0443, "step": 6382 }, { "epoch": 3.4268313143854234, "grad_norm": 0.69921875, "learning_rate": 1.1256788168039623e-05, "loss": 0.8628, "step": 6383 }, { "epoch": 3.427368209120499, "grad_norm": 0.765625, "learning_rate": 1.1249734359012692e-05, "loss": 0.8578, "step": 6384 }, { "epoch": 3.4279051038555752, "grad_norm": 0.71875, "learning_rate": 1.1242682119065218e-05, "loss": 1.0051, "step": 6385 }, { "epoch": 3.4284419985906514, "grad_norm": 0.71484375, "learning_rate": 1.1235631449001934e-05, "loss": 0.7941, "step": 6386 }, { "epoch": 3.4289788933257275, "grad_norm": 0.6015625, "learning_rate": 1.1228582349627403e-05, "loss": 0.932, "step": 6387 }, { "epoch": 3.429515788060803, "grad_norm": 0.625, "learning_rate": 1.122153482174603e-05, "loss": 0.7477, "step": 6388 }, { "epoch": 3.4300526827958793, "grad_norm": 0.6875, "learning_rate": 1.1214488866162032e-05, "loss": 0.9474, "step": 6389 }, { "epoch": 3.4305895775309554, "grad_norm": 0.640625, "learning_rate": 1.1207444483679422e-05, "loss": 0.8235, "step": 6390 }, { "epoch": 3.4311264722660315, "grad_norm": 0.65234375, "learning_rate": 1.1200401675102076e-05, "loss": 0.7388, "step": 6391 }, { "epoch": 3.431663367001107, "grad_norm": 0.7578125, "learning_rate": 1.119336044123365e-05, "loss": 0.8663, "step": 6392 }, { "epoch": 3.4322002617361833, "grad_norm": 0.69140625, "learning_rate": 1.118632078287764e-05, "loss": 0.7474, "step": 6393 }, { "epoch": 3.4327371564712594, "grad_norm": 0.73828125, "learning_rate": 1.1179282700837377e-05, "loss": 0.9538, "step": 6394 }, { "epoch": 3.433274051206335, "grad_norm": 0.79296875, "learning_rate": 1.1172246195915983e-05, "loss": 0.9978, "step": 6395 }, { "epoch": 3.4338109459414112, "grad_norm": 0.60546875, "learning_rate": 1.11652112689164e-05, "loss": 0.9181, "step": 6396 }, { "epoch": 3.4343478406764874, "grad_norm": 0.69921875, "learning_rate": 1.115817792064142e-05, "loss": 0.7215, "step": 6397 }, { "epoch": 3.4348847354115635, "grad_norm": 0.71875, "learning_rate": 1.1151146151893638e-05, "loss": 0.9043, "step": 6398 }, { "epoch": 3.4354216301466396, "grad_norm": 0.7109375, "learning_rate": 1.1144115963475452e-05, "loss": 0.7626, "step": 6399 }, { "epoch": 3.4359585248817153, "grad_norm": 0.68359375, "learning_rate": 1.1137087356189104e-05, "loss": 0.9085, "step": 6400 }, { "epoch": 3.4359585248817153, "eval_loss": 0.902956485748291, "eval_runtime": 45.9769, "eval_samples_per_second": 20.924, "eval_steps_per_second": 20.924, "step": 6400 }, { "epoch": 3.4364954196167914, "grad_norm": 0.62890625, "learning_rate": 1.1130060330836659e-05, "loss": 0.8724, "step": 6401 }, { "epoch": 3.4370323143518675, "grad_norm": 0.85546875, "learning_rate": 1.112303488821998e-05, "loss": 0.7619, "step": 6402 }, { "epoch": 3.437569209086943, "grad_norm": 0.7109375, "learning_rate": 1.1116011029140747e-05, "loss": 0.783, "step": 6403 }, { "epoch": 3.4381061038220193, "grad_norm": 0.78515625, "learning_rate": 1.1108988754400493e-05, "loss": 1.0795, "step": 6404 }, { "epoch": 3.4386429985570954, "grad_norm": 0.72265625, "learning_rate": 1.1101968064800525e-05, "loss": 0.9267, "step": 6405 }, { "epoch": 3.4391798932921716, "grad_norm": 1.4453125, "learning_rate": 1.1094948961142008e-05, "loss": 0.947, "step": 6406 }, { "epoch": 3.4397167880272472, "grad_norm": 0.7421875, "learning_rate": 1.1087931444225915e-05, "loss": 0.8666, "step": 6407 }, { "epoch": 3.4402536827623234, "grad_norm": 0.6953125, "learning_rate": 1.108091551485302e-05, "loss": 0.926, "step": 6408 }, { "epoch": 3.4407905774973995, "grad_norm": 0.9296875, "learning_rate": 1.1073901173823934e-05, "loss": 1.0292, "step": 6409 }, { "epoch": 3.4413274722324756, "grad_norm": 0.796875, "learning_rate": 1.1066888421939093e-05, "loss": 0.7754, "step": 6410 }, { "epoch": 3.4418643669675513, "grad_norm": 0.58203125, "learning_rate": 1.105987725999873e-05, "loss": 0.6972, "step": 6411 }, { "epoch": 3.4424012617026274, "grad_norm": 0.67578125, "learning_rate": 1.10528676888029e-05, "loss": 0.7376, "step": 6412 }, { "epoch": 3.4429381564377035, "grad_norm": 0.6953125, "learning_rate": 1.1045859709151496e-05, "loss": 0.9169, "step": 6413 }, { "epoch": 3.443475051172779, "grad_norm": 0.703125, "learning_rate": 1.1038853321844222e-05, "loss": 0.8072, "step": 6414 }, { "epoch": 3.4440119459078553, "grad_norm": 0.765625, "learning_rate": 1.1031848527680577e-05, "loss": 0.8713, "step": 6415 }, { "epoch": 3.4445488406429314, "grad_norm": 0.79296875, "learning_rate": 1.102484532745991e-05, "loss": 0.8295, "step": 6416 }, { "epoch": 3.4450857353780076, "grad_norm": 0.6484375, "learning_rate": 1.101784372198138e-05, "loss": 0.7872, "step": 6417 }, { "epoch": 3.4456226301130837, "grad_norm": 0.68359375, "learning_rate": 1.1010843712043941e-05, "loss": 0.8727, "step": 6418 }, { "epoch": 3.4461595248481594, "grad_norm": 0.84375, "learning_rate": 1.1003845298446402e-05, "loss": 0.753, "step": 6419 }, { "epoch": 3.4466964195832355, "grad_norm": 0.734375, "learning_rate": 1.0996848481987362e-05, "loss": 0.9506, "step": 6420 }, { "epoch": 3.4472333143183116, "grad_norm": 0.78515625, "learning_rate": 1.0989853263465235e-05, "loss": 0.9232, "step": 6421 }, { "epoch": 3.4477702090533873, "grad_norm": 1.0703125, "learning_rate": 1.0982859643678277e-05, "loss": 0.873, "step": 6422 }, { "epoch": 3.4483071037884634, "grad_norm": 0.640625, "learning_rate": 1.0975867623424554e-05, "loss": 0.8102, "step": 6423 }, { "epoch": 3.4488439985235395, "grad_norm": 0.70703125, "learning_rate": 1.0968877203501929e-05, "loss": 0.7943, "step": 6424 }, { "epoch": 3.4493808932586156, "grad_norm": 0.70703125, "learning_rate": 1.0961888384708105e-05, "loss": 0.8762, "step": 6425 }, { "epoch": 3.4499177879936913, "grad_norm": 0.59765625, "learning_rate": 1.09549011678406e-05, "loss": 0.5962, "step": 6426 }, { "epoch": 3.4504546827287674, "grad_norm": 0.65625, "learning_rate": 1.0947915553696742e-05, "loss": 0.6815, "step": 6427 }, { "epoch": 3.4509915774638436, "grad_norm": 0.86328125, "learning_rate": 1.0940931543073665e-05, "loss": 0.9124, "step": 6428 }, { "epoch": 3.4515284721989197, "grad_norm": 0.75390625, "learning_rate": 1.0933949136768343e-05, "loss": 0.9224, "step": 6429 }, { "epoch": 3.4520653669339953, "grad_norm": 0.703125, "learning_rate": 1.0926968335577564e-05, "loss": 0.9123, "step": 6430 }, { "epoch": 3.4526022616690715, "grad_norm": 0.73046875, "learning_rate": 1.0919989140297911e-05, "loss": 0.7516, "step": 6431 }, { "epoch": 3.4531391564041476, "grad_norm": 0.6328125, "learning_rate": 1.0913011551725805e-05, "loss": 0.7664, "step": 6432 }, { "epoch": 3.4536760511392233, "grad_norm": 0.92578125, "learning_rate": 1.0906035570657488e-05, "loss": 0.7161, "step": 6433 }, { "epoch": 3.4542129458742994, "grad_norm": 0.59375, "learning_rate": 1.0899061197888987e-05, "loss": 1.035, "step": 6434 }, { "epoch": 3.4547498406093755, "grad_norm": 0.66796875, "learning_rate": 1.0892088434216188e-05, "loss": 0.823, "step": 6435 }, { "epoch": 3.4552867353444516, "grad_norm": 0.76953125, "learning_rate": 1.0885117280434762e-05, "loss": 0.9475, "step": 6436 }, { "epoch": 3.4558236300795278, "grad_norm": 0.81640625, "learning_rate": 1.0878147737340194e-05, "loss": 0.9158, "step": 6437 }, { "epoch": 3.4563605248146034, "grad_norm": 0.64453125, "learning_rate": 1.087117980572781e-05, "loss": 0.9155, "step": 6438 }, { "epoch": 3.4568974195496796, "grad_norm": 0.765625, "learning_rate": 1.0864213486392743e-05, "loss": 0.8528, "step": 6439 }, { "epoch": 3.4574343142847557, "grad_norm": 0.77734375, "learning_rate": 1.0857248780129928e-05, "loss": 0.808, "step": 6440 }, { "epoch": 3.4579712090198313, "grad_norm": 0.6015625, "learning_rate": 1.0850285687734126e-05, "loss": 0.8141, "step": 6441 }, { "epoch": 3.4585081037549075, "grad_norm": 1.8125, "learning_rate": 1.0843324209999931e-05, "loss": 0.8891, "step": 6442 }, { "epoch": 3.4590449984899836, "grad_norm": 0.65234375, "learning_rate": 1.083636434772171e-05, "loss": 0.7788, "step": 6443 }, { "epoch": 3.4595818932250597, "grad_norm": 0.75, "learning_rate": 1.0829406101693698e-05, "loss": 0.8557, "step": 6444 }, { "epoch": 3.4601187879601354, "grad_norm": 0.63671875, "learning_rate": 1.0822449472709905e-05, "loss": 0.7947, "step": 6445 }, { "epoch": 3.4606556826952115, "grad_norm": 0.7421875, "learning_rate": 1.0815494461564162e-05, "loss": 0.8934, "step": 6446 }, { "epoch": 3.4611925774302876, "grad_norm": 0.6171875, "learning_rate": 1.0808541069050131e-05, "loss": 0.7493, "step": 6447 }, { "epoch": 3.4617294721653638, "grad_norm": 0.7109375, "learning_rate": 1.0801589295961298e-05, "loss": 0.9463, "step": 6448 }, { "epoch": 3.4622663669004394, "grad_norm": 0.69921875, "learning_rate": 1.0794639143090924e-05, "loss": 0.904, "step": 6449 }, { "epoch": 3.4628032616355156, "grad_norm": 0.63671875, "learning_rate": 1.0787690611232118e-05, "loss": 0.8015, "step": 6450 }, { "epoch": 3.4628032616355156, "eval_loss": 0.9030494093894958, "eval_runtime": 46.6563, "eval_samples_per_second": 20.619, "eval_steps_per_second": 20.619, "step": 6450 }, { "epoch": 3.4633401563705917, "grad_norm": 0.73046875, "learning_rate": 1.0780743701177808e-05, "loss": 0.683, "step": 6451 }, { "epoch": 3.4638770511056673, "grad_norm": 0.59765625, "learning_rate": 1.0773798413720703e-05, "loss": 0.6482, "step": 6452 }, { "epoch": 3.4644139458407435, "grad_norm": 0.7265625, "learning_rate": 1.0766854749653372e-05, "loss": 0.7797, "step": 6453 }, { "epoch": 3.4649508405758196, "grad_norm": 0.73046875, "learning_rate": 1.0759912709768153e-05, "loss": 0.903, "step": 6454 }, { "epoch": 3.4654877353108957, "grad_norm": 0.57421875, "learning_rate": 1.0752972294857236e-05, "loss": 0.752, "step": 6455 }, { "epoch": 3.466024630045972, "grad_norm": 0.7109375, "learning_rate": 1.0746033505712597e-05, "loss": 0.6841, "step": 6456 }, { "epoch": 3.4665615247810475, "grad_norm": 0.66796875, "learning_rate": 1.0739096343126046e-05, "loss": 1.0571, "step": 6457 }, { "epoch": 3.4670984195161236, "grad_norm": 0.625, "learning_rate": 1.0732160807889211e-05, "loss": 0.7725, "step": 6458 }, { "epoch": 3.4676353142511998, "grad_norm": 0.6484375, "learning_rate": 1.0725226900793506e-05, "loss": 0.7736, "step": 6459 }, { "epoch": 3.4681722089862754, "grad_norm": 0.67578125, "learning_rate": 1.0718294622630188e-05, "loss": 0.9082, "step": 6460 }, { "epoch": 3.4687091037213515, "grad_norm": 0.6484375, "learning_rate": 1.0711363974190334e-05, "loss": 0.9389, "step": 6461 }, { "epoch": 3.4692459984564277, "grad_norm": 0.73046875, "learning_rate": 1.0704434956264783e-05, "loss": 0.808, "step": 6462 }, { "epoch": 3.469782893191504, "grad_norm": 0.74609375, "learning_rate": 1.0697507569644244e-05, "loss": 0.927, "step": 6463 }, { "epoch": 3.4703197879265795, "grad_norm": 0.9140625, "learning_rate": 1.0690581815119228e-05, "loss": 0.8797, "step": 6464 }, { "epoch": 3.4708566826616556, "grad_norm": 0.65234375, "learning_rate": 1.068365769348003e-05, "loss": 1.0282, "step": 6465 }, { "epoch": 3.4713935773967317, "grad_norm": 0.6015625, "learning_rate": 1.0676735205516788e-05, "loss": 0.7311, "step": 6466 }, { "epoch": 3.471930472131808, "grad_norm": 0.74609375, "learning_rate": 1.0669814352019462e-05, "loss": 0.7908, "step": 6467 }, { "epoch": 3.4724673668668835, "grad_norm": 0.734375, "learning_rate": 1.0662895133777784e-05, "loss": 0.8146, "step": 6468 }, { "epoch": 3.4730042616019596, "grad_norm": 0.65625, "learning_rate": 1.0655977551581345e-05, "loss": 0.9094, "step": 6469 }, { "epoch": 3.4735411563370358, "grad_norm": 0.69140625, "learning_rate": 1.0649061606219513e-05, "loss": 0.8647, "step": 6470 }, { "epoch": 3.4740780510721114, "grad_norm": 0.6953125, "learning_rate": 1.0642147298481498e-05, "loss": 0.7078, "step": 6471 }, { "epoch": 3.4746149458071875, "grad_norm": 0.7265625, "learning_rate": 1.0635234629156296e-05, "loss": 0.8195, "step": 6472 }, { "epoch": 3.4751518405422637, "grad_norm": 0.6875, "learning_rate": 1.0628323599032738e-05, "loss": 0.6716, "step": 6473 }, { "epoch": 3.47568873527734, "grad_norm": 0.76171875, "learning_rate": 1.062141420889947e-05, "loss": 0.8325, "step": 6474 }, { "epoch": 3.476225630012416, "grad_norm": 0.73046875, "learning_rate": 1.0614506459544921e-05, "loss": 0.8454, "step": 6475 }, { "epoch": 3.4767625247474916, "grad_norm": 0.62109375, "learning_rate": 1.0607600351757369e-05, "loss": 0.7848, "step": 6476 }, { "epoch": 3.4772994194825677, "grad_norm": 0.6875, "learning_rate": 1.0600695886324877e-05, "loss": 0.869, "step": 6477 }, { "epoch": 3.477836314217644, "grad_norm": 0.6953125, "learning_rate": 1.0593793064035343e-05, "loss": 0.8325, "step": 6478 }, { "epoch": 3.4783732089527195, "grad_norm": 0.62890625, "learning_rate": 1.0586891885676451e-05, "loss": 0.8221, "step": 6479 }, { "epoch": 3.4789101036877956, "grad_norm": 0.68359375, "learning_rate": 1.0579992352035734e-05, "loss": 0.8656, "step": 6480 }, { "epoch": 3.4794469984228718, "grad_norm": 0.72265625, "learning_rate": 1.057309446390049e-05, "loss": 0.7419, "step": 6481 }, { "epoch": 3.479983893157948, "grad_norm": 0.5234375, "learning_rate": 1.0566198222057875e-05, "loss": 0.7012, "step": 6482 }, { "epoch": 3.4805207878930235, "grad_norm": 0.7265625, "learning_rate": 1.0559303627294839e-05, "loss": 0.861, "step": 6483 }, { "epoch": 3.4810576826280997, "grad_norm": 0.640625, "learning_rate": 1.0552410680398126e-05, "loss": 0.8149, "step": 6484 }, { "epoch": 3.481594577363176, "grad_norm": 0.69140625, "learning_rate": 1.054551938215432e-05, "loss": 0.787, "step": 6485 }, { "epoch": 3.482131472098252, "grad_norm": 0.71875, "learning_rate": 1.053862973334982e-05, "loss": 0.9327, "step": 6486 }, { "epoch": 3.4826683668333276, "grad_norm": 0.77734375, "learning_rate": 1.0531741734770788e-05, "loss": 0.7855, "step": 6487 }, { "epoch": 3.4832052615684037, "grad_norm": 0.6875, "learning_rate": 1.0524855387203253e-05, "loss": 0.6862, "step": 6488 }, { "epoch": 3.48374215630348, "grad_norm": 0.65625, "learning_rate": 1.0517970691433035e-05, "loss": 0.8203, "step": 6489 }, { "epoch": 3.4842790510385555, "grad_norm": 0.71484375, "learning_rate": 1.0511087648245757e-05, "loss": 0.9173, "step": 6490 }, { "epoch": 3.4848159457736316, "grad_norm": 0.6640625, "learning_rate": 1.0504206258426864e-05, "loss": 0.8391, "step": 6491 }, { "epoch": 3.4853528405087078, "grad_norm": 0.6953125, "learning_rate": 1.049732652276162e-05, "loss": 0.7, "step": 6492 }, { "epoch": 3.485889735243784, "grad_norm": 0.7109375, "learning_rate": 1.049044844203507e-05, "loss": 0.837, "step": 6493 }, { "epoch": 3.48642662997886, "grad_norm": 0.84375, "learning_rate": 1.0483572017032112e-05, "loss": 0.945, "step": 6494 }, { "epoch": 3.4869635247139357, "grad_norm": 0.640625, "learning_rate": 1.0476697248537415e-05, "loss": 0.716, "step": 6495 }, { "epoch": 3.487500419449012, "grad_norm": 0.69921875, "learning_rate": 1.0469824137335491e-05, "loss": 0.8421, "step": 6496 }, { "epoch": 3.488037314184088, "grad_norm": 0.76171875, "learning_rate": 1.0462952684210636e-05, "loss": 0.9197, "step": 6497 }, { "epoch": 3.4885742089191636, "grad_norm": 0.62890625, "learning_rate": 1.0456082889946974e-05, "loss": 0.7482, "step": 6498 }, { "epoch": 3.4891111036542397, "grad_norm": 0.6015625, "learning_rate": 1.044921475532845e-05, "loss": 0.8445, "step": 6499 }, { "epoch": 3.489647998389316, "grad_norm": 0.62109375, "learning_rate": 1.0442348281138783e-05, "loss": 0.7286, "step": 6500 }, { "epoch": 3.489647998389316, "eval_loss": 0.903020441532135, "eval_runtime": 46.0667, "eval_samples_per_second": 20.883, "eval_steps_per_second": 20.883, "step": 6500 }, { "epoch": 3.490184893124392, "grad_norm": 0.80078125, "learning_rate": 1.0435483468161536e-05, "loss": 0.9734, "step": 6501 }, { "epoch": 3.4907217878594676, "grad_norm": 0.78125, "learning_rate": 1.0428620317180076e-05, "loss": 0.8584, "step": 6502 }, { "epoch": 3.4912586825945437, "grad_norm": 0.734375, "learning_rate": 1.0421758828977573e-05, "loss": 0.9207, "step": 6503 }, { "epoch": 3.49179557732962, "grad_norm": 0.63671875, "learning_rate": 1.0414899004336995e-05, "loss": 0.757, "step": 6504 }, { "epoch": 3.492332472064696, "grad_norm": 0.83203125, "learning_rate": 1.0408040844041156e-05, "loss": 0.936, "step": 6505 }, { "epoch": 3.4928693667997717, "grad_norm": 0.76953125, "learning_rate": 1.040118434887264e-05, "loss": 0.857, "step": 6506 }, { "epoch": 3.493406261534848, "grad_norm": 0.6796875, "learning_rate": 1.0394329519613865e-05, "loss": 0.8578, "step": 6507 }, { "epoch": 3.493943156269924, "grad_norm": 0.69921875, "learning_rate": 1.0387476357047072e-05, "loss": 0.7709, "step": 6508 }, { "epoch": 3.4944800510049996, "grad_norm": 0.72265625, "learning_rate": 1.0380624861954267e-05, "loss": 0.8426, "step": 6509 }, { "epoch": 3.4950169457400757, "grad_norm": 0.76171875, "learning_rate": 1.0373775035117305e-05, "loss": 1.0863, "step": 6510 }, { "epoch": 3.495553840475152, "grad_norm": 0.78125, "learning_rate": 1.0366926877317848e-05, "loss": 1.0062, "step": 6511 }, { "epoch": 3.496090735210228, "grad_norm": 0.71484375, "learning_rate": 1.0360080389337345e-05, "loss": 1.0122, "step": 6512 }, { "epoch": 3.496627629945304, "grad_norm": 0.68359375, "learning_rate": 1.0353235571957061e-05, "loss": 0.7241, "step": 6513 }, { "epoch": 3.4971645246803797, "grad_norm": 0.62890625, "learning_rate": 1.0346392425958088e-05, "loss": 0.7633, "step": 6514 }, { "epoch": 3.497701419415456, "grad_norm": 0.7578125, "learning_rate": 1.0339550952121318e-05, "loss": 0.9641, "step": 6515 }, { "epoch": 3.498238314150532, "grad_norm": 0.6015625, "learning_rate": 1.0332711151227433e-05, "loss": 0.7376, "step": 6516 }, { "epoch": 3.4987752088856077, "grad_norm": 0.62109375, "learning_rate": 1.0325873024056955e-05, "loss": 0.8517, "step": 6517 }, { "epoch": 3.499312103620684, "grad_norm": 0.796875, "learning_rate": 1.0319036571390206e-05, "loss": 0.7533, "step": 6518 }, { "epoch": 3.49984899835576, "grad_norm": 1.296875, "learning_rate": 1.0312201794007294e-05, "loss": 0.7301, "step": 6519 }, { "epoch": 3.500385893090836, "grad_norm": 0.66796875, "learning_rate": 1.0305368692688174e-05, "loss": 0.8539, "step": 6520 }, { "epoch": 3.500922787825912, "grad_norm": 0.765625, "learning_rate": 1.0298537268212577e-05, "loss": 0.8464, "step": 6521 }, { "epoch": 3.501459682560988, "grad_norm": 0.6875, "learning_rate": 1.0291707521360047e-05, "loss": 0.9696, "step": 6522 }, { "epoch": 3.501996577296064, "grad_norm": 0.60546875, "learning_rate": 1.0284879452909956e-05, "loss": 0.9041, "step": 6523 }, { "epoch": 3.50253347203114, "grad_norm": 0.69140625, "learning_rate": 1.0278053063641479e-05, "loss": 0.8197, "step": 6524 }, { "epoch": 3.5030703667662157, "grad_norm": 0.58203125, "learning_rate": 1.0271228354333578e-05, "loss": 0.856, "step": 6525 }, { "epoch": 3.503607261501292, "grad_norm": 0.72265625, "learning_rate": 1.0264405325765044e-05, "loss": 0.7948, "step": 6526 }, { "epoch": 3.504144156236368, "grad_norm": 0.71875, "learning_rate": 1.0257583978714485e-05, "loss": 0.7216, "step": 6527 }, { "epoch": 3.5046810509714437, "grad_norm": 0.625, "learning_rate": 1.0250764313960292e-05, "loss": 0.7046, "step": 6528 }, { "epoch": 3.50521794570652, "grad_norm": 0.73046875, "learning_rate": 1.0243946332280665e-05, "loss": 0.8455, "step": 6529 }, { "epoch": 3.505754840441596, "grad_norm": 0.6953125, "learning_rate": 1.023713003445364e-05, "loss": 0.7219, "step": 6530 }, { "epoch": 3.506291735176672, "grad_norm": 0.6484375, "learning_rate": 1.0230315421257027e-05, "loss": 0.7459, "step": 6531 }, { "epoch": 3.506828629911748, "grad_norm": 0.61328125, "learning_rate": 1.0223502493468468e-05, "loss": 0.7541, "step": 6532 }, { "epoch": 3.507365524646824, "grad_norm": 0.74609375, "learning_rate": 1.021669125186541e-05, "loss": 0.7977, "step": 6533 }, { "epoch": 3.5079024193819, "grad_norm": 0.734375, "learning_rate": 1.0209881697225087e-05, "loss": 0.806, "step": 6534 }, { "epoch": 3.508439314116976, "grad_norm": 0.7265625, "learning_rate": 1.0203073830324567e-05, "loss": 0.9033, "step": 6535 }, { "epoch": 3.5089762088520517, "grad_norm": 0.734375, "learning_rate": 1.0196267651940716e-05, "loss": 0.9589, "step": 6536 }, { "epoch": 3.509513103587128, "grad_norm": 0.71875, "learning_rate": 1.0189463162850202e-05, "loss": 0.7795, "step": 6537 }, { "epoch": 3.510049998322204, "grad_norm": 0.71484375, "learning_rate": 1.0182660363829495e-05, "loss": 0.7505, "step": 6538 }, { "epoch": 3.51058689305728, "grad_norm": 0.5703125, "learning_rate": 1.0175859255654882e-05, "loss": 0.6832, "step": 6539 }, { "epoch": 3.5111237877923562, "grad_norm": 0.75, "learning_rate": 1.0169059839102471e-05, "loss": 1.062, "step": 6540 }, { "epoch": 3.511660682527432, "grad_norm": 0.69921875, "learning_rate": 1.0162262114948143e-05, "loss": 0.7038, "step": 6541 }, { "epoch": 3.512197577262508, "grad_norm": 0.64453125, "learning_rate": 1.0155466083967613e-05, "loss": 0.7424, "step": 6542 }, { "epoch": 3.512734471997584, "grad_norm": 0.78125, "learning_rate": 1.01486717469364e-05, "loss": 0.8705, "step": 6543 }, { "epoch": 3.51327136673266, "grad_norm": 0.71484375, "learning_rate": 1.0141879104629812e-05, "loss": 0.8432, "step": 6544 }, { "epoch": 3.513808261467736, "grad_norm": 0.63671875, "learning_rate": 1.0135088157822986e-05, "loss": 0.698, "step": 6545 }, { "epoch": 3.514345156202812, "grad_norm": 0.5859375, "learning_rate": 1.0128298907290853e-05, "loss": 0.7488, "step": 6546 }, { "epoch": 3.5148820509378877, "grad_norm": 0.72265625, "learning_rate": 1.0121511353808142e-05, "loss": 0.7864, "step": 6547 }, { "epoch": 3.515418945672964, "grad_norm": 0.734375, "learning_rate": 1.0114725498149402e-05, "loss": 0.8029, "step": 6548 }, { "epoch": 3.51595584040804, "grad_norm": 0.75390625, "learning_rate": 1.0107941341089003e-05, "loss": 0.9244, "step": 6549 }, { "epoch": 3.516492735143116, "grad_norm": 0.625, "learning_rate": 1.0101158883401077e-05, "loss": 0.7203, "step": 6550 }, { "epoch": 3.516492735143116, "eval_loss": 0.9030448794364929, "eval_runtime": 45.747, "eval_samples_per_second": 21.029, "eval_steps_per_second": 21.029, "step": 6550 }, { "epoch": 3.5170296298781922, "grad_norm": 0.671875, "learning_rate": 1.0094378125859602e-05, "loss": 0.7902, "step": 6551 }, { "epoch": 3.517566524613268, "grad_norm": 0.8046875, "learning_rate": 1.0087599069238357e-05, "loss": 0.9235, "step": 6552 }, { "epoch": 3.518103419348344, "grad_norm": 0.66796875, "learning_rate": 1.0080821714310908e-05, "loss": 0.8724, "step": 6553 }, { "epoch": 3.51864031408342, "grad_norm": 0.671875, "learning_rate": 1.0074046061850628e-05, "loss": 1.1629, "step": 6554 }, { "epoch": 3.519177208818496, "grad_norm": 0.76171875, "learning_rate": 1.0067272112630716e-05, "loss": 0.9107, "step": 6555 }, { "epoch": 3.519714103553572, "grad_norm": 0.640625, "learning_rate": 1.006049986742417e-05, "loss": 0.7408, "step": 6556 }, { "epoch": 3.520250998288648, "grad_norm": 0.71484375, "learning_rate": 1.0053729327003775e-05, "loss": 0.7296, "step": 6557 }, { "epoch": 3.520787893023724, "grad_norm": 0.65234375, "learning_rate": 1.0046960492142144e-05, "loss": 0.7931, "step": 6558 }, { "epoch": 3.5213247877588003, "grad_norm": 0.609375, "learning_rate": 1.0040193363611695e-05, "loss": 0.7758, "step": 6559 }, { "epoch": 3.521861682493876, "grad_norm": 0.67578125, "learning_rate": 1.0033427942184622e-05, "loss": 0.7793, "step": 6560 }, { "epoch": 3.522398577228952, "grad_norm": 0.83203125, "learning_rate": 1.0026664228632968e-05, "loss": 0.8087, "step": 6561 }, { "epoch": 3.5229354719640282, "grad_norm": 0.64453125, "learning_rate": 1.0019902223728548e-05, "loss": 0.7792, "step": 6562 }, { "epoch": 3.523472366699104, "grad_norm": 0.7265625, "learning_rate": 1.0013141928242981e-05, "loss": 0.7958, "step": 6563 }, { "epoch": 3.52400926143418, "grad_norm": 0.6328125, "learning_rate": 1.0006383342947714e-05, "loss": 0.7844, "step": 6564 }, { "epoch": 3.524546156169256, "grad_norm": 0.69921875, "learning_rate": 9.999626468613996e-06, "loss": 0.883, "step": 6565 }, { "epoch": 3.525083050904332, "grad_norm": 0.859375, "learning_rate": 9.992871306012852e-06, "loss": 1.0475, "step": 6566 }, { "epoch": 3.525619945639408, "grad_norm": 0.73828125, "learning_rate": 9.986117855915145e-06, "loss": 0.8462, "step": 6567 }, { "epoch": 3.526156840374484, "grad_norm": 0.78125, "learning_rate": 9.979366119091531e-06, "loss": 0.8852, "step": 6568 }, { "epoch": 3.52669373510956, "grad_norm": 0.62109375, "learning_rate": 9.972616096312456e-06, "loss": 0.7959, "step": 6569 }, { "epoch": 3.5272306298446363, "grad_norm": 0.60546875, "learning_rate": 9.965867788348201e-06, "loss": 0.9117, "step": 6570 }, { "epoch": 3.527767524579712, "grad_norm": 0.70703125, "learning_rate": 9.959121195968823e-06, "loss": 0.8907, "step": 6571 }, { "epoch": 3.528304419314788, "grad_norm": 0.6328125, "learning_rate": 9.952376319944184e-06, "loss": 0.7078, "step": 6572 }, { "epoch": 3.5288413140498642, "grad_norm": 0.73046875, "learning_rate": 9.94563316104397e-06, "loss": 0.9385, "step": 6573 }, { "epoch": 3.52937820878494, "grad_norm": 1.0390625, "learning_rate": 9.938891720037671e-06, "loss": 0.9329, "step": 6574 }, { "epoch": 3.529915103520016, "grad_norm": 0.640625, "learning_rate": 9.932151997694549e-06, "loss": 0.7381, "step": 6575 }, { "epoch": 3.530451998255092, "grad_norm": 0.63671875, "learning_rate": 9.925413994783703e-06, "loss": 0.8606, "step": 6576 }, { "epoch": 3.5309888929901683, "grad_norm": 0.6953125, "learning_rate": 9.918677712074035e-06, "loss": 0.9541, "step": 6577 }, { "epoch": 3.5315257877252444, "grad_norm": 0.671875, "learning_rate": 9.91194315033423e-06, "loss": 0.7419, "step": 6578 }, { "epoch": 3.53206268246032, "grad_norm": 0.671875, "learning_rate": 9.905210310332774e-06, "loss": 0.7282, "step": 6579 }, { "epoch": 3.532599577195396, "grad_norm": 0.60546875, "learning_rate": 9.898479192837983e-06, "loss": 0.8543, "step": 6580 }, { "epoch": 3.5331364719304723, "grad_norm": 0.72265625, "learning_rate": 9.891749798617972e-06, "loss": 0.8682, "step": 6581 }, { "epoch": 3.533673366665548, "grad_norm": 0.67578125, "learning_rate": 9.88502212844063e-06, "loss": 0.9144, "step": 6582 }, { "epoch": 3.534210261400624, "grad_norm": 0.72265625, "learning_rate": 9.878296183073678e-06, "loss": 0.8391, "step": 6583 }, { "epoch": 3.5347471561357002, "grad_norm": 0.66015625, "learning_rate": 9.871571963284643e-06, "loss": 0.7431, "step": 6584 }, { "epoch": 3.535284050870776, "grad_norm": 0.6171875, "learning_rate": 9.864849469840822e-06, "loss": 0.7317, "step": 6585 }, { "epoch": 3.535820945605852, "grad_norm": 0.625, "learning_rate": 9.858128703509348e-06, "loss": 0.7143, "step": 6586 }, { "epoch": 3.536357840340928, "grad_norm": 0.6484375, "learning_rate": 9.851409665057165e-06, "loss": 0.7855, "step": 6587 }, { "epoch": 3.5368947350760043, "grad_norm": 0.6875, "learning_rate": 9.844692355250962e-06, "loss": 1.0114, "step": 6588 }, { "epoch": 3.5374316298110804, "grad_norm": 0.671875, "learning_rate": 9.837976774857286e-06, "loss": 0.9623, "step": 6589 }, { "epoch": 3.537968524546156, "grad_norm": 0.72265625, "learning_rate": 9.831262924642481e-06, "loss": 0.9195, "step": 6590 }, { "epoch": 3.538505419281232, "grad_norm": 0.83203125, "learning_rate": 9.824550805372664e-06, "loss": 0.982, "step": 6591 }, { "epoch": 3.5390423140163083, "grad_norm": 0.72265625, "learning_rate": 9.817840417813784e-06, "loss": 0.8192, "step": 6592 }, { "epoch": 3.539579208751384, "grad_norm": 0.68359375, "learning_rate": 9.811131762731585e-06, "loss": 0.7611, "step": 6593 }, { "epoch": 3.54011610348646, "grad_norm": 0.6171875, "learning_rate": 9.804424840891594e-06, "loss": 0.821, "step": 6594 }, { "epoch": 3.5406529982215362, "grad_norm": 0.66015625, "learning_rate": 9.797719653059177e-06, "loss": 0.7946, "step": 6595 }, { "epoch": 3.5411898929566124, "grad_norm": 0.83203125, "learning_rate": 9.791016199999456e-06, "loss": 0.8176, "step": 6596 }, { "epoch": 3.5417267876916885, "grad_norm": 0.6875, "learning_rate": 9.784314482477406e-06, "loss": 0.7749, "step": 6597 }, { "epoch": 3.542263682426764, "grad_norm": 0.80078125, "learning_rate": 9.777614501257751e-06, "loss": 1.0866, "step": 6598 }, { "epoch": 3.5428005771618403, "grad_norm": 0.81640625, "learning_rate": 9.770916257105062e-06, "loss": 1.0468, "step": 6599 }, { "epoch": 3.5433374718969164, "grad_norm": 0.765625, "learning_rate": 9.764219750783693e-06, "loss": 0.8212, "step": 6600 }, { "epoch": 3.5433374718969164, "eval_loss": 0.9029698967933655, "eval_runtime": 45.9736, "eval_samples_per_second": 20.925, "eval_steps_per_second": 20.925, "step": 6600 }, { "epoch": 3.543874366631992, "grad_norm": 0.6640625, "learning_rate": 9.757524983057789e-06, "loss": 0.7546, "step": 6601 }, { "epoch": 3.544411261367068, "grad_norm": 0.6484375, "learning_rate": 9.750831954691326e-06, "loss": 0.9804, "step": 6602 }, { "epoch": 3.5449481561021443, "grad_norm": 0.8359375, "learning_rate": 9.74414066644804e-06, "loss": 0.882, "step": 6603 }, { "epoch": 3.54548505083722, "grad_norm": 0.75390625, "learning_rate": 9.737451119091515e-06, "loss": 0.7966, "step": 6604 }, { "epoch": 3.546021945572296, "grad_norm": 0.61328125, "learning_rate": 9.730763313385091e-06, "loss": 0.8061, "step": 6605 }, { "epoch": 3.5465588403073722, "grad_norm": 0.68359375, "learning_rate": 9.724077250091954e-06, "loss": 0.8407, "step": 6606 }, { "epoch": 3.5470957350424484, "grad_norm": 0.5859375, "learning_rate": 9.717392929975045e-06, "loss": 0.9106, "step": 6607 }, { "epoch": 3.5476326297775245, "grad_norm": 0.7890625, "learning_rate": 9.710710353797142e-06, "loss": 0.9718, "step": 6608 }, { "epoch": 3.5481695245126, "grad_norm": 0.7734375, "learning_rate": 9.70402952232082e-06, "loss": 0.8036, "step": 6609 }, { "epoch": 3.5487064192476763, "grad_norm": 0.8125, "learning_rate": 9.697350436308427e-06, "loss": 0.8215, "step": 6610 }, { "epoch": 3.5492433139827524, "grad_norm": 0.7578125, "learning_rate": 9.690673096522143e-06, "loss": 0.8967, "step": 6611 }, { "epoch": 3.549780208717828, "grad_norm": 0.63671875, "learning_rate": 9.683997503723954e-06, "loss": 0.857, "step": 6612 }, { "epoch": 3.550317103452904, "grad_norm": 0.6484375, "learning_rate": 9.677323658675594e-06, "loss": 0.7265, "step": 6613 }, { "epoch": 3.5508539981879803, "grad_norm": 0.63671875, "learning_rate": 9.67065156213865e-06, "loss": 0.6556, "step": 6614 }, { "epoch": 3.5513908929230564, "grad_norm": 0.76953125, "learning_rate": 9.6639812148745e-06, "loss": 0.9328, "step": 6615 }, { "epoch": 3.5519277876581326, "grad_norm": 0.6484375, "learning_rate": 9.657312617644302e-06, "loss": 0.819, "step": 6616 }, { "epoch": 3.5524646823932082, "grad_norm": 0.828125, "learning_rate": 9.650645771209033e-06, "loss": 0.807, "step": 6617 }, { "epoch": 3.5530015771282843, "grad_norm": 0.76171875, "learning_rate": 9.643980676329472e-06, "loss": 0.8549, "step": 6618 }, { "epoch": 3.5535384718633605, "grad_norm": 0.71484375, "learning_rate": 9.637317333766177e-06, "loss": 0.9689, "step": 6619 }, { "epoch": 3.554075366598436, "grad_norm": 0.75390625, "learning_rate": 9.630655744279537e-06, "loss": 0.7918, "step": 6620 }, { "epoch": 3.5546122613335123, "grad_norm": 0.96484375, "learning_rate": 9.623995908629704e-06, "loss": 1.0044, "step": 6621 }, { "epoch": 3.5551491560685884, "grad_norm": 0.96484375, "learning_rate": 9.617337827576666e-06, "loss": 0.9225, "step": 6622 }, { "epoch": 3.555686050803664, "grad_norm": 0.7109375, "learning_rate": 9.610681501880181e-06, "loss": 0.8674, "step": 6623 }, { "epoch": 3.55622294553874, "grad_norm": 0.6640625, "learning_rate": 9.604026932299825e-06, "loss": 0.7579, "step": 6624 }, { "epoch": 3.5567598402738163, "grad_norm": 0.64453125, "learning_rate": 9.597374119594982e-06, "loss": 0.6607, "step": 6625 }, { "epoch": 3.5572967350088924, "grad_norm": 0.70703125, "learning_rate": 9.590723064524799e-06, "loss": 0.7935, "step": 6626 }, { "epoch": 3.5578336297439686, "grad_norm": 0.65234375, "learning_rate": 9.584073767848257e-06, "loss": 0.8069, "step": 6627 }, { "epoch": 3.5583705244790442, "grad_norm": 0.61328125, "learning_rate": 9.577426230324135e-06, "loss": 0.7998, "step": 6628 }, { "epoch": 3.5589074192141203, "grad_norm": 0.63671875, "learning_rate": 9.570780452710989e-06, "loss": 0.642, "step": 6629 }, { "epoch": 3.5594443139491965, "grad_norm": 0.734375, "learning_rate": 9.564136435767181e-06, "loss": 0.9416, "step": 6630 }, { "epoch": 3.559981208684272, "grad_norm": 0.73828125, "learning_rate": 9.557494180250895e-06, "loss": 0.7998, "step": 6631 }, { "epoch": 3.5605181034193483, "grad_norm": 0.6796875, "learning_rate": 9.550853686920078e-06, "loss": 0.8869, "step": 6632 }, { "epoch": 3.5610549981544244, "grad_norm": 0.59765625, "learning_rate": 9.544214956532502e-06, "loss": 0.6988, "step": 6633 }, { "epoch": 3.5615918928895005, "grad_norm": 0.609375, "learning_rate": 9.537577989845738e-06, "loss": 0.719, "step": 6634 }, { "epoch": 3.5621287876245766, "grad_norm": 0.58984375, "learning_rate": 9.530942787617137e-06, "loss": 0.867, "step": 6635 }, { "epoch": 3.5626656823596523, "grad_norm": 0.7421875, "learning_rate": 9.52430935060386e-06, "loss": 0.7868, "step": 6636 }, { "epoch": 3.5632025770947284, "grad_norm": 0.65625, "learning_rate": 9.517677679562878e-06, "loss": 0.7565, "step": 6637 }, { "epoch": 3.5637394718298046, "grad_norm": 0.73046875, "learning_rate": 9.511047775250941e-06, "loss": 0.8856, "step": 6638 }, { "epoch": 3.5642763665648802, "grad_norm": 0.71875, "learning_rate": 9.5044196384246e-06, "loss": 0.8864, "step": 6639 }, { "epoch": 3.5648132612999563, "grad_norm": 0.625, "learning_rate": 9.497793269840211e-06, "loss": 0.6957, "step": 6640 }, { "epoch": 3.5653501560350325, "grad_norm": 0.62109375, "learning_rate": 9.491168670253936e-06, "loss": 0.7793, "step": 6641 }, { "epoch": 3.565887050770108, "grad_norm": 0.95703125, "learning_rate": 9.484545840421713e-06, "loss": 0.947, "step": 6642 }, { "epoch": 3.5664239455051843, "grad_norm": 0.65234375, "learning_rate": 9.477924781099296e-06, "loss": 0.7158, "step": 6643 }, { "epoch": 3.5669608402402604, "grad_norm": 0.6796875, "learning_rate": 9.471305493042243e-06, "loss": 0.805, "step": 6644 }, { "epoch": 3.5674977349753365, "grad_norm": 0.69921875, "learning_rate": 9.464687977005876e-06, "loss": 0.9085, "step": 6645 }, { "epoch": 3.5680346297104126, "grad_norm": 0.5703125, "learning_rate": 9.458072233745362e-06, "loss": 0.8781, "step": 6646 }, { "epoch": 3.5685715244454883, "grad_norm": 0.7578125, "learning_rate": 9.451458264015627e-06, "loss": 0.999, "step": 6647 }, { "epoch": 3.5691084191805644, "grad_norm": 0.66015625, "learning_rate": 9.444846068571404e-06, "loss": 1.0313, "step": 6648 }, { "epoch": 3.5696453139156405, "grad_norm": 0.84375, "learning_rate": 9.438235648167233e-06, "loss": 0.7287, "step": 6649 }, { "epoch": 3.5701822086507162, "grad_norm": 0.55859375, "learning_rate": 9.431627003557456e-06, "loss": 0.7335, "step": 6650 }, { "epoch": 3.5701822086507162, "eval_loss": 0.902814507484436, "eval_runtime": 46.0505, "eval_samples_per_second": 20.89, "eval_steps_per_second": 20.89, "step": 6650 }, { "epoch": 3.5707191033857923, "grad_norm": 0.76953125, "learning_rate": 9.425020135496185e-06, "loss": 0.9489, "step": 6651 }, { "epoch": 3.5712559981208685, "grad_norm": 0.703125, "learning_rate": 9.41841504473736e-06, "loss": 0.8986, "step": 6652 }, { "epoch": 3.5717928928559446, "grad_norm": 0.71484375, "learning_rate": 9.411811732034715e-06, "loss": 0.7683, "step": 6653 }, { "epoch": 3.5723297875910207, "grad_norm": 0.7578125, "learning_rate": 9.405210198141757e-06, "loss": 0.889, "step": 6654 }, { "epoch": 3.5728666823260964, "grad_norm": 1.0859375, "learning_rate": 9.398610443811797e-06, "loss": 0.7695, "step": 6655 }, { "epoch": 3.5734035770611725, "grad_norm": 0.70703125, "learning_rate": 9.39201246979797e-06, "loss": 0.8382, "step": 6656 }, { "epoch": 3.5739404717962486, "grad_norm": 0.8515625, "learning_rate": 9.385416276853173e-06, "loss": 0.8171, "step": 6657 }, { "epoch": 3.5744773665313243, "grad_norm": 0.765625, "learning_rate": 9.37882186573012e-06, "loss": 0.7891, "step": 6658 }, { "epoch": 3.5750142612664004, "grad_norm": 0.6796875, "learning_rate": 9.372229237181327e-06, "loss": 0.8629, "step": 6659 }, { "epoch": 3.5755511560014765, "grad_norm": 0.58203125, "learning_rate": 9.36563839195908e-06, "loss": 0.7094, "step": 6660 }, { "epoch": 3.5760880507365522, "grad_norm": 0.79296875, "learning_rate": 9.359049330815486e-06, "loss": 0.8081, "step": 6661 }, { "epoch": 3.5766249454716283, "grad_norm": 0.56640625, "learning_rate": 9.352462054502449e-06, "loss": 0.6752, "step": 6662 }, { "epoch": 3.5771618402067045, "grad_norm": 0.58203125, "learning_rate": 9.345876563771649e-06, "loss": 0.6144, "step": 6663 }, { "epoch": 3.5776987349417806, "grad_norm": 0.8671875, "learning_rate": 9.339292859374568e-06, "loss": 0.8694, "step": 6664 }, { "epoch": 3.5782356296768567, "grad_norm": 0.8359375, "learning_rate": 9.3327109420625e-06, "loss": 0.944, "step": 6665 }, { "epoch": 3.5787725244119324, "grad_norm": 0.72265625, "learning_rate": 9.326130812586534e-06, "loss": 0.9569, "step": 6666 }, { "epoch": 3.5793094191470085, "grad_norm": 0.75, "learning_rate": 9.319552471697524e-06, "loss": 1.0054, "step": 6667 }, { "epoch": 3.5798463138820846, "grad_norm": 0.68359375, "learning_rate": 9.312975920146155e-06, "loss": 0.749, "step": 6668 }, { "epoch": 3.5803832086171603, "grad_norm": 0.66015625, "learning_rate": 9.306401158682904e-06, "loss": 0.7842, "step": 6669 }, { "epoch": 3.5809201033522364, "grad_norm": 0.72265625, "learning_rate": 9.299828188058013e-06, "loss": 0.917, "step": 6670 }, { "epoch": 3.5814569980873125, "grad_norm": 0.63671875, "learning_rate": 9.29325700902156e-06, "loss": 0.7266, "step": 6671 }, { "epoch": 3.5819938928223887, "grad_norm": 0.703125, "learning_rate": 9.286687622323395e-06, "loss": 0.8447, "step": 6672 }, { "epoch": 3.582530787557465, "grad_norm": 0.64453125, "learning_rate": 9.280120028713154e-06, "loss": 0.9104, "step": 6673 }, { "epoch": 3.5830676822925405, "grad_norm": 0.703125, "learning_rate": 9.273554228940298e-06, "loss": 0.9141, "step": 6674 }, { "epoch": 3.5836045770276166, "grad_norm": 0.80078125, "learning_rate": 9.266990223754069e-06, "loss": 0.8476, "step": 6675 }, { "epoch": 3.5841414717626927, "grad_norm": 0.71875, "learning_rate": 9.26042801390349e-06, "loss": 0.8628, "step": 6676 }, { "epoch": 3.5846783664977684, "grad_norm": 0.5859375, "learning_rate": 9.2538676001374e-06, "loss": 0.7839, "step": 6677 }, { "epoch": 3.5852152612328445, "grad_norm": 0.70703125, "learning_rate": 9.247308983204433e-06, "loss": 0.8638, "step": 6678 }, { "epoch": 3.5857521559679206, "grad_norm": 0.66796875, "learning_rate": 9.240752163853006e-06, "loss": 0.7267, "step": 6679 }, { "epoch": 3.5862890507029963, "grad_norm": 0.66796875, "learning_rate": 9.234197142831321e-06, "loss": 0.7609, "step": 6680 }, { "epoch": 3.5868259454380724, "grad_norm": 0.8359375, "learning_rate": 9.227643920887399e-06, "loss": 0.6968, "step": 6681 }, { "epoch": 3.5873628401731485, "grad_norm": 0.7109375, "learning_rate": 9.221092498769058e-06, "loss": 0.7684, "step": 6682 }, { "epoch": 3.5878997349082247, "grad_norm": 0.75, "learning_rate": 9.214542877223876e-06, "loss": 0.959, "step": 6683 }, { "epoch": 3.588436629643301, "grad_norm": 0.76953125, "learning_rate": 9.207995056999258e-06, "loss": 0.8972, "step": 6684 }, { "epoch": 3.5889735243783765, "grad_norm": 0.63671875, "learning_rate": 9.201449038842402e-06, "loss": 0.7983, "step": 6685 }, { "epoch": 3.5895104191134526, "grad_norm": 0.58984375, "learning_rate": 9.194904823500275e-06, "loss": 0.7311, "step": 6686 }, { "epoch": 3.5900473138485287, "grad_norm": 0.625, "learning_rate": 9.188362411719672e-06, "loss": 0.7303, "step": 6687 }, { "epoch": 3.5905842085836044, "grad_norm": 0.7109375, "learning_rate": 9.181821804247156e-06, "loss": 0.8449, "step": 6688 }, { "epoch": 3.5911211033186805, "grad_norm": 0.64453125, "learning_rate": 9.175283001829083e-06, "loss": 0.6908, "step": 6689 }, { "epoch": 3.5916579980537566, "grad_norm": 0.6484375, "learning_rate": 9.168746005211626e-06, "loss": 0.7831, "step": 6690 }, { "epoch": 3.5921948927888327, "grad_norm": 0.75, "learning_rate": 9.162210815140743e-06, "loss": 0.9573, "step": 6691 }, { "epoch": 3.592731787523909, "grad_norm": 0.73046875, "learning_rate": 9.155677432362167e-06, "loss": 0.861, "step": 6692 }, { "epoch": 3.5932686822589845, "grad_norm": 0.71875, "learning_rate": 9.149145857621447e-06, "loss": 0.7768, "step": 6693 }, { "epoch": 3.5938055769940607, "grad_norm": 0.83203125, "learning_rate": 9.14261609166393e-06, "loss": 1.0587, "step": 6694 }, { "epoch": 3.594342471729137, "grad_norm": 0.69921875, "learning_rate": 9.136088135234727e-06, "loss": 0.8209, "step": 6695 }, { "epoch": 3.5948793664642125, "grad_norm": 0.671875, "learning_rate": 9.129561989078775e-06, "loss": 0.9221, "step": 6696 }, { "epoch": 3.5954162611992886, "grad_norm": 0.7109375, "learning_rate": 9.12303765394078e-06, "loss": 0.7608, "step": 6697 }, { "epoch": 3.5959531559343647, "grad_norm": 0.74609375, "learning_rate": 9.11651513056525e-06, "loss": 0.8028, "step": 6698 }, { "epoch": 3.5964900506694404, "grad_norm": 0.69140625, "learning_rate": 9.109994419696492e-06, "loss": 0.9619, "step": 6699 }, { "epoch": 3.5970269454045165, "grad_norm": 0.6328125, "learning_rate": 9.103475522078605e-06, "loss": 0.7196, "step": 6700 }, { "epoch": 3.5970269454045165, "eval_loss": 0.9028509259223938, "eval_runtime": 45.7648, "eval_samples_per_second": 21.021, "eval_steps_per_second": 21.021, "step": 6700 }, { "epoch": 3.5975638401395926, "grad_norm": 0.796875, "learning_rate": 9.09695843845547e-06, "loss": 0.8642, "step": 6701 }, { "epoch": 3.5981007348746687, "grad_norm": 0.7421875, "learning_rate": 9.09044316957077e-06, "loss": 0.8311, "step": 6702 }, { "epoch": 3.598637629609745, "grad_norm": 0.640625, "learning_rate": 9.083929716167994e-06, "loss": 0.9032, "step": 6703 }, { "epoch": 3.5991745243448205, "grad_norm": 0.6875, "learning_rate": 9.077418078990396e-06, "loss": 0.994, "step": 6704 }, { "epoch": 3.5997114190798967, "grad_norm": 0.62109375, "learning_rate": 9.07090825878103e-06, "loss": 0.9162, "step": 6705 }, { "epoch": 3.600248313814973, "grad_norm": 0.7109375, "learning_rate": 9.064400256282757e-06, "loss": 0.7996, "step": 6706 }, { "epoch": 3.6007852085500485, "grad_norm": 0.640625, "learning_rate": 9.057894072238233e-06, "loss": 0.9468, "step": 6707 }, { "epoch": 3.6013221032851246, "grad_norm": 0.8203125, "learning_rate": 9.051389707389877e-06, "loss": 0.7284, "step": 6708 }, { "epoch": 3.6018589980202007, "grad_norm": 0.6875, "learning_rate": 9.044887162479927e-06, "loss": 0.8251, "step": 6709 }, { "epoch": 3.602395892755277, "grad_norm": 0.7578125, "learning_rate": 9.038386438250415e-06, "loss": 0.7943, "step": 6710 }, { "epoch": 3.6029327874903525, "grad_norm": 0.671875, "learning_rate": 9.03188753544314e-06, "loss": 0.7361, "step": 6711 }, { "epoch": 3.6034696822254286, "grad_norm": 0.54296875, "learning_rate": 9.025390454799717e-06, "loss": 0.6254, "step": 6712 }, { "epoch": 3.6040065769605047, "grad_norm": 0.66015625, "learning_rate": 9.018895197061564e-06, "loss": 0.8518, "step": 6713 }, { "epoch": 3.604543471695581, "grad_norm": 0.609375, "learning_rate": 9.012401762969835e-06, "loss": 0.9286, "step": 6714 }, { "epoch": 3.6050803664306565, "grad_norm": 1.0390625, "learning_rate": 9.005910153265532e-06, "loss": 0.9985, "step": 6715 }, { "epoch": 3.6056172611657327, "grad_norm": 0.609375, "learning_rate": 8.999420368689434e-06, "loss": 0.8161, "step": 6716 }, { "epoch": 3.606154155900809, "grad_norm": 0.76171875, "learning_rate": 8.9929324099821e-06, "loss": 0.8656, "step": 6717 }, { "epoch": 3.6066910506358845, "grad_norm": 0.609375, "learning_rate": 8.986446277883887e-06, "loss": 0.831, "step": 6718 }, { "epoch": 3.6072279453709606, "grad_norm": 0.69140625, "learning_rate": 8.979961973134959e-06, "loss": 0.8525, "step": 6719 }, { "epoch": 3.6077648401060367, "grad_norm": 1.140625, "learning_rate": 8.97347949647524e-06, "loss": 0.9703, "step": 6720 }, { "epoch": 3.608301734841113, "grad_norm": 0.69140625, "learning_rate": 8.966998848644475e-06, "loss": 0.773, "step": 6721 }, { "epoch": 3.608838629576189, "grad_norm": 0.6171875, "learning_rate": 8.960520030382177e-06, "loss": 0.8466, "step": 6722 }, { "epoch": 3.6093755243112646, "grad_norm": 0.78125, "learning_rate": 8.954043042427676e-06, "loss": 0.7705, "step": 6723 }, { "epoch": 3.6099124190463407, "grad_norm": 0.73828125, "learning_rate": 8.947567885520061e-06, "loss": 0.8059, "step": 6724 }, { "epoch": 3.610449313781417, "grad_norm": 0.65625, "learning_rate": 8.941094560398236e-06, "loss": 0.8899, "step": 6725 }, { "epoch": 3.6109862085164925, "grad_norm": 0.68359375, "learning_rate": 8.934623067800902e-06, "loss": 0.7614, "step": 6726 }, { "epoch": 3.6115231032515687, "grad_norm": 0.58984375, "learning_rate": 8.928153408466516e-06, "loss": 0.7743, "step": 6727 }, { "epoch": 3.612059997986645, "grad_norm": 0.734375, "learning_rate": 8.921685583133372e-06, "loss": 0.8473, "step": 6728 }, { "epoch": 3.612596892721721, "grad_norm": 0.56640625, "learning_rate": 8.915219592539514e-06, "loss": 0.7977, "step": 6729 }, { "epoch": 3.6131337874567966, "grad_norm": 0.7109375, "learning_rate": 8.908755437422792e-06, "loss": 0.9503, "step": 6730 }, { "epoch": 3.6136706821918727, "grad_norm": 0.91015625, "learning_rate": 8.902293118520852e-06, "loss": 0.8522, "step": 6731 }, { "epoch": 3.614207576926949, "grad_norm": 0.60546875, "learning_rate": 8.895832636571138e-06, "loss": 0.7979, "step": 6732 }, { "epoch": 3.614744471662025, "grad_norm": 0.70703125, "learning_rate": 8.889373992310854e-06, "loss": 0.8807, "step": 6733 }, { "epoch": 3.6152813663971006, "grad_norm": 0.73046875, "learning_rate": 8.882917186477021e-06, "loss": 0.7953, "step": 6734 }, { "epoch": 3.6158182611321767, "grad_norm": 0.71875, "learning_rate": 8.876462219806456e-06, "loss": 0.7354, "step": 6735 }, { "epoch": 3.616355155867253, "grad_norm": 0.65234375, "learning_rate": 8.870009093035728e-06, "loss": 0.8158, "step": 6736 }, { "epoch": 3.6168920506023285, "grad_norm": 0.78515625, "learning_rate": 8.863557806901233e-06, "loss": 0.8887, "step": 6737 }, { "epoch": 3.6174289453374047, "grad_norm": 0.69921875, "learning_rate": 8.85710836213916e-06, "loss": 0.7492, "step": 6738 }, { "epoch": 3.617965840072481, "grad_norm": 0.78515625, "learning_rate": 8.850660759485438e-06, "loss": 0.8628, "step": 6739 }, { "epoch": 3.618502734807557, "grad_norm": 0.6328125, "learning_rate": 8.84421499967584e-06, "loss": 0.7729, "step": 6740 }, { "epoch": 3.619039629542633, "grad_norm": 0.7109375, "learning_rate": 8.837771083445918e-06, "loss": 0.8656, "step": 6741 }, { "epoch": 3.6195765242777087, "grad_norm": 0.7578125, "learning_rate": 8.83132901153098e-06, "loss": 0.9877, "step": 6742 }, { "epoch": 3.620113419012785, "grad_norm": 0.62890625, "learning_rate": 8.824888784666163e-06, "loss": 0.8382, "step": 6743 }, { "epoch": 3.620650313747861, "grad_norm": 0.94921875, "learning_rate": 8.818450403586387e-06, "loss": 1.0092, "step": 6744 }, { "epoch": 3.6211872084829366, "grad_norm": 0.80078125, "learning_rate": 8.812013869026334e-06, "loss": 0.9753, "step": 6745 }, { "epoch": 3.6217241032180127, "grad_norm": 0.703125, "learning_rate": 8.805579181720507e-06, "loss": 0.7735, "step": 6746 }, { "epoch": 3.622260997953089, "grad_norm": 0.65625, "learning_rate": 8.799146342403178e-06, "loss": 0.7595, "step": 6747 }, { "epoch": 3.622797892688165, "grad_norm": 0.625, "learning_rate": 8.792715351808425e-06, "loss": 0.8046, "step": 6748 }, { "epoch": 3.6233347874232407, "grad_norm": 0.74609375, "learning_rate": 8.786286210670092e-06, "loss": 0.8783, "step": 6749 }, { "epoch": 3.623871682158317, "grad_norm": 0.62109375, "learning_rate": 8.77985891972183e-06, "loss": 0.6572, "step": 6750 }, { "epoch": 3.623871682158317, "eval_loss": 0.9029676914215088, "eval_runtime": 46.3208, "eval_samples_per_second": 20.768, "eval_steps_per_second": 20.768, "step": 6750 }, { "epoch": 3.624408576893393, "grad_norm": 0.734375, "learning_rate": 8.773433479697086e-06, "loss": 0.8281, "step": 6751 }, { "epoch": 3.624945471628469, "grad_norm": 0.7421875, "learning_rate": 8.767009891329068e-06, "loss": 0.8847, "step": 6752 }, { "epoch": 3.6254823663635447, "grad_norm": 0.7109375, "learning_rate": 8.760588155350794e-06, "loss": 0.884, "step": 6753 }, { "epoch": 3.626019261098621, "grad_norm": 0.5703125, "learning_rate": 8.754168272495076e-06, "loss": 0.7977, "step": 6754 }, { "epoch": 3.626556155833697, "grad_norm": 0.6015625, "learning_rate": 8.747750243494492e-06, "loss": 0.7783, "step": 6755 }, { "epoch": 3.6270930505687726, "grad_norm": 0.65625, "learning_rate": 8.741334069081419e-06, "loss": 0.6992, "step": 6756 }, { "epoch": 3.6276299453038487, "grad_norm": 0.6640625, "learning_rate": 8.734919749988036e-06, "loss": 0.901, "step": 6757 }, { "epoch": 3.628166840038925, "grad_norm": 0.71484375, "learning_rate": 8.728507286946277e-06, "loss": 0.9386, "step": 6758 }, { "epoch": 3.628703734774001, "grad_norm": 0.703125, "learning_rate": 8.722096680687906e-06, "loss": 0.7702, "step": 6759 }, { "epoch": 3.629240629509077, "grad_norm": 0.74609375, "learning_rate": 8.715687931944449e-06, "loss": 0.8265, "step": 6760 }, { "epoch": 3.629777524244153, "grad_norm": 0.671875, "learning_rate": 8.709281041447217e-06, "loss": 0.9102, "step": 6761 }, { "epoch": 3.630314418979229, "grad_norm": 0.6484375, "learning_rate": 8.702876009927321e-06, "loss": 0.7795, "step": 6762 }, { "epoch": 3.630851313714305, "grad_norm": 0.83984375, "learning_rate": 8.696472838115671e-06, "loss": 1.0495, "step": 6763 }, { "epoch": 3.6313882084493807, "grad_norm": 0.625, "learning_rate": 8.690071526742935e-06, "loss": 0.7564, "step": 6764 }, { "epoch": 3.631925103184457, "grad_norm": 0.68359375, "learning_rate": 8.683672076539579e-06, "loss": 1.0103, "step": 6765 }, { "epoch": 3.632461997919533, "grad_norm": 0.7265625, "learning_rate": 8.67727448823587e-06, "loss": 0.6964, "step": 6766 }, { "epoch": 3.632998892654609, "grad_norm": 0.828125, "learning_rate": 8.670878762561857e-06, "loss": 0.9035, "step": 6767 }, { "epoch": 3.6335357873896847, "grad_norm": 0.69140625, "learning_rate": 8.664484900247363e-06, "loss": 0.8722, "step": 6768 }, { "epoch": 3.634072682124761, "grad_norm": 0.890625, "learning_rate": 8.658092902022017e-06, "loss": 0.8307, "step": 6769 }, { "epoch": 3.634609576859837, "grad_norm": 0.6953125, "learning_rate": 8.651702768615228e-06, "loss": 0.8419, "step": 6770 }, { "epoch": 3.635146471594913, "grad_norm": 0.70703125, "learning_rate": 8.645314500756188e-06, "loss": 0.6992, "step": 6771 }, { "epoch": 3.635683366329989, "grad_norm": 0.6015625, "learning_rate": 8.638928099173873e-06, "loss": 0.7571, "step": 6772 }, { "epoch": 3.636220261065065, "grad_norm": 0.65625, "learning_rate": 8.632543564597065e-06, "loss": 0.7757, "step": 6773 }, { "epoch": 3.636757155800141, "grad_norm": 0.68359375, "learning_rate": 8.626160897754306e-06, "loss": 1.0464, "step": 6774 }, { "epoch": 3.6372940505352167, "grad_norm": 0.84765625, "learning_rate": 8.619780099373945e-06, "loss": 0.9449, "step": 6775 }, { "epoch": 3.637830945270293, "grad_norm": 0.59765625, "learning_rate": 8.613401170184124e-06, "loss": 0.7641, "step": 6776 }, { "epoch": 3.638367840005369, "grad_norm": 0.671875, "learning_rate": 8.607024110912737e-06, "loss": 0.9775, "step": 6777 }, { "epoch": 3.638904734740445, "grad_norm": 0.546875, "learning_rate": 8.600648922287501e-06, "loss": 0.6239, "step": 6778 }, { "epoch": 3.639441629475521, "grad_norm": 0.7265625, "learning_rate": 8.594275605035915e-06, "loss": 0.7701, "step": 6779 }, { "epoch": 3.639978524210597, "grad_norm": 0.6328125, "learning_rate": 8.58790415988524e-06, "loss": 0.7851, "step": 6780 }, { "epoch": 3.640515418945673, "grad_norm": 0.65234375, "learning_rate": 8.581534587562538e-06, "loss": 0.8495, "step": 6781 }, { "epoch": 3.641052313680749, "grad_norm": 0.6484375, "learning_rate": 8.57516688879467e-06, "loss": 0.6921, "step": 6782 }, { "epoch": 3.641589208415825, "grad_norm": 0.83203125, "learning_rate": 8.568801064308252e-06, "loss": 1.1063, "step": 6783 }, { "epoch": 3.642126103150901, "grad_norm": 0.80078125, "learning_rate": 8.56243711482972e-06, "loss": 1.1216, "step": 6784 }, { "epoch": 3.642662997885977, "grad_norm": 0.69140625, "learning_rate": 8.556075041085285e-06, "loss": 0.9382, "step": 6785 }, { "epoch": 3.643199892621053, "grad_norm": 0.9296875, "learning_rate": 8.549714843800927e-06, "loss": 0.9093, "step": 6786 }, { "epoch": 3.643736787356129, "grad_norm": 0.7109375, "learning_rate": 8.543356523702428e-06, "loss": 0.7754, "step": 6787 }, { "epoch": 3.644273682091205, "grad_norm": 0.65234375, "learning_rate": 8.537000081515367e-06, "loss": 0.9546, "step": 6788 }, { "epoch": 3.644810576826281, "grad_norm": 0.62109375, "learning_rate": 8.530645517965083e-06, "loss": 0.7963, "step": 6789 }, { "epoch": 3.645347471561357, "grad_norm": 0.68359375, "learning_rate": 8.524292833776706e-06, "loss": 0.9665, "step": 6790 }, { "epoch": 3.645884366296433, "grad_norm": 0.703125, "learning_rate": 8.517942029675164e-06, "loss": 0.7845, "step": 6791 }, { "epoch": 3.646421261031509, "grad_norm": 0.54296875, "learning_rate": 8.51159310638517e-06, "loss": 0.687, "step": 6792 }, { "epoch": 3.646958155766585, "grad_norm": 0.703125, "learning_rate": 8.505246064631206e-06, "loss": 0.8961, "step": 6793 }, { "epoch": 3.647495050501661, "grad_norm": 0.75, "learning_rate": 8.498900905137555e-06, "loss": 0.828, "step": 6794 }, { "epoch": 3.648031945236737, "grad_norm": 0.72265625, "learning_rate": 8.49255762862829e-06, "loss": 0.8958, "step": 6795 }, { "epoch": 3.648568839971813, "grad_norm": 0.7734375, "learning_rate": 8.486216235827238e-06, "loss": 1.0279, "step": 6796 }, { "epoch": 3.649105734706889, "grad_norm": 0.74609375, "learning_rate": 8.479876727458052e-06, "loss": 0.8547, "step": 6797 }, { "epoch": 3.6496426294419653, "grad_norm": 0.63671875, "learning_rate": 8.473539104244146e-06, "loss": 0.9223, "step": 6798 }, { "epoch": 3.650179524177041, "grad_norm": 0.75390625, "learning_rate": 8.467203366908707e-06, "loss": 0.9157, "step": 6799 }, { "epoch": 3.650716418912117, "grad_norm": 0.6875, "learning_rate": 8.460869516174735e-06, "loss": 0.8649, "step": 6800 }, { "epoch": 3.650716418912117, "eval_loss": 0.9029330611228943, "eval_runtime": 46.3498, "eval_samples_per_second": 20.755, "eval_steps_per_second": 20.755, "step": 6800 }, { "epoch": 3.651253313647193, "grad_norm": 0.72265625, "learning_rate": 8.454537552765012e-06, "loss": 0.8005, "step": 6801 }, { "epoch": 3.651790208382269, "grad_norm": 0.66796875, "learning_rate": 8.448207477402076e-06, "loss": 0.8917, "step": 6802 }, { "epoch": 3.652327103117345, "grad_norm": 0.69140625, "learning_rate": 8.44187929080828e-06, "loss": 0.982, "step": 6803 }, { "epoch": 3.652863997852421, "grad_norm": 0.69921875, "learning_rate": 8.435552993705753e-06, "loss": 0.8059, "step": 6804 }, { "epoch": 3.6534008925874972, "grad_norm": 0.66015625, "learning_rate": 8.429228586816403e-06, "loss": 0.9227, "step": 6805 }, { "epoch": 3.653937787322573, "grad_norm": 0.671875, "learning_rate": 8.42290607086192e-06, "loss": 0.8203, "step": 6806 }, { "epoch": 3.654474682057649, "grad_norm": 0.890625, "learning_rate": 8.41658544656378e-06, "loss": 1.0071, "step": 6807 }, { "epoch": 3.655011576792725, "grad_norm": 0.6953125, "learning_rate": 8.410266714643261e-06, "loss": 0.7722, "step": 6808 }, { "epoch": 3.6555484715278013, "grad_norm": 0.7109375, "learning_rate": 8.403949875821397e-06, "loss": 1.0494, "step": 6809 }, { "epoch": 3.656085366262877, "grad_norm": 0.62890625, "learning_rate": 8.397634930819021e-06, "loss": 0.9386, "step": 6810 }, { "epoch": 3.656622260997953, "grad_norm": 0.66796875, "learning_rate": 8.391321880356761e-06, "loss": 0.8423, "step": 6811 }, { "epoch": 3.657159155733029, "grad_norm": 0.625, "learning_rate": 8.385010725154999e-06, "loss": 0.697, "step": 6812 }, { "epoch": 3.657696050468105, "grad_norm": 0.62890625, "learning_rate": 8.37870146593393e-06, "loss": 0.8463, "step": 6813 }, { "epoch": 3.658232945203181, "grad_norm": 0.6953125, "learning_rate": 8.372394103413519e-06, "loss": 0.8213, "step": 6814 }, { "epoch": 3.658769839938257, "grad_norm": 0.65625, "learning_rate": 8.366088638313504e-06, "loss": 0.6919, "step": 6815 }, { "epoch": 3.6593067346733332, "grad_norm": 0.6171875, "learning_rate": 8.359785071353423e-06, "loss": 0.6444, "step": 6816 }, { "epoch": 3.6598436294084093, "grad_norm": 0.65234375, "learning_rate": 8.35348340325261e-06, "loss": 0.7761, "step": 6817 }, { "epoch": 3.660380524143485, "grad_norm": 0.65234375, "learning_rate": 8.347183634730142e-06, "loss": 0.9443, "step": 6818 }, { "epoch": 3.660917418878561, "grad_norm": 0.65625, "learning_rate": 8.34088576650491e-06, "loss": 0.7114, "step": 6819 }, { "epoch": 3.6614543136136373, "grad_norm": 0.68359375, "learning_rate": 8.334589799295591e-06, "loss": 1.1432, "step": 6820 }, { "epoch": 3.661991208348713, "grad_norm": 0.66015625, "learning_rate": 8.32829573382062e-06, "loss": 0.7016, "step": 6821 }, { "epoch": 3.662528103083789, "grad_norm": 0.66015625, "learning_rate": 8.322003570798243e-06, "loss": 0.876, "step": 6822 }, { "epoch": 3.663064997818865, "grad_norm": 0.6875, "learning_rate": 8.315713310946469e-06, "loss": 0.7872, "step": 6823 }, { "epoch": 3.6636018925539413, "grad_norm": 0.65625, "learning_rate": 8.309424954983086e-06, "loss": 0.7113, "step": 6824 }, { "epoch": 3.664138787289017, "grad_norm": 0.70703125, "learning_rate": 8.303138503625688e-06, "loss": 0.8455, "step": 6825 }, { "epoch": 3.664675682024093, "grad_norm": 0.7890625, "learning_rate": 8.296853957591638e-06, "loss": 1.0172, "step": 6826 }, { "epoch": 3.6652125767591692, "grad_norm": 0.58203125, "learning_rate": 8.290571317598075e-06, "loss": 0.6483, "step": 6827 }, { "epoch": 3.6657494714942453, "grad_norm": 0.6796875, "learning_rate": 8.284290584361934e-06, "loss": 0.8555, "step": 6828 }, { "epoch": 3.666286366229321, "grad_norm": 0.8203125, "learning_rate": 8.278011758599929e-06, "loss": 0.8592, "step": 6829 }, { "epoch": 3.666823260964397, "grad_norm": 0.78125, "learning_rate": 8.271734841028553e-06, "loss": 0.7839, "step": 6830 }, { "epoch": 3.6673601556994733, "grad_norm": 0.63671875, "learning_rate": 8.265459832364066e-06, "loss": 0.8585, "step": 6831 }, { "epoch": 3.667897050434549, "grad_norm": 0.6953125, "learning_rate": 8.25918673332254e-06, "loss": 0.8377, "step": 6832 }, { "epoch": 3.668433945169625, "grad_norm": 0.70703125, "learning_rate": 8.252915544619821e-06, "loss": 0.8478, "step": 6833 }, { "epoch": 3.668970839904701, "grad_norm": 0.69921875, "learning_rate": 8.246646266971516e-06, "loss": 0.7837, "step": 6834 }, { "epoch": 3.6695077346397773, "grad_norm": 0.71875, "learning_rate": 8.240378901093034e-06, "loss": 0.8574, "step": 6835 }, { "epoch": 3.6700446293748534, "grad_norm": 0.6875, "learning_rate": 8.234113447699573e-06, "loss": 0.9252, "step": 6836 }, { "epoch": 3.670581524109929, "grad_norm": 0.65234375, "learning_rate": 8.22784990750608e-06, "loss": 0.8353, "step": 6837 }, { "epoch": 3.6711184188450052, "grad_norm": 0.7421875, "learning_rate": 8.221588281227315e-06, "loss": 0.9691, "step": 6838 }, { "epoch": 3.6716553135800813, "grad_norm": 0.68359375, "learning_rate": 8.215328569577826e-06, "loss": 0.7536, "step": 6839 }, { "epoch": 3.672192208315157, "grad_norm": 0.61328125, "learning_rate": 8.209070773271894e-06, "loss": 0.7848, "step": 6840 }, { "epoch": 3.672729103050233, "grad_norm": 0.734375, "learning_rate": 8.202814893023621e-06, "loss": 0.7424, "step": 6841 }, { "epoch": 3.6732659977853093, "grad_norm": 0.78515625, "learning_rate": 8.196560929546898e-06, "loss": 1.1235, "step": 6842 }, { "epoch": 3.6738028925203854, "grad_norm": 0.73828125, "learning_rate": 8.190308883555365e-06, "loss": 0.8499, "step": 6843 }, { "epoch": 3.674339787255461, "grad_norm": 0.6640625, "learning_rate": 8.184058755762466e-06, "loss": 0.8288, "step": 6844 }, { "epoch": 3.674876681990537, "grad_norm": 0.640625, "learning_rate": 8.177810546881426e-06, "loss": 0.7531, "step": 6845 }, { "epoch": 3.6754135767256133, "grad_norm": 0.6640625, "learning_rate": 8.17156425762523e-06, "loss": 0.7914, "step": 6846 }, { "epoch": 3.6759504714606894, "grad_norm": 0.6484375, "learning_rate": 8.165319888706677e-06, "loss": 0.8858, "step": 6847 }, { "epoch": 3.676487366195765, "grad_norm": 0.86328125, "learning_rate": 8.15907744083831e-06, "loss": 0.8575, "step": 6848 }, { "epoch": 3.6770242609308412, "grad_norm": 0.79296875, "learning_rate": 8.152836914732487e-06, "loss": 0.7521, "step": 6849 }, { "epoch": 3.6775611556659173, "grad_norm": 0.67578125, "learning_rate": 8.146598311101317e-06, "loss": 0.805, "step": 6850 }, { "epoch": 3.6775611556659173, "eval_loss": 0.9028520584106445, "eval_runtime": 46.2082, "eval_samples_per_second": 20.819, "eval_steps_per_second": 20.819, "step": 6850 }, { "epoch": 3.678098050400993, "grad_norm": 0.7578125, "learning_rate": 8.14036163065671e-06, "loss": 0.9158, "step": 6851 }, { "epoch": 3.678634945136069, "grad_norm": 0.67578125, "learning_rate": 8.13412687411036e-06, "loss": 0.7099, "step": 6852 }, { "epoch": 3.6791718398711453, "grad_norm": 0.6640625, "learning_rate": 8.127894042173714e-06, "loss": 0.7832, "step": 6853 }, { "epoch": 3.6797087346062214, "grad_norm": 0.73046875, "learning_rate": 8.121663135558032e-06, "loss": 0.8488, "step": 6854 }, { "epoch": 3.6802456293412975, "grad_norm": 0.703125, "learning_rate": 8.115434154974339e-06, "loss": 0.9325, "step": 6855 }, { "epoch": 3.680782524076373, "grad_norm": 0.71875, "learning_rate": 8.109207101133421e-06, "loss": 0.8746, "step": 6856 }, { "epoch": 3.6813194188114493, "grad_norm": 0.6015625, "learning_rate": 8.10298197474588e-06, "loss": 0.9481, "step": 6857 }, { "epoch": 3.6818563135465254, "grad_norm": 0.71875, "learning_rate": 8.096758776522085e-06, "loss": 1.0148, "step": 6858 }, { "epoch": 3.682393208281601, "grad_norm": 0.78515625, "learning_rate": 8.09053750717217e-06, "loss": 0.7556, "step": 6859 }, { "epoch": 3.6829301030166772, "grad_norm": 0.75, "learning_rate": 8.084318167406063e-06, "loss": 0.7653, "step": 6860 }, { "epoch": 3.6834669977517533, "grad_norm": 0.984375, "learning_rate": 8.078100757933485e-06, "loss": 0.8995, "step": 6861 }, { "epoch": 3.684003892486829, "grad_norm": 0.81640625, "learning_rate": 8.071885279463898e-06, "loss": 0.87, "step": 6862 }, { "epoch": 3.684540787221905, "grad_norm": 0.7265625, "learning_rate": 8.065671732706581e-06, "loss": 0.7983, "step": 6863 }, { "epoch": 3.6850776819569813, "grad_norm": 0.6640625, "learning_rate": 8.059460118370588e-06, "loss": 0.7964, "step": 6864 }, { "epoch": 3.6856145766920574, "grad_norm": 0.70703125, "learning_rate": 8.053250437164717e-06, "loss": 0.8647, "step": 6865 }, { "epoch": 3.6861514714271335, "grad_norm": 0.77734375, "learning_rate": 8.047042689797582e-06, "loss": 0.8245, "step": 6866 }, { "epoch": 3.686688366162209, "grad_norm": 0.69140625, "learning_rate": 8.040836876977576e-06, "loss": 0.8216, "step": 6867 }, { "epoch": 3.6872252608972853, "grad_norm": 0.78515625, "learning_rate": 8.034632999412846e-06, "loss": 0.8123, "step": 6868 }, { "epoch": 3.6877621556323614, "grad_norm": 0.79296875, "learning_rate": 8.02843105781134e-06, "loss": 1.2279, "step": 6869 }, { "epoch": 3.688299050367437, "grad_norm": 0.75390625, "learning_rate": 8.022231052880786e-06, "loss": 0.882, "step": 6870 }, { "epoch": 3.6888359451025132, "grad_norm": 0.6875, "learning_rate": 8.016032985328664e-06, "loss": 0.8362, "step": 6871 }, { "epoch": 3.6893728398375893, "grad_norm": 0.7578125, "learning_rate": 8.009836855862274e-06, "loss": 0.829, "step": 6872 }, { "epoch": 3.6899097345726655, "grad_norm": 0.7109375, "learning_rate": 8.003642665188652e-06, "loss": 0.7817, "step": 6873 }, { "epoch": 3.6904466293077416, "grad_norm": 1.203125, "learning_rate": 7.99745041401465e-06, "loss": 0.8557, "step": 6874 }, { "epoch": 3.6909835240428173, "grad_norm": 0.625, "learning_rate": 7.991260103046872e-06, "loss": 0.7789, "step": 6875 }, { "epoch": 3.6915204187778934, "grad_norm": 0.67578125, "learning_rate": 7.98507173299171e-06, "loss": 0.7369, "step": 6876 }, { "epoch": 3.6920573135129695, "grad_norm": 0.6875, "learning_rate": 7.978885304555353e-06, "loss": 1.0639, "step": 6877 }, { "epoch": 3.692594208248045, "grad_norm": 0.6953125, "learning_rate": 7.972700818443726e-06, "loss": 0.8837, "step": 6878 }, { "epoch": 3.6931311029831213, "grad_norm": 0.7421875, "learning_rate": 7.96651827536257e-06, "loss": 0.7951, "step": 6879 }, { "epoch": 3.6936679977181974, "grad_norm": 0.703125, "learning_rate": 7.960337676017395e-06, "loss": 0.7642, "step": 6880 }, { "epoch": 3.694204892453273, "grad_norm": 0.6953125, "learning_rate": 7.954159021113486e-06, "loss": 0.9309, "step": 6881 }, { "epoch": 3.6947417871883492, "grad_norm": 0.62890625, "learning_rate": 7.947982311355893e-06, "loss": 0.7926, "step": 6882 }, { "epoch": 3.6952786819234253, "grad_norm": 0.71484375, "learning_rate": 7.94180754744947e-06, "loss": 0.8497, "step": 6883 }, { "epoch": 3.6958155766585015, "grad_norm": 0.69140625, "learning_rate": 7.935634730098821e-06, "loss": 0.785, "step": 6884 }, { "epoch": 3.6963524713935776, "grad_norm": 0.640625, "learning_rate": 7.929463860008355e-06, "loss": 0.7934, "step": 6885 }, { "epoch": 3.6968893661286533, "grad_norm": 0.63671875, "learning_rate": 7.92329493788225e-06, "loss": 0.6657, "step": 6886 }, { "epoch": 3.6974262608637294, "grad_norm": 0.7421875, "learning_rate": 7.917127964424446e-06, "loss": 0.8921, "step": 6887 }, { "epoch": 3.6979631555988055, "grad_norm": 0.6640625, "learning_rate": 7.910962940338673e-06, "loss": 0.7757, "step": 6888 }, { "epoch": 3.698500050333881, "grad_norm": 0.6875, "learning_rate": 7.904799866328454e-06, "loss": 0.9523, "step": 6889 }, { "epoch": 3.6990369450689573, "grad_norm": 0.76171875, "learning_rate": 7.898638743097066e-06, "loss": 0.7095, "step": 6890 }, { "epoch": 3.6995738398040334, "grad_norm": 0.72265625, "learning_rate": 7.892479571347559e-06, "loss": 1.0438, "step": 6891 }, { "epoch": 3.7001107345391095, "grad_norm": 0.62890625, "learning_rate": 7.886322351782783e-06, "loss": 0.7061, "step": 6892 }, { "epoch": 3.7006476292741857, "grad_norm": 0.71875, "learning_rate": 7.880167085105362e-06, "loss": 0.7912, "step": 6893 }, { "epoch": 3.7011845240092613, "grad_norm": 0.71484375, "learning_rate": 7.874013772017672e-06, "loss": 0.7393, "step": 6894 }, { "epoch": 3.7017214187443375, "grad_norm": 0.9296875, "learning_rate": 7.867862413221893e-06, "loss": 0.9521, "step": 6895 }, { "epoch": 3.7022583134794136, "grad_norm": 0.7734375, "learning_rate": 7.861713009419982e-06, "loss": 0.9284, "step": 6896 }, { "epoch": 3.7027952082144893, "grad_norm": 0.71484375, "learning_rate": 7.855565561313658e-06, "loss": 0.7724, "step": 6897 }, { "epoch": 3.7033321029495654, "grad_norm": 0.6875, "learning_rate": 7.84942006960441e-06, "loss": 0.742, "step": 6898 }, { "epoch": 3.7038689976846415, "grad_norm": 0.796875, "learning_rate": 7.843276534993535e-06, "loss": 0.9588, "step": 6899 }, { "epoch": 3.704405892419717, "grad_norm": 0.6640625, "learning_rate": 7.837134958182072e-06, "loss": 0.8108, "step": 6900 }, { "epoch": 3.704405892419717, "eval_loss": 0.9026853442192078, "eval_runtime": 46.02, "eval_samples_per_second": 20.904, "eval_steps_per_second": 20.904, "step": 6900 }, { "epoch": 3.7049427871547933, "grad_norm": 0.73046875, "learning_rate": 7.830995339870858e-06, "loss": 0.9668, "step": 6901 }, { "epoch": 3.7054796818898694, "grad_norm": 0.671875, "learning_rate": 7.824857680760516e-06, "loss": 0.9215, "step": 6902 }, { "epoch": 3.7060165766249455, "grad_norm": 0.62109375, "learning_rate": 7.818721981551408e-06, "loss": 0.7267, "step": 6903 }, { "epoch": 3.7065534713600217, "grad_norm": 0.71875, "learning_rate": 7.812588242943706e-06, "loss": 0.9404, "step": 6904 }, { "epoch": 3.7070903660950973, "grad_norm": 0.62109375, "learning_rate": 7.806456465637352e-06, "loss": 0.7769, "step": 6905 }, { "epoch": 3.7076272608301735, "grad_norm": 0.734375, "learning_rate": 7.800326650332058e-06, "loss": 0.9568, "step": 6906 }, { "epoch": 3.7081641555652496, "grad_norm": 0.6796875, "learning_rate": 7.794198797727299e-06, "loss": 0.7476, "step": 6907 }, { "epoch": 3.7087010503003253, "grad_norm": 0.6875, "learning_rate": 7.788072908522356e-06, "loss": 0.8911, "step": 6908 }, { "epoch": 3.7092379450354014, "grad_norm": 0.64453125, "learning_rate": 7.781948983416262e-06, "loss": 0.8081, "step": 6909 }, { "epoch": 3.7097748397704775, "grad_norm": 0.734375, "learning_rate": 7.775827023107835e-06, "loss": 0.8794, "step": 6910 }, { "epoch": 3.7103117345055536, "grad_norm": 0.828125, "learning_rate": 7.769707028295681e-06, "loss": 0.8441, "step": 6911 }, { "epoch": 3.7108486292406297, "grad_norm": 0.6484375, "learning_rate": 7.76358899967815e-06, "loss": 0.9243, "step": 6912 }, { "epoch": 3.7113855239757054, "grad_norm": 0.7109375, "learning_rate": 7.757472937953398e-06, "loss": 0.7727, "step": 6913 }, { "epoch": 3.7119224187107815, "grad_norm": 0.71484375, "learning_rate": 7.751358843819348e-06, "loss": 0.7647, "step": 6914 }, { "epoch": 3.7124593134458577, "grad_norm": 0.69140625, "learning_rate": 7.745246717973692e-06, "loss": 0.6932, "step": 6915 }, { "epoch": 3.7129962081809333, "grad_norm": 0.703125, "learning_rate": 7.739136561113891e-06, "loss": 0.7426, "step": 6916 }, { "epoch": 3.7135331029160095, "grad_norm": 0.6328125, "learning_rate": 7.733028373937198e-06, "loss": 0.6847, "step": 6917 }, { "epoch": 3.7140699976510856, "grad_norm": 0.828125, "learning_rate": 7.726922157140645e-06, "loss": 0.9353, "step": 6918 }, { "epoch": 3.7146068923861613, "grad_norm": 0.671875, "learning_rate": 7.720817911421013e-06, "loss": 0.8364, "step": 6919 }, { "epoch": 3.7151437871212374, "grad_norm": 0.6953125, "learning_rate": 7.714715637474879e-06, "loss": 0.8889, "step": 6920 }, { "epoch": 3.7156806818563135, "grad_norm": 0.71875, "learning_rate": 7.7086153359986e-06, "loss": 0.9004, "step": 6921 }, { "epoch": 3.7162175765913896, "grad_norm": 0.6171875, "learning_rate": 7.702517007688279e-06, "loss": 0.6917, "step": 6922 }, { "epoch": 3.7167544713264657, "grad_norm": 0.6875, "learning_rate": 7.696420653239833e-06, "loss": 0.9131, "step": 6923 }, { "epoch": 3.7172913660615414, "grad_norm": 0.6796875, "learning_rate": 7.690326273348922e-06, "loss": 0.8444, "step": 6924 }, { "epoch": 3.7178282607966175, "grad_norm": 0.70703125, "learning_rate": 7.684233868710982e-06, "loss": 0.6928, "step": 6925 }, { "epoch": 3.7183651555316937, "grad_norm": 0.66015625, "learning_rate": 7.678143440021246e-06, "loss": 0.7861, "step": 6926 }, { "epoch": 3.7189020502667693, "grad_norm": 0.6640625, "learning_rate": 7.672054987974713e-06, "loss": 0.9232, "step": 6927 }, { "epoch": 3.7194389450018455, "grad_norm": 0.71875, "learning_rate": 7.665968513266142e-06, "loss": 0.7922, "step": 6928 }, { "epoch": 3.7199758397369216, "grad_norm": 0.7890625, "learning_rate": 7.65988401659008e-06, "loss": 0.9256, "step": 6929 }, { "epoch": 3.7205127344719977, "grad_norm": 0.66796875, "learning_rate": 7.653801498640853e-06, "loss": 0.8112, "step": 6930 }, { "epoch": 3.721049629207074, "grad_norm": 0.6875, "learning_rate": 7.64772096011255e-06, "loss": 0.7897, "step": 6931 }, { "epoch": 3.7215865239421495, "grad_norm": 0.66015625, "learning_rate": 7.641642401699022e-06, "loss": 0.8193, "step": 6932 }, { "epoch": 3.7221234186772256, "grad_norm": 0.74609375, "learning_rate": 7.635565824093926e-06, "loss": 0.8484, "step": 6933 }, { "epoch": 3.7226603134123017, "grad_norm": 0.6640625, "learning_rate": 7.629491227990679e-06, "loss": 0.8381, "step": 6934 }, { "epoch": 3.7231972081473774, "grad_norm": 0.578125, "learning_rate": 7.623418614082453e-06, "loss": 0.7133, "step": 6935 }, { "epoch": 3.7237341028824535, "grad_norm": 0.64453125, "learning_rate": 7.6173479830622205e-06, "loss": 0.7596, "step": 6936 }, { "epoch": 3.7242709976175297, "grad_norm": 0.65625, "learning_rate": 7.6112793356227255e-06, "loss": 0.7272, "step": 6937 }, { "epoch": 3.7248078923526053, "grad_norm": 0.6953125, "learning_rate": 7.605212672456461e-06, "loss": 1.0197, "step": 6938 }, { "epoch": 3.7253447870876815, "grad_norm": 0.7890625, "learning_rate": 7.599147994255723e-06, "loss": 0.9029, "step": 6939 }, { "epoch": 3.7258816818227576, "grad_norm": 0.9375, "learning_rate": 7.593085301712566e-06, "loss": 0.8767, "step": 6940 }, { "epoch": 3.7264185765578337, "grad_norm": 0.60546875, "learning_rate": 7.587024595518804e-06, "loss": 0.8026, "step": 6941 }, { "epoch": 3.72695547129291, "grad_norm": 0.6796875, "learning_rate": 7.580965876366058e-06, "loss": 0.7873, "step": 6942 }, { "epoch": 3.7274923660279855, "grad_norm": 0.69140625, "learning_rate": 7.5749091449457074e-06, "loss": 0.8802, "step": 6943 }, { "epoch": 3.7280292607630616, "grad_norm": 0.68359375, "learning_rate": 7.568854401948886e-06, "loss": 0.6855, "step": 6944 }, { "epoch": 3.7285661554981377, "grad_norm": 0.76171875, "learning_rate": 7.5628016480665225e-06, "loss": 0.7593, "step": 6945 }, { "epoch": 3.7291030502332134, "grad_norm": 0.71484375, "learning_rate": 7.556750883989324e-06, "loss": 0.9939, "step": 6946 }, { "epoch": 3.7296399449682895, "grad_norm": 0.68359375, "learning_rate": 7.550702110407743e-06, "loss": 0.9394, "step": 6947 }, { "epoch": 3.7301768397033657, "grad_norm": 0.609375, "learning_rate": 7.544655328012038e-06, "loss": 0.7396, "step": 6948 }, { "epoch": 3.730713734438442, "grad_norm": 0.71484375, "learning_rate": 7.538610537492213e-06, "loss": 0.8867, "step": 6949 }, { "epoch": 3.731250629173518, "grad_norm": 0.7578125, "learning_rate": 7.532567739538049e-06, "loss": 0.8756, "step": 6950 }, { "epoch": 3.731250629173518, "eval_loss": 0.9027646780014038, "eval_runtime": 45.7834, "eval_samples_per_second": 21.012, "eval_steps_per_second": 21.012, "step": 6950 }, { "epoch": 3.7317875239085936, "grad_norm": 0.76171875, "learning_rate": 7.526526934839112e-06, "loss": 0.8987, "step": 6951 }, { "epoch": 3.7323244186436697, "grad_norm": 0.859375, "learning_rate": 7.520488124084743e-06, "loss": 0.8377, "step": 6952 }, { "epoch": 3.732861313378746, "grad_norm": 0.66015625, "learning_rate": 7.514451307964032e-06, "loss": 0.6978, "step": 6953 }, { "epoch": 3.7333982081138215, "grad_norm": 0.6953125, "learning_rate": 7.508416487165862e-06, "loss": 0.7056, "step": 6954 }, { "epoch": 3.7339351028488976, "grad_norm": 0.71484375, "learning_rate": 7.502383662378895e-06, "loss": 0.8986, "step": 6955 }, { "epoch": 3.7344719975839737, "grad_norm": 0.6875, "learning_rate": 7.49635283429154e-06, "loss": 0.85, "step": 6956 }, { "epoch": 3.7350088923190494, "grad_norm": 0.7578125, "learning_rate": 7.490324003591984e-06, "loss": 0.843, "step": 6957 }, { "epoch": 3.7355457870541255, "grad_norm": 0.7421875, "learning_rate": 7.4842971709681995e-06, "loss": 0.7462, "step": 6958 }, { "epoch": 3.7360826817892017, "grad_norm": 0.68359375, "learning_rate": 7.478272337107936e-06, "loss": 0.8335, "step": 6959 }, { "epoch": 3.736619576524278, "grad_norm": 0.69140625, "learning_rate": 7.472249502698686e-06, "loss": 0.8435, "step": 6960 }, { "epoch": 3.737156471259354, "grad_norm": 0.62109375, "learning_rate": 7.466228668427738e-06, "loss": 0.8474, "step": 6961 }, { "epoch": 3.7376933659944296, "grad_norm": 0.66015625, "learning_rate": 7.460209834982154e-06, "loss": 0.9112, "step": 6962 }, { "epoch": 3.7382302607295057, "grad_norm": 0.68359375, "learning_rate": 7.454193003048743e-06, "loss": 0.6978, "step": 6963 }, { "epoch": 3.738767155464582, "grad_norm": 0.69921875, "learning_rate": 7.448178173314111e-06, "loss": 0.8502, "step": 6964 }, { "epoch": 3.7393040501996575, "grad_norm": 0.7109375, "learning_rate": 7.4421653464646416e-06, "loss": 0.8533, "step": 6965 }, { "epoch": 3.7398409449347336, "grad_norm": 0.72265625, "learning_rate": 7.4361545231864425e-06, "loss": 0.9364, "step": 6966 }, { "epoch": 3.7403778396698097, "grad_norm": 0.6328125, "learning_rate": 7.430145704165439e-06, "loss": 0.5971, "step": 6967 }, { "epoch": 3.740914734404886, "grad_norm": 0.765625, "learning_rate": 7.424138890087323e-06, "loss": 0.8235, "step": 6968 }, { "epoch": 3.741451629139962, "grad_norm": 0.75390625, "learning_rate": 7.418134081637532e-06, "loss": 0.8988, "step": 6969 }, { "epoch": 3.7419885238750377, "grad_norm": 0.77734375, "learning_rate": 7.412131279501297e-06, "loss": 0.9451, "step": 6970 }, { "epoch": 3.742525418610114, "grad_norm": 0.7109375, "learning_rate": 7.406130484363627e-06, "loss": 0.9615, "step": 6971 }, { "epoch": 3.74306231334519, "grad_norm": 0.59765625, "learning_rate": 7.400131696909266e-06, "loss": 0.9225, "step": 6972 }, { "epoch": 3.7435992080802656, "grad_norm": 0.71484375, "learning_rate": 7.3941349178227735e-06, "loss": 1.036, "step": 6973 }, { "epoch": 3.7441361028153417, "grad_norm": 0.62109375, "learning_rate": 7.388140147788436e-06, "loss": 0.7147, "step": 6974 }, { "epoch": 3.744672997550418, "grad_norm": 0.73046875, "learning_rate": 7.382147387490354e-06, "loss": 0.8636, "step": 6975 }, { "epoch": 3.7452098922854935, "grad_norm": 0.671875, "learning_rate": 7.376156637612358e-06, "loss": 0.7157, "step": 6976 }, { "epoch": 3.7457467870205696, "grad_norm": 0.734375, "learning_rate": 7.370167898838079e-06, "loss": 0.9783, "step": 6977 }, { "epoch": 3.7462836817556457, "grad_norm": 0.73046875, "learning_rate": 7.364181171850912e-06, "loss": 0.9377, "step": 6978 }, { "epoch": 3.746820576490722, "grad_norm": 0.66015625, "learning_rate": 7.358196457334007e-06, "loss": 0.8323, "step": 6979 }, { "epoch": 3.747357471225798, "grad_norm": 0.703125, "learning_rate": 7.352213755970302e-06, "loss": 0.9374, "step": 6980 }, { "epoch": 3.7478943659608737, "grad_norm": 0.8125, "learning_rate": 7.346233068442512e-06, "loss": 0.8611, "step": 6981 }, { "epoch": 3.74843126069595, "grad_norm": 0.7109375, "learning_rate": 7.3402543954330825e-06, "loss": 1.0845, "step": 6982 }, { "epoch": 3.748968155431026, "grad_norm": 0.65625, "learning_rate": 7.334277737624268e-06, "loss": 0.6241, "step": 6983 }, { "epoch": 3.7495050501661016, "grad_norm": 0.65234375, "learning_rate": 7.328303095698089e-06, "loss": 0.7232, "step": 6984 }, { "epoch": 3.7500419449011777, "grad_norm": 0.609375, "learning_rate": 7.3223304703363135e-06, "loss": 1.0548, "step": 6985 }, { "epoch": 3.750578839636254, "grad_norm": 0.8671875, "learning_rate": 7.316359862220501e-06, "loss": 1.0695, "step": 6986 }, { "epoch": 3.75111573437133, "grad_norm": 0.7109375, "learning_rate": 7.310391272031983e-06, "loss": 0.7739, "step": 6987 }, { "epoch": 3.751652629106406, "grad_norm": 0.703125, "learning_rate": 7.304424700451831e-06, "loss": 0.7856, "step": 6988 }, { "epoch": 3.7521895238414817, "grad_norm": 0.75, "learning_rate": 7.298460148160918e-06, "loss": 0.9983, "step": 6989 }, { "epoch": 3.752726418576558, "grad_norm": 0.7109375, "learning_rate": 7.292497615839889e-06, "loss": 0.7296, "step": 6990 }, { "epoch": 3.753263313311634, "grad_norm": 0.703125, "learning_rate": 7.286537104169114e-06, "loss": 0.8161, "step": 6991 }, { "epoch": 3.7538002080467097, "grad_norm": 0.671875, "learning_rate": 7.280578613828778e-06, "loss": 0.7749, "step": 6992 }, { "epoch": 3.754337102781786, "grad_norm": 0.703125, "learning_rate": 7.27462214549883e-06, "loss": 0.8895, "step": 6993 }, { "epoch": 3.754873997516862, "grad_norm": 0.81640625, "learning_rate": 7.268667699858961e-06, "loss": 0.9168, "step": 6994 }, { "epoch": 3.7554108922519376, "grad_norm": 0.7265625, "learning_rate": 7.262715277588655e-06, "loss": 0.745, "step": 6995 }, { "epoch": 3.7559477869870137, "grad_norm": 0.71484375, "learning_rate": 7.25676487936717e-06, "loss": 1.0201, "step": 6996 }, { "epoch": 3.75648468172209, "grad_norm": 0.765625, "learning_rate": 7.250816505873504e-06, "loss": 0.9542, "step": 6997 }, { "epoch": 3.757021576457166, "grad_norm": 0.68359375, "learning_rate": 7.244870157786454e-06, "loss": 0.8022, "step": 6998 }, { "epoch": 3.757558471192242, "grad_norm": 0.7265625, "learning_rate": 7.238925835784566e-06, "loss": 1.0832, "step": 6999 }, { "epoch": 3.7580953659273177, "grad_norm": 0.82421875, "learning_rate": 7.232983540546173e-06, "loss": 0.895, "step": 7000 }, { "epoch": 3.7580953659273177, "eval_loss": 0.9025610089302063, "eval_runtime": 46.3144, "eval_samples_per_second": 20.771, "eval_steps_per_second": 20.771, "step": 7000 }, { "epoch": 3.758632260662394, "grad_norm": 0.6953125, "learning_rate": 7.227043272749351e-06, "loss": 0.7568, "step": 7001 }, { "epoch": 3.75916915539747, "grad_norm": 0.71875, "learning_rate": 7.2211050330719674e-06, "loss": 0.9158, "step": 7002 }, { "epoch": 3.7597060501325457, "grad_norm": 0.65625, "learning_rate": 7.215168822191659e-06, "loss": 0.6484, "step": 7003 }, { "epoch": 3.760242944867622, "grad_norm": 0.80078125, "learning_rate": 7.209234640785806e-06, "loss": 1.013, "step": 7004 }, { "epoch": 3.760779839602698, "grad_norm": 0.6796875, "learning_rate": 7.203302489531583e-06, "loss": 0.6657, "step": 7005 }, { "epoch": 3.761316734337774, "grad_norm": 0.7265625, "learning_rate": 7.19737236910594e-06, "loss": 0.8123, "step": 7006 }, { "epoch": 3.76185362907285, "grad_norm": 0.73828125, "learning_rate": 7.191444280185544e-06, "loss": 0.8483, "step": 7007 }, { "epoch": 3.762390523807926, "grad_norm": 0.58203125, "learning_rate": 7.1855182234468825e-06, "loss": 0.9083, "step": 7008 }, { "epoch": 3.762927418543002, "grad_norm": 0.69140625, "learning_rate": 7.179594199566203e-06, "loss": 1.0029, "step": 7009 }, { "epoch": 3.763464313278078, "grad_norm": 0.859375, "learning_rate": 7.173672209219495e-06, "loss": 0.8219, "step": 7010 }, { "epoch": 3.7640012080131537, "grad_norm": 0.67578125, "learning_rate": 7.167752253082538e-06, "loss": 0.8656, "step": 7011 }, { "epoch": 3.76453810274823, "grad_norm": 0.74609375, "learning_rate": 7.161834331830886e-06, "loss": 0.7697, "step": 7012 }, { "epoch": 3.765074997483306, "grad_norm": 0.66015625, "learning_rate": 7.155918446139831e-06, "loss": 0.7004, "step": 7013 }, { "epoch": 3.7656118922183817, "grad_norm": 0.73828125, "learning_rate": 7.150004596684456e-06, "loss": 0.8632, "step": 7014 }, { "epoch": 3.766148786953458, "grad_norm": 0.7578125, "learning_rate": 7.144092784139619e-06, "loss": 0.856, "step": 7015 }, { "epoch": 3.766685681688534, "grad_norm": 0.703125, "learning_rate": 7.138183009179922e-06, "loss": 0.7863, "step": 7016 }, { "epoch": 3.76722257642361, "grad_norm": 0.7109375, "learning_rate": 7.132275272479735e-06, "loss": 0.8785, "step": 7017 }, { "epoch": 3.767759471158686, "grad_norm": 0.62890625, "learning_rate": 7.126369574713218e-06, "loss": 0.8522, "step": 7018 }, { "epoch": 3.768296365893762, "grad_norm": 0.734375, "learning_rate": 7.120465916554292e-06, "loss": 0.8985, "step": 7019 }, { "epoch": 3.768833260628838, "grad_norm": 0.73828125, "learning_rate": 7.114564298676621e-06, "loss": 0.8856, "step": 7020 }, { "epoch": 3.769370155363914, "grad_norm": 0.9296875, "learning_rate": 7.108664721753666e-06, "loss": 0.8923, "step": 7021 }, { "epoch": 3.7699070500989897, "grad_norm": 0.74609375, "learning_rate": 7.102767186458647e-06, "loss": 0.6679, "step": 7022 }, { "epoch": 3.770443944834066, "grad_norm": 0.6640625, "learning_rate": 7.096871693464544e-06, "loss": 0.7724, "step": 7023 }, { "epoch": 3.770980839569142, "grad_norm": 0.73046875, "learning_rate": 7.090978243444099e-06, "loss": 0.7654, "step": 7024 }, { "epoch": 3.771517734304218, "grad_norm": 0.66796875, "learning_rate": 7.08508683706984e-06, "loss": 0.7234, "step": 7025 }, { "epoch": 3.7720546290392942, "grad_norm": 0.734375, "learning_rate": 7.079197475014043e-06, "loss": 0.6981, "step": 7026 }, { "epoch": 3.77259152377437, "grad_norm": 0.70703125, "learning_rate": 7.073310157948759e-06, "loss": 0.7775, "step": 7027 }, { "epoch": 3.773128418509446, "grad_norm": 0.58984375, "learning_rate": 7.0674248865458206e-06, "loss": 0.6624, "step": 7028 }, { "epoch": 3.773665313244522, "grad_norm": 0.6640625, "learning_rate": 7.061541661476792e-06, "loss": 0.8177, "step": 7029 }, { "epoch": 3.774202207979598, "grad_norm": 0.74609375, "learning_rate": 7.055660483413029e-06, "loss": 0.9344, "step": 7030 }, { "epoch": 3.774739102714674, "grad_norm": 0.6875, "learning_rate": 7.049781353025661e-06, "loss": 0.7877, "step": 7031 }, { "epoch": 3.77527599744975, "grad_norm": 0.65234375, "learning_rate": 7.043904270985563e-06, "loss": 0.7633, "step": 7032 }, { "epoch": 3.7758128921848257, "grad_norm": 0.71875, "learning_rate": 7.038029237963376e-06, "loss": 1.0485, "step": 7033 }, { "epoch": 3.776349786919902, "grad_norm": 0.81640625, "learning_rate": 7.032156254629527e-06, "loss": 0.9159, "step": 7034 }, { "epoch": 3.776886681654978, "grad_norm": 0.55078125, "learning_rate": 7.0262853216541885e-06, "loss": 0.6439, "step": 7035 }, { "epoch": 3.777423576390054, "grad_norm": 0.671875, "learning_rate": 7.020416439707314e-06, "loss": 0.955, "step": 7036 }, { "epoch": 3.7779604711251302, "grad_norm": 0.74609375, "learning_rate": 7.014549609458624e-06, "loss": 1.1313, "step": 7037 }, { "epoch": 3.778497365860206, "grad_norm": 0.6484375, "learning_rate": 7.008684831577581e-06, "loss": 0.7717, "step": 7038 }, { "epoch": 3.779034260595282, "grad_norm": 0.69921875, "learning_rate": 7.002822106733442e-06, "loss": 0.7269, "step": 7039 }, { "epoch": 3.779571155330358, "grad_norm": 0.6328125, "learning_rate": 6.996961435595223e-06, "loss": 0.9413, "step": 7040 }, { "epoch": 3.780108050065434, "grad_norm": 0.76953125, "learning_rate": 6.991102818831696e-06, "loss": 0.9704, "step": 7041 }, { "epoch": 3.78064494480051, "grad_norm": 0.76953125, "learning_rate": 6.985246257111394e-06, "loss": 1.008, "step": 7042 }, { "epoch": 3.781181839535586, "grad_norm": 0.63671875, "learning_rate": 6.9793917511026315e-06, "loss": 0.8338, "step": 7043 }, { "epoch": 3.781718734270662, "grad_norm": 0.65625, "learning_rate": 6.973539301473489e-06, "loss": 0.7488, "step": 7044 }, { "epoch": 3.7822556290057383, "grad_norm": 0.73828125, "learning_rate": 6.967688908891792e-06, "loss": 0.8088, "step": 7045 }, { "epoch": 3.782792523740814, "grad_norm": 1.15625, "learning_rate": 6.961840574025153e-06, "loss": 1.0319, "step": 7046 }, { "epoch": 3.78332941847589, "grad_norm": 0.83984375, "learning_rate": 6.9559942975409465e-06, "loss": 0.8648, "step": 7047 }, { "epoch": 3.7838663132109662, "grad_norm": 0.58203125, "learning_rate": 6.950150080106299e-06, "loss": 0.6366, "step": 7048 }, { "epoch": 3.784403207946042, "grad_norm": 0.8359375, "learning_rate": 6.9443079223881015e-06, "loss": 0.8789, "step": 7049 }, { "epoch": 3.784940102681118, "grad_norm": 0.6796875, "learning_rate": 6.9384678250530355e-06, "loss": 0.8497, "step": 7050 }, { "epoch": 3.784940102681118, "eval_loss": 0.9027681946754456, "eval_runtime": 46.2488, "eval_samples_per_second": 20.801, "eval_steps_per_second": 20.801, "step": 7050 }, { "epoch": 3.785476997416194, "grad_norm": 1.046875, "learning_rate": 6.932629788767514e-06, "loss": 0.7524, "step": 7051 }, { "epoch": 3.78601389215127, "grad_norm": 0.7890625, "learning_rate": 6.926793814197741e-06, "loss": 0.913, "step": 7052 }, { "epoch": 3.786550786886346, "grad_norm": 0.66015625, "learning_rate": 6.920959902009675e-06, "loss": 0.8766, "step": 7053 }, { "epoch": 3.787087681621422, "grad_norm": 0.76953125, "learning_rate": 6.915128052869033e-06, "loss": 0.9065, "step": 7054 }, { "epoch": 3.787624576356498, "grad_norm": 0.65625, "learning_rate": 6.909298267441302e-06, "loss": 0.7086, "step": 7055 }, { "epoch": 3.7881614710915743, "grad_norm": 0.66796875, "learning_rate": 6.9034705463917525e-06, "loss": 0.8337, "step": 7056 }, { "epoch": 3.78869836582665, "grad_norm": 0.7265625, "learning_rate": 6.897644890385382e-06, "loss": 0.8837, "step": 7057 }, { "epoch": 3.789235260561726, "grad_norm": 0.8203125, "learning_rate": 6.891821300086973e-06, "loss": 0.8759, "step": 7058 }, { "epoch": 3.7897721552968022, "grad_norm": 0.70703125, "learning_rate": 6.885999776161076e-06, "loss": 1.0047, "step": 7059 }, { "epoch": 3.790309050031878, "grad_norm": 0.6171875, "learning_rate": 6.880180319272006e-06, "loss": 0.7059, "step": 7060 }, { "epoch": 3.790845944766954, "grad_norm": 0.68359375, "learning_rate": 6.874362930083822e-06, "loss": 0.8963, "step": 7061 }, { "epoch": 3.79138283950203, "grad_norm": 0.60546875, "learning_rate": 6.86854760926037e-06, "loss": 0.7088, "step": 7062 }, { "epoch": 3.7919197342371063, "grad_norm": 0.5703125, "learning_rate": 6.862734357465261e-06, "loss": 0.6758, "step": 7063 }, { "epoch": 3.7924566289721824, "grad_norm": 0.76953125, "learning_rate": 6.8569231753618426e-06, "loss": 0.9725, "step": 7064 }, { "epoch": 3.792993523707258, "grad_norm": 0.64453125, "learning_rate": 6.851114063613265e-06, "loss": 0.6546, "step": 7065 }, { "epoch": 3.793530418442334, "grad_norm": 0.6875, "learning_rate": 6.845307022882405e-06, "loss": 0.8853, "step": 7066 }, { "epoch": 3.7940673131774103, "grad_norm": 0.6640625, "learning_rate": 6.83950205383192e-06, "loss": 0.7502, "step": 7067 }, { "epoch": 3.794604207912486, "grad_norm": 0.64453125, "learning_rate": 6.833699157124232e-06, "loss": 1.0217, "step": 7068 }, { "epoch": 3.795141102647562, "grad_norm": 0.7265625, "learning_rate": 6.827898333421537e-06, "loss": 0.9204, "step": 7069 }, { "epoch": 3.7956779973826382, "grad_norm": 0.82421875, "learning_rate": 6.822099583385766e-06, "loss": 0.9674, "step": 7070 }, { "epoch": 3.796214892117714, "grad_norm": 0.75390625, "learning_rate": 6.816302907678638e-06, "loss": 0.9384, "step": 7071 }, { "epoch": 3.79675178685279, "grad_norm": 0.86328125, "learning_rate": 6.81050830696163e-06, "loss": 0.9101, "step": 7072 }, { "epoch": 3.797288681587866, "grad_norm": 0.6953125, "learning_rate": 6.804715781895971e-06, "loss": 0.8452, "step": 7073 }, { "epoch": 3.7978255763229423, "grad_norm": 0.6640625, "learning_rate": 6.798925333142672e-06, "loss": 0.8919, "step": 7074 }, { "epoch": 3.7983624710580184, "grad_norm": 0.7109375, "learning_rate": 6.793136961362489e-06, "loss": 0.8072, "step": 7075 }, { "epoch": 3.798899365793094, "grad_norm": 0.578125, "learning_rate": 6.787350667215947e-06, "loss": 0.7227, "step": 7076 }, { "epoch": 3.79943626052817, "grad_norm": 0.68359375, "learning_rate": 6.781566451363336e-06, "loss": 0.8098, "step": 7077 }, { "epoch": 3.7999731552632463, "grad_norm": 0.7265625, "learning_rate": 6.775784314464717e-06, "loss": 1.0118, "step": 7078 }, { "epoch": 3.800510049998322, "grad_norm": 1.09375, "learning_rate": 6.770004257179893e-06, "loss": 1.1284, "step": 7079 }, { "epoch": 3.801046944733398, "grad_norm": 0.6484375, "learning_rate": 6.764226280168445e-06, "loss": 0.7417, "step": 7080 }, { "epoch": 3.8015838394684742, "grad_norm": 0.63671875, "learning_rate": 6.758450384089726e-06, "loss": 0.8914, "step": 7081 }, { "epoch": 3.8021207342035503, "grad_norm": 0.73046875, "learning_rate": 6.752676569602828e-06, "loss": 0.8329, "step": 7082 }, { "epoch": 3.8026576289386265, "grad_norm": 0.671875, "learning_rate": 6.746904837366608e-06, "loss": 0.7742, "step": 7083 }, { "epoch": 3.803194523673702, "grad_norm": 0.625, "learning_rate": 6.7411351880397045e-06, "loss": 0.8232, "step": 7084 }, { "epoch": 3.8037314184087783, "grad_norm": 0.6875, "learning_rate": 6.735367622280514e-06, "loss": 0.9152, "step": 7085 }, { "epoch": 3.8042683131438544, "grad_norm": 0.6484375, "learning_rate": 6.729602140747171e-06, "loss": 0.7321, "step": 7086 }, { "epoch": 3.80480520787893, "grad_norm": 0.57421875, "learning_rate": 6.723838744097602e-06, "loss": 0.7819, "step": 7087 }, { "epoch": 3.805342102614006, "grad_norm": 0.6796875, "learning_rate": 6.7180774329894865e-06, "loss": 0.7493, "step": 7088 }, { "epoch": 3.8058789973490823, "grad_norm": 0.95703125, "learning_rate": 6.712318208080251e-06, "loss": 0.8014, "step": 7089 }, { "epoch": 3.806415892084158, "grad_norm": 0.70703125, "learning_rate": 6.706561070027109e-06, "loss": 1.0279, "step": 7090 }, { "epoch": 3.806952786819234, "grad_norm": 0.75, "learning_rate": 6.700806019487019e-06, "loss": 0.9602, "step": 7091 }, { "epoch": 3.8074896815543102, "grad_norm": 0.69921875, "learning_rate": 6.695053057116693e-06, "loss": 0.8572, "step": 7092 }, { "epoch": 3.8080265762893863, "grad_norm": 0.8203125, "learning_rate": 6.6893021835726295e-06, "loss": 0.8794, "step": 7093 }, { "epoch": 3.8085634710244625, "grad_norm": 0.640625, "learning_rate": 6.6835533995110775e-06, "loss": 0.7571, "step": 7094 }, { "epoch": 3.809100365759538, "grad_norm": 0.6484375, "learning_rate": 6.677806705588036e-06, "loss": 0.9336, "step": 7095 }, { "epoch": 3.8096372604946143, "grad_norm": 0.73046875, "learning_rate": 6.672062102459281e-06, "loss": 0.9442, "step": 7096 }, { "epoch": 3.8101741552296904, "grad_norm": 0.765625, "learning_rate": 6.666319590780351e-06, "loss": 0.643, "step": 7097 }, { "epoch": 3.810711049964766, "grad_norm": 0.6171875, "learning_rate": 6.660579171206527e-06, "loss": 0.7477, "step": 7098 }, { "epoch": 3.811247944699842, "grad_norm": 0.66796875, "learning_rate": 6.6548408443928765e-06, "loss": 0.9367, "step": 7099 }, { "epoch": 3.8117848394349183, "grad_norm": 0.7890625, "learning_rate": 6.6491046109942e-06, "loss": 0.9445, "step": 7100 }, { "epoch": 3.8117848394349183, "eval_loss": 0.9026427268981934, "eval_runtime": 46.0333, "eval_samples_per_second": 20.898, "eval_steps_per_second": 20.898, "step": 7100 }, { "epoch": 3.8123217341699944, "grad_norm": 0.65625, "learning_rate": 6.643370471665092e-06, "loss": 0.7945, "step": 7101 }, { "epoch": 3.8128586289050705, "grad_norm": 0.6796875, "learning_rate": 6.6376384270598695e-06, "loss": 0.7124, "step": 7102 }, { "epoch": 3.8133955236401462, "grad_norm": 0.6015625, "learning_rate": 6.631908477832646e-06, "loss": 0.6983, "step": 7103 }, { "epoch": 3.8139324183752223, "grad_norm": 0.68359375, "learning_rate": 6.6261806246372875e-06, "loss": 0.8853, "step": 7104 }, { "epoch": 3.8144693131102985, "grad_norm": 0.80078125, "learning_rate": 6.620454868127396e-06, "loss": 1.0888, "step": 7105 }, { "epoch": 3.815006207845374, "grad_norm": 0.69921875, "learning_rate": 6.6147312089563596e-06, "loss": 0.7172, "step": 7106 }, { "epoch": 3.8155431025804503, "grad_norm": 0.76953125, "learning_rate": 6.609009647777342e-06, "loss": 0.768, "step": 7107 }, { "epoch": 3.8160799973155264, "grad_norm": 0.67578125, "learning_rate": 6.6032901852432105e-06, "loss": 0.9998, "step": 7108 }, { "epoch": 3.816616892050602, "grad_norm": 0.7890625, "learning_rate": 6.5975728220066425e-06, "loss": 0.9766, "step": 7109 }, { "epoch": 3.817153786785678, "grad_norm": 1.2734375, "learning_rate": 6.591857558720071e-06, "loss": 1.0564, "step": 7110 }, { "epoch": 3.8176906815207543, "grad_norm": 0.62890625, "learning_rate": 6.586144396035668e-06, "loss": 0.6909, "step": 7111 }, { "epoch": 3.8182275762558304, "grad_norm": 0.640625, "learning_rate": 6.580433334605379e-06, "loss": 0.822, "step": 7112 }, { "epoch": 3.8187644709909065, "grad_norm": 0.7421875, "learning_rate": 6.57472437508092e-06, "loss": 1.0486, "step": 7113 }, { "epoch": 3.8193013657259822, "grad_norm": 0.68359375, "learning_rate": 6.56901751811374e-06, "loss": 0.7288, "step": 7114 }, { "epoch": 3.8198382604610583, "grad_norm": 0.78125, "learning_rate": 6.56331276435507e-06, "loss": 0.9105, "step": 7115 }, { "epoch": 3.8203751551961345, "grad_norm": 0.69921875, "learning_rate": 6.557610114455903e-06, "loss": 0.8362, "step": 7116 }, { "epoch": 3.82091204993121, "grad_norm": 0.82421875, "learning_rate": 6.551909569066974e-06, "loss": 0.9469, "step": 7117 }, { "epoch": 3.8214489446662863, "grad_norm": 0.75390625, "learning_rate": 6.546211128838786e-06, "loss": 0.9123, "step": 7118 }, { "epoch": 3.8219858394013624, "grad_norm": 0.671875, "learning_rate": 6.540514794421612e-06, "loss": 1.0064, "step": 7119 }, { "epoch": 3.8225227341364385, "grad_norm": 0.62890625, "learning_rate": 6.534820566465463e-06, "loss": 0.7518, "step": 7120 }, { "epoch": 3.8230596288715146, "grad_norm": 0.7265625, "learning_rate": 6.529128445620128e-06, "loss": 0.7842, "step": 7121 }, { "epoch": 3.8235965236065903, "grad_norm": 0.64453125, "learning_rate": 6.523438432535162e-06, "loss": 0.8895, "step": 7122 }, { "epoch": 3.8241334183416664, "grad_norm": 0.70703125, "learning_rate": 6.5177505278598506e-06, "loss": 0.8676, "step": 7123 }, { "epoch": 3.8246703130767425, "grad_norm": 0.7421875, "learning_rate": 6.512064732243267e-06, "loss": 0.856, "step": 7124 }, { "epoch": 3.825207207811818, "grad_norm": 0.6015625, "learning_rate": 6.506381046334223e-06, "loss": 0.6957, "step": 7125 }, { "epoch": 3.8257441025468943, "grad_norm": 0.6640625, "learning_rate": 6.50069947078131e-06, "loss": 0.8614, "step": 7126 }, { "epoch": 3.8262809972819705, "grad_norm": 0.6875, "learning_rate": 6.495020006232857e-06, "loss": 0.8632, "step": 7127 }, { "epoch": 3.826817892017046, "grad_norm": 0.671875, "learning_rate": 6.489342653336966e-06, "loss": 0.9827, "step": 7128 }, { "epoch": 3.8273547867521223, "grad_norm": 0.875, "learning_rate": 6.4836674127415064e-06, "loss": 0.8821, "step": 7129 }, { "epoch": 3.8278916814871984, "grad_norm": 0.63671875, "learning_rate": 6.477994285094077e-06, "loss": 0.8037, "step": 7130 }, { "epoch": 3.8284285762222745, "grad_norm": 0.58984375, "learning_rate": 6.47232327104206e-06, "loss": 0.8041, "step": 7131 }, { "epoch": 3.8289654709573506, "grad_norm": 0.7421875, "learning_rate": 6.466654371232608e-06, "loss": 0.7116, "step": 7132 }, { "epoch": 3.8295023656924263, "grad_norm": 0.6328125, "learning_rate": 6.460987586312587e-06, "loss": 0.7953, "step": 7133 }, { "epoch": 3.8300392604275024, "grad_norm": 0.734375, "learning_rate": 6.455322916928655e-06, "loss": 0.7814, "step": 7134 }, { "epoch": 3.8305761551625785, "grad_norm": 0.6328125, "learning_rate": 6.449660363727236e-06, "loss": 0.9174, "step": 7135 }, { "epoch": 3.831113049897654, "grad_norm": 0.703125, "learning_rate": 6.443999927354482e-06, "loss": 0.9298, "step": 7136 }, { "epoch": 3.8316499446327303, "grad_norm": 0.60546875, "learning_rate": 6.438341608456333e-06, "loss": 0.7381, "step": 7137 }, { "epoch": 3.8321868393678065, "grad_norm": 0.75, "learning_rate": 6.432685407678474e-06, "loss": 0.7947, "step": 7138 }, { "epoch": 3.8327237341028826, "grad_norm": 0.75, "learning_rate": 6.427031325666341e-06, "loss": 0.8098, "step": 7139 }, { "epoch": 3.8332606288379587, "grad_norm": 0.734375, "learning_rate": 6.421379363065142e-06, "loss": 0.9963, "step": 7140 }, { "epoch": 3.8337975235730344, "grad_norm": 0.6640625, "learning_rate": 6.415729520519842e-06, "loss": 0.7359, "step": 7141 }, { "epoch": 3.8343344183081105, "grad_norm": 0.65625, "learning_rate": 6.410081798675155e-06, "loss": 0.7311, "step": 7142 }, { "epoch": 3.8348713130431866, "grad_norm": 0.7734375, "learning_rate": 6.40443619817555e-06, "loss": 1.0181, "step": 7143 }, { "epoch": 3.8354082077782623, "grad_norm": 0.70703125, "learning_rate": 6.39879271966527e-06, "loss": 0.8381, "step": 7144 }, { "epoch": 3.8359451025133384, "grad_norm": 0.69921875, "learning_rate": 6.393151363788311e-06, "loss": 0.9254, "step": 7145 }, { "epoch": 3.8364819972484145, "grad_norm": 0.64453125, "learning_rate": 6.387512131188411e-06, "loss": 0.9559, "step": 7146 }, { "epoch": 3.83701889198349, "grad_norm": 0.5859375, "learning_rate": 6.381875022509087e-06, "loss": 0.5784, "step": 7147 }, { "epoch": 3.8375557867185663, "grad_norm": 0.609375, "learning_rate": 6.3762400383936075e-06, "loss": 0.8213, "step": 7148 }, { "epoch": 3.8380926814536425, "grad_norm": 0.71484375, "learning_rate": 6.370607179484991e-06, "loss": 0.9033, "step": 7149 }, { "epoch": 3.8386295761887186, "grad_norm": 0.640625, "learning_rate": 6.3649764464260105e-06, "loss": 0.7153, "step": 7150 }, { "epoch": 3.8386295761887186, "eval_loss": 0.9026381969451904, "eval_runtime": 47.0361, "eval_samples_per_second": 20.452, "eval_steps_per_second": 20.452, "step": 7150 }, { "epoch": 3.8391664709237947, "grad_norm": 0.6171875, "learning_rate": 6.35934783985922e-06, "loss": 0.7916, "step": 7151 }, { "epoch": 3.8397033656588704, "grad_norm": 0.734375, "learning_rate": 6.3537213604268955e-06, "loss": 1.0401, "step": 7152 }, { "epoch": 3.8402402603939465, "grad_norm": 0.75390625, "learning_rate": 6.3480970087711e-06, "loss": 0.8586, "step": 7153 }, { "epoch": 3.8407771551290226, "grad_norm": 0.61328125, "learning_rate": 6.342474785533653e-06, "loss": 0.6723, "step": 7154 }, { "epoch": 3.8413140498640983, "grad_norm": 0.640625, "learning_rate": 6.336854691356103e-06, "loss": 0.8799, "step": 7155 }, { "epoch": 3.8418509445991744, "grad_norm": 0.74609375, "learning_rate": 6.33123672687978e-06, "loss": 0.9772, "step": 7156 }, { "epoch": 3.8423878393342505, "grad_norm": 0.67578125, "learning_rate": 6.325620892745776e-06, "loss": 0.8332, "step": 7157 }, { "epoch": 3.8429247340693267, "grad_norm": 0.6015625, "learning_rate": 6.3200071895949206e-06, "loss": 0.8175, "step": 7158 }, { "epoch": 3.843461628804403, "grad_norm": 0.7578125, "learning_rate": 6.314395618067801e-06, "loss": 0.8394, "step": 7159 }, { "epoch": 3.8439985235394785, "grad_norm": 0.64453125, "learning_rate": 6.308786178804782e-06, "loss": 0.9289, "step": 7160 }, { "epoch": 3.8445354182745546, "grad_norm": 0.80078125, "learning_rate": 6.303178872445958e-06, "loss": 0.9521, "step": 7161 }, { "epoch": 3.8450723130096307, "grad_norm": 0.56640625, "learning_rate": 6.297573699631201e-06, "loss": 0.8274, "step": 7162 }, { "epoch": 3.8456092077447064, "grad_norm": 0.73828125, "learning_rate": 6.2919706610001375e-06, "loss": 0.947, "step": 7163 }, { "epoch": 3.8461461024797825, "grad_norm": 0.64453125, "learning_rate": 6.286369757192134e-06, "loss": 0.8731, "step": 7164 }, { "epoch": 3.8466829972148586, "grad_norm": 0.65234375, "learning_rate": 6.280770988846329e-06, "loss": 0.7913, "step": 7165 }, { "epoch": 3.8472198919499343, "grad_norm": 0.6640625, "learning_rate": 6.275174356601623e-06, "loss": 0.8454, "step": 7166 }, { "epoch": 3.8477567866850104, "grad_norm": 0.765625, "learning_rate": 6.269579861096653e-06, "loss": 0.814, "step": 7167 }, { "epoch": 3.8482936814200865, "grad_norm": 0.74609375, "learning_rate": 6.263987502969815e-06, "loss": 1.1018, "step": 7168 }, { "epoch": 3.8488305761551627, "grad_norm": 0.62890625, "learning_rate": 6.2583972828592744e-06, "loss": 0.7504, "step": 7169 }, { "epoch": 3.849367470890239, "grad_norm": 0.65625, "learning_rate": 6.252809201402953e-06, "loss": 0.7289, "step": 7170 }, { "epoch": 3.8499043656253145, "grad_norm": 0.6796875, "learning_rate": 6.247223259238511e-06, "loss": 0.7782, "step": 7171 }, { "epoch": 3.8504412603603906, "grad_norm": 0.6484375, "learning_rate": 6.241639457003379e-06, "loss": 0.7569, "step": 7172 }, { "epoch": 3.8509781550954667, "grad_norm": 0.7421875, "learning_rate": 6.236057795334749e-06, "loss": 0.8635, "step": 7173 }, { "epoch": 3.8515150498305424, "grad_norm": 0.609375, "learning_rate": 6.23047827486955e-06, "loss": 0.8061, "step": 7174 }, { "epoch": 3.8520519445656185, "grad_norm": 0.69140625, "learning_rate": 6.2249008962444674e-06, "loss": 0.9643, "step": 7175 }, { "epoch": 3.8525888393006946, "grad_norm": 0.6015625, "learning_rate": 6.219325660095971e-06, "loss": 0.6542, "step": 7176 }, { "epoch": 3.8531257340357707, "grad_norm": 0.609375, "learning_rate": 6.213752567060244e-06, "loss": 0.7527, "step": 7177 }, { "epoch": 3.853662628770847, "grad_norm": 0.71875, "learning_rate": 6.20818161777326e-06, "loss": 0.8118, "step": 7178 }, { "epoch": 3.8541995235059225, "grad_norm": 0.66796875, "learning_rate": 6.202612812870736e-06, "loss": 0.7214, "step": 7179 }, { "epoch": 3.8547364182409987, "grad_norm": 0.6796875, "learning_rate": 6.197046152988137e-06, "loss": 0.7413, "step": 7180 }, { "epoch": 3.855273312976075, "grad_norm": 0.83203125, "learning_rate": 6.19148163876069e-06, "loss": 0.8139, "step": 7181 }, { "epoch": 3.8558102077111505, "grad_norm": 0.61328125, "learning_rate": 6.185919270823384e-06, "loss": 0.8034, "step": 7182 }, { "epoch": 3.8563471024462266, "grad_norm": 0.609375, "learning_rate": 6.180359049810955e-06, "loss": 0.8297, "step": 7183 }, { "epoch": 3.8568839971813027, "grad_norm": 0.6171875, "learning_rate": 6.174800976357878e-06, "loss": 0.6517, "step": 7184 }, { "epoch": 3.8574208919163784, "grad_norm": 0.734375, "learning_rate": 6.169245051098413e-06, "loss": 0.9675, "step": 7185 }, { "epoch": 3.8579577866514545, "grad_norm": 0.68359375, "learning_rate": 6.1636912746665695e-06, "loss": 0.7807, "step": 7186 }, { "epoch": 3.8584946813865306, "grad_norm": 0.890625, "learning_rate": 6.158139647696085e-06, "loss": 1.0128, "step": 7187 }, { "epoch": 3.8590315761216067, "grad_norm": 0.75, "learning_rate": 6.15259017082048e-06, "loss": 0.8042, "step": 7188 }, { "epoch": 3.859568470856683, "grad_norm": 0.81640625, "learning_rate": 6.147042844673026e-06, "loss": 0.9115, "step": 7189 }, { "epoch": 3.8601053655917585, "grad_norm": 0.57421875, "learning_rate": 6.141497669886729e-06, "loss": 0.6325, "step": 7190 }, { "epoch": 3.8606422603268347, "grad_norm": 0.7109375, "learning_rate": 6.13595464709438e-06, "loss": 0.8846, "step": 7191 }, { "epoch": 3.861179155061911, "grad_norm": 0.76171875, "learning_rate": 6.130413776928498e-06, "loss": 0.8668, "step": 7192 }, { "epoch": 3.8617160497969865, "grad_norm": 0.59375, "learning_rate": 6.124875060021362e-06, "loss": 0.7045, "step": 7193 }, { "epoch": 3.8622529445320626, "grad_norm": 0.69921875, "learning_rate": 6.119338497005017e-06, "loss": 0.7714, "step": 7194 }, { "epoch": 3.8627898392671387, "grad_norm": 0.7109375, "learning_rate": 6.11380408851126e-06, "loss": 0.978, "step": 7195 }, { "epoch": 3.863326734002215, "grad_norm": 0.6640625, "learning_rate": 6.1082718351716225e-06, "loss": 0.8781, "step": 7196 }, { "epoch": 3.863863628737291, "grad_norm": 0.65234375, "learning_rate": 6.102741737617413e-06, "loss": 0.8758, "step": 7197 }, { "epoch": 3.8644005234723666, "grad_norm": 0.64453125, "learning_rate": 6.0972137964796946e-06, "loss": 0.9519, "step": 7198 }, { "epoch": 3.8649374182074427, "grad_norm": 0.70703125, "learning_rate": 6.091688012389257e-06, "loss": 0.9811, "step": 7199 }, { "epoch": 3.865474312942519, "grad_norm": 0.6328125, "learning_rate": 6.086164385976681e-06, "loss": 0.7897, "step": 7200 }, { "epoch": 3.865474312942519, "eval_loss": 0.902592658996582, "eval_runtime": 46.4544, "eval_samples_per_second": 20.708, "eval_steps_per_second": 20.708, "step": 7200 }, { "epoch": 3.8660112076775945, "grad_norm": 0.6875, "learning_rate": 6.080642917872273e-06, "loss": 0.8944, "step": 7201 }, { "epoch": 3.8665481024126707, "grad_norm": 0.6328125, "learning_rate": 6.075123608706093e-06, "loss": 0.7142, "step": 7202 }, { "epoch": 3.867084997147747, "grad_norm": 0.73828125, "learning_rate": 6.069606459107974e-06, "loss": 1.0613, "step": 7203 }, { "epoch": 3.8676218918828225, "grad_norm": 0.625, "learning_rate": 6.0640914697075e-06, "loss": 0.7551, "step": 7204 }, { "epoch": 3.8681587866178986, "grad_norm": 1.0234375, "learning_rate": 6.058578641133986e-06, "loss": 0.8401, "step": 7205 }, { "epoch": 3.8686956813529747, "grad_norm": 0.6953125, "learning_rate": 6.0530679740165245e-06, "loss": 0.9745, "step": 7206 }, { "epoch": 3.869232576088051, "grad_norm": 0.55859375, "learning_rate": 6.047559468983954e-06, "loss": 0.7397, "step": 7207 }, { "epoch": 3.869769470823127, "grad_norm": 0.60546875, "learning_rate": 6.042053126664862e-06, "loss": 0.8461, "step": 7208 }, { "epoch": 3.8703063655582026, "grad_norm": 0.71875, "learning_rate": 6.036548947687587e-06, "loss": 0.7573, "step": 7209 }, { "epoch": 3.8708432602932787, "grad_norm": 0.65625, "learning_rate": 6.031046932680229e-06, "loss": 0.8045, "step": 7210 }, { "epoch": 3.871380155028355, "grad_norm": 0.64453125, "learning_rate": 6.025547082270644e-06, "loss": 0.7573, "step": 7211 }, { "epoch": 3.8719170497634305, "grad_norm": 0.7265625, "learning_rate": 6.020049397086422e-06, "loss": 0.9924, "step": 7212 }, { "epoch": 3.8724539444985067, "grad_norm": 0.9375, "learning_rate": 6.014553877754925e-06, "loss": 0.8774, "step": 7213 }, { "epoch": 3.872990839233583, "grad_norm": 0.73828125, "learning_rate": 6.00906052490327e-06, "loss": 0.9509, "step": 7214 }, { "epoch": 3.873527733968659, "grad_norm": 0.8671875, "learning_rate": 6.003569339158302e-06, "loss": 0.8107, "step": 7215 }, { "epoch": 3.874064628703735, "grad_norm": 0.7734375, "learning_rate": 5.998080321146651e-06, "loss": 0.9037, "step": 7216 }, { "epoch": 3.8746015234388107, "grad_norm": 0.62890625, "learning_rate": 5.992593471494673e-06, "loss": 0.7999, "step": 7217 }, { "epoch": 3.875138418173887, "grad_norm": 0.6796875, "learning_rate": 5.987108790828486e-06, "loss": 0.8981, "step": 7218 }, { "epoch": 3.875675312908963, "grad_norm": 0.765625, "learning_rate": 5.981626279773961e-06, "loss": 0.9565, "step": 7219 }, { "epoch": 3.8762122076440386, "grad_norm": 0.64453125, "learning_rate": 5.9761459389567346e-06, "loss": 0.7808, "step": 7220 }, { "epoch": 3.8767491023791147, "grad_norm": 0.71875, "learning_rate": 5.9706677690021664e-06, "loss": 1.0312, "step": 7221 }, { "epoch": 3.877285997114191, "grad_norm": 0.6015625, "learning_rate": 5.965191770535391e-06, "loss": 0.712, "step": 7222 }, { "epoch": 3.8778228918492665, "grad_norm": 0.8828125, "learning_rate": 5.9597179441813e-06, "loss": 0.9597, "step": 7223 }, { "epoch": 3.8783597865843427, "grad_norm": 0.78125, "learning_rate": 5.954246290564511e-06, "loss": 0.983, "step": 7224 }, { "epoch": 3.878896681319419, "grad_norm": 0.6875, "learning_rate": 5.948776810309423e-06, "loss": 0.8047, "step": 7225 }, { "epoch": 3.879433576054495, "grad_norm": 0.7109375, "learning_rate": 5.943309504040154e-06, "loss": 0.8208, "step": 7226 }, { "epoch": 3.879970470789571, "grad_norm": 0.58984375, "learning_rate": 5.937844372380616e-06, "loss": 0.7954, "step": 7227 }, { "epoch": 3.8805073655246467, "grad_norm": 0.73046875, "learning_rate": 5.932381415954428e-06, "loss": 0.9025, "step": 7228 }, { "epoch": 3.881044260259723, "grad_norm": 0.640625, "learning_rate": 5.926920635384991e-06, "loss": 0.7858, "step": 7229 }, { "epoch": 3.881581154994799, "grad_norm": 0.6953125, "learning_rate": 5.92146203129546e-06, "loss": 0.7611, "step": 7230 }, { "epoch": 3.8821180497298746, "grad_norm": 0.7890625, "learning_rate": 5.916005604308714e-06, "loss": 0.8577, "step": 7231 }, { "epoch": 3.8826549444649507, "grad_norm": 0.90234375, "learning_rate": 5.9105513550474055e-06, "loss": 0.9093, "step": 7232 }, { "epoch": 3.883191839200027, "grad_norm": 0.6875, "learning_rate": 5.905099284133952e-06, "loss": 0.7829, "step": 7233 }, { "epoch": 3.883728733935103, "grad_norm": 0.875, "learning_rate": 5.899649392190473e-06, "loss": 0.8519, "step": 7234 }, { "epoch": 3.884265628670179, "grad_norm": 0.8203125, "learning_rate": 5.894201679838885e-06, "loss": 0.8464, "step": 7235 }, { "epoch": 3.884802523405255, "grad_norm": 0.703125, "learning_rate": 5.888756147700847e-06, "loss": 0.8117, "step": 7236 }, { "epoch": 3.885339418140331, "grad_norm": 0.67578125, "learning_rate": 5.88331279639775e-06, "loss": 0.8754, "step": 7237 }, { "epoch": 3.885876312875407, "grad_norm": 0.9140625, "learning_rate": 5.877871626550757e-06, "loss": 0.7996, "step": 7238 }, { "epoch": 3.8864132076104827, "grad_norm": 0.6171875, "learning_rate": 5.872432638780781e-06, "loss": 0.9268, "step": 7239 }, { "epoch": 3.886950102345559, "grad_norm": 0.65234375, "learning_rate": 5.866995833708464e-06, "loss": 0.7662, "step": 7240 }, { "epoch": 3.887486997080635, "grad_norm": 0.625, "learning_rate": 5.861561211954222e-06, "loss": 0.7394, "step": 7241 }, { "epoch": 3.8880238918157106, "grad_norm": 0.8046875, "learning_rate": 5.856128774138222e-06, "loss": 0.8664, "step": 7242 }, { "epoch": 3.8885607865507867, "grad_norm": 0.61328125, "learning_rate": 5.850698520880366e-06, "loss": 0.7173, "step": 7243 }, { "epoch": 3.889097681285863, "grad_norm": 0.6484375, "learning_rate": 5.845270452800309e-06, "loss": 0.8258, "step": 7244 }, { "epoch": 3.889634576020939, "grad_norm": 0.68359375, "learning_rate": 5.8398445705174745e-06, "loss": 0.9874, "step": 7245 }, { "epoch": 3.890171470756015, "grad_norm": 0.609375, "learning_rate": 5.8344208746510114e-06, "loss": 0.8062, "step": 7246 }, { "epoch": 3.890708365491091, "grad_norm": 0.75390625, "learning_rate": 5.828999365819837e-06, "loss": 0.8254, "step": 7247 }, { "epoch": 3.891245260226167, "grad_norm": 0.6640625, "learning_rate": 5.823580044642627e-06, "loss": 0.9911, "step": 7248 }, { "epoch": 3.891782154961243, "grad_norm": 0.6328125, "learning_rate": 5.818162911737774e-06, "loss": 0.7894, "step": 7249 }, { "epoch": 3.8923190496963187, "grad_norm": 0.6875, "learning_rate": 5.812747967723459e-06, "loss": 0.858, "step": 7250 }, { "epoch": 3.8923190496963187, "eval_loss": 0.9027091860771179, "eval_runtime": 45.9386, "eval_samples_per_second": 20.941, "eval_steps_per_second": 20.941, "step": 7250 }, { "epoch": 3.892855944431395, "grad_norm": 0.62890625, "learning_rate": 5.80733521321758e-06, "loss": 0.7453, "step": 7251 }, { "epoch": 3.893392839166471, "grad_norm": 0.72265625, "learning_rate": 5.801924648837815e-06, "loss": 0.8787, "step": 7252 }, { "epoch": 3.893929733901547, "grad_norm": 0.6875, "learning_rate": 5.796516275201566e-06, "loss": 0.9252, "step": 7253 }, { "epoch": 3.894466628636623, "grad_norm": 0.6875, "learning_rate": 5.791110092926e-06, "loss": 0.8662, "step": 7254 }, { "epoch": 3.895003523371699, "grad_norm": 0.66796875, "learning_rate": 5.785706102628044e-06, "loss": 0.7687, "step": 7255 }, { "epoch": 3.895540418106775, "grad_norm": 0.78515625, "learning_rate": 5.7803043049243415e-06, "loss": 1.0045, "step": 7256 }, { "epoch": 3.896077312841851, "grad_norm": 0.85546875, "learning_rate": 5.774904700431316e-06, "loss": 0.7697, "step": 7257 }, { "epoch": 3.896614207576927, "grad_norm": 0.60546875, "learning_rate": 5.769507289765144e-06, "loss": 0.9009, "step": 7258 }, { "epoch": 3.897151102312003, "grad_norm": 0.66015625, "learning_rate": 5.764112073541711e-06, "loss": 0.8913, "step": 7259 }, { "epoch": 3.897687997047079, "grad_norm": 0.5859375, "learning_rate": 5.758719052376693e-06, "loss": 0.6348, "step": 7260 }, { "epoch": 3.8982248917821547, "grad_norm": 0.60546875, "learning_rate": 5.753328226885513e-06, "loss": 0.744, "step": 7261 }, { "epoch": 3.898761786517231, "grad_norm": 0.75, "learning_rate": 5.747939597683311e-06, "loss": 0.9436, "step": 7262 }, { "epoch": 3.899298681252307, "grad_norm": 0.57421875, "learning_rate": 5.742553165385009e-06, "loss": 0.6824, "step": 7263 }, { "epoch": 3.899835575987383, "grad_norm": 0.6171875, "learning_rate": 5.737168930605272e-06, "loss": 0.9219, "step": 7264 }, { "epoch": 3.900372470722459, "grad_norm": 0.58203125, "learning_rate": 5.731786893958499e-06, "loss": 0.6955, "step": 7265 }, { "epoch": 3.900909365457535, "grad_norm": 0.69921875, "learning_rate": 5.726407056058852e-06, "loss": 0.7208, "step": 7266 }, { "epoch": 3.901446260192611, "grad_norm": 0.6640625, "learning_rate": 5.721029417520249e-06, "loss": 0.8116, "step": 7267 }, { "epoch": 3.901983154927687, "grad_norm": 0.67578125, "learning_rate": 5.71565397895634e-06, "loss": 0.8814, "step": 7268 }, { "epoch": 3.902520049662763, "grad_norm": 0.94921875, "learning_rate": 5.710280740980517e-06, "loss": 0.9441, "step": 7269 }, { "epoch": 3.903056944397839, "grad_norm": 0.72265625, "learning_rate": 5.704909704205949e-06, "loss": 0.9297, "step": 7270 }, { "epoch": 3.903593839132915, "grad_norm": 0.609375, "learning_rate": 5.69954086924554e-06, "loss": 0.8212, "step": 7271 }, { "epoch": 3.904130733867991, "grad_norm": 0.67578125, "learning_rate": 5.694174236711935e-06, "loss": 0.7893, "step": 7272 }, { "epoch": 3.9046676286030673, "grad_norm": 0.6875, "learning_rate": 5.688809807217535e-06, "loss": 0.861, "step": 7273 }, { "epoch": 3.905204523338143, "grad_norm": 0.64453125, "learning_rate": 5.6834475813745014e-06, "loss": 0.7909, "step": 7274 }, { "epoch": 3.905741418073219, "grad_norm": 0.76171875, "learning_rate": 5.678087559794723e-06, "loss": 0.8263, "step": 7275 }, { "epoch": 3.906278312808295, "grad_norm": 0.81640625, "learning_rate": 5.672729743089839e-06, "loss": 0.9041, "step": 7276 }, { "epoch": 3.906815207543371, "grad_norm": 0.578125, "learning_rate": 5.6673741318712585e-06, "loss": 0.697, "step": 7277 }, { "epoch": 3.907352102278447, "grad_norm": 0.76171875, "learning_rate": 5.662020726750111e-06, "loss": 1.0148, "step": 7278 }, { "epoch": 3.907888997013523, "grad_norm": 0.61328125, "learning_rate": 5.656669528337294e-06, "loss": 0.6864, "step": 7279 }, { "epoch": 3.908425891748599, "grad_norm": 0.796875, "learning_rate": 5.651320537243454e-06, "loss": 0.9782, "step": 7280 }, { "epoch": 3.908962786483675, "grad_norm": 0.6875, "learning_rate": 5.645973754078965e-06, "loss": 0.8902, "step": 7281 }, { "epoch": 3.909499681218751, "grad_norm": 0.66796875, "learning_rate": 5.640629179453968e-06, "loss": 0.8695, "step": 7282 }, { "epoch": 3.910036575953827, "grad_norm": 0.66015625, "learning_rate": 5.635286813978357e-06, "loss": 0.6493, "step": 7283 }, { "epoch": 3.9105734706889033, "grad_norm": 0.6171875, "learning_rate": 5.629946658261753e-06, "loss": 0.8964, "step": 7284 }, { "epoch": 3.911110365423979, "grad_norm": 0.73828125, "learning_rate": 5.6246087129135315e-06, "loss": 1.0302, "step": 7285 }, { "epoch": 3.911647260159055, "grad_norm": 0.66796875, "learning_rate": 5.6192729785428335e-06, "loss": 0.908, "step": 7286 }, { "epoch": 3.912184154894131, "grad_norm": 0.73046875, "learning_rate": 5.613939455758518e-06, "loss": 0.9243, "step": 7287 }, { "epoch": 3.912721049629207, "grad_norm": 0.7109375, "learning_rate": 5.608608145169214e-06, "loss": 0.8276, "step": 7288 }, { "epoch": 3.913257944364283, "grad_norm": 0.734375, "learning_rate": 5.6032790473833e-06, "loss": 0.9719, "step": 7289 }, { "epoch": 3.913794839099359, "grad_norm": 0.74609375, "learning_rate": 5.597952163008879e-06, "loss": 0.793, "step": 7290 }, { "epoch": 3.9143317338344352, "grad_norm": 0.6796875, "learning_rate": 5.592627492653826e-06, "loss": 0.8368, "step": 7291 }, { "epoch": 3.9148686285695113, "grad_norm": 0.6796875, "learning_rate": 5.587305036925755e-06, "loss": 0.9008, "step": 7292 }, { "epoch": 3.915405523304587, "grad_norm": 0.640625, "learning_rate": 5.581984796432022e-06, "loss": 0.7587, "step": 7293 }, { "epoch": 3.915942418039663, "grad_norm": 0.70703125, "learning_rate": 5.576666771779726e-06, "loss": 0.7886, "step": 7294 }, { "epoch": 3.9164793127747393, "grad_norm": 0.875, "learning_rate": 5.571350963575728e-06, "loss": 1.0384, "step": 7295 }, { "epoch": 3.917016207509815, "grad_norm": 0.6875, "learning_rate": 5.5660373724266354e-06, "loss": 0.7161, "step": 7296 }, { "epoch": 3.917553102244891, "grad_norm": 0.67578125, "learning_rate": 5.5607259989387815e-06, "loss": 0.7293, "step": 7297 }, { "epoch": 3.918089996979967, "grad_norm": 0.65625, "learning_rate": 5.5554168437182725e-06, "loss": 0.7379, "step": 7298 }, { "epoch": 3.918626891715043, "grad_norm": 0.609375, "learning_rate": 5.550109907370954e-06, "loss": 0.7603, "step": 7299 }, { "epoch": 3.919163786450119, "grad_norm": 0.796875, "learning_rate": 5.544805190502406e-06, "loss": 0.9963, "step": 7300 }, { "epoch": 3.919163786450119, "eval_loss": 0.9028392434120178, "eval_runtime": 45.9722, "eval_samples_per_second": 20.926, "eval_steps_per_second": 20.926, "step": 7300 }, { "epoch": 3.919700681185195, "grad_norm": 0.8515625, "learning_rate": 5.53950269371796e-06, "loss": 0.8868, "step": 7301 }, { "epoch": 3.9202375759202712, "grad_norm": 0.75390625, "learning_rate": 5.534202417622711e-06, "loss": 0.8868, "step": 7302 }, { "epoch": 3.9207744706553473, "grad_norm": 0.703125, "learning_rate": 5.528904362821475e-06, "loss": 0.9035, "step": 7303 }, { "epoch": 3.921311365390423, "grad_norm": 0.6171875, "learning_rate": 5.523608529918831e-06, "loss": 0.7714, "step": 7304 }, { "epoch": 3.921848260125499, "grad_norm": 0.59375, "learning_rate": 5.518314919519107e-06, "loss": 0.7216, "step": 7305 }, { "epoch": 3.9223851548605753, "grad_norm": 0.6484375, "learning_rate": 5.5130235322263634e-06, "loss": 0.8625, "step": 7306 }, { "epoch": 3.922922049595651, "grad_norm": 0.61328125, "learning_rate": 5.507734368644416e-06, "loss": 0.7752, "step": 7307 }, { "epoch": 3.923458944330727, "grad_norm": 0.6796875, "learning_rate": 5.502447429376833e-06, "loss": 0.84, "step": 7308 }, { "epoch": 3.923995839065803, "grad_norm": 0.69921875, "learning_rate": 5.497162715026915e-06, "loss": 0.7138, "step": 7309 }, { "epoch": 3.9245327338008793, "grad_norm": 0.640625, "learning_rate": 5.491880226197707e-06, "loss": 1.1213, "step": 7310 }, { "epoch": 3.9250696285359554, "grad_norm": 0.53125, "learning_rate": 5.486599963492017e-06, "loss": 0.6998, "step": 7311 }, { "epoch": 3.925606523271031, "grad_norm": 0.72265625, "learning_rate": 5.481321927512395e-06, "loss": 0.8277, "step": 7312 }, { "epoch": 3.926143418006107, "grad_norm": 0.65625, "learning_rate": 5.476046118861117e-06, "loss": 0.8081, "step": 7313 }, { "epoch": 3.9266803127411833, "grad_norm": 0.66015625, "learning_rate": 5.470772538140228e-06, "loss": 0.9648, "step": 7314 }, { "epoch": 3.927217207476259, "grad_norm": 0.65625, "learning_rate": 5.465501185951519e-06, "loss": 0.8677, "step": 7315 }, { "epoch": 3.927754102211335, "grad_norm": 0.640625, "learning_rate": 5.4602320628965e-06, "loss": 0.756, "step": 7316 }, { "epoch": 3.9282909969464113, "grad_norm": 0.671875, "learning_rate": 5.454965169576462e-06, "loss": 0.8091, "step": 7317 }, { "epoch": 3.928827891681487, "grad_norm": 0.76171875, "learning_rate": 5.4497005065924145e-06, "loss": 0.8709, "step": 7318 }, { "epoch": 3.929364786416563, "grad_norm": 0.70703125, "learning_rate": 5.444438074545119e-06, "loss": 0.8644, "step": 7319 }, { "epoch": 3.929901681151639, "grad_norm": 0.70703125, "learning_rate": 5.4391778740350896e-06, "loss": 0.9834, "step": 7320 }, { "epoch": 3.9304385758867153, "grad_norm": 0.6875, "learning_rate": 5.4339199056625876e-06, "loss": 0.8151, "step": 7321 }, { "epoch": 3.9309754706217914, "grad_norm": 0.76171875, "learning_rate": 5.428664170027603e-06, "loss": 0.8584, "step": 7322 }, { "epoch": 3.931512365356867, "grad_norm": 0.6953125, "learning_rate": 5.4234106677298894e-06, "loss": 0.8024, "step": 7323 }, { "epoch": 3.932049260091943, "grad_norm": 0.80859375, "learning_rate": 5.418159399368941e-06, "loss": 0.8314, "step": 7324 }, { "epoch": 3.9325861548270193, "grad_norm": 1.421875, "learning_rate": 5.412910365543988e-06, "loss": 0.7789, "step": 7325 }, { "epoch": 3.933123049562095, "grad_norm": 0.90234375, "learning_rate": 5.4076635668540075e-06, "loss": 0.967, "step": 7326 }, { "epoch": 3.933659944297171, "grad_norm": 0.7890625, "learning_rate": 5.402419003897738e-06, "loss": 0.8204, "step": 7327 }, { "epoch": 3.9341968390322473, "grad_norm": 0.68359375, "learning_rate": 5.3971766772736354e-06, "loss": 0.7443, "step": 7328 }, { "epoch": 3.9347337337673234, "grad_norm": 0.79296875, "learning_rate": 5.3919365875799235e-06, "loss": 0.8301, "step": 7329 }, { "epoch": 3.9352706285023995, "grad_norm": 0.65625, "learning_rate": 5.3866987354145724e-06, "loss": 0.7601, "step": 7330 }, { "epoch": 3.935807523237475, "grad_norm": 0.73828125, "learning_rate": 5.381463121375269e-06, "loss": 1.0165, "step": 7331 }, { "epoch": 3.9363444179725513, "grad_norm": 0.98828125, "learning_rate": 5.3762297460594714e-06, "loss": 0.8752, "step": 7332 }, { "epoch": 3.9368813127076274, "grad_norm": 0.796875, "learning_rate": 5.370998610064382e-06, "loss": 0.9076, "step": 7333 }, { "epoch": 3.937418207442703, "grad_norm": 0.76171875, "learning_rate": 5.365769713986935e-06, "loss": 0.9709, "step": 7334 }, { "epoch": 3.937955102177779, "grad_norm": 0.66796875, "learning_rate": 5.3605430584238e-06, "loss": 0.7881, "step": 7335 }, { "epoch": 3.9384919969128553, "grad_norm": 0.66796875, "learning_rate": 5.355318643971419e-06, "loss": 0.7656, "step": 7336 }, { "epoch": 3.939028891647931, "grad_norm": 0.63671875, "learning_rate": 5.350096471225966e-06, "loss": 0.7514, "step": 7337 }, { "epoch": 3.939565786383007, "grad_norm": 0.6953125, "learning_rate": 5.344876540783345e-06, "loss": 0.8952, "step": 7338 }, { "epoch": 3.9401026811180833, "grad_norm": 0.6328125, "learning_rate": 5.339658853239221e-06, "loss": 0.7225, "step": 7339 }, { "epoch": 3.9406395758531594, "grad_norm": 0.640625, "learning_rate": 5.334443409189011e-06, "loss": 0.8777, "step": 7340 }, { "epoch": 3.9411764705882355, "grad_norm": 0.5859375, "learning_rate": 5.3292302092278395e-06, "loss": 0.765, "step": 7341 }, { "epoch": 3.941713365323311, "grad_norm": 0.6953125, "learning_rate": 5.324019253950621e-06, "loss": 0.9015, "step": 7342 }, { "epoch": 3.9422502600583873, "grad_norm": 0.671875, "learning_rate": 5.3188105439519825e-06, "loss": 0.8368, "step": 7343 }, { "epoch": 3.9427871547934634, "grad_norm": 0.66796875, "learning_rate": 5.313604079826295e-06, "loss": 0.8367, "step": 7344 }, { "epoch": 3.943324049528539, "grad_norm": 0.6953125, "learning_rate": 5.308399862167693e-06, "loss": 0.9607, "step": 7345 }, { "epoch": 3.943860944263615, "grad_norm": 0.703125, "learning_rate": 5.303197891570044e-06, "loss": 0.811, "step": 7346 }, { "epoch": 3.9443978389986913, "grad_norm": 0.6328125, "learning_rate": 5.29799816862695e-06, "loss": 0.8934, "step": 7347 }, { "epoch": 3.9449347337337675, "grad_norm": 0.5546875, "learning_rate": 5.292800693931771e-06, "loss": 0.7024, "step": 7348 }, { "epoch": 3.9454716284688436, "grad_norm": 0.89453125, "learning_rate": 5.2876054680776085e-06, "loss": 0.9306, "step": 7349 }, { "epoch": 3.9460085232039193, "grad_norm": 0.73046875, "learning_rate": 5.282412491657296e-06, "loss": 0.7619, "step": 7350 }, { "epoch": 3.9460085232039193, "eval_loss": 0.902666449546814, "eval_runtime": 46.036, "eval_samples_per_second": 20.897, "eval_steps_per_second": 20.897, "step": 7350 }, { "epoch": 3.9465454179389954, "grad_norm": 1.1640625, "learning_rate": 5.2772217652634265e-06, "loss": 0.9595, "step": 7351 }, { "epoch": 3.9470823126740715, "grad_norm": 0.66796875, "learning_rate": 5.272033289488315e-06, "loss": 0.8536, "step": 7352 }, { "epoch": 3.947619207409147, "grad_norm": 0.9140625, "learning_rate": 5.266847064924047e-06, "loss": 0.7785, "step": 7353 }, { "epoch": 3.9481561021442233, "grad_norm": 0.640625, "learning_rate": 5.261663092162422e-06, "loss": 0.7009, "step": 7354 }, { "epoch": 3.9486929968792994, "grad_norm": 0.6484375, "learning_rate": 5.2564813717950055e-06, "loss": 0.8897, "step": 7355 }, { "epoch": 3.949229891614375, "grad_norm": 0.54296875, "learning_rate": 5.2513019044131e-06, "loss": 0.6483, "step": 7356 }, { "epoch": 3.949766786349451, "grad_norm": 0.6015625, "learning_rate": 5.24612469060774e-06, "loss": 0.7246, "step": 7357 }, { "epoch": 3.9503036810845273, "grad_norm": 0.7109375, "learning_rate": 5.240949730969713e-06, "loss": 0.9832, "step": 7358 }, { "epoch": 3.9508405758196035, "grad_norm": 0.66796875, "learning_rate": 5.235777026089566e-06, "loss": 0.7768, "step": 7359 }, { "epoch": 3.9513774705546796, "grad_norm": 0.78515625, "learning_rate": 5.23060657655754e-06, "loss": 0.8202, "step": 7360 }, { "epoch": 3.9519143652897553, "grad_norm": 0.671875, "learning_rate": 5.22543838296366e-06, "loss": 0.9459, "step": 7361 }, { "epoch": 3.9524512600248314, "grad_norm": 0.6953125, "learning_rate": 5.2202724458976945e-06, "loss": 0.8082, "step": 7362 }, { "epoch": 3.9529881547599075, "grad_norm": 0.6796875, "learning_rate": 5.215108765949126e-06, "loss": 0.7899, "step": 7363 }, { "epoch": 3.953525049494983, "grad_norm": 0.66015625, "learning_rate": 5.209947343707202e-06, "loss": 1.0224, "step": 7364 }, { "epoch": 3.9540619442300593, "grad_norm": 0.59375, "learning_rate": 5.2047881797609135e-06, "loss": 0.8479, "step": 7365 }, { "epoch": 3.9545988389651354, "grad_norm": 0.6796875, "learning_rate": 5.199631274698974e-06, "loss": 0.7561, "step": 7366 }, { "epoch": 3.9551357337002115, "grad_norm": 0.75, "learning_rate": 5.1944766291098625e-06, "loss": 0.7702, "step": 7367 }, { "epoch": 3.9556726284352877, "grad_norm": 0.75, "learning_rate": 5.189324243581778e-06, "loss": 0.9321, "step": 7368 }, { "epoch": 3.9562095231703633, "grad_norm": 0.66015625, "learning_rate": 5.1841741187026864e-06, "loss": 0.7873, "step": 7369 }, { "epoch": 3.9567464179054395, "grad_norm": 0.7109375, "learning_rate": 5.179026255060268e-06, "loss": 0.8677, "step": 7370 }, { "epoch": 3.9572833126405156, "grad_norm": 0.69921875, "learning_rate": 5.173880653241972e-06, "loss": 1.0095, "step": 7371 }, { "epoch": 3.9578202073755913, "grad_norm": 0.6796875, "learning_rate": 5.168737313834967e-06, "loss": 0.8071, "step": 7372 }, { "epoch": 3.9583571021106674, "grad_norm": 0.7421875, "learning_rate": 5.163596237426174e-06, "loss": 0.7922, "step": 7373 }, { "epoch": 3.9588939968457435, "grad_norm": 0.703125, "learning_rate": 5.158457424602267e-06, "loss": 0.776, "step": 7374 }, { "epoch": 3.959430891580819, "grad_norm": 0.61328125, "learning_rate": 5.153320875949632e-06, "loss": 0.9226, "step": 7375 }, { "epoch": 3.9599677863158953, "grad_norm": 0.67578125, "learning_rate": 5.148186592054433e-06, "loss": 0.9211, "step": 7376 }, { "epoch": 3.9605046810509714, "grad_norm": 0.65234375, "learning_rate": 5.143054573502537e-06, "loss": 0.746, "step": 7377 }, { "epoch": 3.9610415757860475, "grad_norm": 1.1484375, "learning_rate": 5.137924820879589e-06, "loss": 0.8041, "step": 7378 }, { "epoch": 3.9615784705211237, "grad_norm": 0.8203125, "learning_rate": 5.1327973347709474e-06, "loss": 0.8759, "step": 7379 }, { "epoch": 3.9621153652561993, "grad_norm": 0.69140625, "learning_rate": 5.1276721157617255e-06, "loss": 0.6991, "step": 7380 }, { "epoch": 3.9626522599912755, "grad_norm": 0.70703125, "learning_rate": 5.122549164436785e-06, "loss": 0.7255, "step": 7381 }, { "epoch": 3.9631891547263516, "grad_norm": 0.6640625, "learning_rate": 5.117428481380704e-06, "loss": 0.8164, "step": 7382 }, { "epoch": 3.9637260494614273, "grad_norm": 0.77734375, "learning_rate": 5.112310067177828e-06, "loss": 0.762, "step": 7383 }, { "epoch": 3.9642629441965034, "grad_norm": 0.73046875, "learning_rate": 5.107193922412243e-06, "loss": 0.8531, "step": 7384 }, { "epoch": 3.9647998389315795, "grad_norm": 0.64453125, "learning_rate": 5.10208004766774e-06, "loss": 0.8792, "step": 7385 }, { "epoch": 3.9653367336666556, "grad_norm": 0.59375, "learning_rate": 5.096968443527892e-06, "loss": 0.6783, "step": 7386 }, { "epoch": 3.9658736284017317, "grad_norm": 0.7421875, "learning_rate": 5.091859110576e-06, "loss": 0.7629, "step": 7387 }, { "epoch": 3.9664105231368074, "grad_norm": 0.6640625, "learning_rate": 5.086752049395094e-06, "loss": 0.8086, "step": 7388 }, { "epoch": 3.9669474178718835, "grad_norm": 0.6328125, "learning_rate": 5.081647260567962e-06, "loss": 0.8446, "step": 7389 }, { "epoch": 3.9674843126069597, "grad_norm": 0.75, "learning_rate": 5.076544744677128e-06, "loss": 0.877, "step": 7390 }, { "epoch": 3.9680212073420353, "grad_norm": 0.80078125, "learning_rate": 5.071444502304842e-06, "loss": 0.8079, "step": 7391 }, { "epoch": 3.9685581020771115, "grad_norm": 0.8125, "learning_rate": 5.0663465340331145e-06, "loss": 0.6302, "step": 7392 }, { "epoch": 3.9690949968121876, "grad_norm": 0.6875, "learning_rate": 5.06125084044369e-06, "loss": 0.9266, "step": 7393 }, { "epoch": 3.9696318915472633, "grad_norm": 0.921875, "learning_rate": 5.056157422118052e-06, "loss": 1.2568, "step": 7394 }, { "epoch": 3.9701687862823394, "grad_norm": 0.6796875, "learning_rate": 5.051066279637412e-06, "loss": 0.8446, "step": 7395 }, { "epoch": 3.9707056810174155, "grad_norm": 0.66796875, "learning_rate": 5.045977413582745e-06, "loss": 0.8624, "step": 7396 }, { "epoch": 3.9712425757524916, "grad_norm": 0.68359375, "learning_rate": 5.040890824534756e-06, "loss": 0.8896, "step": 7397 }, { "epoch": 3.9717794704875677, "grad_norm": 0.71484375, "learning_rate": 5.035806513073879e-06, "loss": 0.8596, "step": 7398 }, { "epoch": 3.9723163652226434, "grad_norm": 0.6953125, "learning_rate": 5.030724479780305e-06, "loss": 0.8962, "step": 7399 }, { "epoch": 3.9728532599577195, "grad_norm": 0.671875, "learning_rate": 5.025644725233966e-06, "loss": 0.8844, "step": 7400 }, { "epoch": 3.9728532599577195, "eval_loss": 0.9027516841888428, "eval_runtime": 46.1077, "eval_samples_per_second": 20.864, "eval_steps_per_second": 20.864, "step": 7400 }, { "epoch": 3.9733901546927957, "grad_norm": 0.796875, "learning_rate": 5.020567250014518e-06, "loss": 1.0648, "step": 7401 }, { "epoch": 3.9739270494278713, "grad_norm": 0.7578125, "learning_rate": 5.015492054701359e-06, "loss": 0.9729, "step": 7402 }, { "epoch": 3.9744639441629475, "grad_norm": 0.80859375, "learning_rate": 5.010419139873648e-06, "loss": 0.7904, "step": 7403 }, { "epoch": 3.9750008388980236, "grad_norm": 0.63671875, "learning_rate": 5.005348506110252e-06, "loss": 0.664, "step": 7404 }, { "epoch": 3.9755377336330997, "grad_norm": 0.7109375, "learning_rate": 5.000280153989806e-06, "loss": 0.7639, "step": 7405 }, { "epoch": 3.976074628368176, "grad_norm": 0.6328125, "learning_rate": 4.995214084090674e-06, "loss": 0.7974, "step": 7406 }, { "epoch": 3.9766115231032515, "grad_norm": 0.8203125, "learning_rate": 4.99015029699095e-06, "loss": 1.073, "step": 7407 }, { "epoch": 3.9771484178383276, "grad_norm": 0.60546875, "learning_rate": 4.98508879326848e-06, "loss": 0.73, "step": 7408 }, { "epoch": 3.9776853125734037, "grad_norm": 0.578125, "learning_rate": 4.980029573500855e-06, "loss": 0.6907, "step": 7409 }, { "epoch": 3.9782222073084794, "grad_norm": 0.67578125, "learning_rate": 4.9749726382653905e-06, "loss": 0.8197, "step": 7410 }, { "epoch": 3.9787591020435555, "grad_norm": 0.6953125, "learning_rate": 4.969917988139134e-06, "loss": 0.7788, "step": 7411 }, { "epoch": 3.9792959967786317, "grad_norm": 0.703125, "learning_rate": 4.964865623698903e-06, "loss": 0.8202, "step": 7412 }, { "epoch": 3.9798328915137073, "grad_norm": 0.6796875, "learning_rate": 4.959815545521224e-06, "loss": 0.9292, "step": 7413 }, { "epoch": 3.9803697862487835, "grad_norm": 0.6484375, "learning_rate": 4.9547677541823765e-06, "loss": 0.6873, "step": 7414 }, { "epoch": 3.9809066809838596, "grad_norm": 0.65234375, "learning_rate": 4.94972225025839e-06, "loss": 0.9967, "step": 7415 }, { "epoch": 3.9814435757189357, "grad_norm": 0.56640625, "learning_rate": 4.944679034325003e-06, "loss": 0.7426, "step": 7416 }, { "epoch": 3.981980470454012, "grad_norm": 0.59375, "learning_rate": 4.9396381069577165e-06, "loss": 0.9072, "step": 7417 }, { "epoch": 3.9825173651890875, "grad_norm": 0.79296875, "learning_rate": 4.934599468731777e-06, "loss": 0.9, "step": 7418 }, { "epoch": 3.9830542599241636, "grad_norm": 0.65234375, "learning_rate": 4.929563120222141e-06, "loss": 0.907, "step": 7419 }, { "epoch": 3.9835911546592397, "grad_norm": 0.58984375, "learning_rate": 4.924529062003522e-06, "loss": 1.0865, "step": 7420 }, { "epoch": 3.9841280493943154, "grad_norm": 0.6484375, "learning_rate": 4.919497294650369e-06, "loss": 0.8952, "step": 7421 }, { "epoch": 3.9846649441293915, "grad_norm": 0.65234375, "learning_rate": 4.9144678187368815e-06, "loss": 0.7385, "step": 7422 }, { "epoch": 3.9852018388644677, "grad_norm": 0.78125, "learning_rate": 4.909440634836973e-06, "loss": 0.9948, "step": 7423 }, { "epoch": 3.985738733599544, "grad_norm": 0.61328125, "learning_rate": 4.904415743524313e-06, "loss": 0.8738, "step": 7424 }, { "epoch": 3.98627562833462, "grad_norm": 0.72265625, "learning_rate": 4.899393145372313e-06, "loss": 0.9451, "step": 7425 }, { "epoch": 3.9868125230696956, "grad_norm": 0.84375, "learning_rate": 4.894372840954112e-06, "loss": 0.9289, "step": 7426 }, { "epoch": 3.9873494178047717, "grad_norm": 0.703125, "learning_rate": 4.889354830842577e-06, "loss": 0.9262, "step": 7427 }, { "epoch": 3.987886312539848, "grad_norm": 0.59765625, "learning_rate": 4.884339115610345e-06, "loss": 0.8329, "step": 7428 }, { "epoch": 3.9884232072749235, "grad_norm": 0.60546875, "learning_rate": 4.879325695829757e-06, "loss": 0.7719, "step": 7429 }, { "epoch": 3.9889601020099996, "grad_norm": 0.64453125, "learning_rate": 4.8743145720729134e-06, "loss": 0.677, "step": 7430 }, { "epoch": 3.9894969967450757, "grad_norm": 0.7421875, "learning_rate": 4.869305744911656e-06, "loss": 0.9912, "step": 7431 }, { "epoch": 3.9900338914801514, "grad_norm": 0.66796875, "learning_rate": 4.864299214917542e-06, "loss": 0.9451, "step": 7432 }, { "epoch": 3.9905707862152275, "grad_norm": 0.77734375, "learning_rate": 4.859294982661886e-06, "loss": 0.9915, "step": 7433 }, { "epoch": 3.9911076809503037, "grad_norm": 0.6875, "learning_rate": 4.854293048715738e-06, "loss": 0.7315, "step": 7434 }, { "epoch": 3.99164457568538, "grad_norm": 0.80859375, "learning_rate": 4.84929341364988e-06, "loss": 0.9712, "step": 7435 }, { "epoch": 3.992181470420456, "grad_norm": 0.83984375, "learning_rate": 4.844296078034824e-06, "loss": 0.9264, "step": 7436 }, { "epoch": 3.9927183651555316, "grad_norm": 0.80078125, "learning_rate": 4.8393010424408355e-06, "loss": 0.9062, "step": 7437 }, { "epoch": 3.9932552598906077, "grad_norm": 0.5859375, "learning_rate": 4.8343083074379205e-06, "loss": 0.5828, "step": 7438 }, { "epoch": 3.993792154625684, "grad_norm": 0.62890625, "learning_rate": 4.829317873595798e-06, "loss": 0.8912, "step": 7439 }, { "epoch": 3.9943290493607595, "grad_norm": 0.60546875, "learning_rate": 4.824329741483949e-06, "loss": 0.7507, "step": 7440 }, { "epoch": 3.9948659440958356, "grad_norm": 0.8359375, "learning_rate": 4.8193439116715865e-06, "loss": 0.9012, "step": 7441 }, { "epoch": 3.9954028388309117, "grad_norm": 0.6640625, "learning_rate": 4.814360384727646e-06, "loss": 0.8761, "step": 7442 }, { "epoch": 3.995939733565988, "grad_norm": 0.68359375, "learning_rate": 4.80937916122082e-06, "loss": 0.6817, "step": 7443 }, { "epoch": 3.996476628301064, "grad_norm": 0.69140625, "learning_rate": 4.804400241719525e-06, "loss": 0.752, "step": 7444 }, { "epoch": 3.9970135230361397, "grad_norm": 0.7265625, "learning_rate": 4.799423626791916e-06, "loss": 0.8484, "step": 7445 }, { "epoch": 3.997550417771216, "grad_norm": 0.77734375, "learning_rate": 4.794449317005886e-06, "loss": 0.9062, "step": 7446 }, { "epoch": 3.998087312506292, "grad_norm": 0.6640625, "learning_rate": 4.789477312929083e-06, "loss": 0.8681, "step": 7447 }, { "epoch": 3.9986242072413676, "grad_norm": 0.75390625, "learning_rate": 4.784507615128856e-06, "loss": 0.8729, "step": 7448 }, { "epoch": 3.9991611019764437, "grad_norm": 0.62890625, "learning_rate": 4.779540224172316e-06, "loss": 0.8511, "step": 7449 }, { "epoch": 3.99969799671152, "grad_norm": 0.609375, "learning_rate": 4.7745751406263165e-06, "loss": 0.8264, "step": 7450 }, { "epoch": 3.99969799671152, "eval_loss": 0.9027752876281738, "eval_runtime": 45.8409, "eval_samples_per_second": 20.986, "eval_steps_per_second": 20.986, "step": 7450 }, { "epoch": 4.0, "grad_norm": 0.62109375, "learning_rate": 4.769612365057427e-06, "loss": 0.4663, "step": 7451 }, { "epoch": 4.000536894735076, "grad_norm": 0.6796875, "learning_rate": 4.764651898031958e-06, "loss": 0.8877, "step": 7452 }, { "epoch": 4.001073789470152, "grad_norm": 0.7578125, "learning_rate": 4.7596937401159725e-06, "loss": 0.9608, "step": 7453 }, { "epoch": 4.001610684205228, "grad_norm": 0.78515625, "learning_rate": 4.754737891875249e-06, "loss": 0.8652, "step": 7454 }, { "epoch": 4.002147578940304, "grad_norm": 0.7578125, "learning_rate": 4.749784353875316e-06, "loss": 0.9188, "step": 7455 }, { "epoch": 4.00268447367538, "grad_norm": 0.6484375, "learning_rate": 4.744833126681445e-06, "loss": 0.8245, "step": 7456 }, { "epoch": 4.003221368410456, "grad_norm": 0.734375, "learning_rate": 4.739884210858616e-06, "loss": 0.7953, "step": 7457 }, { "epoch": 4.003758263145532, "grad_norm": 0.76953125, "learning_rate": 4.734937606971576e-06, "loss": 0.9617, "step": 7458 }, { "epoch": 4.004295157880608, "grad_norm": 0.70703125, "learning_rate": 4.729993315584797e-06, "loss": 0.903, "step": 7459 }, { "epoch": 4.004832052615684, "grad_norm": 0.7265625, "learning_rate": 4.725051337262476e-06, "loss": 1.019, "step": 7460 }, { "epoch": 4.00536894735076, "grad_norm": 0.6171875, "learning_rate": 4.7201116725685555e-06, "loss": 0.9115, "step": 7461 }, { "epoch": 4.005905842085836, "grad_norm": 0.625, "learning_rate": 4.715174322066715e-06, "loss": 0.6203, "step": 7462 }, { "epoch": 4.006442736820912, "grad_norm": 0.73046875, "learning_rate": 4.710239286320378e-06, "loss": 0.9211, "step": 7463 }, { "epoch": 4.006979631555988, "grad_norm": 0.65625, "learning_rate": 4.7053065658926834e-06, "loss": 0.7785, "step": 7464 }, { "epoch": 4.007516526291064, "grad_norm": 0.62890625, "learning_rate": 4.700376161346517e-06, "loss": 0.9612, "step": 7465 }, { "epoch": 4.0080534210261405, "grad_norm": 0.66796875, "learning_rate": 4.695448073244516e-06, "loss": 0.9449, "step": 7466 }, { "epoch": 4.008590315761216, "grad_norm": 0.5859375, "learning_rate": 4.690522302149017e-06, "loss": 0.8617, "step": 7467 }, { "epoch": 4.009127210496292, "grad_norm": 0.6875, "learning_rate": 4.685598848622128e-06, "loss": 0.8689, "step": 7468 }, { "epoch": 4.009664105231368, "grad_norm": 0.5625, "learning_rate": 4.68067771322567e-06, "loss": 0.5935, "step": 7469 }, { "epoch": 4.010200999966444, "grad_norm": 0.80078125, "learning_rate": 4.675758896521204e-06, "loss": 0.9286, "step": 7470 }, { "epoch": 4.01073789470152, "grad_norm": 0.69140625, "learning_rate": 4.670842399070036e-06, "loss": 0.8348, "step": 7471 }, { "epoch": 4.011274789436596, "grad_norm": 0.69140625, "learning_rate": 4.665928221433203e-06, "loss": 0.7317, "step": 7472 }, { "epoch": 4.011811684171672, "grad_norm": 0.6796875, "learning_rate": 4.661016364171461e-06, "loss": 0.8828, "step": 7473 }, { "epoch": 4.012348578906748, "grad_norm": 0.64453125, "learning_rate": 4.656106827845325e-06, "loss": 0.8191, "step": 7474 }, { "epoch": 4.012885473641824, "grad_norm": 0.87890625, "learning_rate": 4.651199613015042e-06, "loss": 0.8596, "step": 7475 }, { "epoch": 4.0134223683769, "grad_norm": 0.640625, "learning_rate": 4.646294720240574e-06, "loss": 0.7695, "step": 7476 }, { "epoch": 4.0139592631119765, "grad_norm": 0.84375, "learning_rate": 4.6413921500816445e-06, "loss": 0.8243, "step": 7477 }, { "epoch": 4.014496157847052, "grad_norm": 0.7734375, "learning_rate": 4.636491903097681e-06, "loss": 0.9723, "step": 7478 }, { "epoch": 4.015033052582128, "grad_norm": 0.66796875, "learning_rate": 4.631593979847884e-06, "loss": 0.7722, "step": 7479 }, { "epoch": 4.015569947317204, "grad_norm": 0.73828125, "learning_rate": 4.626698380891154e-06, "loss": 0.8505, "step": 7480 }, { "epoch": 4.01610684205228, "grad_norm": 0.6484375, "learning_rate": 4.621805106786142e-06, "loss": 0.8458, "step": 7481 }, { "epoch": 4.016643736787356, "grad_norm": 0.703125, "learning_rate": 4.6169141580912455e-06, "loss": 0.8673, "step": 7482 }, { "epoch": 4.017180631522432, "grad_norm": 0.91796875, "learning_rate": 4.612025535364567e-06, "loss": 0.9364, "step": 7483 }, { "epoch": 4.017717526257508, "grad_norm": 0.64453125, "learning_rate": 4.607139239163969e-06, "loss": 0.8494, "step": 7484 }, { "epoch": 4.018254420992585, "grad_norm": 0.7421875, "learning_rate": 4.602255270047048e-06, "loss": 0.694, "step": 7485 }, { "epoch": 4.01879131572766, "grad_norm": 0.87890625, "learning_rate": 4.597373628571106e-06, "loss": 0.8932, "step": 7486 }, { "epoch": 4.019328210462736, "grad_norm": 0.65625, "learning_rate": 4.592494315293211e-06, "loss": 0.7776, "step": 7487 }, { "epoch": 4.0198651051978125, "grad_norm": 0.80078125, "learning_rate": 4.587617330770164e-06, "loss": 0.7525, "step": 7488 }, { "epoch": 4.020401999932888, "grad_norm": 0.7578125, "learning_rate": 4.582742675558471e-06, "loss": 0.9899, "step": 7489 }, { "epoch": 4.020938894667964, "grad_norm": 0.57421875, "learning_rate": 4.577870350214405e-06, "loss": 0.806, "step": 7490 }, { "epoch": 4.02147578940304, "grad_norm": 0.66796875, "learning_rate": 4.5730003552939634e-06, "loss": 0.7843, "step": 7491 }, { "epoch": 4.022012684138116, "grad_norm": 0.765625, "learning_rate": 4.568132691352861e-06, "loss": 0.8928, "step": 7492 }, { "epoch": 4.022549578873192, "grad_norm": 0.7734375, "learning_rate": 4.563267358946574e-06, "loss": 0.6967, "step": 7493 }, { "epoch": 4.023086473608268, "grad_norm": 0.87890625, "learning_rate": 4.558404358630286e-06, "loss": 0.734, "step": 7494 }, { "epoch": 4.023623368343344, "grad_norm": 0.671875, "learning_rate": 4.553543690958939e-06, "loss": 0.8173, "step": 7495 }, { "epoch": 4.024160263078421, "grad_norm": 0.6640625, "learning_rate": 4.548685356487184e-06, "loss": 0.794, "step": 7496 }, { "epoch": 4.024697157813496, "grad_norm": 0.703125, "learning_rate": 4.54382935576943e-06, "loss": 0.954, "step": 7497 }, { "epoch": 4.025234052548572, "grad_norm": 0.79296875, "learning_rate": 4.538975689359795e-06, "loss": 0.8888, "step": 7498 }, { "epoch": 4.0257709472836485, "grad_norm": 0.734375, "learning_rate": 4.534124357812156e-06, "loss": 0.9902, "step": 7499 }, { "epoch": 4.026307842018724, "grad_norm": 0.6953125, "learning_rate": 4.5292753616801106e-06, "loss": 0.9657, "step": 7500 }, { "epoch": 4.026307842018724, "eval_loss": 0.9026344418525696, "eval_runtime": 46.3739, "eval_samples_per_second": 20.744, "eval_steps_per_second": 20.744, "step": 7500 }, { "epoch": 4.0268447367538, "grad_norm": 0.65234375, "learning_rate": 4.524428701516981e-06, "loss": 0.7382, "step": 7501 }, { "epoch": 4.027381631488876, "grad_norm": 0.65625, "learning_rate": 4.519584377875846e-06, "loss": 1.0347, "step": 7502 }, { "epoch": 4.027918526223952, "grad_norm": 0.72265625, "learning_rate": 4.51474239130949e-06, "loss": 0.8116, "step": 7503 }, { "epoch": 4.028455420959029, "grad_norm": 0.69140625, "learning_rate": 4.509902742370459e-06, "loss": 0.89, "step": 7504 }, { "epoch": 4.028992315694104, "grad_norm": 0.890625, "learning_rate": 4.505065431611002e-06, "loss": 0.915, "step": 7505 }, { "epoch": 4.02952921042918, "grad_norm": 0.875, "learning_rate": 4.50023045958313e-06, "loss": 0.7722, "step": 7506 }, { "epoch": 4.030066105164257, "grad_norm": 0.73046875, "learning_rate": 4.495397826838577e-06, "loss": 0.6443, "step": 7507 }, { "epoch": 4.030602999899332, "grad_norm": 0.87890625, "learning_rate": 4.490567533928794e-06, "loss": 0.7557, "step": 7508 }, { "epoch": 4.031139894634408, "grad_norm": 0.7109375, "learning_rate": 4.485739581404988e-06, "loss": 0.9773, "step": 7509 }, { "epoch": 4.0316767893694845, "grad_norm": 0.671875, "learning_rate": 4.480913969818098e-06, "loss": 0.7444, "step": 7510 }, { "epoch": 4.03221368410456, "grad_norm": 0.765625, "learning_rate": 4.4760906997187655e-06, "loss": 0.8161, "step": 7511 }, { "epoch": 4.032750578839636, "grad_norm": 0.75390625, "learning_rate": 4.4712697716574e-06, "loss": 0.8348, "step": 7512 }, { "epoch": 4.033287473574712, "grad_norm": 0.65234375, "learning_rate": 4.466451186184134e-06, "loss": 0.7544, "step": 7513 }, { "epoch": 4.033824368309788, "grad_norm": 0.55859375, "learning_rate": 4.461634943848819e-06, "loss": 0.6289, "step": 7514 }, { "epoch": 4.034361263044865, "grad_norm": 0.703125, "learning_rate": 4.456821045201054e-06, "loss": 1.0013, "step": 7515 }, { "epoch": 4.03489815777994, "grad_norm": 0.7421875, "learning_rate": 4.452009490790171e-06, "loss": 0.9217, "step": 7516 }, { "epoch": 4.035435052515016, "grad_norm": 0.73046875, "learning_rate": 4.447200281165217e-06, "loss": 0.7167, "step": 7517 }, { "epoch": 4.035971947250093, "grad_norm": 0.96875, "learning_rate": 4.442393416875002e-06, "loss": 0.9273, "step": 7518 }, { "epoch": 4.036508841985168, "grad_norm": 0.68359375, "learning_rate": 4.437588898468029e-06, "loss": 0.8502, "step": 7519 }, { "epoch": 4.037045736720244, "grad_norm": 0.76953125, "learning_rate": 4.432786726492571e-06, "loss": 1.0083, "step": 7520 }, { "epoch": 4.0375826314553205, "grad_norm": 0.64453125, "learning_rate": 4.427986901496603e-06, "loss": 0.9081, "step": 7521 }, { "epoch": 4.038119526190396, "grad_norm": 0.9453125, "learning_rate": 4.423189424027857e-06, "loss": 0.8164, "step": 7522 }, { "epoch": 4.038656420925473, "grad_norm": 0.82421875, "learning_rate": 4.4183942946337845e-06, "loss": 0.9482, "step": 7523 }, { "epoch": 4.039193315660548, "grad_norm": 0.7265625, "learning_rate": 4.413601513861562e-06, "loss": 0.8399, "step": 7524 }, { "epoch": 4.039730210395624, "grad_norm": 0.8046875, "learning_rate": 4.408811082258116e-06, "loss": 0.7816, "step": 7525 }, { "epoch": 4.040267105130701, "grad_norm": 0.67578125, "learning_rate": 4.404023000370094e-06, "loss": 0.7986, "step": 7526 }, { "epoch": 4.040803999865776, "grad_norm": 0.65625, "learning_rate": 4.399237268743877e-06, "loss": 0.8299, "step": 7527 }, { "epoch": 4.041340894600852, "grad_norm": 0.703125, "learning_rate": 4.3944538879255674e-06, "loss": 0.8953, "step": 7528 }, { "epoch": 4.041877789335929, "grad_norm": 0.671875, "learning_rate": 4.389672858461025e-06, "loss": 0.6611, "step": 7529 }, { "epoch": 4.042414684071004, "grad_norm": 0.78515625, "learning_rate": 4.38489418089581e-06, "loss": 0.8634, "step": 7530 }, { "epoch": 4.04295157880608, "grad_norm": 0.7578125, "learning_rate": 4.38011785577524e-06, "loss": 0.7427, "step": 7531 }, { "epoch": 4.0434884735411565, "grad_norm": 0.671875, "learning_rate": 4.375343883644359e-06, "loss": 0.7341, "step": 7532 }, { "epoch": 4.044025368276232, "grad_norm": 0.703125, "learning_rate": 4.370572265047923e-06, "loss": 0.9833, "step": 7533 }, { "epoch": 4.044562263011309, "grad_norm": 1.03125, "learning_rate": 4.365803000530441e-06, "loss": 1.0359, "step": 7534 }, { "epoch": 4.045099157746384, "grad_norm": 0.734375, "learning_rate": 4.361036090636156e-06, "loss": 0.892, "step": 7535 }, { "epoch": 4.04563605248146, "grad_norm": 0.70703125, "learning_rate": 4.356271535909024e-06, "loss": 0.8823, "step": 7536 }, { "epoch": 4.046172947216537, "grad_norm": 0.58984375, "learning_rate": 4.351509336892737e-06, "loss": 0.7737, "step": 7537 }, { "epoch": 4.046709841951612, "grad_norm": 0.625, "learning_rate": 4.346749494130731e-06, "loss": 0.9118, "step": 7538 }, { "epoch": 4.047246736686688, "grad_norm": 0.92578125, "learning_rate": 4.341992008166151e-06, "loss": 0.7746, "step": 7539 }, { "epoch": 4.047783631421765, "grad_norm": 0.63671875, "learning_rate": 4.3372368795419006e-06, "loss": 0.845, "step": 7540 }, { "epoch": 4.04832052615684, "grad_norm": 0.671875, "learning_rate": 4.332484108800597e-06, "loss": 0.9345, "step": 7541 }, { "epoch": 4.048857420891917, "grad_norm": 0.765625, "learning_rate": 4.327733696484587e-06, "loss": 0.9044, "step": 7542 }, { "epoch": 4.0493943156269925, "grad_norm": 0.6484375, "learning_rate": 4.322985643135952e-06, "loss": 0.8434, "step": 7543 }, { "epoch": 4.049931210362068, "grad_norm": 0.75, "learning_rate": 4.318239949296515e-06, "loss": 0.7593, "step": 7544 }, { "epoch": 4.050468105097145, "grad_norm": 0.6484375, "learning_rate": 4.3134966155078144e-06, "loss": 0.7988, "step": 7545 }, { "epoch": 4.05100499983222, "grad_norm": 0.85546875, "learning_rate": 4.3087556423111186e-06, "loss": 1.0619, "step": 7546 }, { "epoch": 4.051541894567296, "grad_norm": 0.703125, "learning_rate": 4.304017030247434e-06, "loss": 1.0173, "step": 7547 }, { "epoch": 4.052078789302373, "grad_norm": 0.71875, "learning_rate": 4.299280779857506e-06, "loss": 0.855, "step": 7548 }, { "epoch": 4.052615684037448, "grad_norm": 0.7734375, "learning_rate": 4.294546891681791e-06, "loss": 0.8628, "step": 7549 }, { "epoch": 4.053152578772524, "grad_norm": 0.68359375, "learning_rate": 4.289815366260488e-06, "loss": 0.7688, "step": 7550 }, { "epoch": 4.053152578772524, "eval_loss": 0.9027820825576782, "eval_runtime": 46.7155, "eval_samples_per_second": 20.593, "eval_steps_per_second": 20.593, "step": 7550 }, { "epoch": 4.053689473507601, "grad_norm": 0.71484375, "learning_rate": 4.2850862041335316e-06, "loss": 0.9689, "step": 7551 }, { "epoch": 4.054226368242676, "grad_norm": 0.6875, "learning_rate": 4.28035940584057e-06, "loss": 0.8382, "step": 7552 }, { "epoch": 4.054763262977753, "grad_norm": 0.5703125, "learning_rate": 4.275634971920988e-06, "loss": 0.8201, "step": 7553 }, { "epoch": 4.0553001577128285, "grad_norm": 0.578125, "learning_rate": 4.270912902913918e-06, "loss": 0.9135, "step": 7554 }, { "epoch": 4.055837052447904, "grad_norm": 0.74609375, "learning_rate": 4.266193199358187e-06, "loss": 0.785, "step": 7555 }, { "epoch": 4.056373947182981, "grad_norm": 0.64453125, "learning_rate": 4.2614758617923885e-06, "loss": 0.7262, "step": 7556 }, { "epoch": 4.056910841918056, "grad_norm": 0.7421875, "learning_rate": 4.256760890754832e-06, "loss": 0.8601, "step": 7557 }, { "epoch": 4.057447736653132, "grad_norm": 0.64453125, "learning_rate": 4.252048286783541e-06, "loss": 0.7742, "step": 7558 }, { "epoch": 4.057984631388209, "grad_norm": 0.6875, "learning_rate": 4.247338050416294e-06, "loss": 0.7661, "step": 7559 }, { "epoch": 4.058521526123284, "grad_norm": 0.65234375, "learning_rate": 4.242630182190594e-06, "loss": 0.7695, "step": 7560 }, { "epoch": 4.059058420858361, "grad_norm": 0.8046875, "learning_rate": 4.237924682643657e-06, "loss": 0.8121, "step": 7561 }, { "epoch": 4.059595315593437, "grad_norm": 0.703125, "learning_rate": 4.2332215523124415e-06, "loss": 0.9495, "step": 7562 }, { "epoch": 4.060132210328512, "grad_norm": 0.70703125, "learning_rate": 4.228520791733636e-06, "loss": 0.8866, "step": 7563 }, { "epoch": 4.060669105063589, "grad_norm": 0.66015625, "learning_rate": 4.2238224014436635e-06, "loss": 0.7356, "step": 7564 }, { "epoch": 4.0612059997986645, "grad_norm": 0.68359375, "learning_rate": 4.219126381978658e-06, "loss": 0.8362, "step": 7565 }, { "epoch": 4.06174289453374, "grad_norm": 0.6953125, "learning_rate": 4.214432733874501e-06, "loss": 0.8295, "step": 7566 }, { "epoch": 4.062279789268817, "grad_norm": 0.546875, "learning_rate": 4.209741457666802e-06, "loss": 0.6949, "step": 7567 }, { "epoch": 4.062816684003892, "grad_norm": 0.703125, "learning_rate": 4.205052553890884e-06, "loss": 0.8235, "step": 7568 }, { "epoch": 4.063353578738968, "grad_norm": 0.59375, "learning_rate": 4.200366023081826e-06, "loss": 0.6671, "step": 7569 }, { "epoch": 4.063890473474045, "grad_norm": 0.66015625, "learning_rate": 4.195681865774406e-06, "loss": 0.7975, "step": 7570 }, { "epoch": 4.06442736820912, "grad_norm": 0.6953125, "learning_rate": 4.191000082503146e-06, "loss": 0.8361, "step": 7571 }, { "epoch": 4.064964262944197, "grad_norm": 0.68359375, "learning_rate": 4.186320673802302e-06, "loss": 0.7287, "step": 7572 }, { "epoch": 4.065501157679273, "grad_norm": 0.80859375, "learning_rate": 4.181643640205859e-06, "loss": 0.8191, "step": 7573 }, { "epoch": 4.066038052414348, "grad_norm": 0.60546875, "learning_rate": 4.176968982247514e-06, "loss": 0.8589, "step": 7574 }, { "epoch": 4.066574947149425, "grad_norm": 0.6953125, "learning_rate": 4.172296700460709e-06, "loss": 0.8424, "step": 7575 }, { "epoch": 4.0671118418845005, "grad_norm": 0.75, "learning_rate": 4.167626795378621e-06, "loss": 0.9578, "step": 7576 }, { "epoch": 4.067648736619576, "grad_norm": 0.6875, "learning_rate": 4.162959267534133e-06, "loss": 0.7207, "step": 7577 }, { "epoch": 4.068185631354653, "grad_norm": 0.59765625, "learning_rate": 4.158294117459868e-06, "loss": 0.7443, "step": 7578 }, { "epoch": 4.068722526089728, "grad_norm": 0.828125, "learning_rate": 4.153631345688183e-06, "loss": 0.8085, "step": 7579 }, { "epoch": 4.069259420824805, "grad_norm": 0.75390625, "learning_rate": 4.148970952751163e-06, "loss": 1.0337, "step": 7580 }, { "epoch": 4.069796315559881, "grad_norm": 0.671875, "learning_rate": 4.144312939180611e-06, "loss": 0.7783, "step": 7581 }, { "epoch": 4.070333210294956, "grad_norm": 0.7109375, "learning_rate": 4.139657305508074e-06, "loss": 0.879, "step": 7582 }, { "epoch": 4.070870105030033, "grad_norm": 0.7421875, "learning_rate": 4.135004052264807e-06, "loss": 0.7828, "step": 7583 }, { "epoch": 4.071406999765109, "grad_norm": 0.7109375, "learning_rate": 4.130353179981811e-06, "loss": 0.9673, "step": 7584 }, { "epoch": 4.071943894500184, "grad_norm": 0.7734375, "learning_rate": 4.125704689189819e-06, "loss": 0.8643, "step": 7585 }, { "epoch": 4.072480789235261, "grad_norm": 0.6171875, "learning_rate": 4.121058580419271e-06, "loss": 0.8345, "step": 7586 }, { "epoch": 4.0730176839703365, "grad_norm": 0.609375, "learning_rate": 4.116414854200343e-06, "loss": 0.7553, "step": 7587 }, { "epoch": 4.073554578705412, "grad_norm": 0.78125, "learning_rate": 4.111773511062949e-06, "loss": 0.909, "step": 7588 }, { "epoch": 4.074091473440489, "grad_norm": 0.73828125, "learning_rate": 4.1071345515367314e-06, "loss": 0.8026, "step": 7589 }, { "epoch": 4.074628368175564, "grad_norm": 0.6875, "learning_rate": 4.102497976151043e-06, "loss": 0.8353, "step": 7590 }, { "epoch": 4.075165262910641, "grad_norm": 0.65234375, "learning_rate": 4.09786378543498e-06, "loss": 0.9353, "step": 7591 }, { "epoch": 4.075702157645717, "grad_norm": 0.625, "learning_rate": 4.09323197991737e-06, "loss": 0.8854, "step": 7592 }, { "epoch": 4.076239052380792, "grad_norm": 0.84765625, "learning_rate": 4.0886025601267476e-06, "loss": 0.7122, "step": 7593 }, { "epoch": 4.076775947115869, "grad_norm": 0.73046875, "learning_rate": 4.083975526591402e-06, "loss": 0.8168, "step": 7594 }, { "epoch": 4.077312841850945, "grad_norm": 0.72265625, "learning_rate": 4.079350879839328e-06, "loss": 0.9631, "step": 7595 }, { "epoch": 4.07784973658602, "grad_norm": 0.59765625, "learning_rate": 4.07472862039825e-06, "loss": 0.8483, "step": 7596 }, { "epoch": 4.078386631321097, "grad_norm": 0.66796875, "learning_rate": 4.070108748795634e-06, "loss": 0.728, "step": 7597 }, { "epoch": 4.0789235260561725, "grad_norm": 0.6171875, "learning_rate": 4.065491265558671e-06, "loss": 0.666, "step": 7598 }, { "epoch": 4.079460420791249, "grad_norm": 0.640625, "learning_rate": 4.060876171214264e-06, "loss": 0.9069, "step": 7599 }, { "epoch": 4.079997315526325, "grad_norm": 0.7578125, "learning_rate": 4.056263466289059e-06, "loss": 0.9613, "step": 7600 }, { "epoch": 4.079997315526325, "eval_loss": 0.9027480483055115, "eval_runtime": 46.1054, "eval_samples_per_second": 20.865, "eval_steps_per_second": 20.865, "step": 7600 }, { "epoch": 4.0805342102614, "grad_norm": 0.6640625, "learning_rate": 4.0516531513094284e-06, "loss": 0.8165, "step": 7601 }, { "epoch": 4.081071104996477, "grad_norm": 0.6953125, "learning_rate": 4.047045226801463e-06, "loss": 0.9922, "step": 7602 }, { "epoch": 4.081607999731553, "grad_norm": 0.76171875, "learning_rate": 4.042439693290978e-06, "loss": 0.8781, "step": 7603 }, { "epoch": 4.082144894466628, "grad_norm": 0.796875, "learning_rate": 4.037836551303531e-06, "loss": 0.8919, "step": 7604 }, { "epoch": 4.082681789201705, "grad_norm": 0.76953125, "learning_rate": 4.0332358013644016e-06, "loss": 0.8596, "step": 7605 }, { "epoch": 4.083218683936781, "grad_norm": 0.55859375, "learning_rate": 4.028637443998587e-06, "loss": 0.731, "step": 7606 }, { "epoch": 4.083755578671856, "grad_norm": 0.75390625, "learning_rate": 4.024041479730819e-06, "loss": 0.7704, "step": 7607 }, { "epoch": 4.084292473406933, "grad_norm": 0.765625, "learning_rate": 4.019447909085564e-06, "loss": 0.8146, "step": 7608 }, { "epoch": 4.0848293681420085, "grad_norm": 1.03125, "learning_rate": 4.014856732586994e-06, "loss": 0.8638, "step": 7609 }, { "epoch": 4.085366262877085, "grad_norm": 0.6484375, "learning_rate": 4.010267950759025e-06, "loss": 0.7923, "step": 7610 }, { "epoch": 4.085903157612161, "grad_norm": 0.63671875, "learning_rate": 4.005681564125308e-06, "loss": 0.8261, "step": 7611 }, { "epoch": 4.086440052347236, "grad_norm": 0.74609375, "learning_rate": 4.001097573209187e-06, "loss": 0.8509, "step": 7612 }, { "epoch": 4.086976947082313, "grad_norm": 0.640625, "learning_rate": 3.996515978533758e-06, "loss": 0.7094, "step": 7613 }, { "epoch": 4.087513841817389, "grad_norm": 0.953125, "learning_rate": 3.9919367806218525e-06, "loss": 0.9203, "step": 7614 }, { "epoch": 4.088050736552464, "grad_norm": 0.62890625, "learning_rate": 3.987359979996e-06, "loss": 0.9241, "step": 7615 }, { "epoch": 4.088587631287541, "grad_norm": 0.6328125, "learning_rate": 3.982785577178477e-06, "loss": 0.8996, "step": 7616 }, { "epoch": 4.089124526022617, "grad_norm": 0.5625, "learning_rate": 3.9782135726912864e-06, "loss": 0.7987, "step": 7617 }, { "epoch": 4.089661420757693, "grad_norm": 0.6796875, "learning_rate": 3.97364396705614e-06, "loss": 0.8265, "step": 7618 }, { "epoch": 4.090198315492769, "grad_norm": 0.73046875, "learning_rate": 3.969076760794502e-06, "loss": 0.829, "step": 7619 }, { "epoch": 4.0907352102278445, "grad_norm": 0.6875, "learning_rate": 3.964511954427533e-06, "loss": 0.8665, "step": 7620 }, { "epoch": 4.091272104962921, "grad_norm": 0.71875, "learning_rate": 3.959949548476149e-06, "loss": 0.8798, "step": 7621 }, { "epoch": 4.091808999697997, "grad_norm": 0.69921875, "learning_rate": 3.955389543460969e-06, "loss": 0.7219, "step": 7622 }, { "epoch": 4.092345894433072, "grad_norm": 0.6015625, "learning_rate": 3.950831939902352e-06, "loss": 0.6613, "step": 7623 }, { "epoch": 4.092882789168149, "grad_norm": 0.83984375, "learning_rate": 3.946276738320373e-06, "loss": 0.906, "step": 7624 }, { "epoch": 4.093419683903225, "grad_norm": 0.71484375, "learning_rate": 3.941723939234843e-06, "loss": 0.7586, "step": 7625 }, { "epoch": 4.0939565786383, "grad_norm": 0.8515625, "learning_rate": 3.937173543165298e-06, "loss": 1.0252, "step": 7626 }, { "epoch": 4.094493473373377, "grad_norm": 0.69921875, "learning_rate": 3.932625550630986e-06, "loss": 1.0482, "step": 7627 }, { "epoch": 4.095030368108453, "grad_norm": 0.68359375, "learning_rate": 3.928079962150899e-06, "loss": 0.8752, "step": 7628 }, { "epoch": 4.095567262843529, "grad_norm": 0.7421875, "learning_rate": 3.923536778243739e-06, "loss": 0.921, "step": 7629 }, { "epoch": 4.096104157578605, "grad_norm": 0.73046875, "learning_rate": 3.918995999427949e-06, "loss": 0.8037, "step": 7630 }, { "epoch": 4.0966410523136805, "grad_norm": 0.75, "learning_rate": 3.914457626221679e-06, "loss": 0.7936, "step": 7631 }, { "epoch": 4.097177947048757, "grad_norm": 0.67578125, "learning_rate": 3.90992165914282e-06, "loss": 0.9538, "step": 7632 }, { "epoch": 4.097714841783833, "grad_norm": 0.69140625, "learning_rate": 3.9053880987089926e-06, "loss": 0.8611, "step": 7633 }, { "epoch": 4.098251736518908, "grad_norm": 0.6953125, "learning_rate": 3.900856945437517e-06, "loss": 0.87, "step": 7634 }, { "epoch": 4.098788631253985, "grad_norm": 0.6015625, "learning_rate": 3.8963281998454615e-06, "loss": 0.7026, "step": 7635 }, { "epoch": 4.099325525989061, "grad_norm": 0.640625, "learning_rate": 3.891801862449629e-06, "loss": 0.7265, "step": 7636 }, { "epoch": 4.099862420724137, "grad_norm": 0.9375, "learning_rate": 3.887277933766506e-06, "loss": 0.8535, "step": 7637 }, { "epoch": 4.100399315459213, "grad_norm": 0.76171875, "learning_rate": 3.882756414312342e-06, "loss": 0.837, "step": 7638 }, { "epoch": 4.1009362101942886, "grad_norm": 0.60546875, "learning_rate": 3.878237304603102e-06, "loss": 0.705, "step": 7639 }, { "epoch": 4.101473104929365, "grad_norm": 0.6875, "learning_rate": 3.873720605154468e-06, "loss": 1.023, "step": 7640 }, { "epoch": 4.102009999664441, "grad_norm": 0.71875, "learning_rate": 3.869206316481855e-06, "loss": 1.0272, "step": 7641 }, { "epoch": 4.1025468943995165, "grad_norm": 0.70703125, "learning_rate": 3.864694439100405e-06, "loss": 0.8219, "step": 7642 }, { "epoch": 4.103083789134593, "grad_norm": 0.7109375, "learning_rate": 3.860184973524972e-06, "loss": 0.7704, "step": 7643 }, { "epoch": 4.103620683869669, "grad_norm": 0.796875, "learning_rate": 3.855677920270154e-06, "loss": 0.9095, "step": 7644 }, { "epoch": 4.104157578604744, "grad_norm": 0.625, "learning_rate": 3.851173279850251e-06, "loss": 0.7077, "step": 7645 }, { "epoch": 4.104694473339821, "grad_norm": 0.76171875, "learning_rate": 3.8466710527793105e-06, "loss": 0.6791, "step": 7646 }, { "epoch": 4.105231368074897, "grad_norm": 0.765625, "learning_rate": 3.842171239571082e-06, "loss": 0.8216, "step": 7647 }, { "epoch": 4.105768262809973, "grad_norm": 0.6875, "learning_rate": 3.837673840739059e-06, "loss": 0.8582, "step": 7648 }, { "epoch": 4.106305157545049, "grad_norm": 0.6953125, "learning_rate": 3.833178856796452e-06, "loss": 0.8844, "step": 7649 }, { "epoch": 4.1068420522801246, "grad_norm": 0.7109375, "learning_rate": 3.8286862882561916e-06, "loss": 0.7074, "step": 7650 }, { "epoch": 4.1068420522801246, "eval_loss": 0.9025216698646545, "eval_runtime": 46.5886, "eval_samples_per_second": 20.649, "eval_steps_per_second": 20.649, "step": 7650 }, { "epoch": 4.107378947015201, "grad_norm": 0.6953125, "learning_rate": 3.824196135630936e-06, "loss": 0.6797, "step": 7651 }, { "epoch": 4.107915841750277, "grad_norm": 0.671875, "learning_rate": 3.819708399433078e-06, "loss": 0.8142, "step": 7652 }, { "epoch": 4.1084527364853525, "grad_norm": 0.671875, "learning_rate": 3.815223080174718e-06, "loss": 0.7858, "step": 7653 }, { "epoch": 4.108989631220429, "grad_norm": 0.765625, "learning_rate": 3.810740178367683e-06, "loss": 0.9258, "step": 7654 }, { "epoch": 4.109526525955505, "grad_norm": 0.64453125, "learning_rate": 3.8062596945235396e-06, "loss": 0.6984, "step": 7655 }, { "epoch": 4.110063420690581, "grad_norm": 0.6875, "learning_rate": 3.8017816291535556e-06, "loss": 0.8492, "step": 7656 }, { "epoch": 4.110600315425657, "grad_norm": 0.69921875, "learning_rate": 3.7973059827687425e-06, "loss": 0.8895, "step": 7657 }, { "epoch": 4.111137210160733, "grad_norm": 0.74609375, "learning_rate": 3.792832755879833e-06, "loss": 0.8753, "step": 7658 }, { "epoch": 4.111674104895809, "grad_norm": 0.63671875, "learning_rate": 3.7883619489972653e-06, "loss": 0.7956, "step": 7659 }, { "epoch": 4.112210999630885, "grad_norm": 0.6953125, "learning_rate": 3.7838935626312242e-06, "loss": 0.8918, "step": 7660 }, { "epoch": 4.1127478943659606, "grad_norm": 0.75, "learning_rate": 3.7794275972916125e-06, "loss": 0.9082, "step": 7661 }, { "epoch": 4.113284789101037, "grad_norm": 0.703125, "learning_rate": 3.7749640534880504e-06, "loss": 0.8224, "step": 7662 }, { "epoch": 4.113821683836113, "grad_norm": 0.89453125, "learning_rate": 3.770502931729877e-06, "loss": 0.9054, "step": 7663 }, { "epoch": 4.1143585785711885, "grad_norm": 0.6328125, "learning_rate": 3.766044232526175e-06, "loss": 0.7902, "step": 7664 }, { "epoch": 4.114895473306265, "grad_norm": 0.61328125, "learning_rate": 3.7615879563857257e-06, "loss": 0.8439, "step": 7665 }, { "epoch": 4.115432368041341, "grad_norm": 0.73046875, "learning_rate": 3.757134103817053e-06, "loss": 0.941, "step": 7666 }, { "epoch": 4.115969262776417, "grad_norm": 0.8515625, "learning_rate": 3.752682675328406e-06, "loss": 0.6705, "step": 7667 }, { "epoch": 4.116506157511493, "grad_norm": 0.703125, "learning_rate": 3.7482336714277337e-06, "loss": 0.8981, "step": 7668 }, { "epoch": 4.117043052246569, "grad_norm": 0.8203125, "learning_rate": 3.7437870926227304e-06, "loss": 0.885, "step": 7669 }, { "epoch": 4.117579946981645, "grad_norm": 0.8359375, "learning_rate": 3.739342939420815e-06, "loss": 0.8488, "step": 7670 }, { "epoch": 4.118116841716721, "grad_norm": 0.734375, "learning_rate": 3.7349012123291144e-06, "loss": 0.7188, "step": 7671 }, { "epoch": 4.1186537364517966, "grad_norm": 0.6875, "learning_rate": 3.7304619118544827e-06, "loss": 0.8741, "step": 7672 }, { "epoch": 4.119190631186873, "grad_norm": 0.69140625, "learning_rate": 3.7260250385035e-06, "loss": 0.855, "step": 7673 }, { "epoch": 4.119727525921949, "grad_norm": 0.640625, "learning_rate": 3.7215905927824812e-06, "loss": 0.8032, "step": 7674 }, { "epoch": 4.120264420657025, "grad_norm": 0.9609375, "learning_rate": 3.7171585751974408e-06, "loss": 0.8138, "step": 7675 }, { "epoch": 4.120801315392101, "grad_norm": 0.796875, "learning_rate": 3.7127289862541297e-06, "loss": 0.8487, "step": 7676 }, { "epoch": 4.121338210127177, "grad_norm": 0.71875, "learning_rate": 3.70830182645803e-06, "loss": 0.9556, "step": 7677 }, { "epoch": 4.121875104862253, "grad_norm": 0.671875, "learning_rate": 3.7038770963143286e-06, "loss": 0.718, "step": 7678 }, { "epoch": 4.122411999597329, "grad_norm": 0.8984375, "learning_rate": 3.6994547963279392e-06, "loss": 0.7813, "step": 7679 }, { "epoch": 4.122948894332405, "grad_norm": 0.61328125, "learning_rate": 3.695034927003513e-06, "loss": 0.9341, "step": 7680 }, { "epoch": 4.123485789067481, "grad_norm": 0.7890625, "learning_rate": 3.6906174888453992e-06, "loss": 0.692, "step": 7681 }, { "epoch": 4.124022683802557, "grad_norm": 1.140625, "learning_rate": 3.6862024823576926e-06, "loss": 0.8553, "step": 7682 }, { "epoch": 4.1245595785376326, "grad_norm": 0.63671875, "learning_rate": 3.6817899080442057e-06, "loss": 0.806, "step": 7683 }, { "epoch": 4.125096473272709, "grad_norm": 0.60546875, "learning_rate": 3.677379766408459e-06, "loss": 0.6916, "step": 7684 }, { "epoch": 4.125633368007785, "grad_norm": 0.71875, "learning_rate": 3.672972057953711e-06, "loss": 0.8773, "step": 7685 }, { "epoch": 4.126170262742861, "grad_norm": 0.7265625, "learning_rate": 3.668566783182939e-06, "loss": 0.8851, "step": 7686 }, { "epoch": 4.126707157477937, "grad_norm": 0.7421875, "learning_rate": 3.664163942598839e-06, "loss": 0.9127, "step": 7687 }, { "epoch": 4.127244052213013, "grad_norm": 0.62890625, "learning_rate": 3.659763536703825e-06, "loss": 0.7889, "step": 7688 }, { "epoch": 4.127780946948089, "grad_norm": 0.703125, "learning_rate": 3.6553655660000436e-06, "loss": 0.9252, "step": 7689 }, { "epoch": 4.128317841683165, "grad_norm": 0.8125, "learning_rate": 3.6509700309893618e-06, "loss": 0.8603, "step": 7690 }, { "epoch": 4.128854736418241, "grad_norm": 0.7265625, "learning_rate": 3.6465769321733594e-06, "loss": 0.7195, "step": 7691 }, { "epoch": 4.129391631153317, "grad_norm": 0.7265625, "learning_rate": 3.6421862700533495e-06, "loss": 0.9742, "step": 7692 }, { "epoch": 4.129928525888393, "grad_norm": 0.9765625, "learning_rate": 3.6377980451303666e-06, "loss": 0.8072, "step": 7693 }, { "epoch": 4.130465420623469, "grad_norm": 0.609375, "learning_rate": 3.63341225790515e-06, "loss": 0.9739, "step": 7694 }, { "epoch": 4.131002315358545, "grad_norm": 0.828125, "learning_rate": 3.6290289088781874e-06, "loss": 0.7622, "step": 7695 }, { "epoch": 4.131539210093621, "grad_norm": 0.5859375, "learning_rate": 3.6246479985496658e-06, "loss": 0.6696, "step": 7696 }, { "epoch": 4.132076104828697, "grad_norm": 0.75, "learning_rate": 3.6202695274195007e-06, "loss": 0.8104, "step": 7697 }, { "epoch": 4.132612999563773, "grad_norm": 0.66015625, "learning_rate": 3.6158934959873353e-06, "loss": 0.8761, "step": 7698 }, { "epoch": 4.133149894298849, "grad_norm": 0.72265625, "learning_rate": 3.611519904752533e-06, "loss": 0.8477, "step": 7699 }, { "epoch": 4.133686789033925, "grad_norm": 0.6796875, "learning_rate": 3.607148754214168e-06, "loss": 0.7589, "step": 7700 }, { "epoch": 4.133686789033925, "eval_loss": 0.9027608036994934, "eval_runtime": 46.5247, "eval_samples_per_second": 20.677, "eval_steps_per_second": 20.677, "step": 7700 }, { "epoch": 4.134223683769001, "grad_norm": 0.7890625, "learning_rate": 3.6027800448710515e-06, "loss": 0.857, "step": 7701 }, { "epoch": 4.134760578504077, "grad_norm": 0.6171875, "learning_rate": 3.5984137772217096e-06, "loss": 0.896, "step": 7702 }, { "epoch": 4.135297473239153, "grad_norm": 0.62109375, "learning_rate": 3.594049951764383e-06, "loss": 0.8503, "step": 7703 }, { "epoch": 4.135834367974229, "grad_norm": 0.76171875, "learning_rate": 3.5896885689970376e-06, "loss": 0.7745, "step": 7704 }, { "epoch": 4.136371262709305, "grad_norm": 0.609375, "learning_rate": 3.5853296294173666e-06, "loss": 0.6519, "step": 7705 }, { "epoch": 4.136908157444381, "grad_norm": 0.73046875, "learning_rate": 3.580973133522783e-06, "loss": 0.7922, "step": 7706 }, { "epoch": 4.137445052179457, "grad_norm": 0.640625, "learning_rate": 3.5766190818104118e-06, "loss": 0.7362, "step": 7707 }, { "epoch": 4.137981946914533, "grad_norm": 0.6953125, "learning_rate": 3.5722674747771134e-06, "loss": 0.7191, "step": 7708 }, { "epoch": 4.138518841649609, "grad_norm": 0.61328125, "learning_rate": 3.5679183129194493e-06, "loss": 0.7361, "step": 7709 }, { "epoch": 4.139055736384685, "grad_norm": 0.6953125, "learning_rate": 3.5635715967337223e-06, "loss": 0.7144, "step": 7710 }, { "epoch": 4.139592631119761, "grad_norm": 0.71484375, "learning_rate": 3.5592273267159525e-06, "loss": 0.8846, "step": 7711 }, { "epoch": 4.140129525854837, "grad_norm": 0.73828125, "learning_rate": 3.554885503361871e-06, "loss": 0.8658, "step": 7712 }, { "epoch": 4.140666420589913, "grad_norm": 0.74609375, "learning_rate": 3.550546127166926e-06, "loss": 0.8837, "step": 7713 }, { "epoch": 4.141203315324989, "grad_norm": 0.6640625, "learning_rate": 3.5462091986263048e-06, "loss": 0.7683, "step": 7714 }, { "epoch": 4.141740210060065, "grad_norm": 0.703125, "learning_rate": 3.5418747182349084e-06, "loss": 0.8164, "step": 7715 }, { "epoch": 4.142277104795141, "grad_norm": 0.67578125, "learning_rate": 3.5375426864873494e-06, "loss": 0.8122, "step": 7716 }, { "epoch": 4.142813999530217, "grad_norm": 0.796875, "learning_rate": 3.5332131038779663e-06, "loss": 0.8984, "step": 7717 }, { "epoch": 4.143350894265293, "grad_norm": 0.703125, "learning_rate": 3.5288859709008305e-06, "loss": 0.7758, "step": 7718 }, { "epoch": 4.143887789000369, "grad_norm": 0.90234375, "learning_rate": 3.52456128804971e-06, "loss": 0.9521, "step": 7719 }, { "epoch": 4.144424683735445, "grad_norm": 0.5703125, "learning_rate": 3.520239055818114e-06, "loss": 0.694, "step": 7720 }, { "epoch": 4.144961578470521, "grad_norm": 0.7109375, "learning_rate": 3.515919274699264e-06, "loss": 0.9689, "step": 7721 }, { "epoch": 4.145498473205597, "grad_norm": 0.609375, "learning_rate": 3.511601945186091e-06, "loss": 0.7541, "step": 7722 }, { "epoch": 4.146035367940673, "grad_norm": 0.75, "learning_rate": 3.5072870677712642e-06, "loss": 1.0244, "step": 7723 }, { "epoch": 4.1465722626757495, "grad_norm": 0.8125, "learning_rate": 3.5029746429471737e-06, "loss": 0.8331, "step": 7724 }, { "epoch": 4.147109157410825, "grad_norm": 0.75390625, "learning_rate": 3.4986646712059064e-06, "loss": 0.9679, "step": 7725 }, { "epoch": 4.147646052145901, "grad_norm": 0.69921875, "learning_rate": 3.494357153039296e-06, "loss": 0.9258, "step": 7726 }, { "epoch": 4.148182946880977, "grad_norm": 0.640625, "learning_rate": 3.490052088938883e-06, "loss": 0.7397, "step": 7727 }, { "epoch": 4.148719841616053, "grad_norm": 0.6484375, "learning_rate": 3.4857494793959307e-06, "loss": 0.7623, "step": 7728 }, { "epoch": 4.149256736351129, "grad_norm": 0.69140625, "learning_rate": 3.4814493249014116e-06, "loss": 0.7676, "step": 7729 }, { "epoch": 4.149793631086205, "grad_norm": 0.64453125, "learning_rate": 3.4771516259460347e-06, "loss": 0.7425, "step": 7730 }, { "epoch": 4.150330525821281, "grad_norm": 0.68359375, "learning_rate": 3.472856383020229e-06, "loss": 0.7885, "step": 7731 }, { "epoch": 4.150867420556358, "grad_norm": 0.85546875, "learning_rate": 3.4685635966141245e-06, "loss": 1.0595, "step": 7732 }, { "epoch": 4.151404315291433, "grad_norm": 0.80078125, "learning_rate": 3.4642732672175855e-06, "loss": 0.9652, "step": 7733 }, { "epoch": 4.151941210026509, "grad_norm": 0.65625, "learning_rate": 3.4599853953202004e-06, "loss": 0.8826, "step": 7734 }, { "epoch": 4.1524781047615855, "grad_norm": 1.203125, "learning_rate": 3.455699981411259e-06, "loss": 0.8383, "step": 7735 }, { "epoch": 4.153014999496661, "grad_norm": 0.70703125, "learning_rate": 3.4514170259797862e-06, "loss": 0.7869, "step": 7736 }, { "epoch": 4.153551894231737, "grad_norm": 0.734375, "learning_rate": 3.4471365295145304e-06, "loss": 0.9131, "step": 7737 }, { "epoch": 4.154088788966813, "grad_norm": 0.76171875, "learning_rate": 3.4428584925039337e-06, "loss": 0.7697, "step": 7738 }, { "epoch": 4.154625683701889, "grad_norm": 0.6875, "learning_rate": 3.4385829154361783e-06, "loss": 0.8784, "step": 7739 }, { "epoch": 4.155162578436965, "grad_norm": 0.61328125, "learning_rate": 3.4343097987991735e-06, "loss": 0.7595, "step": 7740 }, { "epoch": 4.155699473172041, "grad_norm": 0.65234375, "learning_rate": 3.430039143080524e-06, "loss": 0.7975, "step": 7741 }, { "epoch": 4.156236367907117, "grad_norm": 0.66796875, "learning_rate": 3.4257709487675677e-06, "loss": 0.8956, "step": 7742 }, { "epoch": 4.156773262642194, "grad_norm": 0.64453125, "learning_rate": 3.421505216347368e-06, "loss": 0.9228, "step": 7743 }, { "epoch": 4.157310157377269, "grad_norm": 0.66796875, "learning_rate": 3.4172419463066848e-06, "loss": 0.8605, "step": 7744 }, { "epoch": 4.157847052112345, "grad_norm": 0.63671875, "learning_rate": 3.4129811391320267e-06, "loss": 0.8328, "step": 7745 }, { "epoch": 4.1583839468474215, "grad_norm": 0.6484375, "learning_rate": 3.4087227953095934e-06, "loss": 0.765, "step": 7746 }, { "epoch": 4.158920841582497, "grad_norm": 0.67578125, "learning_rate": 3.4044669153253243e-06, "loss": 0.8829, "step": 7747 }, { "epoch": 4.159457736317573, "grad_norm": 0.75, "learning_rate": 3.4002134996648616e-06, "loss": 0.9027, "step": 7748 }, { "epoch": 4.159994631052649, "grad_norm": 0.64453125, "learning_rate": 3.3959625488135805e-06, "loss": 0.8091, "step": 7749 }, { "epoch": 4.160531525787725, "grad_norm": 0.65625, "learning_rate": 3.3917140632565624e-06, "loss": 0.8279, "step": 7750 }, { "epoch": 4.160531525787725, "eval_loss": 0.9027578234672546, "eval_runtime": 46.8161, "eval_samples_per_second": 20.548, "eval_steps_per_second": 20.548, "step": 7750 }, { "epoch": 4.161068420522801, "grad_norm": 0.71484375, "learning_rate": 3.3874680434786172e-06, "loss": 0.6601, "step": 7751 }, { "epoch": 4.161605315257877, "grad_norm": 0.734375, "learning_rate": 3.3832244899642706e-06, "loss": 0.9747, "step": 7752 }, { "epoch": 4.162142209992953, "grad_norm": 0.703125, "learning_rate": 3.378983403197761e-06, "loss": 0.7435, "step": 7753 }, { "epoch": 4.16267910472803, "grad_norm": 0.6796875, "learning_rate": 3.3747447836630587e-06, "loss": 0.836, "step": 7754 }, { "epoch": 4.163215999463105, "grad_norm": 0.6328125, "learning_rate": 3.3705086318438327e-06, "loss": 0.8198, "step": 7755 }, { "epoch": 4.163752894198181, "grad_norm": 0.65625, "learning_rate": 3.366274948223494e-06, "loss": 0.8979, "step": 7756 }, { "epoch": 4.1642897889332575, "grad_norm": 0.81640625, "learning_rate": 3.3620437332851446e-06, "loss": 0.7899, "step": 7757 }, { "epoch": 4.164826683668333, "grad_norm": 0.8671875, "learning_rate": 3.3578149875116288e-06, "loss": 1.0397, "step": 7758 }, { "epoch": 4.165363578403409, "grad_norm": 0.60546875, "learning_rate": 3.3535887113855052e-06, "loss": 0.8326, "step": 7759 }, { "epoch": 4.165900473138485, "grad_norm": 0.61328125, "learning_rate": 3.3493649053890326e-06, "loss": 0.713, "step": 7760 }, { "epoch": 4.166437367873561, "grad_norm": 0.71484375, "learning_rate": 3.3451435700042083e-06, "loss": 0.8868, "step": 7761 }, { "epoch": 4.166974262608638, "grad_norm": 0.70703125, "learning_rate": 3.34092470571275e-06, "loss": 0.9271, "step": 7762 }, { "epoch": 4.167511157343713, "grad_norm": 0.640625, "learning_rate": 3.336708312996062e-06, "loss": 0.8128, "step": 7763 }, { "epoch": 4.168048052078789, "grad_norm": 0.77734375, "learning_rate": 3.3324943923352974e-06, "loss": 0.7175, "step": 7764 }, { "epoch": 4.168584946813866, "grad_norm": 0.7734375, "learning_rate": 3.328282944211328e-06, "loss": 0.9151, "step": 7765 }, { "epoch": 4.169121841548941, "grad_norm": 0.59375, "learning_rate": 3.3240739691047157e-06, "loss": 0.6801, "step": 7766 }, { "epoch": 4.169658736284017, "grad_norm": 0.71484375, "learning_rate": 3.3198674674957684e-06, "loss": 0.6921, "step": 7767 }, { "epoch": 4.1701956310190935, "grad_norm": 0.80078125, "learning_rate": 3.315663439864508e-06, "loss": 0.8124, "step": 7768 }, { "epoch": 4.170732525754169, "grad_norm": 0.765625, "learning_rate": 3.3114618866906503e-06, "loss": 0.7704, "step": 7769 }, { "epoch": 4.171269420489246, "grad_norm": 0.66015625, "learning_rate": 3.3072628084536593e-06, "loss": 0.7589, "step": 7770 }, { "epoch": 4.171806315224321, "grad_norm": 0.75390625, "learning_rate": 3.3030662056326933e-06, "loss": 1.0278, "step": 7771 }, { "epoch": 4.172343209959397, "grad_norm": 0.68359375, "learning_rate": 3.2988720787066495e-06, "loss": 0.7851, "step": 7772 }, { "epoch": 4.172880104694474, "grad_norm": 0.66015625, "learning_rate": 3.2946804281541178e-06, "loss": 0.7814, "step": 7773 }, { "epoch": 4.173416999429549, "grad_norm": 0.75390625, "learning_rate": 3.290491254453426e-06, "loss": 0.8635, "step": 7774 }, { "epoch": 4.173953894164625, "grad_norm": 0.6640625, "learning_rate": 3.2863045580826145e-06, "loss": 0.8788, "step": 7775 }, { "epoch": 4.174490788899702, "grad_norm": 0.703125, "learning_rate": 3.282120339519429e-06, "loss": 0.8501, "step": 7776 }, { "epoch": 4.175027683634777, "grad_norm": 0.6640625, "learning_rate": 3.277938599241348e-06, "loss": 0.9007, "step": 7777 }, { "epoch": 4.175564578369853, "grad_norm": 0.56640625, "learning_rate": 3.273759337725568e-06, "loss": 0.7333, "step": 7778 }, { "epoch": 4.1761014731049295, "grad_norm": 0.75390625, "learning_rate": 3.2695825554489854e-06, "loss": 0.8948, "step": 7779 }, { "epoch": 4.176638367840005, "grad_norm": 0.7265625, "learning_rate": 3.265408252888222e-06, "loss": 0.7789, "step": 7780 }, { "epoch": 4.177175262575082, "grad_norm": 0.67578125, "learning_rate": 3.2612364305196293e-06, "loss": 0.8759, "step": 7781 }, { "epoch": 4.177712157310157, "grad_norm": 0.6796875, "learning_rate": 3.2570670888192547e-06, "loss": 0.8162, "step": 7782 }, { "epoch": 4.178249052045233, "grad_norm": 0.62109375, "learning_rate": 3.2529002282628756e-06, "loss": 0.9651, "step": 7783 }, { "epoch": 4.17878594678031, "grad_norm": 0.8359375, "learning_rate": 3.24873584932599e-06, "loss": 0.9906, "step": 7784 }, { "epoch": 4.179322841515385, "grad_norm": 0.640625, "learning_rate": 3.2445739524837953e-06, "loss": 0.8634, "step": 7785 }, { "epoch": 4.179859736250461, "grad_norm": 0.65234375, "learning_rate": 3.2404145382112254e-06, "loss": 0.8493, "step": 7786 }, { "epoch": 4.180396630985538, "grad_norm": 0.703125, "learning_rate": 3.2362576069829233e-06, "loss": 0.7157, "step": 7787 }, { "epoch": 4.180933525720613, "grad_norm": 0.61328125, "learning_rate": 3.232103159273245e-06, "loss": 0.9648, "step": 7788 }, { "epoch": 4.181470420455689, "grad_norm": 0.8359375, "learning_rate": 3.2279511955562588e-06, "loss": 0.9492, "step": 7789 }, { "epoch": 4.1820073151907655, "grad_norm": 0.62890625, "learning_rate": 3.223801716305766e-06, "loss": 0.8544, "step": 7790 }, { "epoch": 4.182544209925841, "grad_norm": 0.61328125, "learning_rate": 3.219654721995266e-06, "loss": 0.7633, "step": 7791 }, { "epoch": 4.183081104660918, "grad_norm": 0.75390625, "learning_rate": 3.2155102130979886e-06, "loss": 0.7513, "step": 7792 }, { "epoch": 4.183617999395993, "grad_norm": 0.65625, "learning_rate": 3.2113681900868775e-06, "loss": 0.8225, "step": 7793 }, { "epoch": 4.184154894131069, "grad_norm": 0.65625, "learning_rate": 3.2072286534345826e-06, "loss": 0.8405, "step": 7794 }, { "epoch": 4.184691788866146, "grad_norm": 0.61328125, "learning_rate": 3.2030916036134865e-06, "loss": 0.6495, "step": 7795 }, { "epoch": 4.185228683601221, "grad_norm": 0.59375, "learning_rate": 3.198957041095671e-06, "loss": 0.6269, "step": 7796 }, { "epoch": 4.185765578336297, "grad_norm": 0.71875, "learning_rate": 3.194824966352947e-06, "loss": 0.8509, "step": 7797 }, { "epoch": 4.186302473071374, "grad_norm": 0.76953125, "learning_rate": 3.190695379856831e-06, "loss": 0.7805, "step": 7798 }, { "epoch": 4.186839367806449, "grad_norm": 0.94140625, "learning_rate": 3.186568282078564e-06, "loss": 0.784, "step": 7799 }, { "epoch": 4.187376262541526, "grad_norm": 0.81640625, "learning_rate": 3.182443673489105e-06, "loss": 0.7417, "step": 7800 }, { "epoch": 4.187376262541526, "eval_loss": 0.902660608291626, "eval_runtime": 45.6911, "eval_samples_per_second": 21.054, "eval_steps_per_second": 21.054, "step": 7800 }, { "epoch": 4.1879131572766015, "grad_norm": 0.79296875, "learning_rate": 3.178321554559116e-06, "loss": 0.9144, "step": 7801 }, { "epoch": 4.188450052011677, "grad_norm": 0.70703125, "learning_rate": 3.174201925758988e-06, "loss": 0.7681, "step": 7802 }, { "epoch": 4.188986946746754, "grad_norm": 0.63671875, "learning_rate": 3.170084787558825e-06, "loss": 0.7775, "step": 7803 }, { "epoch": 4.189523841481829, "grad_norm": 0.68359375, "learning_rate": 3.1659701404284423e-06, "loss": 0.79, "step": 7804 }, { "epoch": 4.190060736216905, "grad_norm": 0.5859375, "learning_rate": 3.161857984837366e-06, "loss": 0.7043, "step": 7805 }, { "epoch": 4.190597630951982, "grad_norm": 0.76953125, "learning_rate": 3.157748321254858e-06, "loss": 0.8262, "step": 7806 }, { "epoch": 4.191134525687057, "grad_norm": 0.73046875, "learning_rate": 3.153641150149869e-06, "loss": 0.9911, "step": 7807 }, { "epoch": 4.191671420422134, "grad_norm": 0.73046875, "learning_rate": 3.149536471991088e-06, "loss": 0.8864, "step": 7808 }, { "epoch": 4.19220831515721, "grad_norm": 0.5703125, "learning_rate": 3.145434287246912e-06, "loss": 0.648, "step": 7809 }, { "epoch": 4.192745209892285, "grad_norm": 0.80078125, "learning_rate": 3.141334596385448e-06, "loss": 1.1176, "step": 7810 }, { "epoch": 4.193282104627362, "grad_norm": 0.828125, "learning_rate": 3.137237399874521e-06, "loss": 0.8613, "step": 7811 }, { "epoch": 4.1938189993624375, "grad_norm": 0.734375, "learning_rate": 3.1331426981816845e-06, "loss": 0.8342, "step": 7812 }, { "epoch": 4.194355894097513, "grad_norm": 0.79296875, "learning_rate": 3.1290504917741855e-06, "loss": 0.9059, "step": 7813 }, { "epoch": 4.19489278883259, "grad_norm": 0.6875, "learning_rate": 3.1249607811189934e-06, "loss": 1.1009, "step": 7814 }, { "epoch": 4.195429683567665, "grad_norm": 0.74609375, "learning_rate": 3.1208735666828027e-06, "loss": 0.9495, "step": 7815 }, { "epoch": 4.195966578302741, "grad_norm": 0.8984375, "learning_rate": 3.116788848932017e-06, "loss": 1.0348, "step": 7816 }, { "epoch": 4.196503473037818, "grad_norm": 0.62890625, "learning_rate": 3.112706628332748e-06, "loss": 0.6889, "step": 7817 }, { "epoch": 4.197040367772893, "grad_norm": 0.6484375, "learning_rate": 3.1086269053508337e-06, "loss": 0.8543, "step": 7818 }, { "epoch": 4.19757726250797, "grad_norm": 0.61328125, "learning_rate": 3.1045496804518263e-06, "loss": 0.6374, "step": 7819 }, { "epoch": 4.198114157243046, "grad_norm": 0.64453125, "learning_rate": 3.1004749541009792e-06, "loss": 0.8448, "step": 7820 }, { "epoch": 4.198651051978121, "grad_norm": 0.64453125, "learning_rate": 3.0964027267632784e-06, "loss": 1.0817, "step": 7821 }, { "epoch": 4.199187946713198, "grad_norm": 0.6953125, "learning_rate": 3.092332998903416e-06, "loss": 0.7765, "step": 7822 }, { "epoch": 4.1997248414482735, "grad_norm": 0.67578125, "learning_rate": 3.088265770985788e-06, "loss": 0.708, "step": 7823 }, { "epoch": 4.200261736183349, "grad_norm": 0.8203125, "learning_rate": 3.084201043474527e-06, "loss": 0.9858, "step": 7824 }, { "epoch": 4.200798630918426, "grad_norm": 0.57421875, "learning_rate": 3.080138816833472e-06, "loss": 0.8453, "step": 7825 }, { "epoch": 4.201335525653501, "grad_norm": 0.6484375, "learning_rate": 3.0760790915261685e-06, "loss": 0.8257, "step": 7826 }, { "epoch": 4.201872420388577, "grad_norm": 0.66796875, "learning_rate": 3.072021868015881e-06, "loss": 0.6764, "step": 7827 }, { "epoch": 4.202409315123654, "grad_norm": 0.73828125, "learning_rate": 3.067967146765602e-06, "loss": 0.8771, "step": 7828 }, { "epoch": 4.202946209858729, "grad_norm": 0.77734375, "learning_rate": 3.0639149282380154e-06, "loss": 0.8544, "step": 7829 }, { "epoch": 4.203483104593806, "grad_norm": 0.60546875, "learning_rate": 3.0598652128955295e-06, "loss": 0.7856, "step": 7830 }, { "epoch": 4.204019999328882, "grad_norm": 0.671875, "learning_rate": 3.0558180012002725e-06, "loss": 0.9165, "step": 7831 }, { "epoch": 4.204556894063957, "grad_norm": 0.63671875, "learning_rate": 3.051773293614088e-06, "loss": 0.7057, "step": 7832 }, { "epoch": 4.205093788799034, "grad_norm": 0.77734375, "learning_rate": 3.0477310905985156e-06, "loss": 0.971, "step": 7833 }, { "epoch": 4.2056306835341095, "grad_norm": 0.69140625, "learning_rate": 3.0436913926148336e-06, "loss": 0.8533, "step": 7834 }, { "epoch": 4.206167578269185, "grad_norm": 0.7578125, "learning_rate": 3.0396542001240147e-06, "loss": 0.8808, "step": 7835 }, { "epoch": 4.206704473004262, "grad_norm": 0.65234375, "learning_rate": 3.0356195135867572e-06, "loss": 0.7512, "step": 7836 }, { "epoch": 4.207241367739337, "grad_norm": 0.61328125, "learning_rate": 3.031587333463473e-06, "loss": 0.7196, "step": 7837 }, { "epoch": 4.207778262474414, "grad_norm": 0.8671875, "learning_rate": 3.027557660214281e-06, "loss": 1.1682, "step": 7838 }, { "epoch": 4.20831515720949, "grad_norm": 1.0390625, "learning_rate": 3.023530494299015e-06, "loss": 0.9743, "step": 7839 }, { "epoch": 4.208852051944565, "grad_norm": 1.234375, "learning_rate": 3.0195058361772277e-06, "loss": 0.856, "step": 7840 }, { "epoch": 4.209388946679642, "grad_norm": 0.62890625, "learning_rate": 3.0154836863081905e-06, "loss": 0.7588, "step": 7841 }, { "epoch": 4.209925841414718, "grad_norm": 0.72265625, "learning_rate": 3.011464045150872e-06, "loss": 0.8106, "step": 7842 }, { "epoch": 4.210462736149793, "grad_norm": 0.8046875, "learning_rate": 3.0074469131639665e-06, "loss": 0.8849, "step": 7843 }, { "epoch": 4.21099963088487, "grad_norm": 0.734375, "learning_rate": 3.0034322908058883e-06, "loss": 0.9444, "step": 7844 }, { "epoch": 4.2115365256199455, "grad_norm": 0.66796875, "learning_rate": 2.9994201785347427e-06, "loss": 0.8591, "step": 7845 }, { "epoch": 4.212073420355022, "grad_norm": 0.69140625, "learning_rate": 2.9954105768083773e-06, "loss": 0.9048, "step": 7846 }, { "epoch": 4.212610315090098, "grad_norm": 0.71875, "learning_rate": 2.9914034860843293e-06, "loss": 1.079, "step": 7847 }, { "epoch": 4.213147209825173, "grad_norm": 0.609375, "learning_rate": 2.987398906819852e-06, "loss": 0.6953, "step": 7848 }, { "epoch": 4.21368410456025, "grad_norm": 0.765625, "learning_rate": 2.98339683947193e-06, "loss": 0.9031, "step": 7849 }, { "epoch": 4.214220999295326, "grad_norm": 0.62109375, "learning_rate": 2.9793972844972512e-06, "loss": 0.8121, "step": 7850 }, { "epoch": 4.214220999295326, "eval_loss": 0.9026446342468262, "eval_runtime": 45.83, "eval_samples_per_second": 20.991, "eval_steps_per_second": 20.991, "step": 7850 }, { "epoch": 4.214757894030401, "grad_norm": 0.75390625, "learning_rate": 2.9754002423522024e-06, "loss": 0.7948, "step": 7851 }, { "epoch": 4.215294788765478, "grad_norm": 0.69921875, "learning_rate": 2.9714057134929087e-06, "loss": 0.7093, "step": 7852 }, { "epoch": 4.215831683500554, "grad_norm": 0.76171875, "learning_rate": 2.967413698375196e-06, "loss": 0.7571, "step": 7853 }, { "epoch": 4.216368578235629, "grad_norm": 0.6796875, "learning_rate": 2.9634241974546008e-06, "loss": 0.727, "step": 7854 }, { "epoch": 4.216905472970706, "grad_norm": 0.65625, "learning_rate": 2.9594372111863692e-06, "loss": 0.7647, "step": 7855 }, { "epoch": 4.2174423677057815, "grad_norm": 0.83984375, "learning_rate": 2.9554527400254745e-06, "loss": 0.899, "step": 7856 }, { "epoch": 4.217979262440858, "grad_norm": 0.69140625, "learning_rate": 2.951470784426599e-06, "loss": 0.8867, "step": 7857 }, { "epoch": 4.218516157175934, "grad_norm": 0.62890625, "learning_rate": 2.9474913448441222e-06, "loss": 0.704, "step": 7858 }, { "epoch": 4.219053051911009, "grad_norm": 0.6640625, "learning_rate": 2.9435144217321575e-06, "loss": 0.8729, "step": 7859 }, { "epoch": 4.219589946646086, "grad_norm": 0.6640625, "learning_rate": 2.939540015544523e-06, "loss": 0.8927, "step": 7860 }, { "epoch": 4.220126841381162, "grad_norm": 0.64453125, "learning_rate": 2.935568126734742e-06, "loss": 0.828, "step": 7861 }, { "epoch": 4.220663736116237, "grad_norm": 0.65234375, "learning_rate": 2.9315987557560627e-06, "loss": 0.749, "step": 7862 }, { "epoch": 4.221200630851314, "grad_norm": 0.796875, "learning_rate": 2.9276319030614454e-06, "loss": 0.8385, "step": 7863 }, { "epoch": 4.22173752558639, "grad_norm": 0.69140625, "learning_rate": 2.9236675691035443e-06, "loss": 0.7876, "step": 7864 }, { "epoch": 4.222274420321465, "grad_norm": 0.7265625, "learning_rate": 2.919705754334748e-06, "loss": 1.1208, "step": 7865 }, { "epoch": 4.222811315056542, "grad_norm": 0.609375, "learning_rate": 2.9157464592071525e-06, "loss": 0.638, "step": 7866 }, { "epoch": 4.2233482097916175, "grad_norm": 0.62109375, "learning_rate": 2.9117896841725585e-06, "loss": 1.0634, "step": 7867 }, { "epoch": 4.223885104526694, "grad_norm": 0.5546875, "learning_rate": 2.9078354296824816e-06, "loss": 0.6566, "step": 7868 }, { "epoch": 4.22442199926177, "grad_norm": 0.6171875, "learning_rate": 2.903883696188167e-06, "loss": 0.6774, "step": 7869 }, { "epoch": 4.224958893996845, "grad_norm": 0.76953125, "learning_rate": 2.8999344841405373e-06, "loss": 0.7448, "step": 7870 }, { "epoch": 4.225495788731922, "grad_norm": 0.72265625, "learning_rate": 2.8959877939902653e-06, "loss": 1.0266, "step": 7871 }, { "epoch": 4.226032683466998, "grad_norm": 0.59375, "learning_rate": 2.8920436261877072e-06, "loss": 0.7287, "step": 7872 }, { "epoch": 4.226569578202073, "grad_norm": 0.62109375, "learning_rate": 2.8881019811829475e-06, "loss": 0.9185, "step": 7873 }, { "epoch": 4.22710647293715, "grad_norm": 0.703125, "learning_rate": 2.8841628594257737e-06, "loss": 0.9315, "step": 7874 }, { "epoch": 4.227643367672226, "grad_norm": 0.765625, "learning_rate": 2.8802262613656983e-06, "loss": 0.9369, "step": 7875 }, { "epoch": 4.228180262407302, "grad_norm": 0.71484375, "learning_rate": 2.8762921874519234e-06, "loss": 0.9166, "step": 7876 }, { "epoch": 4.228717157142378, "grad_norm": 0.7109375, "learning_rate": 2.872360638133384e-06, "loss": 0.7692, "step": 7877 }, { "epoch": 4.2292540518774535, "grad_norm": 0.640625, "learning_rate": 2.8684316138587243e-06, "loss": 0.9973, "step": 7878 }, { "epoch": 4.22979094661253, "grad_norm": 0.65625, "learning_rate": 2.8645051150762934e-06, "loss": 0.6537, "step": 7879 }, { "epoch": 4.230327841347606, "grad_norm": 0.55859375, "learning_rate": 2.860581142234145e-06, "loss": 0.7868, "step": 7880 }, { "epoch": 4.230864736082681, "grad_norm": 0.703125, "learning_rate": 2.8566596957800616e-06, "loss": 0.7497, "step": 7881 }, { "epoch": 4.231401630817758, "grad_norm": 0.7109375, "learning_rate": 2.852740776161536e-06, "loss": 1.0806, "step": 7882 }, { "epoch": 4.231938525552834, "grad_norm": 0.66015625, "learning_rate": 2.8488243838257544e-06, "loss": 0.9128, "step": 7883 }, { "epoch": 4.23247542028791, "grad_norm": 0.74609375, "learning_rate": 2.8449105192196316e-06, "loss": 0.7724, "step": 7884 }, { "epoch": 4.233012315022986, "grad_norm": 0.75390625, "learning_rate": 2.840999182789797e-06, "loss": 0.928, "step": 7885 }, { "epoch": 4.233549209758062, "grad_norm": 0.6484375, "learning_rate": 2.837090374982568e-06, "loss": 0.7443, "step": 7886 }, { "epoch": 4.234086104493138, "grad_norm": 0.6484375, "learning_rate": 2.8331840962440015e-06, "loss": 0.8266, "step": 7887 }, { "epoch": 4.234622999228214, "grad_norm": 0.7265625, "learning_rate": 2.829280347019858e-06, "loss": 0.961, "step": 7888 }, { "epoch": 4.2351598939632895, "grad_norm": 0.765625, "learning_rate": 2.8253791277555867e-06, "loss": 0.9082, "step": 7889 }, { "epoch": 4.235696788698366, "grad_norm": 0.7578125, "learning_rate": 2.821480438896376e-06, "loss": 0.8474, "step": 7890 }, { "epoch": 4.236233683433442, "grad_norm": 0.82421875, "learning_rate": 2.81758428088712e-06, "loss": 0.9891, "step": 7891 }, { "epoch": 4.236770578168517, "grad_norm": 0.64453125, "learning_rate": 2.81369065417241e-06, "loss": 0.7695, "step": 7892 }, { "epoch": 4.237307472903594, "grad_norm": 0.73046875, "learning_rate": 2.809799559196566e-06, "loss": 0.9035, "step": 7893 }, { "epoch": 4.23784436763867, "grad_norm": 0.59765625, "learning_rate": 2.80591099640361e-06, "loss": 0.8582, "step": 7894 }, { "epoch": 4.238381262373746, "grad_norm": 0.74609375, "learning_rate": 2.802024966237271e-06, "loss": 0.9073, "step": 7895 }, { "epoch": 4.238918157108822, "grad_norm": 0.5703125, "learning_rate": 2.7981414691410025e-06, "loss": 0.8136, "step": 7896 }, { "epoch": 4.239455051843898, "grad_norm": 0.70703125, "learning_rate": 2.7942605055579523e-06, "loss": 0.6798, "step": 7897 }, { "epoch": 4.239991946578974, "grad_norm": 0.6328125, "learning_rate": 2.7903820759309945e-06, "loss": 0.9977, "step": 7898 }, { "epoch": 4.24052884131405, "grad_norm": 0.6875, "learning_rate": 2.7865061807026997e-06, "loss": 0.7733, "step": 7899 }, { "epoch": 4.2410657360491255, "grad_norm": 0.73828125, "learning_rate": 2.7826328203153617e-06, "loss": 0.877, "step": 7900 }, { "epoch": 4.2410657360491255, "eval_loss": 0.902637243270874, "eval_runtime": 46.3458, "eval_samples_per_second": 20.757, "eval_steps_per_second": 20.757, "step": 7900 }, { "epoch": 4.241602630784202, "grad_norm": 0.6640625, "learning_rate": 2.778761995210985e-06, "loss": 0.7297, "step": 7901 }, { "epoch": 4.242139525519278, "grad_norm": 0.625, "learning_rate": 2.774893705831266e-06, "loss": 0.7969, "step": 7902 }, { "epoch": 4.242676420254353, "grad_norm": 0.7578125, "learning_rate": 2.7710279526176357e-06, "loss": 0.9622, "step": 7903 }, { "epoch": 4.24321331498943, "grad_norm": 0.66015625, "learning_rate": 2.767164736011227e-06, "loss": 0.7954, "step": 7904 }, { "epoch": 4.243750209724506, "grad_norm": 0.6640625, "learning_rate": 2.763304056452878e-06, "loss": 0.8726, "step": 7905 }, { "epoch": 4.244287104459582, "grad_norm": 0.66796875, "learning_rate": 2.759445914383135e-06, "loss": 0.866, "step": 7906 }, { "epoch": 4.244823999194658, "grad_norm": 0.8515625, "learning_rate": 2.755590310242273e-06, "loss": 0.8803, "step": 7907 }, { "epoch": 4.245360893929734, "grad_norm": 0.6640625, "learning_rate": 2.7517372444702564e-06, "loss": 0.8575, "step": 7908 }, { "epoch": 4.24589778866481, "grad_norm": 0.56640625, "learning_rate": 2.7478867175067675e-06, "loss": 0.7142, "step": 7909 }, { "epoch": 4.246434683399886, "grad_norm": 0.62890625, "learning_rate": 2.7440387297912123e-06, "loss": 0.7729, "step": 7910 }, { "epoch": 4.2469715781349615, "grad_norm": 0.7109375, "learning_rate": 2.7401932817626817e-06, "loss": 0.8111, "step": 7911 }, { "epoch": 4.247508472870038, "grad_norm": 0.62890625, "learning_rate": 2.7363503738599943e-06, "loss": 0.699, "step": 7912 }, { "epoch": 4.248045367605114, "grad_norm": 0.65625, "learning_rate": 2.7325100065216823e-06, "loss": 0.7496, "step": 7913 }, { "epoch": 4.24858226234019, "grad_norm": 0.79296875, "learning_rate": 2.7286721801859734e-06, "loss": 0.7449, "step": 7914 }, { "epoch": 4.249119157075266, "grad_norm": 0.6953125, "learning_rate": 2.7248368952908053e-06, "loss": 0.7934, "step": 7915 }, { "epoch": 4.249656051810342, "grad_norm": 0.78125, "learning_rate": 2.7210041522738454e-06, "loss": 1.0268, "step": 7916 }, { "epoch": 4.250192946545418, "grad_norm": 0.61328125, "learning_rate": 2.7171739515724465e-06, "loss": 0.7507, "step": 7917 }, { "epoch": 4.250729841280494, "grad_norm": 0.703125, "learning_rate": 2.713346293623689e-06, "loss": 0.829, "step": 7918 }, { "epoch": 4.25126673601557, "grad_norm": 0.55078125, "learning_rate": 2.7095211788643633e-06, "loss": 0.701, "step": 7919 }, { "epoch": 4.251803630750646, "grad_norm": 0.66796875, "learning_rate": 2.7056986077309525e-06, "loss": 0.9173, "step": 7920 }, { "epoch": 4.252340525485722, "grad_norm": 0.640625, "learning_rate": 2.70187858065967e-06, "loss": 0.6739, "step": 7921 }, { "epoch": 4.252877420220798, "grad_norm": 0.65234375, "learning_rate": 2.6980610980864183e-06, "loss": 0.8878, "step": 7922 }, { "epoch": 4.253414314955874, "grad_norm": 0.77734375, "learning_rate": 2.694246160446831e-06, "loss": 0.9122, "step": 7923 }, { "epoch": 4.25395120969095, "grad_norm": 0.6171875, "learning_rate": 2.6904337681762343e-06, "loss": 0.8183, "step": 7924 }, { "epoch": 4.254488104426026, "grad_norm": 0.75390625, "learning_rate": 2.686623921709669e-06, "loss": 0.8204, "step": 7925 }, { "epoch": 4.255024999161102, "grad_norm": 0.6796875, "learning_rate": 2.6828166214818957e-06, "loss": 0.9076, "step": 7926 }, { "epoch": 4.255561893896178, "grad_norm": 0.81640625, "learning_rate": 2.679011867927367e-06, "loss": 0.8625, "step": 7927 }, { "epoch": 4.256098788631254, "grad_norm": 0.66015625, "learning_rate": 2.6752096614802546e-06, "loss": 0.9715, "step": 7928 }, { "epoch": 4.25663568336633, "grad_norm": 0.7109375, "learning_rate": 2.671410002574448e-06, "loss": 0.9036, "step": 7929 }, { "epoch": 4.257172578101406, "grad_norm": 0.6015625, "learning_rate": 2.6676128916435256e-06, "loss": 0.6897, "step": 7930 }, { "epoch": 4.257709472836482, "grad_norm": 0.640625, "learning_rate": 2.6638183291207874e-06, "loss": 0.8025, "step": 7931 }, { "epoch": 4.258246367571558, "grad_norm": 0.71484375, "learning_rate": 2.6600263154392457e-06, "loss": 0.7686, "step": 7932 }, { "epoch": 4.258783262306634, "grad_norm": 0.65234375, "learning_rate": 2.65623685103161e-06, "loss": 0.8541, "step": 7933 }, { "epoch": 4.25932015704171, "grad_norm": 0.6328125, "learning_rate": 2.652449936330309e-06, "loss": 0.7334, "step": 7934 }, { "epoch": 4.259857051776786, "grad_norm": 0.66015625, "learning_rate": 2.6486655717674863e-06, "loss": 0.697, "step": 7935 }, { "epoch": 4.260393946511862, "grad_norm": 0.76171875, "learning_rate": 2.6448837577749713e-06, "loss": 0.9147, "step": 7936 }, { "epoch": 4.260930841246938, "grad_norm": 0.64453125, "learning_rate": 2.641104494784327e-06, "loss": 0.7542, "step": 7937 }, { "epoch": 4.261467735982014, "grad_norm": 0.65625, "learning_rate": 2.6373277832268145e-06, "loss": 0.7827, "step": 7938 }, { "epoch": 4.26200463071709, "grad_norm": 0.7421875, "learning_rate": 2.633553623533405e-06, "loss": 0.7254, "step": 7939 }, { "epoch": 4.262541525452166, "grad_norm": 0.7421875, "learning_rate": 2.6297820161347686e-06, "loss": 0.9928, "step": 7940 }, { "epoch": 4.263078420187242, "grad_norm": 0.71484375, "learning_rate": 2.6260129614612995e-06, "loss": 0.8077, "step": 7941 }, { "epoch": 4.263615314922318, "grad_norm": 0.62890625, "learning_rate": 2.6222464599431006e-06, "loss": 0.7353, "step": 7942 }, { "epoch": 4.264152209657394, "grad_norm": 0.765625, "learning_rate": 2.6184825120099676e-06, "loss": 0.8158, "step": 7943 }, { "epoch": 4.26468910439247, "grad_norm": 0.6484375, "learning_rate": 2.6147211180914176e-06, "loss": 0.9351, "step": 7944 }, { "epoch": 4.265225999127546, "grad_norm": 0.6875, "learning_rate": 2.61096227861668e-06, "loss": 0.8199, "step": 7945 }, { "epoch": 4.265762893862622, "grad_norm": 0.73828125, "learning_rate": 2.6072059940146775e-06, "loss": 0.6854, "step": 7946 }, { "epoch": 4.266299788597698, "grad_norm": 0.6796875, "learning_rate": 2.603452264714057e-06, "loss": 0.8996, "step": 7947 }, { "epoch": 4.266836683332774, "grad_norm": 0.86328125, "learning_rate": 2.5997010911431614e-06, "loss": 0.8856, "step": 7948 }, { "epoch": 4.26737357806785, "grad_norm": 0.84375, "learning_rate": 2.595952473730043e-06, "loss": 1.1729, "step": 7949 }, { "epoch": 4.267910472802926, "grad_norm": 0.6171875, "learning_rate": 2.5922064129024733e-06, "loss": 0.7371, "step": 7950 }, { "epoch": 4.267910472802926, "eval_loss": 0.9027069211006165, "eval_runtime": 46.1566, "eval_samples_per_second": 20.842, "eval_steps_per_second": 20.842, "step": 7950 }, { "epoch": 4.268447367538002, "grad_norm": 0.78515625, "learning_rate": 2.588462909087927e-06, "loss": 0.9572, "step": 7951 }, { "epoch": 4.2689842622730785, "grad_norm": 0.7109375, "learning_rate": 2.5847219627135764e-06, "loss": 0.7605, "step": 7952 }, { "epoch": 4.269521157008154, "grad_norm": 0.7578125, "learning_rate": 2.5809835742063193e-06, "loss": 1.0415, "step": 7953 }, { "epoch": 4.27005805174323, "grad_norm": 0.73828125, "learning_rate": 2.577247743992753e-06, "loss": 1.1646, "step": 7954 }, { "epoch": 4.270594946478306, "grad_norm": 0.6875, "learning_rate": 2.5735144724991806e-06, "loss": 0.8315, "step": 7955 }, { "epoch": 4.271131841213382, "grad_norm": 0.6953125, "learning_rate": 2.5697837601516116e-06, "loss": 0.7845, "step": 7956 }, { "epoch": 4.271668735948458, "grad_norm": 0.87109375, "learning_rate": 2.5660556073757695e-06, "loss": 0.7312, "step": 7957 }, { "epoch": 4.272205630683534, "grad_norm": 0.63671875, "learning_rate": 2.562330014597092e-06, "loss": 0.9128, "step": 7958 }, { "epoch": 4.27274252541861, "grad_norm": 0.6796875, "learning_rate": 2.558606982240705e-06, "loss": 0.6866, "step": 7959 }, { "epoch": 4.273279420153687, "grad_norm": 0.98046875, "learning_rate": 2.5548865107314607e-06, "loss": 1.1446, "step": 7960 }, { "epoch": 4.273816314888762, "grad_norm": 0.7578125, "learning_rate": 2.5511686004939086e-06, "loss": 0.9014, "step": 7961 }, { "epoch": 4.274353209623838, "grad_norm": 0.6171875, "learning_rate": 2.5474532519523087e-06, "loss": 0.7237, "step": 7962 }, { "epoch": 4.2748901043589145, "grad_norm": 0.60546875, "learning_rate": 2.543740465530636e-06, "loss": 0.8437, "step": 7963 }, { "epoch": 4.27542699909399, "grad_norm": 0.8515625, "learning_rate": 2.5400302416525632e-06, "loss": 0.8711, "step": 7964 }, { "epoch": 4.275963893829066, "grad_norm": 0.84765625, "learning_rate": 2.5363225807414655e-06, "loss": 0.872, "step": 7965 }, { "epoch": 4.276500788564142, "grad_norm": 0.6484375, "learning_rate": 2.5326174832204426e-06, "loss": 0.7451, "step": 7966 }, { "epoch": 4.277037683299218, "grad_norm": 0.66015625, "learning_rate": 2.5289149495122956e-06, "loss": 0.7314, "step": 7967 }, { "epoch": 4.277574578034294, "grad_norm": 0.7109375, "learning_rate": 2.5252149800395197e-06, "loss": 0.9796, "step": 7968 }, { "epoch": 4.27811147276937, "grad_norm": 0.640625, "learning_rate": 2.5215175752243357e-06, "loss": 0.8763, "step": 7969 }, { "epoch": 4.278648367504446, "grad_norm": 1.1640625, "learning_rate": 2.51782273548867e-06, "loss": 0.9131, "step": 7970 }, { "epoch": 4.279185262239523, "grad_norm": 0.8125, "learning_rate": 2.5141304612541354e-06, "loss": 0.7431, "step": 7971 }, { "epoch": 4.279722156974598, "grad_norm": 0.69140625, "learning_rate": 2.5104407529420838e-06, "loss": 0.9591, "step": 7972 }, { "epoch": 4.280259051709674, "grad_norm": 0.578125, "learning_rate": 2.5067536109735474e-06, "loss": 0.5859, "step": 7973 }, { "epoch": 4.2807959464447505, "grad_norm": 0.98046875, "learning_rate": 2.503069035769276e-06, "loss": 1.0047, "step": 7974 }, { "epoch": 4.281332841179826, "grad_norm": 0.55859375, "learning_rate": 2.499387027749725e-06, "loss": 0.6877, "step": 7975 }, { "epoch": 4.281869735914902, "grad_norm": 0.73828125, "learning_rate": 2.4957075873350665e-06, "loss": 0.7373, "step": 7976 }, { "epoch": 4.282406630649978, "grad_norm": 0.80859375, "learning_rate": 2.492030714945162e-06, "loss": 1.0971, "step": 7977 }, { "epoch": 4.282943525385054, "grad_norm": 0.65625, "learning_rate": 2.4883564109995957e-06, "loss": 0.7235, "step": 7978 }, { "epoch": 4.28348042012013, "grad_norm": 0.71484375, "learning_rate": 2.4846846759176506e-06, "loss": 0.9833, "step": 7979 }, { "epoch": 4.284017314855206, "grad_norm": 0.79296875, "learning_rate": 2.48101551011832e-06, "loss": 0.8382, "step": 7980 }, { "epoch": 4.284554209590282, "grad_norm": 0.6796875, "learning_rate": 2.4773489140202942e-06, "loss": 1.1936, "step": 7981 }, { "epoch": 4.285091104325359, "grad_norm": 0.6328125, "learning_rate": 2.4736848880419827e-06, "loss": 1.0067, "step": 7982 }, { "epoch": 4.285627999060434, "grad_norm": 0.6015625, "learning_rate": 2.470023432601504e-06, "loss": 0.6354, "step": 7983 }, { "epoch": 4.28616489379551, "grad_norm": 0.6015625, "learning_rate": 2.4663645481166648e-06, "loss": 0.8343, "step": 7984 }, { "epoch": 4.2867017885305865, "grad_norm": 0.734375, "learning_rate": 2.462708235004996e-06, "loss": 0.9136, "step": 7985 }, { "epoch": 4.287238683265662, "grad_norm": 0.765625, "learning_rate": 2.459054493683735e-06, "loss": 0.9607, "step": 7986 }, { "epoch": 4.287775578000738, "grad_norm": 0.73828125, "learning_rate": 2.4554033245698105e-06, "loss": 0.7222, "step": 7987 }, { "epoch": 4.288312472735814, "grad_norm": 0.734375, "learning_rate": 2.4517547280798693e-06, "loss": 0.8734, "step": 7988 }, { "epoch": 4.28884936747089, "grad_norm": 0.81640625, "learning_rate": 2.4481087046302752e-06, "loss": 0.9613, "step": 7989 }, { "epoch": 4.289386262205967, "grad_norm": 0.7578125, "learning_rate": 2.444465254637063e-06, "loss": 0.7166, "step": 7990 }, { "epoch": 4.289923156941042, "grad_norm": 0.7265625, "learning_rate": 2.4408243785160075e-06, "loss": 0.8868, "step": 7991 }, { "epoch": 4.290460051676118, "grad_norm": 0.8046875, "learning_rate": 2.437186076682585e-06, "loss": 0.9208, "step": 7992 }, { "epoch": 4.290996946411195, "grad_norm": 0.69921875, "learning_rate": 2.4335503495519636e-06, "loss": 0.9259, "step": 7993 }, { "epoch": 4.29153384114627, "grad_norm": 0.62109375, "learning_rate": 2.429917197539025e-06, "loss": 0.7361, "step": 7994 }, { "epoch": 4.292070735881346, "grad_norm": 0.671875, "learning_rate": 2.4262866210583686e-06, "loss": 0.8867, "step": 7995 }, { "epoch": 4.2926076306164225, "grad_norm": 0.73046875, "learning_rate": 2.4226586205242735e-06, "loss": 0.7309, "step": 7996 }, { "epoch": 4.293144525351498, "grad_norm": 0.8203125, "learning_rate": 2.419033196350756e-06, "loss": 1.1732, "step": 7997 }, { "epoch": 4.293681420086575, "grad_norm": 0.8046875, "learning_rate": 2.4154103489515107e-06, "loss": 0.9048, "step": 7998 }, { "epoch": 4.29421831482165, "grad_norm": 0.8125, "learning_rate": 2.411790078739959e-06, "loss": 0.7071, "step": 7999 }, { "epoch": 4.294755209556726, "grad_norm": 0.6171875, "learning_rate": 2.408172386129212e-06, "loss": 0.8387, "step": 8000 }, { "epoch": 4.294755209556726, "eval_loss": 0.9026855230331421, "eval_runtime": 46.7378, "eval_samples_per_second": 20.583, "eval_steps_per_second": 20.583, "step": 8000 }, { "epoch": 4.295292104291803, "grad_norm": 0.6875, "learning_rate": 2.4045572715321014e-06, "loss": 0.971, "step": 8001 }, { "epoch": 4.295828999026878, "grad_norm": 0.6171875, "learning_rate": 2.400944735361152e-06, "loss": 0.8212, "step": 8002 }, { "epoch": 4.296365893761954, "grad_norm": 0.65625, "learning_rate": 2.397334778028601e-06, "loss": 0.8329, "step": 8003 }, { "epoch": 4.296902788497031, "grad_norm": 0.71484375, "learning_rate": 2.3937273999463967e-06, "loss": 0.8759, "step": 8004 }, { "epoch": 4.297439683232106, "grad_norm": 0.62890625, "learning_rate": 2.390122601526179e-06, "loss": 0.8097, "step": 8005 }, { "epoch": 4.297976577967182, "grad_norm": 0.71484375, "learning_rate": 2.3865203831793e-06, "loss": 1.0514, "step": 8006 }, { "epoch": 4.2985134727022585, "grad_norm": 0.6015625, "learning_rate": 2.382920745316822e-06, "loss": 0.8337, "step": 8007 }, { "epoch": 4.299050367437334, "grad_norm": 0.72265625, "learning_rate": 2.379323688349516e-06, "loss": 0.9042, "step": 8008 }, { "epoch": 4.299587262172411, "grad_norm": 0.76171875, "learning_rate": 2.3757292126878382e-06, "loss": 0.8833, "step": 8009 }, { "epoch": 4.300124156907486, "grad_norm": 0.703125, "learning_rate": 2.372137318741968e-06, "loss": 0.7728, "step": 8010 }, { "epoch": 4.300661051642562, "grad_norm": 0.75, "learning_rate": 2.3685480069217973e-06, "loss": 0.8913, "step": 8011 }, { "epoch": 4.301197946377639, "grad_norm": 0.7109375, "learning_rate": 2.3649612776368958e-06, "loss": 0.7886, "step": 8012 }, { "epoch": 4.301734841112714, "grad_norm": 0.66015625, "learning_rate": 2.3613771312965633e-06, "loss": 0.8771, "step": 8013 }, { "epoch": 4.30227173584779, "grad_norm": 0.6875, "learning_rate": 2.3577955683098037e-06, "loss": 0.9066, "step": 8014 }, { "epoch": 4.3028086305828666, "grad_norm": 0.73046875, "learning_rate": 2.3542165890853006e-06, "loss": 0.9207, "step": 8015 }, { "epoch": 4.303345525317942, "grad_norm": 0.78515625, "learning_rate": 2.3506401940314727e-06, "loss": 0.909, "step": 8016 }, { "epoch": 4.303882420053018, "grad_norm": 0.6875, "learning_rate": 2.3470663835564317e-06, "loss": 1.0307, "step": 8017 }, { "epoch": 4.3044193147880945, "grad_norm": 0.671875, "learning_rate": 2.3434951580679876e-06, "loss": 0.9595, "step": 8018 }, { "epoch": 4.30495620952317, "grad_norm": 0.64453125, "learning_rate": 2.3399265179736675e-06, "loss": 0.6826, "step": 8019 }, { "epoch": 4.305493104258247, "grad_norm": 0.62109375, "learning_rate": 2.336360463680706e-06, "loss": 0.7527, "step": 8020 }, { "epoch": 4.306029998993322, "grad_norm": 0.7421875, "learning_rate": 2.3327969955960204e-06, "loss": 0.9664, "step": 8021 }, { "epoch": 4.306566893728398, "grad_norm": 0.83984375, "learning_rate": 2.329236114126257e-06, "loss": 0.9296, "step": 8022 }, { "epoch": 4.307103788463475, "grad_norm": 0.6484375, "learning_rate": 2.3256778196777544e-06, "loss": 0.7358, "step": 8023 }, { "epoch": 4.30764068319855, "grad_norm": 0.57421875, "learning_rate": 2.3221221126565606e-06, "loss": 0.752, "step": 8024 }, { "epoch": 4.308177577933626, "grad_norm": 0.6015625, "learning_rate": 2.3185689934684255e-06, "loss": 0.7511, "step": 8025 }, { "epoch": 4.3087144726687026, "grad_norm": 0.61328125, "learning_rate": 2.315018462518803e-06, "loss": 0.8922, "step": 8026 }, { "epoch": 4.309251367403778, "grad_norm": 0.59375, "learning_rate": 2.3114705202128607e-06, "loss": 0.6395, "step": 8027 }, { "epoch": 4.309788262138855, "grad_norm": 0.5859375, "learning_rate": 2.307925166955452e-06, "loss": 0.7146, "step": 8028 }, { "epoch": 4.3103251568739305, "grad_norm": 0.75, "learning_rate": 2.304382403151156e-06, "loss": 0.9844, "step": 8029 }, { "epoch": 4.310862051609006, "grad_norm": 0.9375, "learning_rate": 2.300842229204253e-06, "loss": 0.7643, "step": 8030 }, { "epoch": 4.311398946344083, "grad_norm": 0.66015625, "learning_rate": 2.2973046455187054e-06, "loss": 0.7227, "step": 8031 }, { "epoch": 4.311935841079158, "grad_norm": 0.6484375, "learning_rate": 2.293769652498204e-06, "loss": 0.7683, "step": 8032 }, { "epoch": 4.312472735814234, "grad_norm": 0.66796875, "learning_rate": 2.29023725054614e-06, "loss": 0.8554, "step": 8033 }, { "epoch": 4.313009630549311, "grad_norm": 0.70703125, "learning_rate": 2.2867074400655967e-06, "loss": 0.83, "step": 8034 }, { "epoch": 4.313546525284386, "grad_norm": 0.640625, "learning_rate": 2.2831802214593775e-06, "loss": 0.7588, "step": 8035 }, { "epoch": 4.314083420019463, "grad_norm": 0.63671875, "learning_rate": 2.2796555951299814e-06, "loss": 0.8654, "step": 8036 }, { "epoch": 4.3146203147545386, "grad_norm": 0.67578125, "learning_rate": 2.2761335614796097e-06, "loss": 1.0302, "step": 8037 }, { "epoch": 4.315157209489614, "grad_norm": 0.6875, "learning_rate": 2.2726141209101712e-06, "loss": 0.8198, "step": 8038 }, { "epoch": 4.315694104224691, "grad_norm": 0.59765625, "learning_rate": 2.269097273823287e-06, "loss": 0.7993, "step": 8039 }, { "epoch": 4.3162309989597665, "grad_norm": 0.71484375, "learning_rate": 2.2655830206202655e-06, "loss": 1.0365, "step": 8040 }, { "epoch": 4.316767893694842, "grad_norm": 0.6875, "learning_rate": 2.262071361702128e-06, "loss": 0.811, "step": 8041 }, { "epoch": 4.317304788429919, "grad_norm": 0.81640625, "learning_rate": 2.258562297469599e-06, "loss": 0.9286, "step": 8042 }, { "epoch": 4.317841683164994, "grad_norm": 0.7109375, "learning_rate": 2.255055828323113e-06, "loss": 0.7901, "step": 8043 }, { "epoch": 4.31837857790007, "grad_norm": 0.59765625, "learning_rate": 2.2515519546627924e-06, "loss": 0.7912, "step": 8044 }, { "epoch": 4.318915472635147, "grad_norm": 0.60546875, "learning_rate": 2.248050676888486e-06, "loss": 0.7476, "step": 8045 }, { "epoch": 4.319452367370222, "grad_norm": 0.78125, "learning_rate": 2.244551995399724e-06, "loss": 1.039, "step": 8046 }, { "epoch": 4.319989262105299, "grad_norm": 0.66796875, "learning_rate": 2.241055910595757e-06, "loss": 0.7083, "step": 8047 }, { "epoch": 4.3205261568403746, "grad_norm": 0.58203125, "learning_rate": 2.237562422875525e-06, "loss": 0.8362, "step": 8048 }, { "epoch": 4.32106305157545, "grad_norm": 0.6796875, "learning_rate": 2.2340715326376854e-06, "loss": 0.9089, "step": 8049 }, { "epoch": 4.321599946310527, "grad_norm": 0.72265625, "learning_rate": 2.23058324028059e-06, "loss": 0.8789, "step": 8050 }, { "epoch": 4.321599946310527, "eval_loss": 0.9027815461158752, "eval_runtime": 46.3347, "eval_samples_per_second": 20.762, "eval_steps_per_second": 20.762, "step": 8050 }, { "epoch": 4.3221368410456025, "grad_norm": 0.57421875, "learning_rate": 2.2270975462022973e-06, "loss": 0.7919, "step": 8051 }, { "epoch": 4.322673735780678, "grad_norm": 0.70703125, "learning_rate": 2.2236144508005735e-06, "loss": 0.7806, "step": 8052 }, { "epoch": 4.323210630515755, "grad_norm": 0.66015625, "learning_rate": 2.2201339544728776e-06, "loss": 0.9323, "step": 8053 }, { "epoch": 4.32374752525083, "grad_norm": 0.65234375, "learning_rate": 2.2166560576163813e-06, "loss": 1.1064, "step": 8054 }, { "epoch": 4.324284419985906, "grad_norm": 0.62109375, "learning_rate": 2.213180760627959e-06, "loss": 0.7244, "step": 8055 }, { "epoch": 4.324821314720983, "grad_norm": 0.71484375, "learning_rate": 2.209708063904184e-06, "loss": 1.0234, "step": 8056 }, { "epoch": 4.325358209456058, "grad_norm": 0.62890625, "learning_rate": 2.206237967841332e-06, "loss": 0.7779, "step": 8057 }, { "epoch": 4.325895104191135, "grad_norm": 0.7109375, "learning_rate": 2.2027704728353917e-06, "loss": 0.8836, "step": 8058 }, { "epoch": 4.3264319989262106, "grad_norm": 0.70703125, "learning_rate": 2.1993055792820377e-06, "loss": 0.988, "step": 8059 }, { "epoch": 4.326968893661286, "grad_norm": 0.640625, "learning_rate": 2.1958432875766653e-06, "loss": 0.7226, "step": 8060 }, { "epoch": 4.327505788396363, "grad_norm": 0.87109375, "learning_rate": 2.1923835981143697e-06, "loss": 0.9372, "step": 8061 }, { "epoch": 4.3280426831314385, "grad_norm": 0.6328125, "learning_rate": 2.1889265112899377e-06, "loss": 0.8695, "step": 8062 }, { "epoch": 4.328579577866514, "grad_norm": 0.6328125, "learning_rate": 2.1854720274978673e-06, "loss": 0.8805, "step": 8063 }, { "epoch": 4.329116472601591, "grad_norm": 0.7734375, "learning_rate": 2.182020147132366e-06, "loss": 0.9173, "step": 8064 }, { "epoch": 4.329653367336666, "grad_norm": 0.609375, "learning_rate": 2.178570870587335e-06, "loss": 0.7687, "step": 8065 }, { "epoch": 4.330190262071743, "grad_norm": 0.72265625, "learning_rate": 2.1751241982563702e-06, "loss": 0.9901, "step": 8066 }, { "epoch": 4.330727156806819, "grad_norm": 0.7734375, "learning_rate": 2.1716801305327884e-06, "loss": 0.9185, "step": 8067 }, { "epoch": 4.331264051541894, "grad_norm": 0.65234375, "learning_rate": 2.168238667809605e-06, "loss": 0.6998, "step": 8068 }, { "epoch": 4.331800946276971, "grad_norm": 0.78125, "learning_rate": 2.1647998104795286e-06, "loss": 0.8205, "step": 8069 }, { "epoch": 4.3323378410120466, "grad_norm": 0.734375, "learning_rate": 2.1613635589349756e-06, "loss": 0.9229, "step": 8070 }, { "epoch": 4.332874735747122, "grad_norm": 0.734375, "learning_rate": 2.157929913568074e-06, "loss": 0.7987, "step": 8071 }, { "epoch": 4.333411630482199, "grad_norm": 0.6953125, "learning_rate": 2.1544988747706414e-06, "loss": 0.7715, "step": 8072 }, { "epoch": 4.3339485252172745, "grad_norm": 0.78515625, "learning_rate": 2.151070442934197e-06, "loss": 0.907, "step": 8073 }, { "epoch": 4.334485419952351, "grad_norm": 0.8046875, "learning_rate": 2.147644618449976e-06, "loss": 1.0381, "step": 8074 }, { "epoch": 4.335022314687427, "grad_norm": 0.7109375, "learning_rate": 2.144221401708904e-06, "loss": 0.8121, "step": 8075 }, { "epoch": 4.335559209422502, "grad_norm": 0.7421875, "learning_rate": 2.140800793101613e-06, "loss": 0.8282, "step": 8076 }, { "epoch": 4.336096104157579, "grad_norm": 0.6484375, "learning_rate": 2.1373827930184444e-06, "loss": 0.7177, "step": 8077 }, { "epoch": 4.336632998892655, "grad_norm": 0.52734375, "learning_rate": 2.1339674018494292e-06, "loss": 0.7439, "step": 8078 }, { "epoch": 4.33716989362773, "grad_norm": 0.6171875, "learning_rate": 2.130554619984304e-06, "loss": 0.9383, "step": 8079 }, { "epoch": 4.337706788362807, "grad_norm": 0.62890625, "learning_rate": 2.1271444478125235e-06, "loss": 0.7015, "step": 8080 }, { "epoch": 4.3382436830978826, "grad_norm": 0.66015625, "learning_rate": 2.123736885723221e-06, "loss": 0.8099, "step": 8081 }, { "epoch": 4.338780577832958, "grad_norm": 0.75390625, "learning_rate": 2.120331934105238e-06, "loss": 0.9487, "step": 8082 }, { "epoch": 4.339317472568035, "grad_norm": 0.71875, "learning_rate": 2.1169295933471307e-06, "loss": 0.8674, "step": 8083 }, { "epoch": 4.3398543673031105, "grad_norm": 0.671875, "learning_rate": 2.113529863837152e-06, "loss": 0.9466, "step": 8084 }, { "epoch": 4.340391262038187, "grad_norm": 0.55859375, "learning_rate": 2.1101327459632446e-06, "loss": 0.7537, "step": 8085 }, { "epoch": 4.340928156773263, "grad_norm": 0.625, "learning_rate": 2.1067382401130707e-06, "loss": 0.8198, "step": 8086 }, { "epoch": 4.341465051508338, "grad_norm": 0.69921875, "learning_rate": 2.1033463466739816e-06, "loss": 0.9313, "step": 8087 }, { "epoch": 4.342001946243415, "grad_norm": 0.6640625, "learning_rate": 2.0999570660330344e-06, "loss": 0.8846, "step": 8088 }, { "epoch": 4.342538840978491, "grad_norm": 0.67578125, "learning_rate": 2.0965703985769996e-06, "loss": 0.7537, "step": 8089 }, { "epoch": 4.343075735713566, "grad_norm": 0.69921875, "learning_rate": 2.093186344692327e-06, "loss": 0.8586, "step": 8090 }, { "epoch": 4.343612630448643, "grad_norm": 0.62109375, "learning_rate": 2.0898049047651802e-06, "loss": 0.7498, "step": 8091 }, { "epoch": 4.3441495251837186, "grad_norm": 0.58203125, "learning_rate": 2.0864260791814305e-06, "loss": 0.6604, "step": 8092 }, { "epoch": 4.344686419918794, "grad_norm": 0.70703125, "learning_rate": 2.0830498683266446e-06, "loss": 0.8944, "step": 8093 }, { "epoch": 4.345223314653871, "grad_norm": 0.71484375, "learning_rate": 2.079676272586084e-06, "loss": 0.8687, "step": 8094 }, { "epoch": 4.3457602093889465, "grad_norm": 0.609375, "learning_rate": 2.0763052923447213e-06, "loss": 0.7794, "step": 8095 }, { "epoch": 4.346297104124023, "grad_norm": 0.6484375, "learning_rate": 2.0729369279872375e-06, "loss": 0.7759, "step": 8096 }, { "epoch": 4.346833998859099, "grad_norm": 0.61328125, "learning_rate": 2.0695711798979915e-06, "loss": 0.7231, "step": 8097 }, { "epoch": 4.347370893594174, "grad_norm": 0.63671875, "learning_rate": 2.0662080484610686e-06, "loss": 0.8852, "step": 8098 }, { "epoch": 4.347907788329251, "grad_norm": 0.68359375, "learning_rate": 2.0628475340602416e-06, "loss": 0.7639, "step": 8099 }, { "epoch": 4.348444683064327, "grad_norm": 2.5, "learning_rate": 2.059489637078982e-06, "loss": 1.0297, "step": 8100 }, { "epoch": 4.348444683064327, "eval_loss": 0.9026857018470764, "eval_runtime": 46.6968, "eval_samples_per_second": 20.601, "eval_steps_per_second": 20.601, "step": 8100 }, { "epoch": 4.348981577799402, "grad_norm": 0.69140625, "learning_rate": 2.0561343579004715e-06, "loss": 0.8775, "step": 8101 }, { "epoch": 4.349518472534479, "grad_norm": 0.66015625, "learning_rate": 2.0527816969075964e-06, "loss": 0.849, "step": 8102 }, { "epoch": 4.3500553672695546, "grad_norm": 0.875, "learning_rate": 2.049431654482928e-06, "loss": 0.9483, "step": 8103 }, { "epoch": 4.350592262004631, "grad_norm": 0.69921875, "learning_rate": 2.0460842310087548e-06, "loss": 0.912, "step": 8104 }, { "epoch": 4.351129156739707, "grad_norm": 0.69140625, "learning_rate": 2.042739426867063e-06, "loss": 0.9216, "step": 8105 }, { "epoch": 4.3516660514747825, "grad_norm": 0.6953125, "learning_rate": 2.0393972424395308e-06, "loss": 0.83, "step": 8106 }, { "epoch": 4.352202946209859, "grad_norm": 0.6484375, "learning_rate": 2.0360576781075417e-06, "loss": 0.882, "step": 8107 }, { "epoch": 4.352739840944935, "grad_norm": 0.765625, "learning_rate": 2.0327207342521885e-06, "loss": 0.848, "step": 8108 }, { "epoch": 4.35327673568001, "grad_norm": 0.74609375, "learning_rate": 2.029386411254258e-06, "loss": 0.9986, "step": 8109 }, { "epoch": 4.353813630415087, "grad_norm": 0.671875, "learning_rate": 2.026054709494235e-06, "loss": 0.7709, "step": 8110 }, { "epoch": 4.354350525150163, "grad_norm": 0.62109375, "learning_rate": 2.0227256293523096e-06, "loss": 0.9553, "step": 8111 }, { "epoch": 4.354887419885239, "grad_norm": 0.7265625, "learning_rate": 2.0193991712083804e-06, "loss": 1.0477, "step": 8112 }, { "epoch": 4.355424314620315, "grad_norm": 0.63671875, "learning_rate": 2.0160753354420246e-06, "loss": 0.7884, "step": 8113 }, { "epoch": 4.3559612093553906, "grad_norm": 0.5703125, "learning_rate": 2.0127541224325474e-06, "loss": 0.827, "step": 8114 }, { "epoch": 4.356498104090467, "grad_norm": 0.8046875, "learning_rate": 2.0094355325589315e-06, "loss": 0.9843, "step": 8115 }, { "epoch": 4.357034998825543, "grad_norm": 0.67578125, "learning_rate": 2.006119566199871e-06, "loss": 0.7487, "step": 8116 }, { "epoch": 4.3575718935606185, "grad_norm": 0.5859375, "learning_rate": 2.0028062237337634e-06, "loss": 0.7653, "step": 8117 }, { "epoch": 4.358108788295695, "grad_norm": 0.7890625, "learning_rate": 1.999495505538704e-06, "loss": 0.9481, "step": 8118 }, { "epoch": 4.358645683030771, "grad_norm": 0.671875, "learning_rate": 1.996187411992481e-06, "loss": 0.7662, "step": 8119 }, { "epoch": 4.359182577765846, "grad_norm": 0.7265625, "learning_rate": 1.992881943472594e-06, "loss": 0.8063, "step": 8120 }, { "epoch": 4.359719472500923, "grad_norm": 0.62890625, "learning_rate": 1.9895791003562425e-06, "loss": 0.7327, "step": 8121 }, { "epoch": 4.360256367235999, "grad_norm": 0.80859375, "learning_rate": 1.986278883020318e-06, "loss": 0.9624, "step": 8122 }, { "epoch": 4.360793261971075, "grad_norm": 0.64453125, "learning_rate": 1.9829812918414183e-06, "loss": 0.7113, "step": 8123 }, { "epoch": 4.361330156706151, "grad_norm": 0.7578125, "learning_rate": 1.9796863271958373e-06, "loss": 0.6861, "step": 8124 }, { "epoch": 4.3618670514412266, "grad_norm": 0.80859375, "learning_rate": 1.9763939894595797e-06, "loss": 0.865, "step": 8125 }, { "epoch": 4.362403946176303, "grad_norm": 0.73828125, "learning_rate": 1.9731042790083367e-06, "loss": 0.7453, "step": 8126 }, { "epoch": 4.362940840911379, "grad_norm": 0.79296875, "learning_rate": 1.9698171962175078e-06, "loss": 0.9922, "step": 8127 }, { "epoch": 4.3634777356464545, "grad_norm": 0.6328125, "learning_rate": 1.9665327414621876e-06, "loss": 0.8148, "step": 8128 }, { "epoch": 4.364014630381531, "grad_norm": 0.82421875, "learning_rate": 1.9632509151171763e-06, "loss": 0.9316, "step": 8129 }, { "epoch": 4.364551525116607, "grad_norm": 0.76171875, "learning_rate": 1.959971717556977e-06, "loss": 1.1203, "step": 8130 }, { "epoch": 4.365088419851682, "grad_norm": 0.67578125, "learning_rate": 1.956695149155785e-06, "loss": 0.7563, "step": 8131 }, { "epoch": 4.365625314586759, "grad_norm": 0.69921875, "learning_rate": 1.95342121028749e-06, "loss": 0.7891, "step": 8132 }, { "epoch": 4.366162209321835, "grad_norm": 0.67578125, "learning_rate": 1.950149901325696e-06, "loss": 0.8395, "step": 8133 }, { "epoch": 4.366699104056911, "grad_norm": 0.63671875, "learning_rate": 1.9468812226437067e-06, "loss": 0.6971, "step": 8134 }, { "epoch": 4.367235998791987, "grad_norm": 0.77734375, "learning_rate": 1.9436151746145077e-06, "loss": 0.8547, "step": 8135 }, { "epoch": 4.3677728935270625, "grad_norm": 0.6328125, "learning_rate": 1.9403517576108034e-06, "loss": 0.6845, "step": 8136 }, { "epoch": 4.368309788262139, "grad_norm": 0.640625, "learning_rate": 1.9370909720049957e-06, "loss": 0.995, "step": 8137 }, { "epoch": 4.368846682997215, "grad_norm": 0.6875, "learning_rate": 1.9338328181691705e-06, "loss": 0.9313, "step": 8138 }, { "epoch": 4.3693835777322905, "grad_norm": 0.5859375, "learning_rate": 1.9305772964751304e-06, "loss": 0.7913, "step": 8139 }, { "epoch": 4.369920472467367, "grad_norm": 0.75390625, "learning_rate": 1.9273244072943815e-06, "loss": 0.9748, "step": 8140 }, { "epoch": 4.370457367202443, "grad_norm": 0.6953125, "learning_rate": 1.924074150998101e-06, "loss": 0.7767, "step": 8141 }, { "epoch": 4.370994261937519, "grad_norm": 0.59765625, "learning_rate": 1.9208265279571926e-06, "loss": 0.8383, "step": 8142 }, { "epoch": 4.371531156672595, "grad_norm": 0.5625, "learning_rate": 1.917581538542254e-06, "loss": 0.6568, "step": 8143 }, { "epoch": 4.372068051407671, "grad_norm": 0.5625, "learning_rate": 1.9143391831235728e-06, "loss": 0.6376, "step": 8144 }, { "epoch": 4.372604946142747, "grad_norm": 0.6484375, "learning_rate": 1.9110994620711476e-06, "loss": 0.7222, "step": 8145 }, { "epoch": 4.373141840877823, "grad_norm": 0.6640625, "learning_rate": 1.9078623757546767e-06, "loss": 0.8181, "step": 8146 }, { "epoch": 4.3736787356128985, "grad_norm": 0.83203125, "learning_rate": 1.9046279245435421e-06, "loss": 0.775, "step": 8147 }, { "epoch": 4.374215630347975, "grad_norm": 0.71875, "learning_rate": 1.9013961088068438e-06, "loss": 0.9851, "step": 8148 }, { "epoch": 4.374752525083051, "grad_norm": 0.75, "learning_rate": 1.8981669289133668e-06, "loss": 0.728, "step": 8149 }, { "epoch": 4.375289419818127, "grad_norm": 0.59765625, "learning_rate": 1.8949403852316084e-06, "loss": 0.7222, "step": 8150 }, { "epoch": 4.375289419818127, "eval_loss": 0.9027720093727112, "eval_runtime": 46.7202, "eval_samples_per_second": 20.591, "eval_steps_per_second": 20.591, "step": 8150 }, { "epoch": 4.375826314553203, "grad_norm": 0.62890625, "learning_rate": 1.8917164781297486e-06, "loss": 0.7624, "step": 8151 }, { "epoch": 4.376363209288279, "grad_norm": 0.703125, "learning_rate": 1.8884952079756796e-06, "loss": 0.8883, "step": 8152 }, { "epoch": 4.376900104023355, "grad_norm": 0.640625, "learning_rate": 1.885276575136999e-06, "loss": 0.8894, "step": 8153 }, { "epoch": 4.377436998758431, "grad_norm": 0.63671875, "learning_rate": 1.8820605799809793e-06, "loss": 0.7652, "step": 8154 }, { "epoch": 4.377973893493507, "grad_norm": 0.6484375, "learning_rate": 1.8788472228746135e-06, "loss": 0.8017, "step": 8155 }, { "epoch": 4.378510788228583, "grad_norm": 0.76953125, "learning_rate": 1.8756365041845913e-06, "loss": 0.8047, "step": 8156 }, { "epoch": 4.379047682963659, "grad_norm": 0.734375, "learning_rate": 1.872428424277284e-06, "loss": 0.859, "step": 8157 }, { "epoch": 4.3795845776987345, "grad_norm": 0.7265625, "learning_rate": 1.8692229835187813e-06, "loss": 0.8247, "step": 8158 }, { "epoch": 4.380121472433811, "grad_norm": 0.69921875, "learning_rate": 1.8660201822748664e-06, "loss": 0.6967, "step": 8159 }, { "epoch": 4.380658367168887, "grad_norm": 0.87109375, "learning_rate": 1.8628200209110131e-06, "loss": 1.0077, "step": 8160 }, { "epoch": 4.381195261903963, "grad_norm": 0.60546875, "learning_rate": 1.8596224997924043e-06, "loss": 1.0272, "step": 8161 }, { "epoch": 4.381732156639039, "grad_norm": 0.7109375, "learning_rate": 1.8564276192839208e-06, "loss": 0.7763, "step": 8162 }, { "epoch": 4.382269051374115, "grad_norm": 0.61328125, "learning_rate": 1.8532353797501318e-06, "loss": 0.8049, "step": 8163 }, { "epoch": 4.382805946109191, "grad_norm": 0.7421875, "learning_rate": 1.8500457815553152e-06, "loss": 0.9138, "step": 8164 }, { "epoch": 4.383342840844267, "grad_norm": 0.7578125, "learning_rate": 1.8468588250634494e-06, "loss": 0.9941, "step": 8165 }, { "epoch": 4.383879735579343, "grad_norm": 0.73828125, "learning_rate": 1.8436745106382047e-06, "loss": 0.9625, "step": 8166 }, { "epoch": 4.384416630314419, "grad_norm": 0.671875, "learning_rate": 1.8404928386429432e-06, "loss": 0.7265, "step": 8167 }, { "epoch": 4.384953525049495, "grad_norm": 0.77734375, "learning_rate": 1.837313809440741e-06, "loss": 0.8467, "step": 8168 }, { "epoch": 4.3854904197845705, "grad_norm": 0.703125, "learning_rate": 1.8341374233943693e-06, "loss": 0.8626, "step": 8169 }, { "epoch": 4.386027314519647, "grad_norm": 0.63671875, "learning_rate": 1.830963680866285e-06, "loss": 0.9047, "step": 8170 }, { "epoch": 4.386564209254723, "grad_norm": 0.69140625, "learning_rate": 1.8277925822186625e-06, "loss": 0.949, "step": 8171 }, { "epoch": 4.387101103989799, "grad_norm": 1.0234375, "learning_rate": 1.824624127813354e-06, "loss": 0.7457, "step": 8172 }, { "epoch": 4.387637998724875, "grad_norm": 0.7109375, "learning_rate": 1.8214583180119288e-06, "loss": 0.7518, "step": 8173 }, { "epoch": 4.388174893459951, "grad_norm": 0.8046875, "learning_rate": 1.8182951531756364e-06, "loss": 0.8073, "step": 8174 }, { "epoch": 4.388711788195027, "grad_norm": 0.7578125, "learning_rate": 1.815134633665444e-06, "loss": 0.6695, "step": 8175 }, { "epoch": 4.389248682930103, "grad_norm": 0.61328125, "learning_rate": 1.811976759841999e-06, "loss": 0.7827, "step": 8176 }, { "epoch": 4.389785577665179, "grad_norm": 0.953125, "learning_rate": 1.8088215320656577e-06, "loss": 0.9644, "step": 8177 }, { "epoch": 4.390322472400255, "grad_norm": 0.74609375, "learning_rate": 1.8056689506964764e-06, "loss": 0.9412, "step": 8178 }, { "epoch": 4.390859367135331, "grad_norm": 1.015625, "learning_rate": 1.802519016094198e-06, "loss": 0.9344, "step": 8179 }, { "epoch": 4.391396261870407, "grad_norm": 0.73046875, "learning_rate": 1.799371728618271e-06, "loss": 0.8584, "step": 8180 }, { "epoch": 4.391933156605483, "grad_norm": 0.7578125, "learning_rate": 1.7962270886278442e-06, "loss": 0.9523, "step": 8181 }, { "epoch": 4.392470051340559, "grad_norm": 0.6875, "learning_rate": 1.7930850964817614e-06, "loss": 0.9462, "step": 8182 }, { "epoch": 4.393006946075635, "grad_norm": 0.7734375, "learning_rate": 1.789945752538555e-06, "loss": 0.852, "step": 8183 }, { "epoch": 4.393543840810711, "grad_norm": 0.65625, "learning_rate": 1.7868090571564744e-06, "loss": 0.8787, "step": 8184 }, { "epoch": 4.394080735545787, "grad_norm": 0.61328125, "learning_rate": 1.7836750106934474e-06, "loss": 0.8575, "step": 8185 }, { "epoch": 4.394617630280863, "grad_norm": 0.71875, "learning_rate": 1.780543613507113e-06, "loss": 0.8147, "step": 8186 }, { "epoch": 4.395154525015939, "grad_norm": 0.75, "learning_rate": 1.7774148659548072e-06, "loss": 1.0441, "step": 8187 }, { "epoch": 4.3956914197510155, "grad_norm": 0.68359375, "learning_rate": 1.7742887683935506e-06, "loss": 1.0124, "step": 8188 }, { "epoch": 4.396228314486091, "grad_norm": 0.69921875, "learning_rate": 1.7711653211800739e-06, "loss": 0.7514, "step": 8189 }, { "epoch": 4.396765209221167, "grad_norm": 0.625, "learning_rate": 1.7680445246708115e-06, "loss": 0.6354, "step": 8190 }, { "epoch": 4.397302103956243, "grad_norm": 0.8046875, "learning_rate": 1.7649263792218734e-06, "loss": 1.0701, "step": 8191 }, { "epoch": 4.397838998691319, "grad_norm": 1.1328125, "learning_rate": 1.761810885189083e-06, "loss": 0.9761, "step": 8192 }, { "epoch": 4.398375893426395, "grad_norm": 0.67578125, "learning_rate": 1.7586980429279588e-06, "loss": 0.865, "step": 8193 }, { "epoch": 4.398912788161471, "grad_norm": 0.703125, "learning_rate": 1.7555878527937164e-06, "loss": 0.7017, "step": 8194 }, { "epoch": 4.399449682896547, "grad_norm": 0.8046875, "learning_rate": 1.752480315141264e-06, "loss": 0.8551, "step": 8195 }, { "epoch": 4.399986577631623, "grad_norm": 0.62890625, "learning_rate": 1.7493754303252147e-06, "loss": 0.7346, "step": 8196 }, { "epoch": 4.400523472366699, "grad_norm": 0.640625, "learning_rate": 1.746273198699877e-06, "loss": 0.7483, "step": 8197 }, { "epoch": 4.401060367101775, "grad_norm": 0.58984375, "learning_rate": 1.743173620619251e-06, "loss": 0.8668, "step": 8198 }, { "epoch": 4.4015972618368515, "grad_norm": 0.58984375, "learning_rate": 1.7400766964370375e-06, "loss": 0.6748, "step": 8199 }, { "epoch": 4.402134156571927, "grad_norm": 0.609375, "learning_rate": 1.7369824265066365e-06, "loss": 0.8673, "step": 8200 }, { "epoch": 4.402134156571927, "eval_loss": 0.9027100205421448, "eval_runtime": 46.0007, "eval_samples_per_second": 20.913, "eval_steps_per_second": 20.913, "step": 8200 }, { "epoch": 4.402671051307003, "grad_norm": 0.63671875, "learning_rate": 1.7338908111811408e-06, "loss": 0.9569, "step": 8201 }, { "epoch": 4.403207946042079, "grad_norm": 0.734375, "learning_rate": 1.7308018508133462e-06, "loss": 0.8345, "step": 8202 }, { "epoch": 4.403744840777155, "grad_norm": 0.62109375, "learning_rate": 1.7277155457557426e-06, "loss": 0.8133, "step": 8203 }, { "epoch": 4.404281735512231, "grad_norm": 0.66796875, "learning_rate": 1.7246318963605123e-06, "loss": 0.9958, "step": 8204 }, { "epoch": 4.404818630247307, "grad_norm": 0.625, "learning_rate": 1.7215509029795436e-06, "loss": 0.8414, "step": 8205 }, { "epoch": 4.405355524982383, "grad_norm": 0.5625, "learning_rate": 1.7184725659644157e-06, "loss": 0.7352, "step": 8206 }, { "epoch": 4.405892419717459, "grad_norm": 0.68359375, "learning_rate": 1.7153968856664038e-06, "loss": 0.8369, "step": 8207 }, { "epoch": 4.406429314452535, "grad_norm": 0.765625, "learning_rate": 1.7123238624364795e-06, "loss": 0.8539, "step": 8208 }, { "epoch": 4.406966209187611, "grad_norm": 0.68359375, "learning_rate": 1.7092534966253182e-06, "loss": 0.7729, "step": 8209 }, { "epoch": 4.4075031039226875, "grad_norm": 0.76171875, "learning_rate": 1.7061857885832893e-06, "loss": 0.8306, "step": 8210 }, { "epoch": 4.408039998657763, "grad_norm": 0.61328125, "learning_rate": 1.703120738660452e-06, "loss": 0.729, "step": 8211 }, { "epoch": 4.408576893392839, "grad_norm": 0.76171875, "learning_rate": 1.7000583472065734e-06, "loss": 0.8055, "step": 8212 }, { "epoch": 4.409113788127915, "grad_norm": 0.66796875, "learning_rate": 1.696998614571102e-06, "loss": 0.8735, "step": 8213 }, { "epoch": 4.409650682862991, "grad_norm": 0.65625, "learning_rate": 1.6939415411031972e-06, "loss": 0.9641, "step": 8214 }, { "epoch": 4.410187577598067, "grad_norm": 0.7734375, "learning_rate": 1.6908871271517135e-06, "loss": 0.849, "step": 8215 }, { "epoch": 4.410724472333143, "grad_norm": 0.64453125, "learning_rate": 1.687835373065194e-06, "loss": 0.9493, "step": 8216 }, { "epoch": 4.411261367068219, "grad_norm": 0.75390625, "learning_rate": 1.6847862791918829e-06, "loss": 0.8929, "step": 8217 }, { "epoch": 4.411798261803296, "grad_norm": 0.8125, "learning_rate": 1.681739845879718e-06, "loss": 0.8998, "step": 8218 }, { "epoch": 4.412335156538371, "grad_norm": 0.671875, "learning_rate": 1.6786960734763408e-06, "loss": 0.7558, "step": 8219 }, { "epoch": 4.412872051273447, "grad_norm": 0.66796875, "learning_rate": 1.6756549623290818e-06, "loss": 0.8733, "step": 8220 }, { "epoch": 4.4134089460085235, "grad_norm": 0.7734375, "learning_rate": 1.6726165127849692e-06, "loss": 0.7853, "step": 8221 }, { "epoch": 4.413945840743599, "grad_norm": 0.73046875, "learning_rate": 1.6695807251907335e-06, "loss": 0.9125, "step": 8222 }, { "epoch": 4.414482735478675, "grad_norm": 0.97265625, "learning_rate": 1.6665475998927872e-06, "loss": 1.0584, "step": 8223 }, { "epoch": 4.415019630213751, "grad_norm": 0.703125, "learning_rate": 1.663517137237261e-06, "loss": 0.8236, "step": 8224 }, { "epoch": 4.415556524948827, "grad_norm": 0.7421875, "learning_rate": 1.6604893375699594e-06, "loss": 0.867, "step": 8225 }, { "epoch": 4.416093419683904, "grad_norm": 0.73828125, "learning_rate": 1.6574642012363945e-06, "loss": 0.8422, "step": 8226 }, { "epoch": 4.416630314418979, "grad_norm": 0.765625, "learning_rate": 1.654441728581771e-06, "loss": 0.8131, "step": 8227 }, { "epoch": 4.417167209154055, "grad_norm": 0.65625, "learning_rate": 1.651421919950999e-06, "loss": 0.8059, "step": 8228 }, { "epoch": 4.417704103889132, "grad_norm": 0.7734375, "learning_rate": 1.6484047756886666e-06, "loss": 0.7703, "step": 8229 }, { "epoch": 4.418240998624207, "grad_norm": 0.73046875, "learning_rate": 1.6453902961390738e-06, "loss": 0.9109, "step": 8230 }, { "epoch": 4.418777893359283, "grad_norm": 0.6796875, "learning_rate": 1.6423784816462145e-06, "loss": 0.789, "step": 8231 }, { "epoch": 4.4193147880943595, "grad_norm": 0.69921875, "learning_rate": 1.6393693325537724e-06, "loss": 0.8911, "step": 8232 }, { "epoch": 4.419851682829435, "grad_norm": 0.74609375, "learning_rate": 1.6363628492051225e-06, "loss": 0.8648, "step": 8233 }, { "epoch": 4.420388577564511, "grad_norm": 0.6328125, "learning_rate": 1.6333590319433461e-06, "loss": 0.7889, "step": 8234 }, { "epoch": 4.420925472299587, "grad_norm": 0.61328125, "learning_rate": 1.6303578811112246e-06, "loss": 0.7976, "step": 8235 }, { "epoch": 4.421462367034663, "grad_norm": 0.80859375, "learning_rate": 1.6273593970512174e-06, "loss": 0.8777, "step": 8236 }, { "epoch": 4.42199926176974, "grad_norm": 0.671875, "learning_rate": 1.624363580105495e-06, "loss": 0.8699, "step": 8237 }, { "epoch": 4.422536156504815, "grad_norm": 0.80859375, "learning_rate": 1.6213704306159178e-06, "loss": 0.7813, "step": 8238 }, { "epoch": 4.423073051239891, "grad_norm": 0.625, "learning_rate": 1.6183799489240398e-06, "loss": 0.7055, "step": 8239 }, { "epoch": 4.423609945974968, "grad_norm": 0.78125, "learning_rate": 1.615392135371116e-06, "loss": 0.8254, "step": 8240 }, { "epoch": 4.424146840710043, "grad_norm": 0.76953125, "learning_rate": 1.612406990298093e-06, "loss": 0.9572, "step": 8241 }, { "epoch": 4.424683735445119, "grad_norm": 0.6875, "learning_rate": 1.6094245140456072e-06, "loss": 0.8455, "step": 8242 }, { "epoch": 4.4252206301801955, "grad_norm": 0.66015625, "learning_rate": 1.6064447069540024e-06, "loss": 0.762, "step": 8243 }, { "epoch": 4.425757524915271, "grad_norm": 0.6328125, "learning_rate": 1.603467569363315e-06, "loss": 0.8894, "step": 8244 }, { "epoch": 4.426294419650347, "grad_norm": 0.66015625, "learning_rate": 1.600493101613268e-06, "loss": 0.8646, "step": 8245 }, { "epoch": 4.426831314385423, "grad_norm": 0.63671875, "learning_rate": 1.5975213040432897e-06, "loss": 0.9088, "step": 8246 }, { "epoch": 4.427368209120499, "grad_norm": 0.7421875, "learning_rate": 1.5945521769925004e-06, "loss": 0.912, "step": 8247 }, { "epoch": 4.427905103855576, "grad_norm": 0.66015625, "learning_rate": 1.5915857207997126e-06, "loss": 0.9382, "step": 8248 }, { "epoch": 4.428441998590651, "grad_norm": 0.64453125, "learning_rate": 1.5886219358034387e-06, "loss": 0.7741, "step": 8249 }, { "epoch": 4.428978893325727, "grad_norm": 0.64453125, "learning_rate": 1.5856608223418806e-06, "loss": 0.7866, "step": 8250 }, { "epoch": 4.428978893325727, "eval_loss": 0.9027072787284851, "eval_runtime": 46.2751, "eval_samples_per_second": 20.789, "eval_steps_per_second": 20.789, "step": 8250 }, { "epoch": 4.429515788060804, "grad_norm": 0.70703125, "learning_rate": 1.5827023807529456e-06, "loss": 0.8154, "step": 8251 }, { "epoch": 4.430052682795879, "grad_norm": 0.6171875, "learning_rate": 1.579746611374222e-06, "loss": 0.8866, "step": 8252 }, { "epoch": 4.430589577530955, "grad_norm": 0.62890625, "learning_rate": 1.5767935145430068e-06, "loss": 0.7472, "step": 8253 }, { "epoch": 4.4311264722660315, "grad_norm": 0.6953125, "learning_rate": 1.5738430905962776e-06, "loss": 0.9132, "step": 8254 }, { "epoch": 4.431663367001107, "grad_norm": 0.6484375, "learning_rate": 1.5708953398707204e-06, "loss": 0.8426, "step": 8255 }, { "epoch": 4.432200261736184, "grad_norm": 0.84765625, "learning_rate": 1.5679502627027136e-06, "loss": 0.9216, "step": 8256 }, { "epoch": 4.432737156471259, "grad_norm": 0.6484375, "learning_rate": 1.5650078594283275e-06, "loss": 0.6307, "step": 8257 }, { "epoch": 4.433274051206335, "grad_norm": 0.8359375, "learning_rate": 1.5620681303833178e-06, "loss": 0.8897, "step": 8258 }, { "epoch": 4.433810945941412, "grad_norm": 0.671875, "learning_rate": 1.559131075903153e-06, "loss": 0.7865, "step": 8259 }, { "epoch": 4.434347840676487, "grad_norm": 0.70703125, "learning_rate": 1.5561966963229924e-06, "loss": 0.8214, "step": 8260 }, { "epoch": 4.434884735411563, "grad_norm": 0.63671875, "learning_rate": 1.5532649919776738e-06, "loss": 0.8391, "step": 8261 }, { "epoch": 4.43542163014664, "grad_norm": 0.8125, "learning_rate": 1.550335963201749e-06, "loss": 0.6785, "step": 8262 }, { "epoch": 4.435958524881715, "grad_norm": 0.73828125, "learning_rate": 1.5474096103294616e-06, "loss": 0.8154, "step": 8263 }, { "epoch": 4.436495419616792, "grad_norm": 0.5859375, "learning_rate": 1.5444859336947393e-06, "loss": 0.6973, "step": 8264 }, { "epoch": 4.4370323143518675, "grad_norm": 0.7109375, "learning_rate": 1.5415649336312093e-06, "loss": 0.8809, "step": 8265 }, { "epoch": 4.437569209086943, "grad_norm": 0.57421875, "learning_rate": 1.5386466104722053e-06, "loss": 0.983, "step": 8266 }, { "epoch": 4.43810610382202, "grad_norm": 0.7109375, "learning_rate": 1.5357309645507273e-06, "loss": 0.6873, "step": 8267 }, { "epoch": 4.438642998557095, "grad_norm": 0.6796875, "learning_rate": 1.532817996199501e-06, "loss": 0.8135, "step": 8268 }, { "epoch": 4.439179893292171, "grad_norm": 0.6796875, "learning_rate": 1.5299077057509297e-06, "loss": 0.7436, "step": 8269 }, { "epoch": 4.439716788027248, "grad_norm": 0.70703125, "learning_rate": 1.527000093537112e-06, "loss": 0.8359, "step": 8270 }, { "epoch": 4.440253682762323, "grad_norm": 0.70703125, "learning_rate": 1.5240951598898463e-06, "loss": 0.9547, "step": 8271 }, { "epoch": 4.440790577497399, "grad_norm": 0.88671875, "learning_rate": 1.5211929051406226e-06, "loss": 0.9057, "step": 8272 }, { "epoch": 4.441327472232476, "grad_norm": 0.7421875, "learning_rate": 1.5182933296206208e-06, "loss": 0.9338, "step": 8273 }, { "epoch": 4.441864366967551, "grad_norm": 0.78515625, "learning_rate": 1.5153964336607234e-06, "loss": 0.9568, "step": 8274 }, { "epoch": 4.442401261702628, "grad_norm": 0.62890625, "learning_rate": 1.5125022175914994e-06, "loss": 0.6984, "step": 8275 }, { "epoch": 4.4429381564377035, "grad_norm": 0.6640625, "learning_rate": 1.5096106817432231e-06, "loss": 0.7784, "step": 8276 }, { "epoch": 4.443475051172779, "grad_norm": 0.79296875, "learning_rate": 1.5067218264458422e-06, "loss": 0.7825, "step": 8277 }, { "epoch": 4.444011945907856, "grad_norm": 0.78125, "learning_rate": 1.503835652029023e-06, "loss": 0.9397, "step": 8278 }, { "epoch": 4.444548840642931, "grad_norm": 0.67578125, "learning_rate": 1.500952158822111e-06, "loss": 0.7453, "step": 8279 }, { "epoch": 4.445085735378007, "grad_norm": 0.63671875, "learning_rate": 1.498071347154148e-06, "loss": 0.7393, "step": 8280 }, { "epoch": 4.445622630113084, "grad_norm": 0.70703125, "learning_rate": 1.4951932173538713e-06, "loss": 0.8314, "step": 8281 }, { "epoch": 4.446159524848159, "grad_norm": 0.640625, "learning_rate": 1.492317769749721e-06, "loss": 0.9092, "step": 8282 }, { "epoch": 4.446696419583235, "grad_norm": 0.5859375, "learning_rate": 1.4894450046698039e-06, "loss": 0.6895, "step": 8283 }, { "epoch": 4.447233314318312, "grad_norm": 0.7421875, "learning_rate": 1.4865749224419496e-06, "loss": 0.7358, "step": 8284 }, { "epoch": 4.447770209053387, "grad_norm": 0.6796875, "learning_rate": 1.4837075233936765e-06, "loss": 1.0611, "step": 8285 }, { "epoch": 4.448307103788464, "grad_norm": 0.84765625, "learning_rate": 1.4808428078521785e-06, "loss": 0.731, "step": 8286 }, { "epoch": 4.4488439985235395, "grad_norm": 0.67578125, "learning_rate": 1.4779807761443636e-06, "loss": 0.9906, "step": 8287 }, { "epoch": 4.449380893258615, "grad_norm": 0.625, "learning_rate": 1.4751214285968263e-06, "loss": 0.8957, "step": 8288 }, { "epoch": 4.449917787993692, "grad_norm": 0.8828125, "learning_rate": 1.4722647655358496e-06, "loss": 0.8662, "step": 8289 }, { "epoch": 4.450454682728767, "grad_norm": 0.55859375, "learning_rate": 1.4694107872874175e-06, "loss": 0.8454, "step": 8290 }, { "epoch": 4.450991577463843, "grad_norm": 0.61328125, "learning_rate": 1.466559494177211e-06, "loss": 0.7678, "step": 8291 }, { "epoch": 4.45152847219892, "grad_norm": 0.62890625, "learning_rate": 1.463710886530592e-06, "loss": 0.8827, "step": 8292 }, { "epoch": 4.452065366933995, "grad_norm": 0.66796875, "learning_rate": 1.4608649646726202e-06, "loss": 0.8198, "step": 8293 }, { "epoch": 4.452602261669072, "grad_norm": 0.84375, "learning_rate": 1.4580217289280524e-06, "loss": 1.0586, "step": 8294 }, { "epoch": 4.453139156404148, "grad_norm": 0.71484375, "learning_rate": 1.4551811796213483e-06, "loss": 0.8704, "step": 8295 }, { "epoch": 4.453676051139223, "grad_norm": 0.796875, "learning_rate": 1.4523433170766348e-06, "loss": 0.7902, "step": 8296 }, { "epoch": 4.4542129458743, "grad_norm": 0.8984375, "learning_rate": 1.4495081416177613e-06, "loss": 0.8432, "step": 8297 }, { "epoch": 4.4547498406093755, "grad_norm": 0.62109375, "learning_rate": 1.4466756535682496e-06, "loss": 0.8071, "step": 8298 }, { "epoch": 4.455286735344451, "grad_norm": 0.69140625, "learning_rate": 1.4438458532513272e-06, "loss": 0.8832, "step": 8299 }, { "epoch": 4.455823630079528, "grad_norm": 0.6171875, "learning_rate": 1.4410187409899023e-06, "loss": 0.7187, "step": 8300 }, { "epoch": 4.455823630079528, "eval_loss": 0.9027251601219177, "eval_runtime": 46.0364, "eval_samples_per_second": 20.896, "eval_steps_per_second": 20.896, "step": 8300 }, { "epoch": 4.456360524814603, "grad_norm": 0.640625, "learning_rate": 1.4381943171065947e-06, "loss": 0.7582, "step": 8301 }, { "epoch": 4.45689741954968, "grad_norm": 0.76171875, "learning_rate": 1.4353725819236969e-06, "loss": 0.8503, "step": 8302 }, { "epoch": 4.457434314284756, "grad_norm": 0.66015625, "learning_rate": 1.432553535763212e-06, "loss": 0.7626, "step": 8303 }, { "epoch": 4.457971209019831, "grad_norm": 0.61328125, "learning_rate": 1.4297371789468272e-06, "loss": 0.8649, "step": 8304 }, { "epoch": 4.458508103754908, "grad_norm": 1.015625, "learning_rate": 1.4269235117959218e-06, "loss": 0.93, "step": 8305 }, { "epoch": 4.459044998489984, "grad_norm": 1.859375, "learning_rate": 1.4241125346315691e-06, "loss": 0.9761, "step": 8306 }, { "epoch": 4.459581893225059, "grad_norm": 0.69921875, "learning_rate": 1.421304247774552e-06, "loss": 0.7589, "step": 8307 }, { "epoch": 4.460118787960136, "grad_norm": 0.671875, "learning_rate": 1.4184986515453108e-06, "loss": 0.6836, "step": 8308 }, { "epoch": 4.4606556826952115, "grad_norm": 0.6875, "learning_rate": 1.4156957462640092e-06, "loss": 0.8432, "step": 8309 }, { "epoch": 4.461192577430287, "grad_norm": 0.56640625, "learning_rate": 1.4128955322504966e-06, "loss": 0.6673, "step": 8310 }, { "epoch": 4.461729472165364, "grad_norm": 0.77734375, "learning_rate": 1.4100980098243093e-06, "loss": 0.9891, "step": 8311 }, { "epoch": 4.462266366900439, "grad_norm": 0.7578125, "learning_rate": 1.4073031793046782e-06, "loss": 0.8597, "step": 8312 }, { "epoch": 4.462803261635516, "grad_norm": 0.7265625, "learning_rate": 1.4045110410105367e-06, "loss": 1.0593, "step": 8313 }, { "epoch": 4.463340156370592, "grad_norm": 0.65625, "learning_rate": 1.401721595260494e-06, "loss": 0.8861, "step": 8314 }, { "epoch": 4.463877051105667, "grad_norm": 0.76171875, "learning_rate": 1.3989348423728678e-06, "loss": 0.6185, "step": 8315 }, { "epoch": 4.464413945840744, "grad_norm": 0.74609375, "learning_rate": 1.3961507826656589e-06, "loss": 0.9124, "step": 8316 }, { "epoch": 4.46495084057582, "grad_norm": 0.6171875, "learning_rate": 1.3933694164565663e-06, "loss": 0.8167, "step": 8317 }, { "epoch": 4.465487735310895, "grad_norm": 0.6171875, "learning_rate": 1.3905907440629752e-06, "loss": 0.7692, "step": 8318 }, { "epoch": 4.466024630045972, "grad_norm": 0.64453125, "learning_rate": 1.3878147658019674e-06, "loss": 0.6746, "step": 8319 }, { "epoch": 4.4665615247810475, "grad_norm": 0.7734375, "learning_rate": 1.3850414819903235e-06, "loss": 0.8528, "step": 8320 }, { "epoch": 4.467098419516123, "grad_norm": 0.58203125, "learning_rate": 1.382270892944501e-06, "loss": 0.7957, "step": 8321 }, { "epoch": 4.4676353142512, "grad_norm": 0.6875, "learning_rate": 1.379502998980664e-06, "loss": 0.8642, "step": 8322 }, { "epoch": 4.468172208986275, "grad_norm": 0.6796875, "learning_rate": 1.3767378004146681e-06, "loss": 0.7861, "step": 8323 }, { "epoch": 4.468709103721352, "grad_norm": 0.75390625, "learning_rate": 1.3739752975620523e-06, "loss": 0.8906, "step": 8324 }, { "epoch": 4.469245998456428, "grad_norm": 0.7109375, "learning_rate": 1.3712154907380537e-06, "loss": 0.7889, "step": 8325 }, { "epoch": 4.469782893191503, "grad_norm": 1.2578125, "learning_rate": 1.3684583802576034e-06, "loss": 0.8172, "step": 8326 }, { "epoch": 4.47031978792658, "grad_norm": 0.6875, "learning_rate": 1.3657039664353166e-06, "loss": 0.9455, "step": 8327 }, { "epoch": 4.470856682661656, "grad_norm": 0.671875, "learning_rate": 1.362952249585514e-06, "loss": 0.8061, "step": 8328 }, { "epoch": 4.471393577396731, "grad_norm": 0.70703125, "learning_rate": 1.3602032300222e-06, "loss": 0.8529, "step": 8329 }, { "epoch": 4.471930472131808, "grad_norm": 0.83203125, "learning_rate": 1.357456908059071e-06, "loss": 0.7752, "step": 8330 }, { "epoch": 4.4724673668668835, "grad_norm": 0.78125, "learning_rate": 1.354713284009515e-06, "loss": 0.8533, "step": 8331 }, { "epoch": 4.47300426160196, "grad_norm": 0.58984375, "learning_rate": 1.3519723581866207e-06, "loss": 0.6661, "step": 8332 }, { "epoch": 4.473541156337036, "grad_norm": 0.60546875, "learning_rate": 1.349234130903157e-06, "loss": 0.7504, "step": 8333 }, { "epoch": 4.474078051072111, "grad_norm": 0.8203125, "learning_rate": 1.3464986024715908e-06, "loss": 0.9392, "step": 8334 }, { "epoch": 4.474614945807188, "grad_norm": 0.73046875, "learning_rate": 1.3437657732040782e-06, "loss": 0.8801, "step": 8335 }, { "epoch": 4.475151840542264, "grad_norm": 0.74609375, "learning_rate": 1.3410356434124777e-06, "loss": 0.8965, "step": 8336 }, { "epoch": 4.475688735277339, "grad_norm": 0.66796875, "learning_rate": 1.3383082134083237e-06, "loss": 0.8219, "step": 8337 }, { "epoch": 4.476225630012416, "grad_norm": 0.7421875, "learning_rate": 1.3355834835028586e-06, "loss": 0.8903, "step": 8338 }, { "epoch": 4.476762524747492, "grad_norm": 0.69921875, "learning_rate": 1.3328614540069979e-06, "loss": 0.8317, "step": 8339 }, { "epoch": 4.477299419482568, "grad_norm": 0.73828125, "learning_rate": 1.3301421252313651e-06, "loss": 0.8192, "step": 8340 }, { "epoch": 4.477836314217644, "grad_norm": 0.65234375, "learning_rate": 1.327425497486276e-06, "loss": 0.7166, "step": 8341 }, { "epoch": 4.4783732089527195, "grad_norm": 0.703125, "learning_rate": 1.3247115710817242e-06, "loss": 0.9011, "step": 8342 }, { "epoch": 4.478910103687796, "grad_norm": 0.625, "learning_rate": 1.3220003463274038e-06, "loss": 0.78, "step": 8343 }, { "epoch": 4.479446998422872, "grad_norm": 0.74609375, "learning_rate": 1.3192918235327006e-06, "loss": 0.8576, "step": 8344 }, { "epoch": 4.479983893157947, "grad_norm": 0.73046875, "learning_rate": 1.3165860030066951e-06, "loss": 0.9406, "step": 8345 }, { "epoch": 4.480520787893024, "grad_norm": 0.609375, "learning_rate": 1.3138828850581515e-06, "loss": 0.8202, "step": 8346 }, { "epoch": 4.4810576826281, "grad_norm": 0.79296875, "learning_rate": 1.3111824699955283e-06, "loss": 0.899, "step": 8347 }, { "epoch": 4.481594577363175, "grad_norm": 0.59375, "learning_rate": 1.3084847581269849e-06, "loss": 0.7537, "step": 8348 }, { "epoch": 4.482131472098252, "grad_norm": 0.64453125, "learning_rate": 1.305789749760361e-06, "loss": 0.6815, "step": 8349 }, { "epoch": 4.482668366833328, "grad_norm": 0.796875, "learning_rate": 1.303097445203183e-06, "loss": 0.8237, "step": 8350 }, { "epoch": 4.482668366833328, "eval_loss": 0.9026736617088318, "eval_runtime": 46.1503, "eval_samples_per_second": 20.845, "eval_steps_per_second": 20.845, "step": 8350 }, { "epoch": 4.483205261568404, "grad_norm": 0.671875, "learning_rate": 1.3004078447626883e-06, "loss": 0.9488, "step": 8351 }, { "epoch": 4.48374215630348, "grad_norm": 0.70703125, "learning_rate": 1.297720948745787e-06, "loss": 1.0053, "step": 8352 }, { "epoch": 4.4842790510385555, "grad_norm": 0.69921875, "learning_rate": 1.2950367574590894e-06, "loss": 0.7899, "step": 8353 }, { "epoch": 4.484815945773632, "grad_norm": 0.75, "learning_rate": 1.2923552712089005e-06, "loss": 1.1165, "step": 8354 }, { "epoch": 4.485352840508708, "grad_norm": 0.7578125, "learning_rate": 1.289676490301206e-06, "loss": 0.9944, "step": 8355 }, { "epoch": 4.485889735243783, "grad_norm": 0.625, "learning_rate": 1.2870004150416893e-06, "loss": 0.7521, "step": 8356 }, { "epoch": 4.48642662997886, "grad_norm": 0.68359375, "learning_rate": 1.2843270457357283e-06, "loss": 0.8393, "step": 8357 }, { "epoch": 4.486963524713936, "grad_norm": 0.76953125, "learning_rate": 1.28165638268839e-06, "loss": 0.889, "step": 8358 }, { "epoch": 4.487500419449011, "grad_norm": 0.78125, "learning_rate": 1.2789884262044194e-06, "loss": 0.9361, "step": 8359 }, { "epoch": 4.488037314184088, "grad_norm": 0.71875, "learning_rate": 1.2763231765882732e-06, "loss": 0.8338, "step": 8360 }, { "epoch": 4.488574208919164, "grad_norm": 0.73828125, "learning_rate": 1.2736606341440916e-06, "loss": 0.8465, "step": 8361 }, { "epoch": 4.48911110365424, "grad_norm": 0.7578125, "learning_rate": 1.2710007991757005e-06, "loss": 0.9843, "step": 8362 }, { "epoch": 4.489647998389316, "grad_norm": 0.62890625, "learning_rate": 1.2683436719866188e-06, "loss": 0.6926, "step": 8363 }, { "epoch": 4.4901848931243915, "grad_norm": 0.6953125, "learning_rate": 1.2656892528800674e-06, "loss": 0.8333, "step": 8364 }, { "epoch": 4.490721787859468, "grad_norm": 0.6328125, "learning_rate": 1.2630375421589375e-06, "loss": 0.8713, "step": 8365 }, { "epoch": 4.491258682594544, "grad_norm": 0.71875, "learning_rate": 1.2603885401258314e-06, "loss": 0.8221, "step": 8366 }, { "epoch": 4.491795577329619, "grad_norm": 0.7421875, "learning_rate": 1.2577422470830324e-06, "loss": 1.1343, "step": 8367 }, { "epoch": 4.492332472064696, "grad_norm": 0.90625, "learning_rate": 1.2550986633325102e-06, "loss": 0.8528, "step": 8368 }, { "epoch": 4.492869366799772, "grad_norm": 0.609375, "learning_rate": 1.2524577891759342e-06, "loss": 0.8208, "step": 8369 }, { "epoch": 4.493406261534848, "grad_norm": 0.8203125, "learning_rate": 1.2498196249146665e-06, "loss": 0.9491, "step": 8370 }, { "epoch": 4.493943156269924, "grad_norm": 1.0, "learning_rate": 1.2471841708497494e-06, "loss": 0.9982, "step": 8371 }, { "epoch": 4.494480051005, "grad_norm": 0.6171875, "learning_rate": 1.244551427281923e-06, "loss": 0.5941, "step": 8372 }, { "epoch": 4.495016945740076, "grad_norm": 0.61328125, "learning_rate": 1.2419213945116194e-06, "loss": 0.8029, "step": 8373 }, { "epoch": 4.495553840475152, "grad_norm": 0.765625, "learning_rate": 1.2392940728389536e-06, "loss": 0.7634, "step": 8374 }, { "epoch": 4.4960907352102275, "grad_norm": 0.6796875, "learning_rate": 1.2366694625637442e-06, "loss": 0.8302, "step": 8375 }, { "epoch": 4.496627629945304, "grad_norm": 0.6484375, "learning_rate": 1.2340475639854826e-06, "loss": 0.8304, "step": 8376 }, { "epoch": 4.49716452468038, "grad_norm": 0.76171875, "learning_rate": 1.231428377403368e-06, "loss": 1.0635, "step": 8377 }, { "epoch": 4.497701419415456, "grad_norm": 0.81640625, "learning_rate": 1.2288119031162782e-06, "loss": 0.828, "step": 8378 }, { "epoch": 4.498238314150532, "grad_norm": 0.69921875, "learning_rate": 1.2261981414227852e-06, "loss": 0.9418, "step": 8379 }, { "epoch": 4.498775208885608, "grad_norm": 0.6640625, "learning_rate": 1.2235870926211619e-06, "loss": 0.8015, "step": 8380 }, { "epoch": 4.499312103620684, "grad_norm": 0.734375, "learning_rate": 1.2209787570093501e-06, "loss": 0.783, "step": 8381 }, { "epoch": 4.49984899835576, "grad_norm": 0.6640625, "learning_rate": 1.2183731348850035e-06, "loss": 0.8503, "step": 8382 }, { "epoch": 4.500385893090836, "grad_norm": 0.76171875, "learning_rate": 1.2157702265454512e-06, "loss": 0.9928, "step": 8383 }, { "epoch": 4.500922787825912, "grad_norm": 0.6171875, "learning_rate": 1.2131700322877166e-06, "loss": 0.7411, "step": 8384 }, { "epoch": 4.501459682560988, "grad_norm": 0.65234375, "learning_rate": 1.210572552408515e-06, "loss": 0.8119, "step": 8385 }, { "epoch": 4.5019965772960635, "grad_norm": 0.734375, "learning_rate": 1.2079777872042592e-06, "loss": 1.1473, "step": 8386 }, { "epoch": 4.50253347203114, "grad_norm": 0.65234375, "learning_rate": 1.2053857369710348e-06, "loss": 0.831, "step": 8387 }, { "epoch": 4.503070366766216, "grad_norm": 0.67578125, "learning_rate": 1.2027964020046329e-06, "loss": 0.7662, "step": 8388 }, { "epoch": 4.503607261501292, "grad_norm": 0.6640625, "learning_rate": 1.2002097826005337e-06, "loss": 0.9336, "step": 8389 }, { "epoch": 4.504144156236368, "grad_norm": 0.734375, "learning_rate": 1.197625879053893e-06, "loss": 0.9547, "step": 8390 }, { "epoch": 4.504681050971444, "grad_norm": 0.703125, "learning_rate": 1.1950446916595748e-06, "loss": 0.979, "step": 8391 }, { "epoch": 4.50521794570652, "grad_norm": 0.80859375, "learning_rate": 1.192466220712124e-06, "loss": 0.7748, "step": 8392 }, { "epoch": 4.505754840441596, "grad_norm": 0.53515625, "learning_rate": 1.189890466505772e-06, "loss": 0.7529, "step": 8393 }, { "epoch": 4.506291735176672, "grad_norm": 0.73828125, "learning_rate": 1.1873174293344474e-06, "loss": 0.9212, "step": 8394 }, { "epoch": 4.506828629911748, "grad_norm": 0.6484375, "learning_rate": 1.1847471094917712e-06, "loss": 0.8835, "step": 8395 }, { "epoch": 4.507365524646824, "grad_norm": 0.75390625, "learning_rate": 1.1821795072710446e-06, "loss": 0.7913, "step": 8396 }, { "epoch": 4.5079024193818995, "grad_norm": 0.640625, "learning_rate": 1.179614622965261e-06, "loss": 0.7279, "step": 8397 }, { "epoch": 4.508439314116976, "grad_norm": 0.75, "learning_rate": 1.1770524568671142e-06, "loss": 0.8137, "step": 8398 }, { "epoch": 4.508976208852052, "grad_norm": 0.69921875, "learning_rate": 1.1744930092689733e-06, "loss": 0.9252, "step": 8399 }, { "epoch": 4.509513103587128, "grad_norm": 0.59765625, "learning_rate": 1.171936280462907e-06, "loss": 0.8223, "step": 8400 }, { "epoch": 4.509513103587128, "eval_loss": 0.9027235507965088, "eval_runtime": 46.0731, "eval_samples_per_second": 20.88, "eval_steps_per_second": 20.88, "step": 8400 }, { "epoch": 4.510049998322204, "grad_norm": 0.71875, "learning_rate": 1.169382270740668e-06, "loss": 0.8002, "step": 8401 }, { "epoch": 4.51058689305728, "grad_norm": 0.73046875, "learning_rate": 1.1668309803937044e-06, "loss": 0.9601, "step": 8402 }, { "epoch": 4.511123787792356, "grad_norm": 0.59375, "learning_rate": 1.1642824097131438e-06, "loss": 0.6579, "step": 8403 }, { "epoch": 4.511660682527432, "grad_norm": 0.7890625, "learning_rate": 1.1617365589898178e-06, "loss": 0.8987, "step": 8404 }, { "epoch": 4.512197577262508, "grad_norm": 0.91015625, "learning_rate": 1.1591934285142408e-06, "loss": 0.9232, "step": 8405 }, { "epoch": 4.512734471997584, "grad_norm": 0.8671875, "learning_rate": 1.156653018576609e-06, "loss": 0.8704, "step": 8406 }, { "epoch": 4.51327136673266, "grad_norm": 0.69140625, "learning_rate": 1.1541153294668178e-06, "loss": 0.8116, "step": 8407 }, { "epoch": 4.5138082614677355, "grad_norm": 0.71484375, "learning_rate": 1.151580361474458e-06, "loss": 0.9706, "step": 8408 }, { "epoch": 4.514345156202812, "grad_norm": 0.62890625, "learning_rate": 1.14904811488879e-06, "loss": 0.8165, "step": 8409 }, { "epoch": 4.514882050937888, "grad_norm": 0.78515625, "learning_rate": 1.1465185899987797e-06, "loss": 0.9603, "step": 8410 }, { "epoch": 4.515418945672964, "grad_norm": 0.6484375, "learning_rate": 1.1439917870930793e-06, "loss": 0.8335, "step": 8411 }, { "epoch": 4.51595584040804, "grad_norm": 0.60546875, "learning_rate": 1.1414677064600282e-06, "loss": 0.7845, "step": 8412 }, { "epoch": 4.516492735143116, "grad_norm": 0.68359375, "learning_rate": 1.138946348387654e-06, "loss": 0.7134, "step": 8413 }, { "epoch": 4.517029629878192, "grad_norm": 0.796875, "learning_rate": 1.136427713163679e-06, "loss": 0.918, "step": 8414 }, { "epoch": 4.517566524613268, "grad_norm": 0.60546875, "learning_rate": 1.1339118010755073e-06, "loss": 0.826, "step": 8415 }, { "epoch": 4.5181034193483445, "grad_norm": 0.62890625, "learning_rate": 1.1313986124102394e-06, "loss": 0.8624, "step": 8416 }, { "epoch": 4.51864031408342, "grad_norm": 0.65625, "learning_rate": 1.1288881474546625e-06, "loss": 0.7929, "step": 8417 }, { "epoch": 4.519177208818496, "grad_norm": 0.953125, "learning_rate": 1.1263804064952532e-06, "loss": 0.8775, "step": 8418 }, { "epoch": 4.519714103553572, "grad_norm": 0.65234375, "learning_rate": 1.1238753898181715e-06, "loss": 0.8291, "step": 8419 }, { "epoch": 4.520250998288648, "grad_norm": 0.72265625, "learning_rate": 1.1213730977092746e-06, "loss": 1.0208, "step": 8420 }, { "epoch": 4.520787893023724, "grad_norm": 0.65234375, "learning_rate": 1.1188735304541065e-06, "loss": 0.8897, "step": 8421 }, { "epoch": 4.5213247877588, "grad_norm": 0.66015625, "learning_rate": 1.1163766883378973e-06, "loss": 0.877, "step": 8422 }, { "epoch": 4.521861682493876, "grad_norm": 1.1640625, "learning_rate": 1.113882571645572e-06, "loss": 0.8493, "step": 8423 }, { "epoch": 4.522398577228952, "grad_norm": 0.80859375, "learning_rate": 1.111391180661736e-06, "loss": 0.9505, "step": 8424 }, { "epoch": 4.522935471964028, "grad_norm": 0.63671875, "learning_rate": 1.1089025156706928e-06, "loss": 0.8568, "step": 8425 }, { "epoch": 4.523472366699104, "grad_norm": 0.63671875, "learning_rate": 1.106416576956426e-06, "loss": 0.9714, "step": 8426 }, { "epoch": 4.5240092614341805, "grad_norm": 0.6484375, "learning_rate": 1.10393336480262e-06, "loss": 0.8765, "step": 8427 }, { "epoch": 4.524546156169256, "grad_norm": 0.65234375, "learning_rate": 1.1014528794926316e-06, "loss": 0.7755, "step": 8428 }, { "epoch": 4.525083050904332, "grad_norm": 0.68359375, "learning_rate": 1.09897512130952e-06, "loss": 0.798, "step": 8429 }, { "epoch": 4.525619945639408, "grad_norm": 0.66796875, "learning_rate": 1.0965000905360345e-06, "loss": 1.0487, "step": 8430 }, { "epoch": 4.526156840374484, "grad_norm": 0.60546875, "learning_rate": 1.0940277874545962e-06, "loss": 0.6858, "step": 8431 }, { "epoch": 4.52669373510956, "grad_norm": 0.8046875, "learning_rate": 1.091558212347335e-06, "loss": 0.9631, "step": 8432 }, { "epoch": 4.527230629844636, "grad_norm": 0.7265625, "learning_rate": 1.0890913654960617e-06, "loss": 0.8211, "step": 8433 }, { "epoch": 4.527767524579712, "grad_norm": 0.69140625, "learning_rate": 1.0866272471822647e-06, "loss": 0.8105, "step": 8434 }, { "epoch": 4.528304419314788, "grad_norm": 0.5703125, "learning_rate": 1.0841658576871388e-06, "loss": 0.7098, "step": 8435 }, { "epoch": 4.528841314049864, "grad_norm": 0.68359375, "learning_rate": 1.0817071972915592e-06, "loss": 0.7504, "step": 8436 }, { "epoch": 4.52937820878494, "grad_norm": 0.734375, "learning_rate": 1.0792512662760879e-06, "loss": 0.9925, "step": 8437 }, { "epoch": 4.5299151035200165, "grad_norm": 0.671875, "learning_rate": 1.076798064920978e-06, "loss": 0.9186, "step": 8438 }, { "epoch": 4.530451998255092, "grad_norm": 0.75, "learning_rate": 1.0743475935061725e-06, "loss": 0.8319, "step": 8439 }, { "epoch": 4.530988892990168, "grad_norm": 0.6953125, "learning_rate": 1.0718998523113004e-06, "loss": 1.0014, "step": 8440 }, { "epoch": 4.531525787725244, "grad_norm": 0.56640625, "learning_rate": 1.0694548416156774e-06, "loss": 0.7704, "step": 8441 }, { "epoch": 4.53206268246032, "grad_norm": 0.6484375, "learning_rate": 1.067012561698319e-06, "loss": 0.9283, "step": 8442 }, { "epoch": 4.532599577195396, "grad_norm": 0.734375, "learning_rate": 1.0645730128379132e-06, "loss": 1.0053, "step": 8443 }, { "epoch": 4.533136471930472, "grad_norm": 0.67578125, "learning_rate": 1.0621361953128407e-06, "loss": 0.809, "step": 8444 }, { "epoch": 4.533673366665548, "grad_norm": 0.66015625, "learning_rate": 1.0597021094011755e-06, "loss": 0.7751, "step": 8445 }, { "epoch": 4.534210261400624, "grad_norm": 0.7265625, "learning_rate": 1.057270755380685e-06, "loss": 1.0098, "step": 8446 }, { "epoch": 4.5347471561357, "grad_norm": 0.8125, "learning_rate": 1.054842133528805e-06, "loss": 0.886, "step": 8447 }, { "epoch": 4.535284050870776, "grad_norm": 0.65234375, "learning_rate": 1.052416244122681e-06, "loss": 0.7425, "step": 8448 }, { "epoch": 4.5358209456058525, "grad_norm": 0.68359375, "learning_rate": 1.0499930874391357e-06, "loss": 0.7849, "step": 8449 }, { "epoch": 4.536357840340928, "grad_norm": 0.75, "learning_rate": 1.047572663754684e-06, "loss": 0.8093, "step": 8450 }, { "epoch": 4.536357840340928, "eval_loss": 0.9026585817337036, "eval_runtime": 46.8877, "eval_samples_per_second": 20.517, "eval_steps_per_second": 20.517, "step": 8450 }, { "epoch": 4.536894735076004, "grad_norm": 0.703125, "learning_rate": 1.0451549733455219e-06, "loss": 0.8279, "step": 8451 }, { "epoch": 4.53743162981108, "grad_norm": 0.7421875, "learning_rate": 1.0427400164875423e-06, "loss": 0.8743, "step": 8452 }, { "epoch": 4.537968524546156, "grad_norm": 0.70703125, "learning_rate": 1.0403277934563165e-06, "loss": 0.8063, "step": 8453 }, { "epoch": 4.538505419281233, "grad_norm": 0.70703125, "learning_rate": 1.0379183045271162e-06, "loss": 0.7471, "step": 8454 }, { "epoch": 4.539042314016308, "grad_norm": 0.6875, "learning_rate": 1.0355115499748936e-06, "loss": 0.8042, "step": 8455 }, { "epoch": 4.539579208751384, "grad_norm": 0.78125, "learning_rate": 1.0331075300742898e-06, "loss": 0.6991, "step": 8456 }, { "epoch": 4.5401161034864606, "grad_norm": 0.62109375, "learning_rate": 1.0307062450996302e-06, "loss": 0.6688, "step": 8457 }, { "epoch": 4.540652998221536, "grad_norm": 0.69140625, "learning_rate": 1.0283076953249371e-06, "loss": 0.7197, "step": 8458 }, { "epoch": 4.541189892956612, "grad_norm": 0.66015625, "learning_rate": 1.0259118810239139e-06, "loss": 0.9387, "step": 8459 }, { "epoch": 4.5417267876916885, "grad_norm": 0.5703125, "learning_rate": 1.023518802469947e-06, "loss": 0.7205, "step": 8460 }, { "epoch": 4.542263682426764, "grad_norm": 0.640625, "learning_rate": 1.0211284599361271e-06, "loss": 0.7472, "step": 8461 }, { "epoch": 4.54280057716184, "grad_norm": 0.6953125, "learning_rate": 1.0187408536952158e-06, "loss": 0.7683, "step": 8462 }, { "epoch": 4.543337471896916, "grad_norm": 0.640625, "learning_rate": 1.0163559840196708e-06, "loss": 0.7384, "step": 8463 }, { "epoch": 4.543874366631992, "grad_norm": 0.6875, "learning_rate": 1.0139738511816404e-06, "loss": 0.8512, "step": 8464 }, { "epoch": 4.544411261367069, "grad_norm": 0.74609375, "learning_rate": 1.0115944554529467e-06, "loss": 1.043, "step": 8465 }, { "epoch": 4.544948156102144, "grad_norm": 0.71484375, "learning_rate": 1.0092177971051137e-06, "loss": 0.7697, "step": 8466 }, { "epoch": 4.54548505083722, "grad_norm": 0.69921875, "learning_rate": 1.006843876409355e-06, "loss": 0.9354, "step": 8467 }, { "epoch": 4.5460219455722966, "grad_norm": 0.6875, "learning_rate": 1.0044726936365574e-06, "loss": 0.8374, "step": 8468 }, { "epoch": 4.546558840307372, "grad_norm": 0.59375, "learning_rate": 1.002104249057298e-06, "loss": 0.6824, "step": 8469 }, { "epoch": 4.547095735042448, "grad_norm": 0.859375, "learning_rate": 9.997385429418555e-07, "loss": 0.64, "step": 8470 }, { "epoch": 4.5476326297775245, "grad_norm": 0.6640625, "learning_rate": 9.973755755601888e-07, "loss": 0.8754, "step": 8471 }, { "epoch": 4.5481695245126, "grad_norm": 0.74609375, "learning_rate": 9.950153471819324e-07, "loss": 0.9445, "step": 8472 }, { "epoch": 4.548706419247676, "grad_norm": 0.66796875, "learning_rate": 9.926578580764234e-07, "loss": 0.8091, "step": 8473 }, { "epoch": 4.549243313982752, "grad_norm": 0.75390625, "learning_rate": 9.90303108512683e-07, "loss": 0.8357, "step": 8474 }, { "epoch": 4.549780208717828, "grad_norm": 0.6953125, "learning_rate": 9.879510987594182e-07, "loss": 0.7445, "step": 8475 }, { "epoch": 4.550317103452905, "grad_norm": 0.7578125, "learning_rate": 9.856018290850172e-07, "loss": 0.791, "step": 8476 }, { "epoch": 4.55085399818798, "grad_norm": 0.75390625, "learning_rate": 9.832552997575684e-07, "loss": 0.9048, "step": 8477 }, { "epoch": 4.551390892923056, "grad_norm": 0.6328125, "learning_rate": 9.809115110448325e-07, "loss": 0.7591, "step": 8478 }, { "epoch": 4.5519277876581326, "grad_norm": 0.7421875, "learning_rate": 9.785704632142734e-07, "loss": 0.8141, "step": 8479 }, { "epoch": 4.552464682393208, "grad_norm": 0.66015625, "learning_rate": 9.76232156533033e-07, "loss": 1.0441, "step": 8480 }, { "epoch": 4.553001577128284, "grad_norm": 0.83984375, "learning_rate": 9.738965912679366e-07, "loss": 0.8971, "step": 8481 }, { "epoch": 4.5535384718633605, "grad_norm": 0.78515625, "learning_rate": 9.715637676855043e-07, "loss": 0.7573, "step": 8482 }, { "epoch": 4.554075366598436, "grad_norm": 0.69921875, "learning_rate": 9.692336860519458e-07, "loss": 0.7689, "step": 8483 }, { "epoch": 4.554612261333512, "grad_norm": 0.6796875, "learning_rate": 9.66906346633148e-07, "loss": 0.8638, "step": 8484 }, { "epoch": 4.555149156068588, "grad_norm": 0.63671875, "learning_rate": 9.645817496946903e-07, "loss": 0.9896, "step": 8485 }, { "epoch": 4.555686050803664, "grad_norm": 0.69921875, "learning_rate": 9.622598955018352e-07, "loss": 0.8124, "step": 8486 }, { "epoch": 4.556222945538741, "grad_norm": 0.59765625, "learning_rate": 9.599407843195434e-07, "loss": 0.7511, "step": 8487 }, { "epoch": 4.556759840273816, "grad_norm": 0.62109375, "learning_rate": 9.576244164124504e-07, "loss": 0.6803, "step": 8488 }, { "epoch": 4.557296735008892, "grad_norm": 0.66796875, "learning_rate": 9.553107920448806e-07, "loss": 0.7879, "step": 8489 }, { "epoch": 4.5578336297439686, "grad_norm": 0.6015625, "learning_rate": 9.529999114808535e-07, "loss": 0.7233, "step": 8490 }, { "epoch": 4.558370524479044, "grad_norm": 0.63671875, "learning_rate": 9.506917749840665e-07, "loss": 0.7202, "step": 8491 }, { "epoch": 4.558907419214121, "grad_norm": 0.67578125, "learning_rate": 9.48386382817909e-07, "loss": 0.8645, "step": 8492 }, { "epoch": 4.5594443139491965, "grad_norm": 0.6953125, "learning_rate": 9.460837352454566e-07, "loss": 0.735, "step": 8493 }, { "epoch": 4.559981208684272, "grad_norm": 0.73046875, "learning_rate": 9.437838325294629e-07, "loss": 0.8704, "step": 8494 }, { "epoch": 4.560518103419349, "grad_norm": 0.68359375, "learning_rate": 9.414866749323819e-07, "loss": 0.9574, "step": 8495 }, { "epoch": 4.561054998154424, "grad_norm": 0.7578125, "learning_rate": 9.391922627163513e-07, "loss": 0.8052, "step": 8496 }, { "epoch": 4.5615918928895, "grad_norm": 0.7890625, "learning_rate": 9.369005961431837e-07, "loss": 0.8902, "step": 8497 }, { "epoch": 4.562128787624577, "grad_norm": 0.734375, "learning_rate": 9.34611675474395e-07, "loss": 0.8239, "step": 8498 }, { "epoch": 4.562665682359652, "grad_norm": 0.60546875, "learning_rate": 9.32325500971179e-07, "loss": 0.7137, "step": 8499 }, { "epoch": 4.563202577094728, "grad_norm": 0.71484375, "learning_rate": 9.300420728944131e-07, "loss": 0.815, "step": 8500 }, { "epoch": 4.563202577094728, "eval_loss": 0.902619481086731, "eval_runtime": 45.7874, "eval_samples_per_second": 21.01, "eval_steps_per_second": 21.01, "step": 8500 }, { "epoch": 4.5637394718298046, "grad_norm": 1.0859375, "learning_rate": 9.277613915046723e-07, "loss": 0.7932, "step": 8501 }, { "epoch": 4.56427636656488, "grad_norm": 0.6953125, "learning_rate": 9.25483457062204e-07, "loss": 0.9043, "step": 8502 }, { "epoch": 4.564813261299957, "grad_norm": 0.6640625, "learning_rate": 9.232082698269557e-07, "loss": 0.6617, "step": 8503 }, { "epoch": 4.5653501560350325, "grad_norm": 0.75, "learning_rate": 9.209358300585474e-07, "loss": 0.7606, "step": 8504 }, { "epoch": 4.565887050770108, "grad_norm": 0.6875, "learning_rate": 9.18666138016297e-07, "loss": 0.7473, "step": 8505 }, { "epoch": 4.566423945505185, "grad_norm": 0.58203125, "learning_rate": 9.163991939592081e-07, "loss": 0.7939, "step": 8506 }, { "epoch": 4.56696084024026, "grad_norm": 0.65625, "learning_rate": 9.141349981459657e-07, "loss": 0.8223, "step": 8507 }, { "epoch": 4.567497734975336, "grad_norm": 0.73828125, "learning_rate": 9.118735508349435e-07, "loss": 0.9082, "step": 8508 }, { "epoch": 4.568034629710413, "grad_norm": 0.71875, "learning_rate": 9.096148522841991e-07, "loss": 0.817, "step": 8509 }, { "epoch": 4.568571524445488, "grad_norm": 0.6328125, "learning_rate": 9.073589027514789e-07, "loss": 0.8953, "step": 8510 }, { "epoch": 4.569108419180564, "grad_norm": 0.69921875, "learning_rate": 9.051057024942189e-07, "loss": 0.8384, "step": 8511 }, { "epoch": 4.5696453139156405, "grad_norm": 0.73828125, "learning_rate": 9.028552517695354e-07, "loss": 0.7219, "step": 8512 }, { "epoch": 4.570182208650716, "grad_norm": 0.63671875, "learning_rate": 9.006075508342287e-07, "loss": 0.7437, "step": 8513 }, { "epoch": 4.570719103385793, "grad_norm": 0.66015625, "learning_rate": 8.983625999447992e-07, "loss": 0.6465, "step": 8514 }, { "epoch": 4.5712559981208685, "grad_norm": 0.84765625, "learning_rate": 8.961203993574196e-07, "loss": 0.9275, "step": 8515 }, { "epoch": 4.571792892855944, "grad_norm": 0.79296875, "learning_rate": 8.938809493279493e-07, "loss": 0.8312, "step": 8516 }, { "epoch": 4.572329787591021, "grad_norm": 0.62109375, "learning_rate": 8.916442501119477e-07, "loss": 0.8756, "step": 8517 }, { "epoch": 4.572866682326096, "grad_norm": 0.74609375, "learning_rate": 8.894103019646438e-07, "loss": 0.888, "step": 8518 }, { "epoch": 4.573403577061172, "grad_norm": 0.63671875, "learning_rate": 8.871791051409562e-07, "loss": 0.6117, "step": 8519 }, { "epoch": 4.573940471796249, "grad_norm": 0.671875, "learning_rate": 8.849506598954977e-07, "loss": 0.8381, "step": 8520 }, { "epoch": 4.574477366531324, "grad_norm": 0.74609375, "learning_rate": 8.827249664825649e-07, "loss": 0.8809, "step": 8521 }, { "epoch": 4.5750142612664, "grad_norm": 0.671875, "learning_rate": 8.805020251561324e-07, "loss": 0.9112, "step": 8522 }, { "epoch": 4.5755511560014765, "grad_norm": 0.5625, "learning_rate": 8.782818361698642e-07, "loss": 0.6419, "step": 8523 }, { "epoch": 4.576088050736552, "grad_norm": 0.6953125, "learning_rate": 8.760643997771212e-07, "loss": 0.7746, "step": 8524 }, { "epoch": 4.576624945471629, "grad_norm": 0.60546875, "learning_rate": 8.73849716230929e-07, "loss": 0.8607, "step": 8525 }, { "epoch": 4.5771618402067045, "grad_norm": 0.71484375, "learning_rate": 8.716377857840241e-07, "loss": 0.8451, "step": 8526 }, { "epoch": 4.57769873494178, "grad_norm": 0.55078125, "learning_rate": 8.69428608688802e-07, "loss": 0.6915, "step": 8527 }, { "epoch": 4.578235629676857, "grad_norm": 0.84765625, "learning_rate": 8.672221851973694e-07, "loss": 0.8643, "step": 8528 }, { "epoch": 4.578772524411932, "grad_norm": 0.73046875, "learning_rate": 8.650185155614998e-07, "loss": 0.8649, "step": 8529 }, { "epoch": 4.579309419147009, "grad_norm": 0.7890625, "learning_rate": 8.628176000326615e-07, "loss": 0.7384, "step": 8530 }, { "epoch": 4.579846313882085, "grad_norm": 0.703125, "learning_rate": 8.60619438862012e-07, "loss": 0.866, "step": 8531 }, { "epoch": 4.58038320861716, "grad_norm": 0.65625, "learning_rate": 8.584240323003811e-07, "loss": 0.8837, "step": 8532 }, { "epoch": 4.580920103352237, "grad_norm": 0.6796875, "learning_rate": 8.562313805982963e-07, "loss": 0.7141, "step": 8533 }, { "epoch": 4.5814569980873125, "grad_norm": 0.71875, "learning_rate": 8.54041484005974e-07, "loss": 0.9274, "step": 8534 }, { "epoch": 4.581993892822388, "grad_norm": 0.59765625, "learning_rate": 8.51854342773295e-07, "loss": 0.7147, "step": 8535 }, { "epoch": 4.582530787557465, "grad_norm": 0.65625, "learning_rate": 8.496699571498485e-07, "loss": 0.8153, "step": 8536 }, { "epoch": 4.5830676822925405, "grad_norm": 0.85546875, "learning_rate": 8.474883273848988e-07, "loss": 0.7827, "step": 8537 }, { "epoch": 4.583604577027616, "grad_norm": 0.58203125, "learning_rate": 8.45309453727397e-07, "loss": 0.8565, "step": 8538 }, { "epoch": 4.584141471762693, "grad_norm": 0.71875, "learning_rate": 8.431333364259831e-07, "loss": 1.0511, "step": 8539 }, { "epoch": 4.584678366497768, "grad_norm": 0.83984375, "learning_rate": 8.409599757289777e-07, "loss": 0.9118, "step": 8540 }, { "epoch": 4.585215261232845, "grad_norm": 0.63671875, "learning_rate": 8.387893718843853e-07, "loss": 0.7262, "step": 8541 }, { "epoch": 4.585752155967921, "grad_norm": 0.72265625, "learning_rate": 8.366215251399078e-07, "loss": 0.8432, "step": 8542 }, { "epoch": 4.586289050702996, "grad_norm": 0.6875, "learning_rate": 8.344564357429141e-07, "loss": 0.8498, "step": 8543 }, { "epoch": 4.586825945438073, "grad_norm": 0.6875, "learning_rate": 8.322941039404758e-07, "loss": 0.8212, "step": 8544 }, { "epoch": 4.5873628401731485, "grad_norm": 0.6640625, "learning_rate": 8.301345299793373e-07, "loss": 0.774, "step": 8545 }, { "epoch": 4.587899734908224, "grad_norm": 0.6015625, "learning_rate": 8.279777141059347e-07, "loss": 0.8208, "step": 8546 }, { "epoch": 4.588436629643301, "grad_norm": 0.73046875, "learning_rate": 8.258236565663908e-07, "loss": 0.7831, "step": 8547 }, { "epoch": 4.5889735243783765, "grad_norm": 0.71484375, "learning_rate": 8.236723576065087e-07, "loss": 1.0149, "step": 8548 }, { "epoch": 4.589510419113452, "grad_norm": 0.796875, "learning_rate": 8.215238174717809e-07, "loss": 0.695, "step": 8549 }, { "epoch": 4.590047313848529, "grad_norm": 0.5859375, "learning_rate": 8.193780364073755e-07, "loss": 0.7278, "step": 8550 }, { "epoch": 4.590047313848529, "eval_loss": 0.9027642011642456, "eval_runtime": 46.3462, "eval_samples_per_second": 20.757, "eval_steps_per_second": 20.757, "step": 8550 }, { "epoch": 4.590584208583604, "grad_norm": 0.6875, "learning_rate": 8.172350146581658e-07, "loss": 0.8991, "step": 8551 }, { "epoch": 4.591121103318681, "grad_norm": 0.5625, "learning_rate": 8.150947524686869e-07, "loss": 0.7414, "step": 8552 }, { "epoch": 4.591657998053757, "grad_norm": 0.71875, "learning_rate": 8.129572500831739e-07, "loss": 0.775, "step": 8553 }, { "epoch": 4.592194892788832, "grad_norm": 0.65234375, "learning_rate": 8.108225077455428e-07, "loss": 0.7875, "step": 8554 }, { "epoch": 4.592731787523909, "grad_norm": 0.68359375, "learning_rate": 8.08690525699396e-07, "loss": 0.8853, "step": 8555 }, { "epoch": 4.5932686822589845, "grad_norm": 0.66015625, "learning_rate": 8.065613041880194e-07, "loss": 0.9799, "step": 8556 }, { "epoch": 4.59380557699406, "grad_norm": 0.890625, "learning_rate": 8.044348434543797e-07, "loss": 1.0202, "step": 8557 }, { "epoch": 4.594342471729137, "grad_norm": 0.69140625, "learning_rate": 8.023111437411357e-07, "loss": 1.0828, "step": 8558 }, { "epoch": 4.5948793664642125, "grad_norm": 0.65625, "learning_rate": 8.001902052906351e-07, "loss": 1.018, "step": 8559 }, { "epoch": 4.595416261199288, "grad_norm": 0.859375, "learning_rate": 7.980720283448956e-07, "loss": 0.8757, "step": 8560 }, { "epoch": 4.595953155934365, "grad_norm": 0.72265625, "learning_rate": 7.959566131456264e-07, "loss": 0.772, "step": 8561 }, { "epoch": 4.59649005066944, "grad_norm": 0.70703125, "learning_rate": 7.938439599342317e-07, "loss": 0.9219, "step": 8562 }, { "epoch": 4.597026945404517, "grad_norm": 0.75, "learning_rate": 7.917340689517828e-07, "loss": 1.0212, "step": 8563 }, { "epoch": 4.597563840139593, "grad_norm": 0.7421875, "learning_rate": 7.896269404390505e-07, "loss": 1.0342, "step": 8564 }, { "epoch": 4.598100734874668, "grad_norm": 0.7421875, "learning_rate": 7.875225746364873e-07, "loss": 1.0311, "step": 8565 }, { "epoch": 4.598637629609745, "grad_norm": 0.65234375, "learning_rate": 7.854209717842231e-07, "loss": 0.8156, "step": 8566 }, { "epoch": 4.5991745243448205, "grad_norm": 0.6875, "learning_rate": 7.833221321220774e-07, "loss": 0.7297, "step": 8567 }, { "epoch": 4.599711419079897, "grad_norm": 0.6796875, "learning_rate": 7.812260558895584e-07, "loss": 0.8428, "step": 8568 }, { "epoch": 4.600248313814973, "grad_norm": 0.671875, "learning_rate": 7.791327433258499e-07, "loss": 0.7613, "step": 8569 }, { "epoch": 4.6007852085500485, "grad_norm": 0.67578125, "learning_rate": 7.770421946698275e-07, "loss": 0.817, "step": 8570 }, { "epoch": 4.601322103285125, "grad_norm": 0.7265625, "learning_rate": 7.749544101600504e-07, "loss": 1.1847, "step": 8571 }, { "epoch": 4.601858998020201, "grad_norm": 0.5859375, "learning_rate": 7.728693900347639e-07, "loss": 0.6922, "step": 8572 }, { "epoch": 4.602395892755276, "grad_norm": 0.60546875, "learning_rate": 7.707871345318862e-07, "loss": 0.6023, "step": 8573 }, { "epoch": 4.602932787490353, "grad_norm": 0.765625, "learning_rate": 7.687076438890356e-07, "loss": 0.8528, "step": 8574 }, { "epoch": 4.603469682225429, "grad_norm": 0.78125, "learning_rate": 7.666309183435111e-07, "loss": 0.9262, "step": 8575 }, { "epoch": 4.604006576960504, "grad_norm": 0.7109375, "learning_rate": 7.645569581322898e-07, "loss": 1.0498, "step": 8576 }, { "epoch": 4.604543471695581, "grad_norm": 0.703125, "learning_rate": 7.624857634920324e-07, "loss": 0.8385, "step": 8577 }, { "epoch": 4.6050803664306565, "grad_norm": 0.83203125, "learning_rate": 7.60417334659097e-07, "loss": 1.0223, "step": 8578 }, { "epoch": 4.605617261165733, "grad_norm": 0.609375, "learning_rate": 7.583516718695088e-07, "loss": 0.6642, "step": 8579 }, { "epoch": 4.606154155900809, "grad_norm": 0.72265625, "learning_rate": 7.562887753589904e-07, "loss": 0.9018, "step": 8580 }, { "epoch": 4.6066910506358845, "grad_norm": 0.62109375, "learning_rate": 7.542286453629505e-07, "loss": 0.8002, "step": 8581 }, { "epoch": 4.607227945370961, "grad_norm": 0.7109375, "learning_rate": 7.52171282116465e-07, "loss": 0.9679, "step": 8582 }, { "epoch": 4.607764840106037, "grad_norm": 0.703125, "learning_rate": 7.501166858543101e-07, "loss": 0.9165, "step": 8583 }, { "epoch": 4.608301734841112, "grad_norm": 0.56640625, "learning_rate": 7.480648568109428e-07, "loss": 0.7012, "step": 8584 }, { "epoch": 4.608838629576189, "grad_norm": 0.6171875, "learning_rate": 7.460157952205033e-07, "loss": 0.7311, "step": 8585 }, { "epoch": 4.609375524311265, "grad_norm": 0.58984375, "learning_rate": 7.439695013168075e-07, "loss": 0.7021, "step": 8586 }, { "epoch": 4.60991241904634, "grad_norm": 0.7578125, "learning_rate": 7.41925975333374e-07, "loss": 0.9652, "step": 8587 }, { "epoch": 4.610449313781417, "grad_norm": 0.66015625, "learning_rate": 7.398852175033888e-07, "loss": 0.7665, "step": 8588 }, { "epoch": 4.6109862085164925, "grad_norm": 0.6953125, "learning_rate": 7.378472280597292e-07, "loss": 0.7301, "step": 8589 }, { "epoch": 4.611523103251569, "grad_norm": 0.6484375, "learning_rate": 7.358120072349595e-07, "loss": 0.814, "step": 8590 }, { "epoch": 4.612059997986645, "grad_norm": 0.65234375, "learning_rate": 7.337795552613158e-07, "loss": 0.7413, "step": 8591 }, { "epoch": 4.6125968927217205, "grad_norm": 0.74609375, "learning_rate": 7.31749872370735e-07, "loss": 0.8707, "step": 8592 }, { "epoch": 4.613133787456797, "grad_norm": 0.66015625, "learning_rate": 7.297229587948262e-07, "loss": 0.7414, "step": 8593 }, { "epoch": 4.613670682191873, "grad_norm": 0.703125, "learning_rate": 7.276988147648878e-07, "loss": 0.8463, "step": 8594 }, { "epoch": 4.614207576926948, "grad_norm": 0.68359375, "learning_rate": 7.256774405118932e-07, "loss": 0.7516, "step": 8595 }, { "epoch": 4.614744471662025, "grad_norm": 0.6328125, "learning_rate": 7.236588362665136e-07, "loss": 0.8484, "step": 8596 }, { "epoch": 4.615281366397101, "grad_norm": 0.765625, "learning_rate": 7.216430022591008e-07, "loss": 0.8635, "step": 8597 }, { "epoch": 4.615818261132176, "grad_norm": 0.734375, "learning_rate": 7.196299387196764e-07, "loss": 0.7139, "step": 8598 }, { "epoch": 4.616355155867253, "grad_norm": 0.62109375, "learning_rate": 7.176196458779622e-07, "loss": 0.6986, "step": 8599 }, { "epoch": 4.6168920506023285, "grad_norm": 0.61328125, "learning_rate": 7.156121239633607e-07, "loss": 0.7515, "step": 8600 }, { "epoch": 4.6168920506023285, "eval_loss": 0.9026636481285095, "eval_runtime": 46.4391, "eval_samples_per_second": 20.715, "eval_steps_per_second": 20.715, "step": 8600 }, { "epoch": 4.617428945337405, "grad_norm": 0.6875, "learning_rate": 7.136073732049553e-07, "loss": 0.6794, "step": 8601 }, { "epoch": 4.617965840072481, "grad_norm": 0.78515625, "learning_rate": 7.116053938315048e-07, "loss": 1.0429, "step": 8602 }, { "epoch": 4.6185027348075565, "grad_norm": 0.73046875, "learning_rate": 7.096061860714709e-07, "loss": 0.8808, "step": 8603 }, { "epoch": 4.619039629542633, "grad_norm": 0.73046875, "learning_rate": 7.076097501529794e-07, "loss": 0.9149, "step": 8604 }, { "epoch": 4.619576524277709, "grad_norm": 0.6953125, "learning_rate": 7.056160863038563e-07, "loss": 0.8187, "step": 8605 }, { "epoch": 4.620113419012785, "grad_norm": 0.7578125, "learning_rate": 7.036251947516032e-07, "loss": 0.7818, "step": 8606 }, { "epoch": 4.620650313747861, "grad_norm": 0.9453125, "learning_rate": 7.016370757233992e-07, "loss": 0.7576, "step": 8607 }, { "epoch": 4.621187208482937, "grad_norm": 0.671875, "learning_rate": 6.996517294461213e-07, "loss": 0.9365, "step": 8608 }, { "epoch": 4.621724103218013, "grad_norm": 0.6015625, "learning_rate": 6.976691561463217e-07, "loss": 0.7382, "step": 8609 }, { "epoch": 4.622260997953089, "grad_norm": 0.6328125, "learning_rate": 6.956893560502359e-07, "loss": 0.7634, "step": 8610 }, { "epoch": 4.6227978926881645, "grad_norm": 0.6015625, "learning_rate": 6.937123293837805e-07, "loss": 0.8106, "step": 8611 }, { "epoch": 4.623334787423241, "grad_norm": 0.76953125, "learning_rate": 6.91738076372564e-07, "loss": 0.7068, "step": 8612 }, { "epoch": 4.623871682158317, "grad_norm": 0.6875, "learning_rate": 6.897665972418755e-07, "loss": 0.7453, "step": 8613 }, { "epoch": 4.6244085768933925, "grad_norm": 0.65234375, "learning_rate": 6.877978922166767e-07, "loss": 0.7838, "step": 8614 }, { "epoch": 4.624945471628469, "grad_norm": 0.72265625, "learning_rate": 6.858319615216296e-07, "loss": 0.8509, "step": 8615 }, { "epoch": 4.625482366363545, "grad_norm": 0.73828125, "learning_rate": 6.838688053810716e-07, "loss": 0.8256, "step": 8616 }, { "epoch": 4.626019261098621, "grad_norm": 0.703125, "learning_rate": 6.819084240190205e-07, "loss": 0.7821, "step": 8617 }, { "epoch": 4.626556155833697, "grad_norm": 0.66796875, "learning_rate": 6.799508176591834e-07, "loss": 0.9111, "step": 8618 }, { "epoch": 4.627093050568773, "grad_norm": 0.65234375, "learning_rate": 6.779959865249458e-07, "loss": 0.7661, "step": 8619 }, { "epoch": 4.627629945303849, "grad_norm": 0.6953125, "learning_rate": 6.760439308393763e-07, "loss": 0.6803, "step": 8620 }, { "epoch": 4.628166840038925, "grad_norm": 0.75, "learning_rate": 6.740946508252355e-07, "loss": 0.9843, "step": 8621 }, { "epoch": 4.6287037347740005, "grad_norm": 0.6015625, "learning_rate": 6.721481467049568e-07, "loss": 0.7459, "step": 8622 }, { "epoch": 4.629240629509077, "grad_norm": 0.8515625, "learning_rate": 6.702044187006623e-07, "loss": 0.7871, "step": 8623 }, { "epoch": 4.629777524244153, "grad_norm": 0.71484375, "learning_rate": 6.682634670341525e-07, "loss": 0.7931, "step": 8624 }, { "epoch": 4.6303144189792285, "grad_norm": 0.828125, "learning_rate": 6.663252919269225e-07, "loss": 0.9139, "step": 8625 }, { "epoch": 4.630851313714305, "grad_norm": 0.61328125, "learning_rate": 6.643898936001369e-07, "loss": 0.8128, "step": 8626 }, { "epoch": 4.631388208449381, "grad_norm": 0.6953125, "learning_rate": 6.62457272274647e-07, "loss": 0.8223, "step": 8627 }, { "epoch": 4.631925103184457, "grad_norm": 0.69140625, "learning_rate": 6.605274281709928e-07, "loss": 0.9057, "step": 8628 }, { "epoch": 4.632461997919533, "grad_norm": 0.6328125, "learning_rate": 6.586003615093955e-07, "loss": 0.8061, "step": 8629 }, { "epoch": 4.632998892654609, "grad_norm": 0.9453125, "learning_rate": 6.566760725097543e-07, "loss": 0.9027, "step": 8630 }, { "epoch": 4.633535787389685, "grad_norm": 0.94140625, "learning_rate": 6.547545613916572e-07, "loss": 0.6901, "step": 8631 }, { "epoch": 4.634072682124761, "grad_norm": 0.71484375, "learning_rate": 6.528358283743735e-07, "loss": 0.9554, "step": 8632 }, { "epoch": 4.6346095768598365, "grad_norm": 0.64453125, "learning_rate": 6.509198736768529e-07, "loss": 0.7403, "step": 8633 }, { "epoch": 4.635146471594913, "grad_norm": 0.625, "learning_rate": 6.490066975177317e-07, "loss": 0.7575, "step": 8634 }, { "epoch": 4.635683366329989, "grad_norm": 0.62109375, "learning_rate": 6.470963001153269e-07, "loss": 0.8484, "step": 8635 }, { "epoch": 4.6362202610650645, "grad_norm": 0.625, "learning_rate": 6.451886816876363e-07, "loss": 0.6832, "step": 8636 }, { "epoch": 4.636757155800141, "grad_norm": 0.734375, "learning_rate": 6.432838424523441e-07, "loss": 0.9626, "step": 8637 }, { "epoch": 4.637294050535217, "grad_norm": 0.68359375, "learning_rate": 6.413817826268236e-07, "loss": 0.8905, "step": 8638 }, { "epoch": 4.637830945270293, "grad_norm": 0.70703125, "learning_rate": 6.394825024281154e-07, "loss": 0.7792, "step": 8639 }, { "epoch": 4.638367840005369, "grad_norm": 0.7578125, "learning_rate": 6.375860020729541e-07, "loss": 0.9051, "step": 8640 }, { "epoch": 4.638904734740445, "grad_norm": 0.70703125, "learning_rate": 6.356922817777583e-07, "loss": 0.7692, "step": 8641 }, { "epoch": 4.639441629475521, "grad_norm": 0.83203125, "learning_rate": 6.338013417586192e-07, "loss": 0.8862, "step": 8642 }, { "epoch": 4.639978524210597, "grad_norm": 0.62109375, "learning_rate": 6.319131822313224e-07, "loss": 0.7468, "step": 8643 }, { "epoch": 4.640515418945673, "grad_norm": 0.63671875, "learning_rate": 6.30027803411326e-07, "loss": 0.8482, "step": 8644 }, { "epoch": 4.641052313680749, "grad_norm": 0.8515625, "learning_rate": 6.281452055137804e-07, "loss": 1.0957, "step": 8645 }, { "epoch": 4.641589208415825, "grad_norm": 0.59765625, "learning_rate": 6.262653887535081e-07, "loss": 0.7458, "step": 8646 }, { "epoch": 4.642126103150901, "grad_norm": 0.7421875, "learning_rate": 6.243883533450262e-07, "loss": 0.7512, "step": 8647 }, { "epoch": 4.642662997885977, "grad_norm": 0.77734375, "learning_rate": 6.22514099502522e-07, "loss": 0.9586, "step": 8648 }, { "epoch": 4.643199892621053, "grad_norm": 0.68359375, "learning_rate": 6.206426274398769e-07, "loss": 0.8132, "step": 8649 }, { "epoch": 4.643736787356129, "grad_norm": 0.6875, "learning_rate": 6.187739373706508e-07, "loss": 0.9041, "step": 8650 }, { "epoch": 4.643736787356129, "eval_loss": 0.9026292562484741, "eval_runtime": 46.1503, "eval_samples_per_second": 20.845, "eval_steps_per_second": 20.845, "step": 8650 }, { "epoch": 4.644273682091205, "grad_norm": 0.71875, "learning_rate": 6.16908029508076e-07, "loss": 0.6977, "step": 8651 }, { "epoch": 4.644810576826281, "grad_norm": 0.63671875, "learning_rate": 6.150449040650874e-07, "loss": 0.7941, "step": 8652 }, { "epoch": 4.645347471561357, "grad_norm": 0.6953125, "learning_rate": 6.13184561254282e-07, "loss": 0.7809, "step": 8653 }, { "epoch": 4.645884366296433, "grad_norm": 0.6328125, "learning_rate": 6.113270012879535e-07, "loss": 0.725, "step": 8654 }, { "epoch": 4.646421261031509, "grad_norm": 0.68359375, "learning_rate": 6.094722243780714e-07, "loss": 0.8718, "step": 8655 }, { "epoch": 4.646958155766585, "grad_norm": 0.7421875, "learning_rate": 6.076202307362888e-07, "loss": 0.7824, "step": 8656 }, { "epoch": 4.647495050501661, "grad_norm": 0.703125, "learning_rate": 6.05771020573942e-07, "loss": 0.8179, "step": 8657 }, { "epoch": 4.648031945236737, "grad_norm": 0.90234375, "learning_rate": 6.039245941020511e-07, "loss": 0.7794, "step": 8658 }, { "epoch": 4.648568839971813, "grad_norm": 0.63671875, "learning_rate": 6.020809515313142e-07, "loss": 0.9527, "step": 8659 }, { "epoch": 4.649105734706889, "grad_norm": 0.609375, "learning_rate": 6.002400930721186e-07, "loss": 0.671, "step": 8660 }, { "epoch": 4.649642629441965, "grad_norm": 0.703125, "learning_rate": 5.984020189345241e-07, "loss": 0.8642, "step": 8661 }, { "epoch": 4.650179524177041, "grad_norm": 0.625, "learning_rate": 5.965667293282795e-07, "loss": 0.8086, "step": 8662 }, { "epoch": 4.650716418912117, "grad_norm": 0.5625, "learning_rate": 5.947342244628174e-07, "loss": 0.7407, "step": 8663 }, { "epoch": 4.651253313647193, "grad_norm": 0.671875, "learning_rate": 5.929045045472453e-07, "loss": 0.865, "step": 8664 }, { "epoch": 4.651790208382269, "grad_norm": 0.73828125, "learning_rate": 5.910775697903603e-07, "loss": 0.7146, "step": 8665 }, { "epoch": 4.652327103117345, "grad_norm": 0.71875, "learning_rate": 5.892534204006428e-07, "loss": 0.9263, "step": 8666 }, { "epoch": 4.652863997852421, "grad_norm": 0.64453125, "learning_rate": 5.874320565862429e-07, "loss": 0.9011, "step": 8667 }, { "epoch": 4.653400892587497, "grad_norm": 0.63671875, "learning_rate": 5.856134785550082e-07, "loss": 0.8471, "step": 8668 }, { "epoch": 4.653937787322573, "grad_norm": 0.60546875, "learning_rate": 5.837976865144563e-07, "loss": 0.6945, "step": 8669 }, { "epoch": 4.654474682057649, "grad_norm": 0.6875, "learning_rate": 5.81984680671796e-07, "loss": 0.9044, "step": 8670 }, { "epoch": 4.655011576792725, "grad_norm": 0.7109375, "learning_rate": 5.801744612339121e-07, "loss": 0.7981, "step": 8671 }, { "epoch": 4.655548471527801, "grad_norm": 0.6953125, "learning_rate": 5.783670284073727e-07, "loss": 0.7813, "step": 8672 }, { "epoch": 4.656085366262877, "grad_norm": 0.79296875, "learning_rate": 5.765623823984351e-07, "loss": 0.8944, "step": 8673 }, { "epoch": 4.656622260997953, "grad_norm": 0.8203125, "learning_rate": 5.747605234130233e-07, "loss": 0.937, "step": 8674 }, { "epoch": 4.657159155733029, "grad_norm": 0.6640625, "learning_rate": 5.729614516567588e-07, "loss": 0.6755, "step": 8675 }, { "epoch": 4.657696050468105, "grad_norm": 0.69921875, "learning_rate": 5.711651673349333e-07, "loss": 0.9269, "step": 8676 }, { "epoch": 4.658232945203181, "grad_norm": 0.703125, "learning_rate": 5.693716706525326e-07, "loss": 0.7199, "step": 8677 }, { "epoch": 4.658769839938257, "grad_norm": 0.7265625, "learning_rate": 5.675809618142097e-07, "loss": 0.8776, "step": 8678 }, { "epoch": 4.659306734673333, "grad_norm": 0.64453125, "learning_rate": 5.657930410243123e-07, "loss": 1.0098, "step": 8679 }, { "epoch": 4.659843629408409, "grad_norm": 0.7578125, "learning_rate": 5.640079084868632e-07, "loss": 0.9221, "step": 8680 }, { "epoch": 4.660380524143485, "grad_norm": 0.66015625, "learning_rate": 5.622255644055663e-07, "loss": 0.7009, "step": 8681 }, { "epoch": 4.660917418878562, "grad_norm": 0.796875, "learning_rate": 5.604460089838143e-07, "loss": 0.9555, "step": 8682 }, { "epoch": 4.661454313613637, "grad_norm": 0.70703125, "learning_rate": 5.586692424246725e-07, "loss": 0.8738, "step": 8683 }, { "epoch": 4.661991208348713, "grad_norm": 0.703125, "learning_rate": 5.568952649308957e-07, "loss": 1.0114, "step": 8684 }, { "epoch": 4.6625281030837895, "grad_norm": 0.62890625, "learning_rate": 5.551240767049187e-07, "loss": 0.8329, "step": 8685 }, { "epoch": 4.663064997818865, "grad_norm": 0.65625, "learning_rate": 5.533556779488497e-07, "loss": 0.7449, "step": 8686 }, { "epoch": 4.663601892553941, "grad_norm": 0.5546875, "learning_rate": 5.515900688644909e-07, "loss": 0.7247, "step": 8687 }, { "epoch": 4.664138787289017, "grad_norm": 0.84375, "learning_rate": 5.498272496533202e-07, "loss": 1.0778, "step": 8688 }, { "epoch": 4.664675682024093, "grad_norm": 0.703125, "learning_rate": 5.480672205164961e-07, "loss": 1.026, "step": 8689 }, { "epoch": 4.665212576759169, "grad_norm": 0.70703125, "learning_rate": 5.463099816548579e-07, "loss": 0.8403, "step": 8690 }, { "epoch": 4.665749471494245, "grad_norm": 0.6484375, "learning_rate": 5.445555332689367e-07, "loss": 0.8892, "step": 8691 }, { "epoch": 4.666286366229321, "grad_norm": 0.703125, "learning_rate": 5.428038755589282e-07, "loss": 0.7186, "step": 8692 }, { "epoch": 4.666823260964398, "grad_norm": 0.64453125, "learning_rate": 5.410550087247251e-07, "loss": 0.7183, "step": 8693 }, { "epoch": 4.667360155699473, "grad_norm": 0.56640625, "learning_rate": 5.393089329658929e-07, "loss": 0.7167, "step": 8694 }, { "epoch": 4.667897050434549, "grad_norm": 0.64453125, "learning_rate": 5.375656484816832e-07, "loss": 0.8619, "step": 8695 }, { "epoch": 4.6684339451696255, "grad_norm": 0.7109375, "learning_rate": 5.358251554710203e-07, "loss": 0.9259, "step": 8696 }, { "epoch": 4.668970839904701, "grad_norm": 0.62890625, "learning_rate": 5.340874541325231e-07, "loss": 0.7198, "step": 8697 }, { "epoch": 4.669507734639777, "grad_norm": 0.73046875, "learning_rate": 5.323525446644828e-07, "loss": 1.0486, "step": 8698 }, { "epoch": 4.670044629374853, "grad_norm": 0.69140625, "learning_rate": 5.306204272648747e-07, "loss": 0.8277, "step": 8699 }, { "epoch": 4.670581524109929, "grad_norm": 0.7890625, "learning_rate": 5.288911021313542e-07, "loss": 0.7683, "step": 8700 }, { "epoch": 4.670581524109929, "eval_loss": 0.9025928974151611, "eval_runtime": 46.0125, "eval_samples_per_second": 20.907, "eval_steps_per_second": 20.907, "step": 8700 }, { "epoch": 4.671118418845005, "grad_norm": 0.6640625, "learning_rate": 5.271645694612637e-07, "loss": 0.7914, "step": 8701 }, { "epoch": 4.671655313580081, "grad_norm": 0.6796875, "learning_rate": 5.254408294516178e-07, "loss": 0.7356, "step": 8702 }, { "epoch": 4.672192208315157, "grad_norm": 1.078125, "learning_rate": 5.237198822991174e-07, "loss": 0.8037, "step": 8703 }, { "epoch": 4.672729103050234, "grad_norm": 0.6953125, "learning_rate": 5.220017282001472e-07, "loss": 0.7648, "step": 8704 }, { "epoch": 4.673265997785309, "grad_norm": 0.84375, "learning_rate": 5.202863673507668e-07, "loss": 0.9845, "step": 8705 }, { "epoch": 4.673802892520385, "grad_norm": 0.6640625, "learning_rate": 5.185737999467227e-07, "loss": 0.8392, "step": 8706 }, { "epoch": 4.6743397872554615, "grad_norm": 0.8515625, "learning_rate": 5.168640261834417e-07, "loss": 0.8451, "step": 8707 }, { "epoch": 4.674876681990537, "grad_norm": 0.69921875, "learning_rate": 5.151570462560262e-07, "loss": 0.8909, "step": 8708 }, { "epoch": 4.675413576725613, "grad_norm": 0.71875, "learning_rate": 5.134528603592676e-07, "loss": 0.9123, "step": 8709 }, { "epoch": 4.675950471460689, "grad_norm": 0.76171875, "learning_rate": 5.117514686876379e-07, "loss": 0.8133, "step": 8710 }, { "epoch": 4.676487366195765, "grad_norm": 0.72265625, "learning_rate": 5.100528714352848e-07, "loss": 0.8935, "step": 8711 }, { "epoch": 4.677024260930841, "grad_norm": 0.75390625, "learning_rate": 5.083570687960337e-07, "loss": 1.0045, "step": 8712 }, { "epoch": 4.677561155665917, "grad_norm": 0.70703125, "learning_rate": 5.066640609634049e-07, "loss": 0.9306, "step": 8713 }, { "epoch": 4.678098050400993, "grad_norm": 0.734375, "learning_rate": 5.049738481305938e-07, "loss": 0.9433, "step": 8714 }, { "epoch": 4.67863494513607, "grad_norm": 0.671875, "learning_rate": 5.032864304904656e-07, "loss": 0.8312, "step": 8715 }, { "epoch": 4.679171839871145, "grad_norm": 0.59375, "learning_rate": 5.016018082355856e-07, "loss": 0.741, "step": 8716 }, { "epoch": 4.679708734606221, "grad_norm": 0.62890625, "learning_rate": 4.999199815581834e-07, "loss": 0.775, "step": 8717 }, { "epoch": 4.6802456293412975, "grad_norm": 0.73046875, "learning_rate": 4.982409506501779e-07, "loss": 0.9109, "step": 8718 }, { "epoch": 4.680782524076373, "grad_norm": 0.640625, "learning_rate": 4.965647157031739e-07, "loss": 0.8084, "step": 8719 }, { "epoch": 4.68131941881145, "grad_norm": 0.64453125, "learning_rate": 4.948912769084463e-07, "loss": 0.877, "step": 8720 }, { "epoch": 4.681856313546525, "grad_norm": 0.66796875, "learning_rate": 4.932206344569562e-07, "loss": 0.7337, "step": 8721 }, { "epoch": 4.682393208281601, "grad_norm": 0.6484375, "learning_rate": 4.915527885393429e-07, "loss": 0.8027, "step": 8722 }, { "epoch": 4.682930103016678, "grad_norm": 0.76171875, "learning_rate": 4.898877393459317e-07, "loss": 0.9704, "step": 8723 }, { "epoch": 4.683466997751753, "grad_norm": 0.69140625, "learning_rate": 4.882254870667236e-07, "loss": 1.0239, "step": 8724 }, { "epoch": 4.684003892486829, "grad_norm": 0.62109375, "learning_rate": 4.865660318914056e-07, "loss": 0.7066, "step": 8725 }, { "epoch": 4.684540787221906, "grad_norm": 0.78125, "learning_rate": 4.849093740093403e-07, "loss": 1.0652, "step": 8726 }, { "epoch": 4.685077681956981, "grad_norm": 0.61328125, "learning_rate": 4.832555136095762e-07, "loss": 0.7992, "step": 8727 }, { "epoch": 4.685614576692057, "grad_norm": 0.64453125, "learning_rate": 4.816044508808349e-07, "loss": 1.0047, "step": 8728 }, { "epoch": 4.6861514714271335, "grad_norm": 1.0234375, "learning_rate": 4.799561860115265e-07, "loss": 0.9498, "step": 8729 }, { "epoch": 4.686688366162209, "grad_norm": 0.8203125, "learning_rate": 4.783107191897368e-07, "loss": 0.9027, "step": 8730 }, { "epoch": 4.687225260897286, "grad_norm": 0.70703125, "learning_rate": 4.766680506032378e-07, "loss": 0.8182, "step": 8731 }, { "epoch": 4.687762155632361, "grad_norm": 0.6875, "learning_rate": 4.750281804394796e-07, "loss": 0.7541, "step": 8732 }, { "epoch": 4.688299050367437, "grad_norm": 0.59765625, "learning_rate": 4.733911088855847e-07, "loss": 0.823, "step": 8733 }, { "epoch": 4.688835945102514, "grad_norm": 0.80078125, "learning_rate": 4.7175683612837027e-07, "loss": 0.8659, "step": 8734 }, { "epoch": 4.689372839837589, "grad_norm": 0.72265625, "learning_rate": 4.701253623543289e-07, "loss": 0.7827, "step": 8735 }, { "epoch": 4.689909734572665, "grad_norm": 0.7265625, "learning_rate": 4.684966877496283e-07, "loss": 0.9438, "step": 8736 }, { "epoch": 4.690446629307742, "grad_norm": 0.74609375, "learning_rate": 4.668708125001198e-07, "loss": 0.9088, "step": 8737 }, { "epoch": 4.690983524042817, "grad_norm": 0.74609375, "learning_rate": 4.65247736791341e-07, "loss": 0.9442, "step": 8738 }, { "epoch": 4.691520418777893, "grad_norm": 0.8984375, "learning_rate": 4.636274608085023e-07, "loss": 0.9731, "step": 8739 }, { "epoch": 4.6920573135129695, "grad_norm": 0.71875, "learning_rate": 4.6200998473649725e-07, "loss": 0.792, "step": 8740 }, { "epoch": 4.692594208248045, "grad_norm": 0.66015625, "learning_rate": 4.6039530875990057e-07, "loss": 0.8352, "step": 8741 }, { "epoch": 4.693131102983122, "grad_norm": 0.6875, "learning_rate": 4.5878343306297033e-07, "loss": 0.7705, "step": 8742 }, { "epoch": 4.693667997718197, "grad_norm": 0.8125, "learning_rate": 4.5717435782963726e-07, "loss": 0.9759, "step": 8743 }, { "epoch": 4.694204892453273, "grad_norm": 0.66015625, "learning_rate": 4.5556808324352386e-07, "loss": 0.807, "step": 8744 }, { "epoch": 4.69474178718835, "grad_norm": 0.76171875, "learning_rate": 4.5396460948791963e-07, "loss": 1.0424, "step": 8745 }, { "epoch": 4.695278681923425, "grad_norm": 0.625, "learning_rate": 4.5236393674580324e-07, "loss": 1.0559, "step": 8746 }, { "epoch": 4.695815576658501, "grad_norm": 0.60546875, "learning_rate": 4.5076606519983135e-07, "loss": 0.8719, "step": 8747 }, { "epoch": 4.696352471393578, "grad_norm": 0.82421875, "learning_rate": 4.491709950323469e-07, "loss": 0.7993, "step": 8748 }, { "epoch": 4.696889366128653, "grad_norm": 0.66796875, "learning_rate": 4.4757872642536e-07, "loss": 0.8002, "step": 8749 }, { "epoch": 4.697426260863729, "grad_norm": 0.6875, "learning_rate": 4.459892595605697e-07, "loss": 0.8538, "step": 8750 }, { "epoch": 4.697426260863729, "eval_loss": 0.9027050137519836, "eval_runtime": 45.879, "eval_samples_per_second": 20.968, "eval_steps_per_second": 20.968, "step": 8750 }, { "epoch": 4.6979631555988055, "grad_norm": 0.67578125, "learning_rate": 4.444025946193586e-07, "loss": 0.9229, "step": 8751 }, { "epoch": 4.698500050333881, "grad_norm": 0.7109375, "learning_rate": 4.4281873178278475e-07, "loss": 0.9207, "step": 8752 }, { "epoch": 4.699036945068958, "grad_norm": 0.609375, "learning_rate": 4.412376712315813e-07, "loss": 0.7659, "step": 8753 }, { "epoch": 4.699573839804033, "grad_norm": 0.6015625, "learning_rate": 4.396594131461734e-07, "loss": 0.725, "step": 8754 }, { "epoch": 4.700110734539109, "grad_norm": 0.609375, "learning_rate": 4.3808395770665865e-07, "loss": 0.6612, "step": 8755 }, { "epoch": 4.700647629274186, "grad_norm": 0.78125, "learning_rate": 4.365113050928127e-07, "loss": 0.9153, "step": 8756 }, { "epoch": 4.701184524009261, "grad_norm": 0.71484375, "learning_rate": 4.349414554841003e-07, "loss": 0.8003, "step": 8757 }, { "epoch": 4.701721418744338, "grad_norm": 0.5625, "learning_rate": 4.3337440905966143e-07, "loss": 0.6134, "step": 8758 }, { "epoch": 4.702258313479414, "grad_norm": 0.640625, "learning_rate": 4.318101659983087e-07, "loss": 0.8941, "step": 8759 }, { "epoch": 4.702795208214489, "grad_norm": 0.6328125, "learning_rate": 4.302487264785521e-07, "loss": 0.6993, "step": 8760 }, { "epoch": 4.703332102949566, "grad_norm": 0.7421875, "learning_rate": 4.2869009067856293e-07, "loss": 0.8569, "step": 8761 }, { "epoch": 4.7038689976846415, "grad_norm": 0.8203125, "learning_rate": 4.2713425877620726e-07, "loss": 0.9141, "step": 8762 }, { "epoch": 4.704405892419717, "grad_norm": 0.703125, "learning_rate": 4.255812309490209e-07, "loss": 0.9555, "step": 8763 }, { "epoch": 4.704942787154794, "grad_norm": 0.640625, "learning_rate": 4.2403100737422595e-07, "loss": 0.8367, "step": 8764 }, { "epoch": 4.705479681889869, "grad_norm": 0.59765625, "learning_rate": 4.2248358822872256e-07, "loss": 0.6338, "step": 8765 }, { "epoch": 4.706016576624945, "grad_norm": 0.75390625, "learning_rate": 4.2093897368909173e-07, "loss": 1.0227, "step": 8766 }, { "epoch": 4.706553471360022, "grad_norm": 0.68359375, "learning_rate": 4.1939716393159245e-07, "loss": 0.7389, "step": 8767 }, { "epoch": 4.707090366095097, "grad_norm": 0.9140625, "learning_rate": 4.178581591321645e-07, "loss": 0.7469, "step": 8768 }, { "epoch": 4.707627260830174, "grad_norm": 0.6484375, "learning_rate": 4.163219594664286e-07, "loss": 0.5844, "step": 8769 }, { "epoch": 4.70816415556525, "grad_norm": 0.78515625, "learning_rate": 4.147885651096861e-07, "loss": 0.8751, "step": 8770 }, { "epoch": 4.708701050300325, "grad_norm": 0.609375, "learning_rate": 4.13257976236911e-07, "loss": 0.8312, "step": 8771 }, { "epoch": 4.709237945035402, "grad_norm": 0.7109375, "learning_rate": 4.1173019302276905e-07, "loss": 0.8552, "step": 8772 }, { "epoch": 4.7097748397704775, "grad_norm": 0.8359375, "learning_rate": 4.1020521564159863e-07, "loss": 0.8906, "step": 8773 }, { "epoch": 4.710311734505553, "grad_norm": 0.7578125, "learning_rate": 4.0868304426741334e-07, "loss": 0.8281, "step": 8774 }, { "epoch": 4.71084862924063, "grad_norm": 0.68359375, "learning_rate": 4.071636790739186e-07, "loss": 0.5785, "step": 8775 }, { "epoch": 4.711385523975705, "grad_norm": 0.9140625, "learning_rate": 4.0564712023449246e-07, "loss": 0.7485, "step": 8776 }, { "epoch": 4.711922418710781, "grad_norm": 0.71484375, "learning_rate": 4.041333679221909e-07, "loss": 1.0854, "step": 8777 }, { "epoch": 4.712459313445858, "grad_norm": 0.5703125, "learning_rate": 4.0262242230975353e-07, "loss": 0.717, "step": 8778 }, { "epoch": 4.712996208180933, "grad_norm": 0.80078125, "learning_rate": 4.01114283569598e-07, "loss": 0.8729, "step": 8779 }, { "epoch": 4.71353310291601, "grad_norm": 0.70703125, "learning_rate": 3.996089518738255e-07, "loss": 0.8346, "step": 8780 }, { "epoch": 4.714069997651086, "grad_norm": 0.71875, "learning_rate": 3.98106427394207e-07, "loss": 0.7998, "step": 8781 }, { "epoch": 4.714606892386161, "grad_norm": 0.65234375, "learning_rate": 3.9660671030220254e-07, "loss": 0.8557, "step": 8782 }, { "epoch": 4.715143787121238, "grad_norm": 0.69921875, "learning_rate": 3.95109800768953e-07, "loss": 0.8293, "step": 8783 }, { "epoch": 4.7156806818563135, "grad_norm": 0.6953125, "learning_rate": 3.9361569896526616e-07, "loss": 0.9219, "step": 8784 }, { "epoch": 4.716217576591389, "grad_norm": 0.7109375, "learning_rate": 3.921244050616446e-07, "loss": 0.9588, "step": 8785 }, { "epoch": 4.716754471326466, "grad_norm": 0.796875, "learning_rate": 3.9063591922826604e-07, "loss": 0.9811, "step": 8786 }, { "epoch": 4.717291366061541, "grad_norm": 0.69140625, "learning_rate": 3.891502416349779e-07, "loss": 0.696, "step": 8787 }, { "epoch": 4.717828260796617, "grad_norm": 0.76953125, "learning_rate": 3.876673724513169e-07, "loss": 0.8588, "step": 8788 }, { "epoch": 4.718365155531694, "grad_norm": 0.64453125, "learning_rate": 3.8618731184650035e-07, "loss": 0.9651, "step": 8789 }, { "epoch": 4.718902050266769, "grad_norm": 0.69140625, "learning_rate": 3.84710059989421e-07, "loss": 0.8654, "step": 8790 }, { "epoch": 4.719438945001846, "grad_norm": 0.703125, "learning_rate": 3.8323561704864676e-07, "loss": 0.8077, "step": 8791 }, { "epoch": 4.719975839736922, "grad_norm": 0.78125, "learning_rate": 3.817639831924402e-07, "loss": 0.804, "step": 8792 }, { "epoch": 4.720512734471997, "grad_norm": 0.765625, "learning_rate": 3.8029515858872543e-07, "loss": 0.9968, "step": 8793 }, { "epoch": 4.721049629207074, "grad_norm": 0.65234375, "learning_rate": 3.788291434051183e-07, "loss": 0.9055, "step": 8794 }, { "epoch": 4.7215865239421495, "grad_norm": 0.77734375, "learning_rate": 3.773659378089045e-07, "loss": 0.9227, "step": 8795 }, { "epoch": 4.722123418677226, "grad_norm": 0.69921875, "learning_rate": 3.759055419670615e-07, "loss": 0.8571, "step": 8796 }, { "epoch": 4.722660313412302, "grad_norm": 0.71484375, "learning_rate": 3.744479560462311e-07, "loss": 0.898, "step": 8797 }, { "epoch": 4.723197208147377, "grad_norm": 0.65234375, "learning_rate": 3.7299318021274953e-07, "loss": 0.7944, "step": 8798 }, { "epoch": 4.723734102882454, "grad_norm": 0.703125, "learning_rate": 3.715412146326203e-07, "loss": 1.0103, "step": 8799 }, { "epoch": 4.72427099761753, "grad_norm": 0.63671875, "learning_rate": 3.700920594715329e-07, "loss": 0.837, "step": 8800 }, { "epoch": 4.72427099761753, "eval_loss": 0.9027426838874817, "eval_runtime": 46.2574, "eval_samples_per_second": 20.797, "eval_steps_per_second": 20.797, "step": 8800 }, { "epoch": 4.724807892352605, "grad_norm": 0.6640625, "learning_rate": 3.6864571489485524e-07, "loss": 0.7167, "step": 8801 }, { "epoch": 4.725344787087682, "grad_norm": 0.67578125, "learning_rate": 3.6720218106763016e-07, "loss": 0.9992, "step": 8802 }, { "epoch": 4.725881681822758, "grad_norm": 0.76171875, "learning_rate": 3.6576145815458707e-07, "loss": 0.8699, "step": 8803 }, { "epoch": 4.726418576557833, "grad_norm": 0.73828125, "learning_rate": 3.6432354632012776e-07, "loss": 0.9403, "step": 8804 }, { "epoch": 4.72695547129291, "grad_norm": 0.6171875, "learning_rate": 3.6288844572834046e-07, "loss": 0.8786, "step": 8805 }, { "epoch": 4.7274923660279855, "grad_norm": 0.671875, "learning_rate": 3.6145615654298024e-07, "loss": 0.7672, "step": 8806 }, { "epoch": 4.728029260763062, "grad_norm": 0.69921875, "learning_rate": 3.60026678927497e-07, "loss": 0.7765, "step": 8807 }, { "epoch": 4.728566155498138, "grad_norm": 0.66015625, "learning_rate": 3.586000130450101e-07, "loss": 0.8297, "step": 8808 }, { "epoch": 4.729103050233213, "grad_norm": 0.69140625, "learning_rate": 3.5717615905832003e-07, "loss": 0.8162, "step": 8809 }, { "epoch": 4.72963994496829, "grad_norm": 0.71875, "learning_rate": 3.557551171299051e-07, "loss": 0.8945, "step": 8810 }, { "epoch": 4.730176839703366, "grad_norm": 0.69140625, "learning_rate": 3.5433688742192995e-07, "loss": 0.8245, "step": 8811 }, { "epoch": 4.730713734438441, "grad_norm": 0.6875, "learning_rate": 3.529214700962236e-07, "loss": 0.8519, "step": 8812 }, { "epoch": 4.731250629173518, "grad_norm": 0.58984375, "learning_rate": 3.5150886531430673e-07, "loss": 0.6998, "step": 8813 }, { "epoch": 4.731787523908594, "grad_norm": 0.7265625, "learning_rate": 3.5009907323737825e-07, "loss": 0.7962, "step": 8814 }, { "epoch": 4.732324418643669, "grad_norm": 0.8125, "learning_rate": 3.4869209402630943e-07, "loss": 0.8786, "step": 8815 }, { "epoch": 4.732861313378746, "grad_norm": 0.640625, "learning_rate": 3.4728792784165797e-07, "loss": 0.8427, "step": 8816 }, { "epoch": 4.7333982081138215, "grad_norm": 0.78515625, "learning_rate": 3.458865748436541e-07, "loss": 0.8291, "step": 8817 }, { "epoch": 4.733935102848898, "grad_norm": 0.77734375, "learning_rate": 3.444880351922114e-07, "loss": 0.9613, "step": 8818 }, { "epoch": 4.734471997583974, "grad_norm": 0.70703125, "learning_rate": 3.430923090469218e-07, "loss": 0.8607, "step": 8819 }, { "epoch": 4.735008892319049, "grad_norm": 0.76171875, "learning_rate": 3.416993965670523e-07, "loss": 0.7448, "step": 8820 }, { "epoch": 4.735545787054126, "grad_norm": 0.90234375, "learning_rate": 3.403092979115563e-07, "loss": 0.9397, "step": 8821 }, { "epoch": 4.736082681789202, "grad_norm": 0.6640625, "learning_rate": 3.389220132390569e-07, "loss": 0.8316, "step": 8822 }, { "epoch": 4.736619576524277, "grad_norm": 0.6796875, "learning_rate": 3.3753754270786365e-07, "loss": 0.879, "step": 8823 }, { "epoch": 4.737156471259354, "grad_norm": 0.78125, "learning_rate": 3.36155886475964e-07, "loss": 0.6894, "step": 8824 }, { "epoch": 4.73769336599443, "grad_norm": 0.90234375, "learning_rate": 3.3477704470101787e-07, "loss": 1.0246, "step": 8825 }, { "epoch": 4.738230260729505, "grad_norm": 0.6796875, "learning_rate": 3.33401017540369e-07, "loss": 0.9192, "step": 8826 }, { "epoch": 4.738767155464582, "grad_norm": 0.609375, "learning_rate": 3.3202780515104447e-07, "loss": 0.9538, "step": 8827 }, { "epoch": 4.7393040501996575, "grad_norm": 0.6328125, "learning_rate": 3.306574076897412e-07, "loss": 0.7781, "step": 8828 }, { "epoch": 4.739840944934734, "grad_norm": 0.671875, "learning_rate": 3.292898253128368e-07, "loss": 0.8118, "step": 8829 }, { "epoch": 4.74037783966981, "grad_norm": 0.66796875, "learning_rate": 3.279250581763982e-07, "loss": 1.0324, "step": 8830 }, { "epoch": 4.740914734404885, "grad_norm": 0.83203125, "learning_rate": 3.2656310643615074e-07, "loss": 0.9676, "step": 8831 }, { "epoch": 4.741451629139962, "grad_norm": 0.703125, "learning_rate": 3.2520397024752016e-07, "loss": 0.9083, "step": 8832 }, { "epoch": 4.741988523875038, "grad_norm": 0.5078125, "learning_rate": 3.238476497655962e-07, "loss": 0.5256, "step": 8833 }, { "epoch": 4.742525418610114, "grad_norm": 0.796875, "learning_rate": 3.224941451451552e-07, "loss": 0.9408, "step": 8834 }, { "epoch": 4.74306231334519, "grad_norm": 0.69921875, "learning_rate": 3.211434565406457e-07, "loss": 0.9741, "step": 8835 }, { "epoch": 4.743599208080266, "grad_norm": 0.94140625, "learning_rate": 3.197955841062028e-07, "loss": 0.7964, "step": 8836 }, { "epoch": 4.744136102815342, "grad_norm": 0.74609375, "learning_rate": 3.184505279956312e-07, "loss": 0.7008, "step": 8837 }, { "epoch": 4.744672997550418, "grad_norm": 0.70703125, "learning_rate": 3.171082883624221e-07, "loss": 0.9395, "step": 8838 }, { "epoch": 4.7452098922854935, "grad_norm": 0.7734375, "learning_rate": 3.1576886535973906e-07, "loss": 0.8211, "step": 8839 }, { "epoch": 4.74574678702057, "grad_norm": 0.63671875, "learning_rate": 3.144322591404292e-07, "loss": 0.9074, "step": 8840 }, { "epoch": 4.746283681755646, "grad_norm": 0.65234375, "learning_rate": 3.1309846985701774e-07, "loss": 0.8083, "step": 8841 }, { "epoch": 4.746820576490721, "grad_norm": 0.6640625, "learning_rate": 3.117674976617024e-07, "loss": 0.9155, "step": 8842 }, { "epoch": 4.747357471225798, "grad_norm": 0.80859375, "learning_rate": 3.104393427063701e-07, "loss": 0.8939, "step": 8843 }, { "epoch": 4.747894365960874, "grad_norm": 0.66796875, "learning_rate": 3.091140051425745e-07, "loss": 0.6923, "step": 8844 }, { "epoch": 4.74843126069595, "grad_norm": 0.72265625, "learning_rate": 3.077914851215585e-07, "loss": 0.8931, "step": 8845 }, { "epoch": 4.748968155431026, "grad_norm": 0.67578125, "learning_rate": 3.0647178279423206e-07, "loss": 0.9833, "step": 8846 }, { "epoch": 4.749505050166102, "grad_norm": 0.6953125, "learning_rate": 3.051548983111968e-07, "loss": 0.8665, "step": 8847 }, { "epoch": 4.750041944901178, "grad_norm": 0.65234375, "learning_rate": 3.038408318227187e-07, "loss": 0.8933, "step": 8848 }, { "epoch": 4.750578839636254, "grad_norm": 0.71875, "learning_rate": 3.025295834787556e-07, "loss": 0.9279, "step": 8849 }, { "epoch": 4.7511157343713295, "grad_norm": 0.5234375, "learning_rate": 3.01221153428935e-07, "loss": 0.7077, "step": 8850 }, { "epoch": 4.7511157343713295, "eval_loss": 0.9026684761047363, "eval_runtime": 47.3564, "eval_samples_per_second": 20.314, "eval_steps_per_second": 20.314, "step": 8850 }, { "epoch": 4.751652629106406, "grad_norm": 0.5703125, "learning_rate": 2.999155418225652e-07, "loss": 0.6956, "step": 8851 }, { "epoch": 4.752189523841482, "grad_norm": 0.70703125, "learning_rate": 2.9861274880863267e-07, "loss": 0.8107, "step": 8852 }, { "epoch": 4.752726418576557, "grad_norm": 0.640625, "learning_rate": 2.973127745358045e-07, "loss": 0.9229, "step": 8853 }, { "epoch": 4.753263313311634, "grad_norm": 0.69140625, "learning_rate": 2.960156191524233e-07, "loss": 0.9259, "step": 8854 }, { "epoch": 4.75380020804671, "grad_norm": 0.6796875, "learning_rate": 2.9472128280650933e-07, "loss": 0.8425, "step": 8855 }, { "epoch": 4.754337102781786, "grad_norm": 0.609375, "learning_rate": 2.934297656457641e-07, "loss": 0.739, "step": 8856 }, { "epoch": 4.754873997516862, "grad_norm": 0.8671875, "learning_rate": 2.9214106781756676e-07, "loss": 0.9329, "step": 8857 }, { "epoch": 4.755410892251938, "grad_norm": 0.6875, "learning_rate": 2.9085518946897484e-07, "loss": 0.7387, "step": 8858 }, { "epoch": 4.755947786987014, "grad_norm": 0.6328125, "learning_rate": 2.8957213074671806e-07, "loss": 0.7765, "step": 8859 }, { "epoch": 4.75648468172209, "grad_norm": 0.67578125, "learning_rate": 2.8829189179721547e-07, "loss": 0.7699, "step": 8860 }, { "epoch": 4.7570215764571655, "grad_norm": 0.6796875, "learning_rate": 2.870144727665558e-07, "loss": 0.8795, "step": 8861 }, { "epoch": 4.757558471192242, "grad_norm": 0.87109375, "learning_rate": 2.857398738005113e-07, "loss": 1.0587, "step": 8862 }, { "epoch": 4.758095365927318, "grad_norm": 0.609375, "learning_rate": 2.8446809504452665e-07, "loss": 0.8542, "step": 8863 }, { "epoch": 4.758632260662393, "grad_norm": 0.8203125, "learning_rate": 2.8319913664372755e-07, "loss": 0.8493, "step": 8864 }, { "epoch": 4.75916915539747, "grad_norm": 0.70703125, "learning_rate": 2.8193299874292313e-07, "loss": 0.8772, "step": 8865 }, { "epoch": 4.759706050132546, "grad_norm": 0.7421875, "learning_rate": 2.8066968148658945e-07, "loss": 0.8548, "step": 8866 }, { "epoch": 4.760242944867622, "grad_norm": 0.734375, "learning_rate": 2.794091850188918e-07, "loss": 0.9739, "step": 8867 }, { "epoch": 4.760779839602698, "grad_norm": 0.7578125, "learning_rate": 2.7815150948366784e-07, "loss": 1.0512, "step": 8868 }, { "epoch": 4.761316734337774, "grad_norm": 0.82421875, "learning_rate": 2.7689665502443064e-07, "loss": 0.9138, "step": 8869 }, { "epoch": 4.76185362907285, "grad_norm": 0.7421875, "learning_rate": 2.7564462178438224e-07, "loss": 0.903, "step": 8870 }, { "epoch": 4.762390523807926, "grad_norm": 0.59765625, "learning_rate": 2.743954099063889e-07, "loss": 0.7088, "step": 8871 }, { "epoch": 4.762927418543002, "grad_norm": 0.60546875, "learning_rate": 2.7314901953300334e-07, "loss": 0.71, "step": 8872 }, { "epoch": 4.763464313278078, "grad_norm": 0.58984375, "learning_rate": 2.7190545080645336e-07, "loss": 0.7832, "step": 8873 }, { "epoch": 4.764001208013154, "grad_norm": 0.61328125, "learning_rate": 2.7066470386865053e-07, "loss": 0.7738, "step": 8874 }, { "epoch": 4.76453810274823, "grad_norm": 0.64453125, "learning_rate": 2.6942677886117597e-07, "loss": 0.9703, "step": 8875 }, { "epoch": 4.765074997483306, "grad_norm": 0.609375, "learning_rate": 2.681916759252917e-07, "loss": 0.759, "step": 8876 }, { "epoch": 4.765611892218382, "grad_norm": 0.703125, "learning_rate": 2.669593952019406e-07, "loss": 0.9378, "step": 8877 }, { "epoch": 4.766148786953458, "grad_norm": 0.7109375, "learning_rate": 2.657299368317434e-07, "loss": 0.8146, "step": 8878 }, { "epoch": 4.766685681688534, "grad_norm": 0.77734375, "learning_rate": 2.645033009549935e-07, "loss": 0.743, "step": 8879 }, { "epoch": 4.76722257642361, "grad_norm": 0.953125, "learning_rate": 2.632794877116651e-07, "loss": 0.8351, "step": 8880 }, { "epoch": 4.767759471158686, "grad_norm": 0.7265625, "learning_rate": 2.620584972414131e-07, "loss": 0.8081, "step": 8881 }, { "epoch": 4.768296365893762, "grad_norm": 0.68359375, "learning_rate": 2.6084032968356763e-07, "loss": 0.9437, "step": 8882 }, { "epoch": 4.768833260628838, "grad_norm": 0.5859375, "learning_rate": 2.5962498517713706e-07, "loss": 0.8131, "step": 8883 }, { "epoch": 4.769370155363914, "grad_norm": 0.77734375, "learning_rate": 2.5841246386080765e-07, "loss": 0.7389, "step": 8884 }, { "epoch": 4.76990705009899, "grad_norm": 0.59375, "learning_rate": 2.572027658729409e-07, "loss": 0.7616, "step": 8885 }, { "epoch": 4.770443944834066, "grad_norm": 0.6484375, "learning_rate": 2.55995891351582e-07, "loss": 0.8065, "step": 8886 }, { "epoch": 4.770980839569142, "grad_norm": 0.6015625, "learning_rate": 2.5479184043445127e-07, "loss": 0.8393, "step": 8887 }, { "epoch": 4.771517734304218, "grad_norm": 0.73828125, "learning_rate": 2.535906132589416e-07, "loss": 0.8836, "step": 8888 }, { "epoch": 4.772054629039294, "grad_norm": 0.796875, "learning_rate": 2.5239220996212955e-07, "loss": 0.999, "step": 8889 }, { "epoch": 4.77259152377437, "grad_norm": 0.66015625, "learning_rate": 2.5119663068077227e-07, "loss": 0.7627, "step": 8890 }, { "epoch": 4.773128418509446, "grad_norm": 0.7421875, "learning_rate": 2.500038755512968e-07, "loss": 0.922, "step": 8891 }, { "epoch": 4.773665313244522, "grad_norm": 0.6171875, "learning_rate": 2.488139447098109e-07, "loss": 0.7346, "step": 8892 }, { "epoch": 4.774202207979598, "grad_norm": 0.76171875, "learning_rate": 2.47626838292106e-07, "loss": 0.9338, "step": 8893 }, { "epoch": 4.774739102714674, "grad_norm": 0.83984375, "learning_rate": 2.4644255643363755e-07, "loss": 0.9669, "step": 8894 }, { "epoch": 4.77527599744975, "grad_norm": 0.59765625, "learning_rate": 2.452610992695559e-07, "loss": 0.7975, "step": 8895 }, { "epoch": 4.775812892184826, "grad_norm": 0.625, "learning_rate": 2.440824669346753e-07, "loss": 0.7544, "step": 8896 }, { "epoch": 4.776349786919902, "grad_norm": 0.74609375, "learning_rate": 2.429066595634938e-07, "loss": 0.9778, "step": 8897 }, { "epoch": 4.776886681654978, "grad_norm": 0.66796875, "learning_rate": 2.4173367729018457e-07, "loss": 0.7701, "step": 8898 }, { "epoch": 4.777423576390054, "grad_norm": 0.73046875, "learning_rate": 2.405635202486017e-07, "loss": 1.0011, "step": 8899 }, { "epoch": 4.77796047112513, "grad_norm": 0.6171875, "learning_rate": 2.3939618857227174e-07, "loss": 0.8734, "step": 8900 }, { "epoch": 4.77796047112513, "eval_loss": 0.9026810526847839, "eval_runtime": 46.4544, "eval_samples_per_second": 20.709, "eval_steps_per_second": 20.709, "step": 8900 }, { "epoch": 4.778497365860206, "grad_norm": 0.60546875, "learning_rate": 2.3823168239440752e-07, "loss": 0.8939, "step": 8901 }, { "epoch": 4.779034260595282, "grad_norm": 1.0390625, "learning_rate": 2.3707000184788887e-07, "loss": 0.9509, "step": 8902 }, { "epoch": 4.779571155330358, "grad_norm": 0.58984375, "learning_rate": 2.3591114706528195e-07, "loss": 0.7246, "step": 8903 }, { "epoch": 4.780108050065434, "grad_norm": 0.796875, "learning_rate": 2.3475511817882268e-07, "loss": 0.8044, "step": 8904 }, { "epoch": 4.78064494480051, "grad_norm": 0.55078125, "learning_rate": 2.336019153204305e-07, "loss": 0.646, "step": 8905 }, { "epoch": 4.781181839535586, "grad_norm": 0.6796875, "learning_rate": 2.3245153862170298e-07, "loss": 0.888, "step": 8906 }, { "epoch": 4.781718734270662, "grad_norm": 0.66796875, "learning_rate": 2.3130398821391007e-07, "loss": 0.9328, "step": 8907 }, { "epoch": 4.782255629005738, "grad_norm": 0.6328125, "learning_rate": 2.301592642280026e-07, "loss": 0.7437, "step": 8908 }, { "epoch": 4.782792523740814, "grad_norm": 0.70703125, "learning_rate": 2.290173667946066e-07, "loss": 0.8947, "step": 8909 }, { "epoch": 4.7833294184758905, "grad_norm": 0.78125, "learning_rate": 2.27878296044029e-07, "loss": 0.7926, "step": 8910 }, { "epoch": 4.783866313210966, "grad_norm": 0.62109375, "learning_rate": 2.267420521062491e-07, "loss": 0.919, "step": 8911 }, { "epoch": 4.784403207946042, "grad_norm": 0.68359375, "learning_rate": 2.2560863511093267e-07, "loss": 0.9469, "step": 8912 }, { "epoch": 4.7849401026811185, "grad_norm": 0.7578125, "learning_rate": 2.2447804518741232e-07, "loss": 0.8512, "step": 8913 }, { "epoch": 4.785476997416194, "grad_norm": 0.78515625, "learning_rate": 2.2335028246470159e-07, "loss": 0.8271, "step": 8914 }, { "epoch": 4.78601389215127, "grad_norm": 0.6953125, "learning_rate": 2.2222534707149745e-07, "loss": 0.9416, "step": 8915 }, { "epoch": 4.786550786886346, "grad_norm": 0.7578125, "learning_rate": 2.2110323913616392e-07, "loss": 0.8162, "step": 8916 }, { "epoch": 4.787087681621422, "grad_norm": 1.1875, "learning_rate": 2.199839587867486e-07, "loss": 0.9047, "step": 8917 }, { "epoch": 4.787624576356498, "grad_norm": 0.78125, "learning_rate": 2.1886750615097984e-07, "loss": 0.8363, "step": 8918 }, { "epoch": 4.788161471091574, "grad_norm": 0.67578125, "learning_rate": 2.1775388135625573e-07, "loss": 0.817, "step": 8919 }, { "epoch": 4.78869836582665, "grad_norm": 0.71484375, "learning_rate": 2.16643084529658e-07, "loss": 0.9826, "step": 8920 }, { "epoch": 4.7892352605617265, "grad_norm": 0.75, "learning_rate": 2.1553511579793518e-07, "loss": 0.8506, "step": 8921 }, { "epoch": 4.789772155296802, "grad_norm": 0.7578125, "learning_rate": 2.144299752875306e-07, "loss": 0.8597, "step": 8922 }, { "epoch": 4.790309050031878, "grad_norm": 0.8671875, "learning_rate": 2.1332766312454623e-07, "loss": 1.0634, "step": 8923 }, { "epoch": 4.7908459447669545, "grad_norm": 0.6484375, "learning_rate": 2.12228179434773e-07, "loss": 0.7643, "step": 8924 }, { "epoch": 4.79138283950203, "grad_norm": 0.61328125, "learning_rate": 2.1113152434367733e-07, "loss": 0.8133, "step": 8925 }, { "epoch": 4.791919734237106, "grad_norm": 0.6328125, "learning_rate": 2.1003769797640072e-07, "loss": 0.8601, "step": 8926 }, { "epoch": 4.792456628972182, "grad_norm": 0.83984375, "learning_rate": 2.0894670045776278e-07, "loss": 0.8927, "step": 8927 }, { "epoch": 4.792993523707258, "grad_norm": 0.72265625, "learning_rate": 2.0785853191225835e-07, "loss": 0.811, "step": 8928 }, { "epoch": 4.793530418442334, "grad_norm": 0.71484375, "learning_rate": 2.0677319246406312e-07, "loss": 0.7321, "step": 8929 }, { "epoch": 4.79406731317741, "grad_norm": 0.66015625, "learning_rate": 2.0569068223702803e-07, "loss": 0.7338, "step": 8930 }, { "epoch": 4.794604207912486, "grad_norm": 0.6875, "learning_rate": 2.0461100135467925e-07, "loss": 0.8936, "step": 8931 }, { "epoch": 4.7951411026475625, "grad_norm": 0.62109375, "learning_rate": 2.035341499402238e-07, "loss": 0.7176, "step": 8932 }, { "epoch": 4.795677997382638, "grad_norm": 0.671875, "learning_rate": 2.0246012811654391e-07, "loss": 0.9077, "step": 8933 }, { "epoch": 4.796214892117714, "grad_norm": 0.71484375, "learning_rate": 2.013889360061999e-07, "loss": 0.8842, "step": 8934 }, { "epoch": 4.7967517868527905, "grad_norm": 0.64453125, "learning_rate": 2.0032057373142454e-07, "loss": 0.943, "step": 8935 }, { "epoch": 4.797288681587866, "grad_norm": 0.640625, "learning_rate": 1.9925504141413697e-07, "loss": 0.7171, "step": 8936 }, { "epoch": 4.797825576322942, "grad_norm": 0.5703125, "learning_rate": 1.9819233917592606e-07, "loss": 0.8325, "step": 8937 }, { "epoch": 4.798362471058018, "grad_norm": 0.73046875, "learning_rate": 1.9713246713805588e-07, "loss": 0.7394, "step": 8938 }, { "epoch": 4.798899365793094, "grad_norm": 0.64453125, "learning_rate": 1.9607542542147416e-07, "loss": 0.957, "step": 8939 }, { "epoch": 4.79943626052817, "grad_norm": 0.77734375, "learning_rate": 1.9502121414680385e-07, "loss": 0.9808, "step": 8940 }, { "epoch": 4.799973155263246, "grad_norm": 0.9921875, "learning_rate": 1.9396983343434315e-07, "loss": 0.9053, "step": 8941 }, { "epoch": 4.800510049998322, "grad_norm": 0.8359375, "learning_rate": 1.9292128340406556e-07, "loss": 1.0395, "step": 8942 }, { "epoch": 4.8010469447333985, "grad_norm": 0.65625, "learning_rate": 1.9187556417562812e-07, "loss": 0.7515, "step": 8943 }, { "epoch": 4.801583839468474, "grad_norm": 0.734375, "learning_rate": 1.9083267586835484e-07, "loss": 0.9611, "step": 8944 }, { "epoch": 4.80212073420355, "grad_norm": 0.65625, "learning_rate": 1.8979261860125885e-07, "loss": 0.7925, "step": 8945 }, { "epoch": 4.8026576289386265, "grad_norm": 0.78125, "learning_rate": 1.887553924930202e-07, "loss": 0.9188, "step": 8946 }, { "epoch": 4.803194523673702, "grad_norm": 0.75390625, "learning_rate": 1.877209976619998e-07, "loss": 0.8435, "step": 8947 }, { "epoch": 4.803731418408779, "grad_norm": 0.734375, "learning_rate": 1.8668943422623652e-07, "loss": 1.0408, "step": 8948 }, { "epoch": 4.804268313143854, "grad_norm": 0.65625, "learning_rate": 1.8566070230344179e-07, "loss": 0.8568, "step": 8949 }, { "epoch": 4.80480520787893, "grad_norm": 0.671875, "learning_rate": 1.8463480201101334e-07, "loss": 0.8391, "step": 8950 }, { "epoch": 4.80480520787893, "eval_loss": 0.9026857018470764, "eval_runtime": 46.5931, "eval_samples_per_second": 20.647, "eval_steps_per_second": 20.647, "step": 8950 }, { "epoch": 4.805342102614007, "grad_norm": 0.60546875, "learning_rate": 1.8361173346601034e-07, "loss": 0.6363, "step": 8951 }, { "epoch": 4.805878997349082, "grad_norm": 0.6796875, "learning_rate": 1.8259149678518662e-07, "loss": 0.6233, "step": 8952 }, { "epoch": 4.806415892084158, "grad_norm": 0.74609375, "learning_rate": 1.8157409208495735e-07, "loss": 0.7178, "step": 8953 }, { "epoch": 4.8069527868192345, "grad_norm": 0.7265625, "learning_rate": 1.805595194814269e-07, "loss": 0.8452, "step": 8954 }, { "epoch": 4.80748968155431, "grad_norm": 0.6875, "learning_rate": 1.7954777909036656e-07, "loss": 0.9099, "step": 8955 }, { "epoch": 4.808026576289386, "grad_norm": 0.71484375, "learning_rate": 1.7853887102723123e-07, "loss": 0.8106, "step": 8956 }, { "epoch": 4.8085634710244625, "grad_norm": 0.62890625, "learning_rate": 1.7753279540714817e-07, "loss": 0.7842, "step": 8957 }, { "epoch": 4.809100365759538, "grad_norm": 0.88671875, "learning_rate": 1.7652955234492285e-07, "loss": 1.0033, "step": 8958 }, { "epoch": 4.809637260494615, "grad_norm": 0.6796875, "learning_rate": 1.7552914195504145e-07, "loss": 0.7133, "step": 8959 }, { "epoch": 4.81017415522969, "grad_norm": 0.69921875, "learning_rate": 1.7453156435165986e-07, "loss": 0.8815, "step": 8960 }, { "epoch": 4.810711049964766, "grad_norm": 0.58984375, "learning_rate": 1.735368196486148e-07, "loss": 0.7701, "step": 8961 }, { "epoch": 4.811247944699843, "grad_norm": 0.6875, "learning_rate": 1.7254490795942378e-07, "loss": 0.8473, "step": 8962 }, { "epoch": 4.811784839434918, "grad_norm": 0.73046875, "learning_rate": 1.7155582939727134e-07, "loss": 0.7896, "step": 8963 }, { "epoch": 4.812321734169994, "grad_norm": 0.66015625, "learning_rate": 1.7056958407502544e-07, "loss": 0.8496, "step": 8964 }, { "epoch": 4.8128586289050705, "grad_norm": 0.6875, "learning_rate": 1.6958617210522943e-07, "loss": 0.6718, "step": 8965 }, { "epoch": 4.813395523640146, "grad_norm": 0.59375, "learning_rate": 1.6860559360010463e-07, "loss": 0.8748, "step": 8966 }, { "epoch": 4.813932418375222, "grad_norm": 0.78125, "learning_rate": 1.6762784867154212e-07, "loss": 0.8733, "step": 8967 }, { "epoch": 4.8144693131102985, "grad_norm": 0.62890625, "learning_rate": 1.666529374311221e-07, "loss": 0.8356, "step": 8968 }, { "epoch": 4.815006207845374, "grad_norm": 0.65234375, "learning_rate": 1.6568085999008888e-07, "loss": 0.6574, "step": 8969 }, { "epoch": 4.815543102580451, "grad_norm": 0.66796875, "learning_rate": 1.6471161645937038e-07, "loss": 0.76, "step": 8970 }, { "epoch": 4.816079997315526, "grad_norm": 0.64453125, "learning_rate": 1.6374520694957263e-07, "loss": 0.7994, "step": 8971 }, { "epoch": 4.816616892050602, "grad_norm": 0.7734375, "learning_rate": 1.6278163157097127e-07, "loss": 0.9175, "step": 8972 }, { "epoch": 4.817153786785679, "grad_norm": 0.7890625, "learning_rate": 1.6182089043352277e-07, "loss": 0.7615, "step": 8973 }, { "epoch": 4.817690681520754, "grad_norm": 0.6953125, "learning_rate": 1.608629836468617e-07, "loss": 0.8836, "step": 8974 }, { "epoch": 4.81822757625583, "grad_norm": 0.68359375, "learning_rate": 1.5990791132029503e-07, "loss": 0.7681, "step": 8975 }, { "epoch": 4.8187644709909065, "grad_norm": 0.94140625, "learning_rate": 1.589556735628106e-07, "loss": 0.9287, "step": 8976 }, { "epoch": 4.819301365725982, "grad_norm": 0.68359375, "learning_rate": 1.5800627048307148e-07, "loss": 0.9093, "step": 8977 }, { "epoch": 4.819838260461058, "grad_norm": 0.89453125, "learning_rate": 1.5705970218941323e-07, "loss": 0.9605, "step": 8978 }, { "epoch": 4.8203751551961345, "grad_norm": 0.6015625, "learning_rate": 1.56115968789855e-07, "loss": 0.812, "step": 8979 }, { "epoch": 4.82091204993121, "grad_norm": 0.5859375, "learning_rate": 1.5517507039208568e-07, "loss": 0.6377, "step": 8980 }, { "epoch": 4.821448944666287, "grad_norm": 0.6796875, "learning_rate": 1.5423700710347765e-07, "loss": 0.6437, "step": 8981 }, { "epoch": 4.821985839401362, "grad_norm": 0.7421875, "learning_rate": 1.5330177903107036e-07, "loss": 0.8401, "step": 8982 }, { "epoch": 4.822522734136438, "grad_norm": 0.78125, "learning_rate": 1.523693862815895e-07, "loss": 0.9717, "step": 8983 }, { "epoch": 4.823059628871515, "grad_norm": 0.80859375, "learning_rate": 1.514398289614305e-07, "loss": 0.8353, "step": 8984 }, { "epoch": 4.82359652360659, "grad_norm": 0.671875, "learning_rate": 1.5051310717666967e-07, "loss": 0.8644, "step": 8985 }, { "epoch": 4.824133418341667, "grad_norm": 0.63671875, "learning_rate": 1.4958922103305574e-07, "loss": 0.8548, "step": 8986 }, { "epoch": 4.8246703130767425, "grad_norm": 0.75390625, "learning_rate": 1.4866817063601547e-07, "loss": 0.7597, "step": 8987 }, { "epoch": 4.825207207811818, "grad_norm": 0.73828125, "learning_rate": 1.4774995609065646e-07, "loss": 1.0039, "step": 8988 }, { "epoch": 4.825744102546895, "grad_norm": 0.70703125, "learning_rate": 1.4683457750175323e-07, "loss": 0.7785, "step": 8989 }, { "epoch": 4.8262809972819705, "grad_norm": 0.7578125, "learning_rate": 1.4592203497376667e-07, "loss": 0.9638, "step": 8990 }, { "epoch": 4.826817892017046, "grad_norm": 0.65234375, "learning_rate": 1.4501232861082735e-07, "loss": 0.708, "step": 8991 }, { "epoch": 4.827354786752123, "grad_norm": 0.76953125, "learning_rate": 1.4410545851674118e-07, "loss": 0.9362, "step": 8992 }, { "epoch": 4.827891681487198, "grad_norm": 0.6953125, "learning_rate": 1.432014247950003e-07, "loss": 0.7802, "step": 8993 }, { "epoch": 4.828428576222274, "grad_norm": 0.71484375, "learning_rate": 1.4230022754876115e-07, "loss": 0.8584, "step": 8994 }, { "epoch": 4.828965470957351, "grad_norm": 0.74609375, "learning_rate": 1.4140186688086366e-07, "loss": 0.7625, "step": 8995 }, { "epoch": 4.829502365692426, "grad_norm": 0.796875, "learning_rate": 1.4050634289382302e-07, "loss": 0.8922, "step": 8996 }, { "epoch": 4.830039260427503, "grad_norm": 0.6640625, "learning_rate": 1.3961365568982698e-07, "loss": 0.7938, "step": 8997 }, { "epoch": 4.8305761551625785, "grad_norm": 0.68359375, "learning_rate": 1.3872380537074404e-07, "loss": 0.8929, "step": 8998 }, { "epoch": 4.831113049897654, "grad_norm": 0.64453125, "learning_rate": 1.3783679203811794e-07, "loss": 0.7465, "step": 8999 }, { "epoch": 4.831649944632731, "grad_norm": 0.64453125, "learning_rate": 1.3695261579316777e-07, "loss": 0.7243, "step": 9000 }, { "epoch": 4.831649944632731, "eval_loss": 0.9027743935585022, "eval_runtime": 46.2143, "eval_samples_per_second": 20.816, "eval_steps_per_second": 20.816, "step": 9000 }, { "epoch": 4.8321868393678065, "grad_norm": 0.7265625, "learning_rate": 1.3607127673678777e-07, "loss": 0.936, "step": 9001 }, { "epoch": 4.832723734102882, "grad_norm": 0.6484375, "learning_rate": 1.3519277496955307e-07, "loss": 0.8228, "step": 9002 }, { "epoch": 4.833260628837959, "grad_norm": 0.7890625, "learning_rate": 1.343171105917085e-07, "loss": 0.9724, "step": 9003 }, { "epoch": 4.833797523573034, "grad_norm": 0.55859375, "learning_rate": 1.334442837031824e-07, "loss": 0.5936, "step": 9004 }, { "epoch": 4.83433441830811, "grad_norm": 0.71484375, "learning_rate": 1.3257429440357016e-07, "loss": 0.7406, "step": 9005 }, { "epoch": 4.834871313043187, "grad_norm": 0.74609375, "learning_rate": 1.3170714279215346e-07, "loss": 0.7096, "step": 9006 }, { "epoch": 4.835408207778262, "grad_norm": 0.6953125, "learning_rate": 1.3084282896788368e-07, "loss": 0.814, "step": 9007 }, { "epoch": 4.835945102513339, "grad_norm": 0.6484375, "learning_rate": 1.2998135302938753e-07, "loss": 0.8512, "step": 9008 }, { "epoch": 4.8364819972484145, "grad_norm": 1.0546875, "learning_rate": 1.2912271507497243e-07, "loss": 0.9884, "step": 9009 }, { "epoch": 4.83701889198349, "grad_norm": 0.80859375, "learning_rate": 1.2826691520262114e-07, "loss": 0.9556, "step": 9010 }, { "epoch": 4.837555786718567, "grad_norm": 0.68359375, "learning_rate": 1.2741395350998887e-07, "loss": 0.8672, "step": 9011 }, { "epoch": 4.8380926814536425, "grad_norm": 0.63671875, "learning_rate": 1.2656383009441163e-07, "loss": 0.6835, "step": 9012 }, { "epoch": 4.838629576188718, "grad_norm": 0.72265625, "learning_rate": 1.2571654505289798e-07, "loss": 0.8848, "step": 9013 }, { "epoch": 4.839166470923795, "grad_norm": 0.63671875, "learning_rate": 1.2487209848213165e-07, "loss": 0.8183, "step": 9014 }, { "epoch": 4.83970336565887, "grad_norm": 0.6640625, "learning_rate": 1.2403049047847725e-07, "loss": 0.9652, "step": 9015 }, { "epoch": 4.840240260393946, "grad_norm": 0.828125, "learning_rate": 1.231917211379746e-07, "loss": 1.0092, "step": 9016 }, { "epoch": 4.840777155129023, "grad_norm": 0.65234375, "learning_rate": 1.2235579055633327e-07, "loss": 0.7694, "step": 9017 }, { "epoch": 4.841314049864098, "grad_norm": 0.69921875, "learning_rate": 1.2152269882894918e-07, "loss": 0.7765, "step": 9018 }, { "epoch": 4.841850944599175, "grad_norm": 0.73046875, "learning_rate": 1.2069244605088236e-07, "loss": 0.8609, "step": 9019 }, { "epoch": 4.8423878393342505, "grad_norm": 0.66015625, "learning_rate": 1.1986503231688207e-07, "loss": 0.8848, "step": 9020 }, { "epoch": 4.842924734069326, "grad_norm": 0.6953125, "learning_rate": 1.1904045772135886e-07, "loss": 0.6962, "step": 9021 }, { "epoch": 4.843461628804403, "grad_norm": 0.7578125, "learning_rate": 1.1821872235841525e-07, "loss": 0.9984, "step": 9022 }, { "epoch": 4.8439985235394785, "grad_norm": 0.7265625, "learning_rate": 1.1739982632181512e-07, "loss": 0.8243, "step": 9023 }, { "epoch": 4.844535418274555, "grad_norm": 0.55078125, "learning_rate": 1.1658376970500873e-07, "loss": 0.6503, "step": 9024 }, { "epoch": 4.845072313009631, "grad_norm": 0.73046875, "learning_rate": 1.1577055260111602e-07, "loss": 0.6767, "step": 9025 }, { "epoch": 4.845609207744706, "grad_norm": 0.64453125, "learning_rate": 1.1496017510293777e-07, "loss": 0.7621, "step": 9026 }, { "epoch": 4.846146102479782, "grad_norm": 0.609375, "learning_rate": 1.1415263730294445e-07, "loss": 0.7268, "step": 9027 }, { "epoch": 4.846682997214859, "grad_norm": 0.58984375, "learning_rate": 1.1334793929329291e-07, "loss": 0.7942, "step": 9028 }, { "epoch": 4.847219891949934, "grad_norm": 0.73828125, "learning_rate": 1.1254608116580412e-07, "loss": 0.7029, "step": 9029 }, { "epoch": 4.847756786685011, "grad_norm": 0.7109375, "learning_rate": 1.117470630119799e-07, "loss": 0.785, "step": 9030 }, { "epoch": 4.8482936814200865, "grad_norm": 0.78515625, "learning_rate": 1.109508849230001e-07, "loss": 0.8514, "step": 9031 }, { "epoch": 4.848830576155162, "grad_norm": 0.62890625, "learning_rate": 1.1015754698971981e-07, "loss": 0.7724, "step": 9032 }, { "epoch": 4.849367470890239, "grad_norm": 0.6640625, "learning_rate": 1.0936704930266661e-07, "loss": 0.8121, "step": 9033 }, { "epoch": 4.8499043656253145, "grad_norm": 0.7109375, "learning_rate": 1.0857939195204891e-07, "loss": 0.7787, "step": 9034 }, { "epoch": 4.850441260360391, "grad_norm": 0.6484375, "learning_rate": 1.0779457502774482e-07, "loss": 0.755, "step": 9035 }, { "epoch": 4.850978155095467, "grad_norm": 0.6875, "learning_rate": 1.07012598619316e-07, "loss": 0.9678, "step": 9036 }, { "epoch": 4.851515049830542, "grad_norm": 0.71484375, "learning_rate": 1.0623346281599389e-07, "loss": 0.6666, "step": 9037 }, { "epoch": 4.852051944565619, "grad_norm": 0.69921875, "learning_rate": 1.054571677066879e-07, "loss": 0.8457, "step": 9038 }, { "epoch": 4.852588839300695, "grad_norm": 0.93359375, "learning_rate": 1.0468371337998273e-07, "loss": 0.8623, "step": 9039 }, { "epoch": 4.85312573403577, "grad_norm": 0.703125, "learning_rate": 1.0391309992413833e-07, "loss": 1.0475, "step": 9040 }, { "epoch": 4.853662628770847, "grad_norm": 0.6640625, "learning_rate": 1.0314532742709271e-07, "loss": 0.8704, "step": 9041 }, { "epoch": 4.8541995235059225, "grad_norm": 0.6640625, "learning_rate": 1.023803959764591e-07, "loss": 0.7975, "step": 9042 }, { "epoch": 4.854736418240998, "grad_norm": 0.76171875, "learning_rate": 1.0161830565952602e-07, "loss": 0.8559, "step": 9043 }, { "epoch": 4.855273312976075, "grad_norm": 0.671875, "learning_rate": 1.0085905656325445e-07, "loss": 0.743, "step": 9044 }, { "epoch": 4.8558102077111505, "grad_norm": 0.546875, "learning_rate": 1.0010264877428899e-07, "loss": 1.0218, "step": 9045 }, { "epoch": 4.856347102446227, "grad_norm": 0.6015625, "learning_rate": 9.934908237894114e-08, "loss": 0.8144, "step": 9046 }, { "epoch": 4.856883997181303, "grad_norm": 0.6953125, "learning_rate": 9.859835746320601e-08, "loss": 0.8618, "step": 9047 }, { "epoch": 4.857420891916378, "grad_norm": 0.72265625, "learning_rate": 9.785047411274839e-08, "loss": 0.9914, "step": 9048 }, { "epoch": 4.857957786651455, "grad_norm": 0.609375, "learning_rate": 9.710543241291115e-08, "loss": 0.7195, "step": 9049 }, { "epoch": 4.858494681386531, "grad_norm": 0.67578125, "learning_rate": 9.636323244871514e-08, "loss": 0.6905, "step": 9050 }, { "epoch": 4.858494681386531, "eval_loss": 0.9026276469230652, "eval_runtime": 45.9643, "eval_samples_per_second": 20.929, "eval_steps_per_second": 20.929, "step": 9050 }, { "epoch": 4.859031576121606, "grad_norm": 0.671875, "learning_rate": 9.562387430485099e-08, "loss": 0.8008, "step": 9051 }, { "epoch": 4.859568470856683, "grad_norm": 0.671875, "learning_rate": 9.488735806569283e-08, "loss": 1.2051, "step": 9052 }, { "epoch": 4.8601053655917585, "grad_norm": 0.6796875, "learning_rate": 9.415368381528455e-08, "loss": 0.7499, "step": 9053 }, { "epoch": 4.860642260326834, "grad_norm": 0.703125, "learning_rate": 9.342285163734809e-08, "loss": 0.8332, "step": 9054 }, { "epoch": 4.861179155061911, "grad_norm": 0.68359375, "learning_rate": 9.269486161528063e-08, "loss": 0.8453, "step": 9055 }, { "epoch": 4.8617160497969865, "grad_norm": 0.6953125, "learning_rate": 9.196971383215458e-08, "loss": 0.8733, "step": 9056 }, { "epoch": 4.862252944532063, "grad_norm": 0.78515625, "learning_rate": 9.124740837072043e-08, "loss": 0.9948, "step": 9057 }, { "epoch": 4.862789839267139, "grad_norm": 0.80078125, "learning_rate": 9.052794531339837e-08, "loss": 0.8759, "step": 9058 }, { "epoch": 4.863326734002214, "grad_norm": 0.67578125, "learning_rate": 8.981132474229214e-08, "loss": 0.9871, "step": 9059 }, { "epoch": 4.863863628737291, "grad_norm": 0.7421875, "learning_rate": 8.909754673917525e-08, "loss": 1.0342, "step": 9060 }, { "epoch": 4.864400523472367, "grad_norm": 0.72265625, "learning_rate": 8.83866113854992e-08, "loss": 0.7909, "step": 9061 }, { "epoch": 4.864937418207443, "grad_norm": 0.765625, "learning_rate": 8.767851876239074e-08, "loss": 1.0984, "step": 9062 }, { "epoch": 4.865474312942519, "grad_norm": 0.73046875, "learning_rate": 8.697326895065472e-08, "loss": 0.8747, "step": 9063 }, { "epoch": 4.8660112076775945, "grad_norm": 0.60546875, "learning_rate": 8.627086203076284e-08, "loss": 0.8663, "step": 9064 }, { "epoch": 4.86654810241267, "grad_norm": 0.71875, "learning_rate": 8.557129808287601e-08, "loss": 1.0312, "step": 9065 }, { "epoch": 4.867084997147747, "grad_norm": 0.65625, "learning_rate": 8.487457718681924e-08, "loss": 0.7214, "step": 9066 }, { "epoch": 4.8676218918828225, "grad_norm": 0.69140625, "learning_rate": 8.418069942209561e-08, "loss": 0.8832, "step": 9067 }, { "epoch": 4.868158786617899, "grad_norm": 0.61328125, "learning_rate": 8.348966486788622e-08, "loss": 0.7356, "step": 9068 }, { "epoch": 4.868695681352975, "grad_norm": 0.76171875, "learning_rate": 8.280147360305302e-08, "loss": 0.8824, "step": 9069 }, { "epoch": 4.86923257608805, "grad_norm": 0.64453125, "learning_rate": 8.211612570611926e-08, "loss": 0.9492, "step": 9070 }, { "epoch": 4.869769470823127, "grad_norm": 0.65234375, "learning_rate": 8.143362125529463e-08, "loss": 0.7919, "step": 9071 }, { "epoch": 4.870306365558203, "grad_norm": 0.76953125, "learning_rate": 8.075396032846405e-08, "loss": 0.8273, "step": 9072 }, { "epoch": 4.870843260293279, "grad_norm": 0.7578125, "learning_rate": 8.007714300318214e-08, "loss": 0.7849, "step": 9073 }, { "epoch": 4.871380155028355, "grad_norm": 0.734375, "learning_rate": 7.940316935668435e-08, "loss": 0.8282, "step": 9074 }, { "epoch": 4.8719170497634305, "grad_norm": 0.69921875, "learning_rate": 7.873203946587581e-08, "loss": 0.7447, "step": 9075 }, { "epoch": 4.872453944498507, "grad_norm": 0.6953125, "learning_rate": 7.806375340734806e-08, "loss": 0.8362, "step": 9076 }, { "epoch": 4.872990839233583, "grad_norm": 0.77734375, "learning_rate": 7.739831125735397e-08, "loss": 1.0521, "step": 9077 }, { "epoch": 4.8735277339686585, "grad_norm": 0.87109375, "learning_rate": 7.673571309183281e-08, "loss": 0.8534, "step": 9078 }, { "epoch": 4.874064628703735, "grad_norm": 0.796875, "learning_rate": 7.607595898639352e-08, "loss": 0.8739, "step": 9079 }, { "epoch": 4.874601523438811, "grad_norm": 0.69140625, "learning_rate": 7.54190490163259e-08, "loss": 0.8838, "step": 9080 }, { "epoch": 4.875138418173886, "grad_norm": 0.55859375, "learning_rate": 7.476498325658665e-08, "loss": 0.6999, "step": 9081 }, { "epoch": 4.875675312908963, "grad_norm": 0.70703125, "learning_rate": 7.411376178181606e-08, "loss": 0.8838, "step": 9082 }, { "epoch": 4.876212207644039, "grad_norm": 0.94921875, "learning_rate": 7.346538466632414e-08, "loss": 0.9781, "step": 9083 }, { "epoch": 4.876749102379115, "grad_norm": 0.7109375, "learning_rate": 7.281985198410169e-08, "loss": 0.8947, "step": 9084 }, { "epoch": 4.877285997114191, "grad_norm": 0.65234375, "learning_rate": 7.217716380881479e-08, "loss": 0.8, "step": 9085 }, { "epoch": 4.8778228918492665, "grad_norm": 0.7265625, "learning_rate": 7.153732021379367e-08, "loss": 0.9429, "step": 9086 }, { "epoch": 4.878359786584343, "grad_norm": 0.62890625, "learning_rate": 7.090032127206048e-08, "loss": 0.7188, "step": 9087 }, { "epoch": 4.878896681319419, "grad_norm": 0.8203125, "learning_rate": 7.026616705630429e-08, "loss": 0.8974, "step": 9088 }, { "epoch": 4.8794335760544945, "grad_norm": 0.76171875, "learning_rate": 6.963485763888389e-08, "loss": 0.9295, "step": 9089 }, { "epoch": 4.879970470789571, "grad_norm": 0.94140625, "learning_rate": 6.90063930918472e-08, "loss": 1.2854, "step": 9090 }, { "epoch": 4.880507365524647, "grad_norm": 0.69140625, "learning_rate": 6.838077348690353e-08, "loss": 0.8567, "step": 9091 }, { "epoch": 4.881044260259722, "grad_norm": 0.7421875, "learning_rate": 6.775799889544854e-08, "loss": 0.8553, "step": 9092 }, { "epoch": 4.881581154994799, "grad_norm": 0.71875, "learning_rate": 6.71380693885476e-08, "loss": 0.8453, "step": 9093 }, { "epoch": 4.882118049729875, "grad_norm": 0.76171875, "learning_rate": 6.652098503694137e-08, "loss": 0.7966, "step": 9094 }, { "epoch": 4.882654944464951, "grad_norm": 0.5859375, "learning_rate": 6.59067459110485e-08, "loss": 0.7641, "step": 9095 }, { "epoch": 4.883191839200027, "grad_norm": 0.703125, "learning_rate": 6.529535208096016e-08, "loss": 1.0088, "step": 9096 }, { "epoch": 4.8837287339351025, "grad_norm": 0.76171875, "learning_rate": 6.468680361644275e-08, "loss": 0.7856, "step": 9097 }, { "epoch": 4.884265628670179, "grad_norm": 0.67578125, "learning_rate": 6.40811005869435e-08, "loss": 0.7847, "step": 9098 }, { "epoch": 4.884802523405255, "grad_norm": 0.66015625, "learning_rate": 6.347824306157935e-08, "loss": 0.6684, "step": 9099 }, { "epoch": 4.885339418140331, "grad_norm": 0.6796875, "learning_rate": 6.287823110914248e-08, "loss": 0.8787, "step": 9100 }, { "epoch": 4.885339418140331, "eval_loss": 0.90264892578125, "eval_runtime": 46.047, "eval_samples_per_second": 20.892, "eval_steps_per_second": 20.892, "step": 9100 }, { "epoch": 4.885876312875407, "grad_norm": 0.96875, "learning_rate": 6.228106479810036e-08, "loss": 1.0519, "step": 9101 }, { "epoch": 4.886413207610483, "grad_norm": 0.671875, "learning_rate": 6.168674419660126e-08, "loss": 0.9016, "step": 9102 }, { "epoch": 4.886950102345558, "grad_norm": 0.59375, "learning_rate": 6.109526937246313e-08, "loss": 0.6995, "step": 9103 }, { "epoch": 4.887486997080635, "grad_norm": 0.67578125, "learning_rate": 6.050664039317921e-08, "loss": 0.794, "step": 9104 }, { "epoch": 4.888023891815711, "grad_norm": 0.69140625, "learning_rate": 5.992085732592079e-08, "loss": 0.8779, "step": 9105 }, { "epoch": 4.888560786550787, "grad_norm": 0.70703125, "learning_rate": 5.9337920237534375e-08, "loss": 0.9777, "step": 9106 }, { "epoch": 4.889097681285863, "grad_norm": 0.75390625, "learning_rate": 5.875782919453898e-08, "loss": 0.7092, "step": 9107 }, { "epoch": 4.8896345760209385, "grad_norm": 0.70703125, "learning_rate": 5.818058426312889e-08, "loss": 0.769, "step": 9108 }, { "epoch": 4.890171470756015, "grad_norm": 0.78125, "learning_rate": 5.760618550917918e-08, "loss": 0.8875, "step": 9109 }, { "epoch": 4.890708365491091, "grad_norm": 0.68359375, "learning_rate": 5.7034632998231865e-08, "loss": 0.9944, "step": 9110 }, { "epoch": 4.891245260226167, "grad_norm": 0.75, "learning_rate": 5.646592679550977e-08, "loss": 0.8484, "step": 9111 }, { "epoch": 4.891782154961243, "grad_norm": 0.765625, "learning_rate": 5.590006696591099e-08, "loss": 0.8101, "step": 9112 }, { "epoch": 4.892319049696319, "grad_norm": 0.703125, "learning_rate": 5.533705357400609e-08, "loss": 0.9721, "step": 9113 }, { "epoch": 4.892855944431395, "grad_norm": 0.609375, "learning_rate": 5.477688668404091e-08, "loss": 0.8035, "step": 9114 }, { "epoch": 4.893392839166471, "grad_norm": 0.69140625, "learning_rate": 5.42195663599393e-08, "loss": 0.9419, "step": 9115 }, { "epoch": 4.893929733901547, "grad_norm": 0.69921875, "learning_rate": 5.366509266529762e-08, "loss": 0.8662, "step": 9116 }, { "epoch": 4.894466628636623, "grad_norm": 0.703125, "learning_rate": 5.311346566339026e-08, "loss": 0.8157, "step": 9117 }, { "epoch": 4.895003523371699, "grad_norm": 0.75, "learning_rate": 5.256468541716131e-08, "loss": 0.8522, "step": 9118 }, { "epoch": 4.8955404181067745, "grad_norm": 0.66796875, "learning_rate": 5.201875198923567e-08, "loss": 0.8041, "step": 9119 }, { "epoch": 4.896077312841851, "grad_norm": 0.6328125, "learning_rate": 5.1475665441910733e-08, "loss": 0.7414, "step": 9120 }, { "epoch": 4.896614207576927, "grad_norm": 0.625, "learning_rate": 5.0935425837159156e-08, "loss": 0.8075, "step": 9121 }, { "epoch": 4.897151102312003, "grad_norm": 0.734375, "learning_rate": 5.039803323663162e-08, "loss": 0.8519, "step": 9122 }, { "epoch": 4.897687997047079, "grad_norm": 0.67578125, "learning_rate": 4.986348770165128e-08, "loss": 0.8335, "step": 9123 }, { "epoch": 4.898224891782155, "grad_norm": 0.6484375, "learning_rate": 4.9331789293211026e-08, "loss": 0.679, "step": 9124 }, { "epoch": 4.898761786517231, "grad_norm": 0.68359375, "learning_rate": 4.88029380719901e-08, "loss": 0.7697, "step": 9125 }, { "epoch": 4.899298681252307, "grad_norm": 0.7421875, "learning_rate": 4.827693409833467e-08, "loss": 0.7572, "step": 9126 }, { "epoch": 4.899835575987383, "grad_norm": 0.66796875, "learning_rate": 4.775377743226894e-08, "loss": 0.8181, "step": 9127 }, { "epoch": 4.900372470722459, "grad_norm": 0.765625, "learning_rate": 4.7233468133492386e-08, "loss": 0.8985, "step": 9128 }, { "epoch": 4.900909365457535, "grad_norm": 0.6484375, "learning_rate": 4.671600626137973e-08, "loss": 0.7619, "step": 9129 }, { "epoch": 4.9014462601926105, "grad_norm": 0.703125, "learning_rate": 4.6201391874978186e-08, "loss": 0.8286, "step": 9130 }, { "epoch": 4.901983154927687, "grad_norm": 0.78125, "learning_rate": 4.568962503301022e-08, "loss": 0.9733, "step": 9131 }, { "epoch": 4.902520049662763, "grad_norm": 0.5859375, "learning_rate": 4.5180705793876344e-08, "loss": 0.7693, "step": 9132 }, { "epoch": 4.903056944397839, "grad_norm": 0.6015625, "learning_rate": 4.4674634215655095e-08, "loss": 0.8485, "step": 9133 }, { "epoch": 4.903593839132915, "grad_norm": 0.6484375, "learning_rate": 4.417141035608641e-08, "loss": 0.7162, "step": 9134 }, { "epoch": 4.904130733867991, "grad_norm": 0.81640625, "learning_rate": 4.367103427260211e-08, "loss": 0.9069, "step": 9135 }, { "epoch": 4.904667628603067, "grad_norm": 0.58203125, "learning_rate": 4.317350602230097e-08, "loss": 0.7779, "step": 9136 }, { "epoch": 4.905204523338143, "grad_norm": 0.82421875, "learning_rate": 4.2678825661954245e-08, "loss": 0.9377, "step": 9137 }, { "epoch": 4.9057414180732195, "grad_norm": 0.69921875, "learning_rate": 4.218699324801123e-08, "loss": 0.9546, "step": 9138 }, { "epoch": 4.906278312808295, "grad_norm": 0.75, "learning_rate": 4.1698008836599245e-08, "loss": 0.9133, "step": 9139 }, { "epoch": 4.906815207543371, "grad_norm": 0.68359375, "learning_rate": 4.121187248351532e-08, "loss": 0.837, "step": 9140 }, { "epoch": 4.9073521022784465, "grad_norm": 0.6015625, "learning_rate": 4.072858424423454e-08, "loss": 0.7976, "step": 9141 }, { "epoch": 4.907888997013523, "grad_norm": 0.7109375, "learning_rate": 4.024814417390443e-08, "loss": 0.8258, "step": 9142 }, { "epoch": 4.908425891748599, "grad_norm": 0.7578125, "learning_rate": 3.977055232735061e-08, "loss": 0.8559, "step": 9143 }, { "epoch": 4.908962786483675, "grad_norm": 1.0234375, "learning_rate": 3.9295808759073904e-08, "loss": 0.9074, "step": 9144 }, { "epoch": 4.909499681218751, "grad_norm": 0.8046875, "learning_rate": 3.8823913523247654e-08, "loss": 0.9378, "step": 9145 }, { "epoch": 4.910036575953827, "grad_norm": 0.63671875, "learning_rate": 3.835486667371768e-08, "loss": 0.8543, "step": 9146 }, { "epoch": 4.910573470688903, "grad_norm": 0.65625, "learning_rate": 3.788866826401338e-08, "loss": 0.7796, "step": 9147 }, { "epoch": 4.911110365423979, "grad_norm": 0.765625, "learning_rate": 3.7425318347331096e-08, "loss": 0.9348, "step": 9148 }, { "epoch": 4.9116472601590555, "grad_norm": 0.76953125, "learning_rate": 3.696481697654519e-08, "loss": 0.7457, "step": 9149 }, { "epoch": 4.912184154894131, "grad_norm": 0.7109375, "learning_rate": 3.65071642042053e-08, "loss": 0.9105, "step": 9150 }, { "epoch": 4.912184154894131, "eval_loss": 0.9025958776473999, "eval_runtime": 46.1145, "eval_samples_per_second": 20.861, "eval_steps_per_second": 20.861, "step": 9150 }, { "epoch": 4.912721049629207, "grad_norm": 0.66796875, "learning_rate": 3.605236008253632e-08, "loss": 0.8419, "step": 9151 }, { "epoch": 4.913257944364283, "grad_norm": 0.828125, "learning_rate": 3.5600404663435615e-08, "loss": 0.8061, "step": 9152 }, { "epoch": 4.913794839099359, "grad_norm": 0.6796875, "learning_rate": 3.515129799847583e-08, "loss": 0.8222, "step": 9153 }, { "epoch": 4.914331733834435, "grad_norm": 0.68359375, "learning_rate": 3.470504013890763e-08, "loss": 0.8127, "step": 9154 }, { "epoch": 4.914868628569511, "grad_norm": 0.6328125, "learning_rate": 3.426163113565417e-08, "loss": 0.8199, "step": 9155 }, { "epoch": 4.915405523304587, "grad_norm": 0.6484375, "learning_rate": 3.3821071039313865e-08, "loss": 0.7451, "step": 9156 }, { "epoch": 4.915942418039663, "grad_norm": 0.6171875, "learning_rate": 3.3383359900160396e-08, "loss": 0.8019, "step": 9157 }, { "epoch": 4.916479312774739, "grad_norm": 0.69921875, "learning_rate": 3.294849776814268e-08, "loss": 0.7966, "step": 9158 }, { "epoch": 4.917016207509815, "grad_norm": 0.76171875, "learning_rate": 3.2516484692882156e-08, "loss": 0.697, "step": 9159 }, { "epoch": 4.9175531022448915, "grad_norm": 0.609375, "learning_rate": 3.208732072368104e-08, "loss": 0.87, "step": 9160 }, { "epoch": 4.918089996979967, "grad_norm": 0.6953125, "learning_rate": 3.1661005909505716e-08, "loss": 0.8946, "step": 9161 }, { "epoch": 4.918626891715043, "grad_norm": 0.6796875, "learning_rate": 3.123754029901171e-08, "loss": 0.9249, "step": 9162 }, { "epoch": 4.919163786450119, "grad_norm": 0.5546875, "learning_rate": 3.081692394051594e-08, "loss": 0.7447, "step": 9163 }, { "epoch": 4.919700681185195, "grad_norm": 0.67578125, "learning_rate": 3.0399156882021665e-08, "loss": 0.6506, "step": 9164 }, { "epoch": 4.920237575920271, "grad_norm": 0.75390625, "learning_rate": 2.998423917119353e-08, "loss": 0.8776, "step": 9165 }, { "epoch": 4.920774470655347, "grad_norm": 0.7578125, "learning_rate": 2.9572170855385327e-08, "loss": 0.8856, "step": 9166 }, { "epoch": 4.921311365390423, "grad_norm": 0.6953125, "learning_rate": 2.916295198161778e-08, "loss": 0.9974, "step": 9167 }, { "epoch": 4.921848260125499, "grad_norm": 0.609375, "learning_rate": 2.8756582596586867e-08, "loss": 0.8075, "step": 9168 }, { "epoch": 4.922385154860575, "grad_norm": 0.73828125, "learning_rate": 2.8353062746663827e-08, "loss": 0.8311, "step": 9169 }, { "epoch": 4.922922049595651, "grad_norm": 0.70703125, "learning_rate": 2.7952392477900714e-08, "loss": 0.8493, "step": 9170 }, { "epoch": 4.9234589443307275, "grad_norm": 0.6796875, "learning_rate": 2.7554571836010957e-08, "loss": 0.9469, "step": 9171 }, { "epoch": 4.923995839065803, "grad_norm": 0.70703125, "learning_rate": 2.7159600866397127e-08, "loss": 0.9322, "step": 9172 }, { "epoch": 4.924532733800879, "grad_norm": 0.73046875, "learning_rate": 2.6767479614125957e-08, "loss": 0.9732, "step": 9173 }, { "epoch": 4.925069628535955, "grad_norm": 0.7109375, "learning_rate": 2.6378208123947757e-08, "loss": 0.8026, "step": 9174 }, { "epoch": 4.925606523271031, "grad_norm": 0.58203125, "learning_rate": 2.599178644028255e-08, "loss": 0.6899, "step": 9175 }, { "epoch": 4.926143418006108, "grad_norm": 0.796875, "learning_rate": 2.5608214607222847e-08, "loss": 0.7736, "step": 9176 }, { "epoch": 4.926680312741183, "grad_norm": 1.0625, "learning_rate": 2.5227492668541962e-08, "loss": 0.8652, "step": 9177 }, { "epoch": 4.927217207476259, "grad_norm": 0.59765625, "learning_rate": 2.4849620667682928e-08, "loss": 0.8428, "step": 9178 }, { "epoch": 4.927754102211335, "grad_norm": 0.7421875, "learning_rate": 2.44745986477668e-08, "loss": 0.7495, "step": 9179 }, { "epoch": 4.928290996946411, "grad_norm": 0.734375, "learning_rate": 2.4102426651589905e-08, "loss": 0.9329, "step": 9180 }, { "epoch": 4.928827891681487, "grad_norm": 0.6875, "learning_rate": 2.3733104721618273e-08, "loss": 0.8752, "step": 9181 }, { "epoch": 4.9293647864165635, "grad_norm": 0.6328125, "learning_rate": 2.3366632899998743e-08, "loss": 0.7079, "step": 9182 }, { "epoch": 4.929901681151639, "grad_norm": 0.6640625, "learning_rate": 2.3003011228550643e-08, "loss": 0.7176, "step": 9183 }, { "epoch": 4.930438575886715, "grad_norm": 0.62109375, "learning_rate": 2.2642239748765782e-08, "loss": 0.7696, "step": 9184 }, { "epoch": 4.930975470621791, "grad_norm": 0.77734375, "learning_rate": 2.228431850181678e-08, "loss": 0.887, "step": 9185 }, { "epoch": 4.931512365356867, "grad_norm": 0.65625, "learning_rate": 2.192924752854042e-08, "loss": 0.7757, "step": 9186 }, { "epoch": 4.932049260091944, "grad_norm": 0.64453125, "learning_rate": 2.157702686946261e-08, "loss": 0.9643, "step": 9187 }, { "epoch": 4.932586154827019, "grad_norm": 0.62109375, "learning_rate": 2.1227656564767883e-08, "loss": 0.891, "step": 9188 }, { "epoch": 4.933123049562095, "grad_norm": 0.83984375, "learning_rate": 2.0881136654332668e-08, "loss": 0.8697, "step": 9189 }, { "epoch": 4.933659944297172, "grad_norm": 0.71484375, "learning_rate": 2.0537467177692005e-08, "loss": 0.8539, "step": 9190 }, { "epoch": 4.934196839032247, "grad_norm": 0.70703125, "learning_rate": 2.0196648174064525e-08, "loss": 0.6448, "step": 9191 }, { "epoch": 4.934733733767323, "grad_norm": 0.72265625, "learning_rate": 1.9858679682346892e-08, "loss": 0.7918, "step": 9192 }, { "epoch": 4.9352706285023995, "grad_norm": 0.70703125, "learning_rate": 1.9523561741097153e-08, "loss": 0.7458, "step": 9193 }, { "epoch": 4.935807523237475, "grad_norm": 0.74609375, "learning_rate": 1.919129438856526e-08, "loss": 1.1106, "step": 9194 }, { "epoch": 4.936344417972551, "grad_norm": 0.8515625, "learning_rate": 1.886187766265979e-08, "loss": 0.8313, "step": 9195 }, { "epoch": 4.936881312707627, "grad_norm": 0.75, "learning_rate": 1.8535311600975656e-08, "loss": 1.0673, "step": 9196 }, { "epoch": 4.937418207442703, "grad_norm": 0.66015625, "learning_rate": 1.8211596240777508e-08, "loss": 0.8802, "step": 9197 }, { "epoch": 4.93795510217778, "grad_norm": 0.6875, "learning_rate": 1.789073161900523e-08, "loss": 0.8451, "step": 9198 }, { "epoch": 4.938491996912855, "grad_norm": 0.60546875, "learning_rate": 1.7572717772273982e-08, "loss": 0.699, "step": 9199 }, { "epoch": 4.939028891647931, "grad_norm": 0.79296875, "learning_rate": 1.7257554736871407e-08, "loss": 0.9295, "step": 9200 }, { "epoch": 4.939028891647931, "eval_loss": 0.9025484323501587, "eval_runtime": 46.1447, "eval_samples_per_second": 20.847, "eval_steps_per_second": 20.847, "step": 9200 }, { "epoch": 4.939565786383008, "grad_norm": 0.65625, "learning_rate": 1.6945242548763174e-08, "loss": 0.8636, "step": 9201 }, { "epoch": 4.940102681118083, "grad_norm": 0.65625, "learning_rate": 1.6635781243587446e-08, "loss": 0.7182, "step": 9202 }, { "epoch": 4.940639575853159, "grad_norm": 0.7265625, "learning_rate": 1.632917085665764e-08, "loss": 0.7932, "step": 9203 }, { "epoch": 4.9411764705882355, "grad_norm": 0.6875, "learning_rate": 1.6025411422962433e-08, "loss": 0.9285, "step": 9204 }, { "epoch": 4.941713365323311, "grad_norm": 0.5625, "learning_rate": 1.5724502977162993e-08, "loss": 0.6713, "step": 9205 }, { "epoch": 4.942250260058387, "grad_norm": 0.66796875, "learning_rate": 1.542644555359851e-08, "loss": 0.8203, "step": 9206 }, { "epoch": 4.942787154793463, "grad_norm": 0.76171875, "learning_rate": 1.5131239186280675e-08, "loss": 1.0477, "step": 9207 }, { "epoch": 4.943324049528539, "grad_norm": 0.6640625, "learning_rate": 1.4838883908896428e-08, "loss": 0.8604, "step": 9208 }, { "epoch": 4.943860944263616, "grad_norm": 0.75390625, "learning_rate": 1.4549379754807969e-08, "loss": 0.9156, "step": 9209 }, { "epoch": 4.944397838998691, "grad_norm": 0.7421875, "learning_rate": 1.4262726757049982e-08, "loss": 0.7788, "step": 9210 }, { "epoch": 4.944934733733767, "grad_norm": 0.796875, "learning_rate": 1.3978924948332416e-08, "loss": 0.8569, "step": 9211 }, { "epoch": 4.945471628468844, "grad_norm": 1.15625, "learning_rate": 1.3697974361043254e-08, "loss": 0.8195, "step": 9212 }, { "epoch": 4.946008523203919, "grad_norm": 0.671875, "learning_rate": 1.3419875027240181e-08, "loss": 0.6782, "step": 9213 }, { "epoch": 4.946545417938996, "grad_norm": 0.62890625, "learning_rate": 1.3144626978658925e-08, "loss": 0.9173, "step": 9214 }, { "epoch": 4.9470823126740715, "grad_norm": 0.640625, "learning_rate": 1.2872230246707695e-08, "loss": 0.8079, "step": 9215 }, { "epoch": 4.947619207409147, "grad_norm": 0.671875, "learning_rate": 1.2602684862472736e-08, "loss": 0.9495, "step": 9216 }, { "epoch": 4.948156102144223, "grad_norm": 0.55859375, "learning_rate": 1.233599085671e-08, "loss": 0.7799, "step": 9217 }, { "epoch": 4.948692996879299, "grad_norm": 0.6875, "learning_rate": 1.2072148259856253e-08, "loss": 0.7358, "step": 9218 }, { "epoch": 4.949229891614375, "grad_norm": 0.69140625, "learning_rate": 1.1811157102015192e-08, "loss": 0.9001, "step": 9219 }, { "epoch": 4.949766786349452, "grad_norm": 0.62890625, "learning_rate": 1.1553017412971323e-08, "loss": 0.7197, "step": 9220 }, { "epoch": 4.950303681084527, "grad_norm": 0.75390625, "learning_rate": 1.1297729222181642e-08, "loss": 0.8497, "step": 9221 }, { "epoch": 4.950840575819603, "grad_norm": 0.66015625, "learning_rate": 1.1045292558775622e-08, "loss": 0.7473, "step": 9222 }, { "epoch": 4.95137747055468, "grad_norm": 0.65234375, "learning_rate": 1.0795707451563553e-08, "loss": 0.8857, "step": 9223 }, { "epoch": 4.951914365289755, "grad_norm": 0.8671875, "learning_rate": 1.0548973929022655e-08, "loss": 0.9471, "step": 9224 }, { "epoch": 4.952451260024832, "grad_norm": 0.77734375, "learning_rate": 1.0305092019308182e-08, "loss": 0.8383, "step": 9225 }, { "epoch": 4.9529881547599075, "grad_norm": 0.78515625, "learning_rate": 1.0064061750253429e-08, "loss": 0.9087, "step": 9226 }, { "epoch": 4.953525049494983, "grad_norm": 0.6796875, "learning_rate": 9.825883149361392e-09, "loss": 0.7706, "step": 9227 }, { "epoch": 4.95406194423006, "grad_norm": 0.765625, "learning_rate": 9.590556243810333e-09, "loss": 0.8166, "step": 9228 }, { "epoch": 4.954598838965135, "grad_norm": 0.6484375, "learning_rate": 9.358081060456548e-09, "loss": 0.8657, "step": 9229 }, { "epoch": 4.955135733700211, "grad_norm": 0.640625, "learning_rate": 9.128457625823261e-09, "loss": 0.8099, "step": 9230 }, { "epoch": 4.955672628435288, "grad_norm": 0.61328125, "learning_rate": 8.901685966120065e-09, "loss": 0.7184, "step": 9231 }, { "epoch": 4.956209523170363, "grad_norm": 0.69140625, "learning_rate": 8.677766107217933e-09, "loss": 0.9066, "step": 9232 }, { "epoch": 4.956746417905439, "grad_norm": 0.76953125, "learning_rate": 8.456698074674197e-09, "loss": 0.9604, "step": 9233 }, { "epoch": 4.957283312640516, "grad_norm": 0.73828125, "learning_rate": 8.238481893713123e-09, "loss": 0.9137, "step": 9234 }, { "epoch": 4.957820207375591, "grad_norm": 0.77734375, "learning_rate": 8.023117589237017e-09, "loss": 0.8297, "step": 9235 }, { "epoch": 4.958357102110668, "grad_norm": 0.6015625, "learning_rate": 7.810605185820663e-09, "loss": 0.6425, "step": 9236 }, { "epoch": 4.9588939968457435, "grad_norm": 0.609375, "learning_rate": 7.600944707714108e-09, "loss": 0.7196, "step": 9237 }, { "epoch": 4.959430891580819, "grad_norm": 0.71484375, "learning_rate": 7.394136178842659e-09, "loss": 1.0134, "step": 9238 }, { "epoch": 4.959967786315896, "grad_norm": 0.71875, "learning_rate": 7.190179622806881e-09, "loss": 0.8386, "step": 9239 }, { "epoch": 4.960504681050971, "grad_norm": 0.62890625, "learning_rate": 6.989075062879824e-09, "loss": 0.6899, "step": 9240 }, { "epoch": 4.961041575786047, "grad_norm": 0.6796875, "learning_rate": 6.790822522009799e-09, "loss": 0.9326, "step": 9241 }, { "epoch": 4.961578470521124, "grad_norm": 0.6171875, "learning_rate": 6.595422022820375e-09, "loss": 0.8257, "step": 9242 }, { "epoch": 4.962115365256199, "grad_norm": 0.609375, "learning_rate": 6.402873587610381e-09, "loss": 0.9685, "step": 9243 }, { "epoch": 4.962652259991275, "grad_norm": 0.6796875, "learning_rate": 6.213177238348356e-09, "loss": 0.8452, "step": 9244 }, { "epoch": 4.963189154726352, "grad_norm": 0.79296875, "learning_rate": 6.0263329966836476e-09, "loss": 0.7881, "step": 9245 }, { "epoch": 4.963726049461427, "grad_norm": 0.68359375, "learning_rate": 5.842340883940866e-09, "loss": 0.8278, "step": 9246 }, { "epoch": 4.964262944196504, "grad_norm": 0.609375, "learning_rate": 5.661200921108778e-09, "loss": 0.7414, "step": 9247 }, { "epoch": 4.9647998389315795, "grad_norm": 0.68359375, "learning_rate": 5.48291312886251e-09, "loss": 0.6948, "step": 9248 }, { "epoch": 4.965336733666655, "grad_norm": 0.71484375, "learning_rate": 5.3074775275469e-09, "loss": 0.9773, "step": 9249 }, { "epoch": 4.965873628401732, "grad_norm": 0.87890625, "learning_rate": 5.134894137179269e-09, "loss": 1.0437, "step": 9250 }, { "epoch": 4.965873628401732, "eval_loss": 0.9026386737823486, "eval_runtime": 45.7381, "eval_samples_per_second": 21.033, "eval_steps_per_second": 21.033, "step": 9250 }, { "epoch": 4.966410523136807, "grad_norm": 0.7109375, "learning_rate": 4.965162977454973e-09, "loss": 0.7639, "step": 9251 }, { "epoch": 4.966947417871884, "grad_norm": 0.58203125, "learning_rate": 4.798284067741854e-09, "loss": 0.6897, "step": 9252 }, { "epoch": 4.96748431260696, "grad_norm": 0.75, "learning_rate": 4.6342574270830104e-09, "loss": 0.9268, "step": 9253 }, { "epoch": 4.968021207342035, "grad_norm": 0.69921875, "learning_rate": 4.4730830741940285e-09, "loss": 0.9176, "step": 9254 }, { "epoch": 4.968558102077111, "grad_norm": 0.80078125, "learning_rate": 4.3147610274713034e-09, "loss": 0.8222, "step": 9255 }, { "epoch": 4.969094996812188, "grad_norm": 0.75, "learning_rate": 4.159291304980939e-09, "loss": 0.7937, "step": 9256 }, { "epoch": 4.969631891547263, "grad_norm": 0.8046875, "learning_rate": 4.006673924458748e-09, "loss": 0.9635, "step": 9257 }, { "epoch": 4.97016878628234, "grad_norm": 0.69140625, "learning_rate": 3.8569089033241305e-09, "loss": 0.8897, "step": 9258 }, { "epoch": 4.9707056810174155, "grad_norm": 0.68359375, "learning_rate": 3.7099962586689685e-09, "loss": 1.0692, "step": 9259 }, { "epoch": 4.971242575752491, "grad_norm": 0.55078125, "learning_rate": 3.565936007254855e-09, "loss": 0.7195, "step": 9260 }, { "epoch": 4.971779470487568, "grad_norm": 0.71484375, "learning_rate": 3.424728165521418e-09, "loss": 0.8123, "step": 9261 }, { "epoch": 4.972316365222643, "grad_norm": 0.58984375, "learning_rate": 3.2863727495807685e-09, "loss": 0.6917, "step": 9262 }, { "epoch": 4.97285325995772, "grad_norm": 0.6640625, "learning_rate": 3.150869775225829e-09, "loss": 0.8037, "step": 9263 }, { "epoch": 4.973390154692796, "grad_norm": 0.66796875, "learning_rate": 3.0182192579136792e-09, "loss": 0.9472, "step": 9264 }, { "epoch": 4.973927049427871, "grad_norm": 0.69921875, "learning_rate": 2.8884212127849865e-09, "loss": 0.7353, "step": 9265 }, { "epoch": 4.974463944162948, "grad_norm": 0.62890625, "learning_rate": 2.7614756546501254e-09, "loss": 0.8754, "step": 9266 }, { "epoch": 4.975000838898024, "grad_norm": 0.703125, "learning_rate": 2.637382597994731e-09, "loss": 0.8499, "step": 9267 }, { "epoch": 4.975537733633099, "grad_norm": 0.625, "learning_rate": 2.5161420569796977e-09, "loss": 0.8057, "step": 9268 }, { "epoch": 4.976074628368176, "grad_norm": 0.66015625, "learning_rate": 2.3977540454411805e-09, "loss": 0.7986, "step": 9269 }, { "epoch": 4.9766115231032515, "grad_norm": 0.671875, "learning_rate": 2.282218576887818e-09, "loss": 0.7206, "step": 9270 }, { "epoch": 4.977148417838327, "grad_norm": 0.765625, "learning_rate": 2.169535664503508e-09, "loss": 0.9028, "step": 9271 }, { "epoch": 4.977685312573404, "grad_norm": 0.69140625, "learning_rate": 2.0597053211474095e-09, "loss": 0.9991, "step": 9272 }, { "epoch": 4.978222207308479, "grad_norm": 0.734375, "learning_rate": 1.9527275593539397e-09, "loss": 0.7342, "step": 9273 }, { "epoch": 4.978759102043556, "grad_norm": 0.72265625, "learning_rate": 1.8486023913272255e-09, "loss": 0.8345, "step": 9274 }, { "epoch": 4.979295996778632, "grad_norm": 0.58203125, "learning_rate": 1.7473298289494288e-09, "loss": 0.6468, "step": 9275 }, { "epoch": 4.979832891513707, "grad_norm": 0.6328125, "learning_rate": 1.648909883780747e-09, "loss": 0.6854, "step": 9276 }, { "epoch": 4.980369786248784, "grad_norm": 0.70703125, "learning_rate": 1.553342567051086e-09, "loss": 0.9647, "step": 9277 }, { "epoch": 4.98090668098386, "grad_norm": 0.62890625, "learning_rate": 1.4606278896628357e-09, "loss": 0.8783, "step": 9278 }, { "epoch": 4.981443575718935, "grad_norm": 0.6015625, "learning_rate": 1.3707658621964215e-09, "loss": 0.7503, "step": 9279 }, { "epoch": 4.981980470454012, "grad_norm": 0.8125, "learning_rate": 1.2837564949103042e-09, "loss": 0.9091, "step": 9280 }, { "epoch": 4.9825173651890875, "grad_norm": 0.8125, "learning_rate": 1.1995997977298779e-09, "loss": 0.876, "step": 9281 }, { "epoch": 4.983054259924163, "grad_norm": 0.68359375, "learning_rate": 1.1182957802585715e-09, "loss": 0.7893, "step": 9282 }, { "epoch": 4.98359115465924, "grad_norm": 0.62890625, "learning_rate": 1.039844451775074e-09, "loss": 0.6618, "step": 9283 }, { "epoch": 4.984128049394315, "grad_norm": 0.71875, "learning_rate": 9.642458212305584e-10, "loss": 0.7565, "step": 9284 }, { "epoch": 4.984664944129392, "grad_norm": 0.71875, "learning_rate": 8.914998972542332e-10, "loss": 0.8786, "step": 9285 }, { "epoch": 4.985201838864468, "grad_norm": 0.68359375, "learning_rate": 8.216066881450157e-10, "loss": 0.8447, "step": 9286 }, { "epoch": 4.985738733599543, "grad_norm": 0.578125, "learning_rate": 7.545662018826339e-10, "loss": 0.843, "step": 9287 }, { "epoch": 4.98627562833462, "grad_norm": 0.828125, "learning_rate": 6.90378446110973e-10, "loss": 1.0274, "step": 9288 }, { "epoch": 4.986812523069696, "grad_norm": 0.8515625, "learning_rate": 6.290434281575052e-10, "loss": 0.8809, "step": 9289 }, { "epoch": 4.987349417804772, "grad_norm": 0.8046875, "learning_rate": 5.705611550221868e-10, "loss": 0.8501, "step": 9290 }, { "epoch": 4.987886312539848, "grad_norm": 0.625, "learning_rate": 5.149316333802334e-10, "loss": 0.7096, "step": 9291 }, { "epoch": 4.9884232072749235, "grad_norm": 0.765625, "learning_rate": 4.621548695737943e-10, "loss": 0.8204, "step": 9292 }, { "epoch": 4.988960102009999, "grad_norm": 0.62109375, "learning_rate": 4.122308696313804e-10, "loss": 0.7726, "step": 9293 }, { "epoch": 4.989496996745076, "grad_norm": 0.671875, "learning_rate": 3.651596392456602e-10, "loss": 0.9576, "step": 9294 }, { "epoch": 4.990033891480151, "grad_norm": 0.73828125, "learning_rate": 3.2094118379288885e-10, "loss": 0.7951, "step": 9295 }, { "epoch": 4.990570786215228, "grad_norm": 0.828125, "learning_rate": 2.795755083134788e-10, "loss": 1.0182, "step": 9296 }, { "epoch": 4.991107680950304, "grad_norm": 0.765625, "learning_rate": 2.41062617531429e-10, "loss": 0.8384, "step": 9297 }, { "epoch": 4.991644575685379, "grad_norm": 0.8671875, "learning_rate": 2.054025158404471e-10, "loss": 0.8835, "step": 9298 }, { "epoch": 4.992181470420456, "grad_norm": 0.71484375, "learning_rate": 1.7259520730950052e-10, "loss": 0.8248, "step": 9299 }, { "epoch": 4.992718365155532, "grad_norm": 0.77734375, "learning_rate": 1.4264069568281635e-10, "loss": 0.9296, "step": 9300 }, { "epoch": 4.992718365155532, "eval_loss": 0.9026424288749695, "eval_runtime": 45.729, "eval_samples_per_second": 21.037, "eval_steps_per_second": 21.037, "step": 9300 }, { "epoch": 4.993255259890608, "grad_norm": 0.76171875, "learning_rate": 1.1553898437988154e-10, "loss": 1.0127, "step": 9301 }, { "epoch": 4.993792154625684, "grad_norm": 0.65234375, "learning_rate": 9.129007648989162e-11, "loss": 0.8544, "step": 9302 }, { "epoch": 4.9943290493607595, "grad_norm": 0.6953125, "learning_rate": 6.989397478285309e-11, "loss": 0.8514, "step": 9303 }, { "epoch": 4.994865944095836, "grad_norm": 0.86328125, "learning_rate": 5.1350681698481004e-11, "loss": 0.818, "step": 9304 }, { "epoch": 4.995402838830912, "grad_norm": 0.6640625, "learning_rate": 3.5660199354525806e-11, "loss": 0.734, "step": 9305 }, { "epoch": 4.995939733565987, "grad_norm": 0.64453125, "learning_rate": 2.2822529538446545e-11, "loss": 0.8164, "step": 9306 }, { "epoch": 4.996476628301064, "grad_norm": 0.6484375, "learning_rate": 1.2837673721288746e-11, "loss": 0.6163, "step": 9307 }, { "epoch": 4.99701352303614, "grad_norm": 0.62109375, "learning_rate": 5.705633035479885e-12, "loss": 0.7307, "step": 9308 }, { "epoch": 4.997550417771215, "grad_norm": 0.6953125, "learning_rate": 1.4264082998094451e-12, "loss": 0.786, "step": 9309 }, { "epoch": 4.998087312506292, "grad_norm": 0.75, "learning_rate": 0.0, "loss": 0.8572, "step": 9310 } ], "logging_steps": 1, "max_steps": 9310, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.079528727663903e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }