{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 2666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007501875468867217, "grad_norm": 0.16957804560661316, "learning_rate": 3.7453183520599253e-07, "loss": 0.5067, "step": 1 }, { "epoch": 0.0015003750937734434, "grad_norm": 0.2590717077255249, "learning_rate": 7.490636704119851e-07, "loss": 0.6979, "step": 2 }, { "epoch": 0.002250562640660165, "grad_norm": 0.18719057738780975, "learning_rate": 1.1235955056179775e-06, "loss": 0.6109, "step": 3 }, { "epoch": 0.003000750187546887, "grad_norm": 0.16934776306152344, "learning_rate": 1.4981273408239701e-06, "loss": 0.5901, "step": 4 }, { "epoch": 0.0037509377344336083, "grad_norm": 0.21556782722473145, "learning_rate": 1.8726591760299627e-06, "loss": 0.5248, "step": 5 }, { "epoch": 0.00450112528132033, "grad_norm": 0.2548677325248718, "learning_rate": 2.247191011235955e-06, "loss": 0.6855, "step": 6 }, { "epoch": 0.005251312828207052, "grad_norm": 0.1761707216501236, "learning_rate": 2.621722846441948e-06, "loss": 0.5643, "step": 7 }, { "epoch": 0.006001500375093774, "grad_norm": 0.33832332491874695, "learning_rate": 2.9962546816479402e-06, "loss": 0.6426, "step": 8 }, { "epoch": 0.006751687921980495, "grad_norm": 0.2214318811893463, "learning_rate": 3.3707865168539327e-06, "loss": 0.593, "step": 9 }, { "epoch": 0.007501875468867217, "grad_norm": 0.17969368398189545, "learning_rate": 3.7453183520599255e-06, "loss": 0.5737, "step": 10 }, { "epoch": 0.008252063015753939, "grad_norm": 0.21642841398715973, "learning_rate": 4.1198501872659175e-06, "loss": 0.6448, "step": 11 }, { "epoch": 0.00900225056264066, "grad_norm": 0.19058872759342194, "learning_rate": 4.49438202247191e-06, "loss": 0.6507, "step": 12 }, { "epoch": 0.009752438109527382, "grad_norm": 0.26897871494293213, "learning_rate": 4.868913857677903e-06, "loss": 0.7443, "step": 13 }, { "epoch": 0.010502625656414103, "grad_norm": 0.31465813517570496, "learning_rate": 5.243445692883896e-06, "loss": 0.7296, "step": 14 }, { "epoch": 0.011252813203300824, "grad_norm": 0.2601154148578644, "learning_rate": 5.617977528089888e-06, "loss": 0.701, "step": 15 }, { "epoch": 0.012003000750187547, "grad_norm": 0.24808494746685028, "learning_rate": 5.9925093632958805e-06, "loss": 0.6712, "step": 16 }, { "epoch": 0.012753188297074268, "grad_norm": 0.19653230905532837, "learning_rate": 6.367041198501873e-06, "loss": 0.5046, "step": 17 }, { "epoch": 0.01350337584396099, "grad_norm": 0.17001445591449738, "learning_rate": 6.741573033707865e-06, "loss": 0.5631, "step": 18 }, { "epoch": 0.014253563390847712, "grad_norm": 0.22667701542377472, "learning_rate": 7.116104868913858e-06, "loss": 0.6215, "step": 19 }, { "epoch": 0.015003750937734433, "grad_norm": 0.25909850001335144, "learning_rate": 7.490636704119851e-06, "loss": 0.6636, "step": 20 }, { "epoch": 0.015753938484621154, "grad_norm": 0.2788105607032776, "learning_rate": 7.865168539325843e-06, "loss": 0.698, "step": 21 }, { "epoch": 0.016504126031507877, "grad_norm": 0.3595254123210907, "learning_rate": 8.239700374531835e-06, "loss": 0.7906, "step": 22 }, { "epoch": 0.0172543135783946, "grad_norm": 0.2044590711593628, "learning_rate": 8.614232209737828e-06, "loss": 0.6113, "step": 23 }, { "epoch": 0.01800450112528132, "grad_norm": 0.2928810119628906, "learning_rate": 8.98876404494382e-06, "loss": 0.6482, "step": 24 }, { "epoch": 0.018754688672168042, "grad_norm": 0.288282185792923, "learning_rate": 9.363295880149813e-06, "loss": 0.6606, "step": 25 }, { "epoch": 0.019504876219054765, "grad_norm": 0.34736546874046326, "learning_rate": 9.737827715355806e-06, "loss": 0.6072, "step": 26 }, { "epoch": 0.020255063765941484, "grad_norm": 0.29511576890945435, "learning_rate": 1.0112359550561798e-05, "loss": 0.5874, "step": 27 }, { "epoch": 0.021005251312828207, "grad_norm": 0.46607428789138794, "learning_rate": 1.0486891385767791e-05, "loss": 0.8514, "step": 28 }, { "epoch": 0.02175543885971493, "grad_norm": 0.3923439383506775, "learning_rate": 1.0861423220973783e-05, "loss": 0.722, "step": 29 }, { "epoch": 0.02250562640660165, "grad_norm": 0.32116979360580444, "learning_rate": 1.1235955056179776e-05, "loss": 0.6284, "step": 30 }, { "epoch": 0.023255813953488372, "grad_norm": 0.38401302695274353, "learning_rate": 1.161048689138577e-05, "loss": 0.6293, "step": 31 }, { "epoch": 0.024006001500375095, "grad_norm": 0.3105871379375458, "learning_rate": 1.1985018726591761e-05, "loss": 0.593, "step": 32 }, { "epoch": 0.024756189047261814, "grad_norm": 0.47100821137428284, "learning_rate": 1.2359550561797752e-05, "loss": 0.6236, "step": 33 }, { "epoch": 0.025506376594148537, "grad_norm": 0.42858728766441345, "learning_rate": 1.2734082397003746e-05, "loss": 0.755, "step": 34 }, { "epoch": 0.02625656414103526, "grad_norm": 0.42198970913887024, "learning_rate": 1.3108614232209737e-05, "loss": 0.6563, "step": 35 }, { "epoch": 0.02700675168792198, "grad_norm": 0.418535441160202, "learning_rate": 1.348314606741573e-05, "loss": 0.6429, "step": 36 }, { "epoch": 0.0277569392348087, "grad_norm": 0.4164247512817383, "learning_rate": 1.3857677902621724e-05, "loss": 0.566, "step": 37 }, { "epoch": 0.028507126781695424, "grad_norm": 0.40723326802253723, "learning_rate": 1.4232209737827715e-05, "loss": 0.5636, "step": 38 }, { "epoch": 0.029257314328582147, "grad_norm": 0.5074004530906677, "learning_rate": 1.4606741573033709e-05, "loss": 0.8102, "step": 39 }, { "epoch": 0.030007501875468866, "grad_norm": 0.3689713776111603, "learning_rate": 1.4981273408239702e-05, "loss": 0.6078, "step": 40 }, { "epoch": 0.03075768942235559, "grad_norm": 0.3278329372406006, "learning_rate": 1.5355805243445692e-05, "loss": 0.691, "step": 41 }, { "epoch": 0.03150787696924231, "grad_norm": 0.39348167181015015, "learning_rate": 1.5730337078651687e-05, "loss": 0.5553, "step": 42 }, { "epoch": 0.03225806451612903, "grad_norm": 0.314788818359375, "learning_rate": 1.610486891385768e-05, "loss": 0.6037, "step": 43 }, { "epoch": 0.033008252063015754, "grad_norm": 0.2796263098716736, "learning_rate": 1.647940074906367e-05, "loss": 0.5301, "step": 44 }, { "epoch": 0.03375843960990248, "grad_norm": 0.3108610212802887, "learning_rate": 1.6853932584269665e-05, "loss": 0.6372, "step": 45 }, { "epoch": 0.0345086271567892, "grad_norm": 0.2994379699230194, "learning_rate": 1.7228464419475657e-05, "loss": 0.6297, "step": 46 }, { "epoch": 0.035258814703675916, "grad_norm": 0.29583239555358887, "learning_rate": 1.760299625468165e-05, "loss": 0.5376, "step": 47 }, { "epoch": 0.03600900225056264, "grad_norm": 0.31138482689857483, "learning_rate": 1.797752808988764e-05, "loss": 0.5986, "step": 48 }, { "epoch": 0.03675918979744936, "grad_norm": 0.3490433990955353, "learning_rate": 1.8352059925093635e-05, "loss": 0.5412, "step": 49 }, { "epoch": 0.037509377344336084, "grad_norm": 0.18739935755729675, "learning_rate": 1.8726591760299626e-05, "loss": 0.5105, "step": 50 }, { "epoch": 0.03825956489122281, "grad_norm": 0.18920950591564178, "learning_rate": 1.9101123595505618e-05, "loss": 0.4076, "step": 51 }, { "epoch": 0.03900975243810953, "grad_norm": 0.2046889066696167, "learning_rate": 1.9475655430711613e-05, "loss": 0.4934, "step": 52 }, { "epoch": 0.03975993998499625, "grad_norm": 0.32364991307258606, "learning_rate": 1.9850187265917604e-05, "loss": 0.5735, "step": 53 }, { "epoch": 0.04051012753188297, "grad_norm": 0.23437035083770752, "learning_rate": 2.0224719101123596e-05, "loss": 0.5063, "step": 54 }, { "epoch": 0.04126031507876969, "grad_norm": 0.272062212228775, "learning_rate": 2.059925093632959e-05, "loss": 0.4123, "step": 55 }, { "epoch": 0.042010502625656414, "grad_norm": 0.20376525819301605, "learning_rate": 2.0973782771535582e-05, "loss": 0.4534, "step": 56 }, { "epoch": 0.04276069017254314, "grad_norm": 0.20052243769168854, "learning_rate": 2.1348314606741574e-05, "loss": 0.4475, "step": 57 }, { "epoch": 0.04351087771942986, "grad_norm": 0.384977251291275, "learning_rate": 2.1722846441947566e-05, "loss": 0.5774, "step": 58 }, { "epoch": 0.04426106526631658, "grad_norm": 0.169937863945961, "learning_rate": 2.209737827715356e-05, "loss": 0.4018, "step": 59 }, { "epoch": 0.0450112528132033, "grad_norm": 0.21411824226379395, "learning_rate": 2.2471910112359552e-05, "loss": 0.4357, "step": 60 }, { "epoch": 0.04576144036009002, "grad_norm": 0.244670107960701, "learning_rate": 2.2846441947565544e-05, "loss": 0.4884, "step": 61 }, { "epoch": 0.046511627906976744, "grad_norm": 0.283292293548584, "learning_rate": 2.322097378277154e-05, "loss": 0.4547, "step": 62 }, { "epoch": 0.047261815453863466, "grad_norm": 0.20470695197582245, "learning_rate": 2.359550561797753e-05, "loss": 0.5491, "step": 63 }, { "epoch": 0.04801200300075019, "grad_norm": 0.2549651861190796, "learning_rate": 2.3970037453183522e-05, "loss": 0.5141, "step": 64 }, { "epoch": 0.04876219054763691, "grad_norm": 0.24697761237621307, "learning_rate": 2.4344569288389517e-05, "loss": 0.512, "step": 65 }, { "epoch": 0.04951237809452363, "grad_norm": 0.15838152170181274, "learning_rate": 2.4719101123595505e-05, "loss": 0.3329, "step": 66 }, { "epoch": 0.05026256564141035, "grad_norm": 0.13997496664524078, "learning_rate": 2.50936329588015e-05, "loss": 0.3171, "step": 67 }, { "epoch": 0.05101275318829707, "grad_norm": 0.18878750503063202, "learning_rate": 2.546816479400749e-05, "loss": 0.413, "step": 68 }, { "epoch": 0.051762940735183796, "grad_norm": 0.14333204925060272, "learning_rate": 2.5842696629213486e-05, "loss": 0.4371, "step": 69 }, { "epoch": 0.05251312828207052, "grad_norm": 0.15564358234405518, "learning_rate": 2.6217228464419475e-05, "loss": 0.4236, "step": 70 }, { "epoch": 0.05326331582895724, "grad_norm": 0.18392668664455414, "learning_rate": 2.6591760299625466e-05, "loss": 0.4332, "step": 71 }, { "epoch": 0.05401350337584396, "grad_norm": 0.13817186653614044, "learning_rate": 2.696629213483146e-05, "loss": 0.3495, "step": 72 }, { "epoch": 0.05476369092273068, "grad_norm": 0.19368527829647064, "learning_rate": 2.7340823970037456e-05, "loss": 0.4083, "step": 73 }, { "epoch": 0.0555138784696174, "grad_norm": 0.2148423194885254, "learning_rate": 2.7715355805243448e-05, "loss": 0.4343, "step": 74 }, { "epoch": 0.056264066016504126, "grad_norm": 0.17818492650985718, "learning_rate": 2.8089887640449443e-05, "loss": 0.5151, "step": 75 }, { "epoch": 0.05701425356339085, "grad_norm": 0.23069259524345398, "learning_rate": 2.846441947565543e-05, "loss": 0.4168, "step": 76 }, { "epoch": 0.05776444111027757, "grad_norm": 0.14029783010482788, "learning_rate": 2.8838951310861422e-05, "loss": 0.3783, "step": 77 }, { "epoch": 0.058514628657164294, "grad_norm": 0.18246188759803772, "learning_rate": 2.9213483146067417e-05, "loss": 0.4802, "step": 78 }, { "epoch": 0.05926481620405101, "grad_norm": 0.24328601360321045, "learning_rate": 2.958801498127341e-05, "loss": 0.431, "step": 79 }, { "epoch": 0.06001500375093773, "grad_norm": 0.18796172738075256, "learning_rate": 2.9962546816479404e-05, "loss": 0.3205, "step": 80 }, { "epoch": 0.060765191297824456, "grad_norm": 0.20128509402275085, "learning_rate": 3.0337078651685396e-05, "loss": 0.4262, "step": 81 }, { "epoch": 0.06151537884471118, "grad_norm": 0.22152861952781677, "learning_rate": 3.0711610486891384e-05, "loss": 0.4422, "step": 82 }, { "epoch": 0.0622655663915979, "grad_norm": 0.24586685001850128, "learning_rate": 3.108614232209738e-05, "loss": 0.4053, "step": 83 }, { "epoch": 0.06301575393848462, "grad_norm": 0.18768605589866638, "learning_rate": 3.1460674157303374e-05, "loss": 0.3701, "step": 84 }, { "epoch": 0.06376594148537135, "grad_norm": 0.21827884018421173, "learning_rate": 3.183520599250936e-05, "loss": 0.3414, "step": 85 }, { "epoch": 0.06451612903225806, "grad_norm": 0.15999992191791534, "learning_rate": 3.220973782771536e-05, "loss": 0.4116, "step": 86 }, { "epoch": 0.06526631657914479, "grad_norm": 0.18176347017288208, "learning_rate": 3.258426966292135e-05, "loss": 0.3384, "step": 87 }, { "epoch": 0.06601650412603151, "grad_norm": 0.2036697417497635, "learning_rate": 3.295880149812734e-05, "loss": 0.3607, "step": 88 }, { "epoch": 0.06676669167291822, "grad_norm": 0.19556917250156403, "learning_rate": 3.3333333333333335e-05, "loss": 0.4191, "step": 89 }, { "epoch": 0.06751687921980495, "grad_norm": 0.23337404429912567, "learning_rate": 3.370786516853933e-05, "loss": 0.3996, "step": 90 }, { "epoch": 0.06826706676669167, "grad_norm": 0.20131956040859222, "learning_rate": 3.408239700374532e-05, "loss": 0.3909, "step": 91 }, { "epoch": 0.0690172543135784, "grad_norm": 0.18916216492652893, "learning_rate": 3.445692883895131e-05, "loss": 0.4448, "step": 92 }, { "epoch": 0.06976744186046512, "grad_norm": 0.15950234234333038, "learning_rate": 3.483146067415731e-05, "loss": 0.3839, "step": 93 }, { "epoch": 0.07051762940735183, "grad_norm": 0.23887372016906738, "learning_rate": 3.52059925093633e-05, "loss": 0.4366, "step": 94 }, { "epoch": 0.07126781695423856, "grad_norm": 0.23991575837135315, "learning_rate": 3.558052434456929e-05, "loss": 0.3729, "step": 95 }, { "epoch": 0.07201800450112528, "grad_norm": 0.1565224826335907, "learning_rate": 3.595505617977528e-05, "loss": 0.2412, "step": 96 }, { "epoch": 0.072768192048012, "grad_norm": 0.26748421788215637, "learning_rate": 3.6329588014981274e-05, "loss": 0.3533, "step": 97 }, { "epoch": 0.07351837959489872, "grad_norm": 0.314481645822525, "learning_rate": 3.670411985018727e-05, "loss": 0.4788, "step": 98 }, { "epoch": 0.07426856714178545, "grad_norm": 0.3370957672595978, "learning_rate": 3.7078651685393264e-05, "loss": 0.4331, "step": 99 }, { "epoch": 0.07501875468867217, "grad_norm": 0.22964751720428467, "learning_rate": 3.745318352059925e-05, "loss": 0.4117, "step": 100 }, { "epoch": 0.07576894223555888, "grad_norm": 0.17034278810024261, "learning_rate": 3.782771535580524e-05, "loss": 0.3435, "step": 101 }, { "epoch": 0.07651912978244561, "grad_norm": 0.18526990711688995, "learning_rate": 3.8202247191011236e-05, "loss": 0.3852, "step": 102 }, { "epoch": 0.07726931732933233, "grad_norm": 0.2640657424926758, "learning_rate": 3.857677902621723e-05, "loss": 0.3983, "step": 103 }, { "epoch": 0.07801950487621906, "grad_norm": 0.300622820854187, "learning_rate": 3.8951310861423226e-05, "loss": 0.4192, "step": 104 }, { "epoch": 0.07876969242310577, "grad_norm": 0.212265744805336, "learning_rate": 3.9325842696629214e-05, "loss": 0.3733, "step": 105 }, { "epoch": 0.0795198799699925, "grad_norm": 0.2566777169704437, "learning_rate": 3.970037453183521e-05, "loss": 0.3594, "step": 106 }, { "epoch": 0.08027006751687922, "grad_norm": 0.2779830992221832, "learning_rate": 4.00749063670412e-05, "loss": 0.3431, "step": 107 }, { "epoch": 0.08102025506376594, "grad_norm": 0.1657448709011078, "learning_rate": 4.044943820224719e-05, "loss": 0.3098, "step": 108 }, { "epoch": 0.08177044261065267, "grad_norm": 0.27562883496284485, "learning_rate": 4.082397003745319e-05, "loss": 0.365, "step": 109 }, { "epoch": 0.08252063015753938, "grad_norm": 0.287000834941864, "learning_rate": 4.119850187265918e-05, "loss": 0.3768, "step": 110 }, { "epoch": 0.08327081770442611, "grad_norm": 0.19993917644023895, "learning_rate": 4.157303370786517e-05, "loss": 0.3858, "step": 111 }, { "epoch": 0.08402100525131283, "grad_norm": 0.22518040239810944, "learning_rate": 4.1947565543071165e-05, "loss": 0.3765, "step": 112 }, { "epoch": 0.08477119279819954, "grad_norm": 0.24824552237987518, "learning_rate": 4.232209737827715e-05, "loss": 0.3574, "step": 113 }, { "epoch": 0.08552138034508627, "grad_norm": 0.4001130759716034, "learning_rate": 4.269662921348315e-05, "loss": 0.4179, "step": 114 }, { "epoch": 0.08627156789197299, "grad_norm": 0.20386168360710144, "learning_rate": 4.307116104868914e-05, "loss": 0.375, "step": 115 }, { "epoch": 0.08702175543885972, "grad_norm": 0.30691832304000854, "learning_rate": 4.344569288389513e-05, "loss": 0.4305, "step": 116 }, { "epoch": 0.08777194298574643, "grad_norm": 0.15938441455364227, "learning_rate": 4.3820224719101126e-05, "loss": 0.2996, "step": 117 }, { "epoch": 0.08852213053263316, "grad_norm": 0.1876952052116394, "learning_rate": 4.419475655430712e-05, "loss": 0.2517, "step": 118 }, { "epoch": 0.08927231807951988, "grad_norm": 0.23303130269050598, "learning_rate": 4.456928838951311e-05, "loss": 0.3336, "step": 119 }, { "epoch": 0.0900225056264066, "grad_norm": 0.24461005628108978, "learning_rate": 4.4943820224719104e-05, "loss": 0.3524, "step": 120 }, { "epoch": 0.09077269317329333, "grad_norm": 0.21877430379390717, "learning_rate": 4.531835205992509e-05, "loss": 0.3116, "step": 121 }, { "epoch": 0.09152288072018004, "grad_norm": 0.233370840549469, "learning_rate": 4.569288389513109e-05, "loss": 0.3753, "step": 122 }, { "epoch": 0.09227306826706677, "grad_norm": 0.1965615451335907, "learning_rate": 4.606741573033708e-05, "loss": 0.3264, "step": 123 }, { "epoch": 0.09302325581395349, "grad_norm": 0.27194592356681824, "learning_rate": 4.644194756554308e-05, "loss": 0.4833, "step": 124 }, { "epoch": 0.09377344336084022, "grad_norm": 0.24370448291301727, "learning_rate": 4.6816479400749066e-05, "loss": 0.3407, "step": 125 }, { "epoch": 0.09452363090772693, "grad_norm": 0.3411618769168854, "learning_rate": 4.719101123595506e-05, "loss": 0.3265, "step": 126 }, { "epoch": 0.09527381845461365, "grad_norm": 0.2782435119152069, "learning_rate": 4.756554307116105e-05, "loss": 0.4101, "step": 127 }, { "epoch": 0.09602400600150038, "grad_norm": 0.2854892611503601, "learning_rate": 4.7940074906367044e-05, "loss": 0.3991, "step": 128 }, { "epoch": 0.0967741935483871, "grad_norm": 0.2517203390598297, "learning_rate": 4.831460674157304e-05, "loss": 0.2181, "step": 129 }, { "epoch": 0.09752438109527382, "grad_norm": 0.20972402393817902, "learning_rate": 4.8689138576779034e-05, "loss": 0.2843, "step": 130 }, { "epoch": 0.09827456864216054, "grad_norm": 0.3067452311515808, "learning_rate": 4.906367041198502e-05, "loss": 0.3686, "step": 131 }, { "epoch": 0.09902475618904726, "grad_norm": 0.3077613115310669, "learning_rate": 4.943820224719101e-05, "loss": 0.3986, "step": 132 }, { "epoch": 0.09977494373593399, "grad_norm": 0.17937254905700684, "learning_rate": 4.9812734082397005e-05, "loss": 0.2327, "step": 133 }, { "epoch": 0.1005251312828207, "grad_norm": 0.18352457880973816, "learning_rate": 5.0187265917603e-05, "loss": 0.2862, "step": 134 }, { "epoch": 0.10127531882970743, "grad_norm": 0.29749104380607605, "learning_rate": 5.0561797752808995e-05, "loss": 0.384, "step": 135 }, { "epoch": 0.10202550637659415, "grad_norm": 0.3033657371997833, "learning_rate": 5.093632958801498e-05, "loss": 0.3188, "step": 136 }, { "epoch": 0.10277569392348088, "grad_norm": 0.264631062746048, "learning_rate": 5.131086142322098e-05, "loss": 0.2616, "step": 137 }, { "epoch": 0.10352588147036759, "grad_norm": 0.2146955281496048, "learning_rate": 5.168539325842697e-05, "loss": 0.3429, "step": 138 }, { "epoch": 0.10427606901725431, "grad_norm": 0.24487510323524475, "learning_rate": 5.205992509363297e-05, "loss": 0.2921, "step": 139 }, { "epoch": 0.10502625656414104, "grad_norm": 0.27245110273361206, "learning_rate": 5.243445692883895e-05, "loss": 0.3252, "step": 140 }, { "epoch": 0.10577644411102775, "grad_norm": 0.20059911906719208, "learning_rate": 5.2808988764044944e-05, "loss": 0.3357, "step": 141 }, { "epoch": 0.10652663165791448, "grad_norm": 0.1966572254896164, "learning_rate": 5.318352059925093e-05, "loss": 0.2387, "step": 142 }, { "epoch": 0.1072768192048012, "grad_norm": 0.29327839612960815, "learning_rate": 5.355805243445693e-05, "loss": 0.363, "step": 143 }, { "epoch": 0.10802700675168792, "grad_norm": 0.30395713448524475, "learning_rate": 5.393258426966292e-05, "loss": 0.4361, "step": 144 }, { "epoch": 0.10877719429857464, "grad_norm": 0.3542068898677826, "learning_rate": 5.430711610486892e-05, "loss": 0.2894, "step": 145 }, { "epoch": 0.10952738184546136, "grad_norm": 0.3275168538093567, "learning_rate": 5.468164794007491e-05, "loss": 0.3543, "step": 146 }, { "epoch": 0.11027756939234809, "grad_norm": 0.41734519600868225, "learning_rate": 5.50561797752809e-05, "loss": 0.4531, "step": 147 }, { "epoch": 0.1110277569392348, "grad_norm": 0.2748587727546692, "learning_rate": 5.5430711610486895e-05, "loss": 0.3611, "step": 148 }, { "epoch": 0.11177794448612154, "grad_norm": 0.2709265649318695, "learning_rate": 5.580524344569289e-05, "loss": 0.3403, "step": 149 }, { "epoch": 0.11252813203300825, "grad_norm": 0.31612610816955566, "learning_rate": 5.6179775280898885e-05, "loss": 0.3227, "step": 150 }, { "epoch": 0.11327831957989497, "grad_norm": 0.25628960132598877, "learning_rate": 5.6554307116104874e-05, "loss": 0.3653, "step": 151 }, { "epoch": 0.1140285071267817, "grad_norm": 0.28955215215682983, "learning_rate": 5.692883895131086e-05, "loss": 0.3077, "step": 152 }, { "epoch": 0.11477869467366841, "grad_norm": 0.240242600440979, "learning_rate": 5.730337078651685e-05, "loss": 0.3646, "step": 153 }, { "epoch": 0.11552888222055514, "grad_norm": 0.17536325752735138, "learning_rate": 5.7677902621722845e-05, "loss": 0.2707, "step": 154 }, { "epoch": 0.11627906976744186, "grad_norm": 0.25519630312919617, "learning_rate": 5.805243445692884e-05, "loss": 0.2952, "step": 155 }, { "epoch": 0.11702925731432859, "grad_norm": 0.27612656354904175, "learning_rate": 5.8426966292134835e-05, "loss": 0.3169, "step": 156 }, { "epoch": 0.1177794448612153, "grad_norm": 0.2675272822380066, "learning_rate": 5.880149812734082e-05, "loss": 0.3441, "step": 157 }, { "epoch": 0.11852963240810202, "grad_norm": 0.3260287940502167, "learning_rate": 5.917602996254682e-05, "loss": 0.4491, "step": 158 }, { "epoch": 0.11927981995498875, "grad_norm": 0.3030382990837097, "learning_rate": 5.955056179775281e-05, "loss": 0.3232, "step": 159 }, { "epoch": 0.12003000750187547, "grad_norm": 0.39289069175720215, "learning_rate": 5.992509363295881e-05, "loss": 0.3467, "step": 160 }, { "epoch": 0.1207801950487622, "grad_norm": 0.27724504470825195, "learning_rate": 6.02996254681648e-05, "loss": 0.245, "step": 161 }, { "epoch": 0.12153038259564891, "grad_norm": 0.2558315694332123, "learning_rate": 6.067415730337079e-05, "loss": 0.329, "step": 162 }, { "epoch": 0.12228057014253563, "grad_norm": 0.2716682553291321, "learning_rate": 6.104868913857679e-05, "loss": 0.336, "step": 163 }, { "epoch": 0.12303075768942236, "grad_norm": 0.3051481246948242, "learning_rate": 6.142322097378277e-05, "loss": 0.3738, "step": 164 }, { "epoch": 0.12378094523630907, "grad_norm": 0.2308456301689148, "learning_rate": 6.179775280898876e-05, "loss": 0.2988, "step": 165 }, { "epoch": 0.1245311327831958, "grad_norm": 0.23969238996505737, "learning_rate": 6.217228464419476e-05, "loss": 0.2487, "step": 166 }, { "epoch": 0.12528132033008252, "grad_norm": 0.2843066453933716, "learning_rate": 6.254681647940075e-05, "loss": 0.2846, "step": 167 }, { "epoch": 0.12603150787696923, "grad_norm": 0.26107192039489746, "learning_rate": 6.292134831460675e-05, "loss": 0.2872, "step": 168 }, { "epoch": 0.12678169542385595, "grad_norm": 0.25685277581214905, "learning_rate": 6.329588014981274e-05, "loss": 0.2634, "step": 169 }, { "epoch": 0.1275318829707427, "grad_norm": 0.3931554853916168, "learning_rate": 6.367041198501872e-05, "loss": 0.2984, "step": 170 }, { "epoch": 0.1282820705176294, "grad_norm": 0.3400406837463379, "learning_rate": 6.404494382022472e-05, "loss": 0.2792, "step": 171 }, { "epoch": 0.12903225806451613, "grad_norm": 0.34246256947517395, "learning_rate": 6.441947565543071e-05, "loss": 0.3607, "step": 172 }, { "epoch": 0.12978244561140284, "grad_norm": 0.3287302553653717, "learning_rate": 6.479400749063671e-05, "loss": 0.3235, "step": 173 }, { "epoch": 0.13053263315828958, "grad_norm": 0.24843423068523407, "learning_rate": 6.51685393258427e-05, "loss": 0.2526, "step": 174 }, { "epoch": 0.1312828207051763, "grad_norm": 0.2889952063560486, "learning_rate": 6.55430711610487e-05, "loss": 0.2938, "step": 175 }, { "epoch": 0.13203300825206302, "grad_norm": 0.26350104808807373, "learning_rate": 6.591760299625468e-05, "loss": 0.2327, "step": 176 }, { "epoch": 0.13278319579894973, "grad_norm": 0.26324084401130676, "learning_rate": 6.629213483146067e-05, "loss": 0.3035, "step": 177 }, { "epoch": 0.13353338334583645, "grad_norm": 0.31495001912117004, "learning_rate": 6.666666666666667e-05, "loss": 0.3379, "step": 178 }, { "epoch": 0.1342835708927232, "grad_norm": 0.2633056938648224, "learning_rate": 6.704119850187266e-05, "loss": 0.2981, "step": 179 }, { "epoch": 0.1350337584396099, "grad_norm": 0.34318479895591736, "learning_rate": 6.741573033707866e-05, "loss": 0.3598, "step": 180 }, { "epoch": 0.13578394598649662, "grad_norm": 0.3447207808494568, "learning_rate": 6.779026217228464e-05, "loss": 0.3589, "step": 181 }, { "epoch": 0.13653413353338334, "grad_norm": 0.3212548792362213, "learning_rate": 6.816479400749064e-05, "loss": 0.3215, "step": 182 }, { "epoch": 0.13728432108027006, "grad_norm": 0.31946229934692383, "learning_rate": 6.853932584269663e-05, "loss": 0.2843, "step": 183 }, { "epoch": 0.1380345086271568, "grad_norm": 0.35676705837249756, "learning_rate": 6.891385767790263e-05, "loss": 0.3139, "step": 184 }, { "epoch": 0.13878469617404351, "grad_norm": 0.307155966758728, "learning_rate": 6.928838951310862e-05, "loss": 0.2522, "step": 185 }, { "epoch": 0.13953488372093023, "grad_norm": 0.23999446630477905, "learning_rate": 6.966292134831462e-05, "loss": 0.271, "step": 186 }, { "epoch": 0.14028507126781695, "grad_norm": 0.20766986906528473, "learning_rate": 7.003745318352061e-05, "loss": 0.1641, "step": 187 }, { "epoch": 0.14103525881470366, "grad_norm": 0.3591254651546478, "learning_rate": 7.04119850187266e-05, "loss": 0.3971, "step": 188 }, { "epoch": 0.1417854463615904, "grad_norm": 0.27526983618736267, "learning_rate": 7.078651685393259e-05, "loss": 0.2737, "step": 189 }, { "epoch": 0.14253563390847712, "grad_norm": 0.3869401216506958, "learning_rate": 7.116104868913858e-05, "loss": 0.3669, "step": 190 }, { "epoch": 0.14328582145536384, "grad_norm": 0.3631754219532013, "learning_rate": 7.153558052434456e-05, "loss": 0.3376, "step": 191 }, { "epoch": 0.14403600900225055, "grad_norm": 0.2576736807823181, "learning_rate": 7.191011235955056e-05, "loss": 0.3803, "step": 192 }, { "epoch": 0.1447861965491373, "grad_norm": 0.2559492588043213, "learning_rate": 7.228464419475655e-05, "loss": 0.2486, "step": 193 }, { "epoch": 0.145536384096024, "grad_norm": 0.3264792263507843, "learning_rate": 7.265917602996255e-05, "loss": 0.3883, "step": 194 }, { "epoch": 0.14628657164291073, "grad_norm": 0.2363443374633789, "learning_rate": 7.303370786516854e-05, "loss": 0.3333, "step": 195 }, { "epoch": 0.14703675918979744, "grad_norm": 0.27175259590148926, "learning_rate": 7.340823970037454e-05, "loss": 0.2989, "step": 196 }, { "epoch": 0.14778694673668416, "grad_norm": 0.21516774594783783, "learning_rate": 7.378277153558053e-05, "loss": 0.2439, "step": 197 }, { "epoch": 0.1485371342835709, "grad_norm": 0.433608740568161, "learning_rate": 7.415730337078653e-05, "loss": 0.2982, "step": 198 }, { "epoch": 0.14928732183045762, "grad_norm": 0.3271982669830322, "learning_rate": 7.453183520599252e-05, "loss": 0.3294, "step": 199 }, { "epoch": 0.15003750937734434, "grad_norm": 0.30376243591308594, "learning_rate": 7.49063670411985e-05, "loss": 0.3648, "step": 200 }, { "epoch": 0.15003750937734434, "eval_loss": 0.3126641809940338, "eval_runtime": 8.9312, "eval_samples_per_second": 6.046, "eval_steps_per_second": 1.568, "step": 200 }, { "epoch": 0.15078769692423105, "grad_norm": 0.23975402116775513, "learning_rate": 7.52808988764045e-05, "loss": 0.2743, "step": 201 }, { "epoch": 0.15153788447111777, "grad_norm": 0.36072632670402527, "learning_rate": 7.565543071161048e-05, "loss": 0.2869, "step": 202 }, { "epoch": 0.1522880720180045, "grad_norm": 0.26643139123916626, "learning_rate": 7.602996254681648e-05, "loss": 0.2488, "step": 203 }, { "epoch": 0.15303825956489123, "grad_norm": 0.37966683506965637, "learning_rate": 7.640449438202247e-05, "loss": 0.3597, "step": 204 }, { "epoch": 0.15378844711177794, "grad_norm": 0.31449559330940247, "learning_rate": 7.677902621722847e-05, "loss": 0.2953, "step": 205 }, { "epoch": 0.15453863465866466, "grad_norm": 0.23182997107505798, "learning_rate": 7.715355805243446e-05, "loss": 0.2057, "step": 206 }, { "epoch": 0.15528882220555137, "grad_norm": 0.27765166759490967, "learning_rate": 7.752808988764046e-05, "loss": 0.2507, "step": 207 }, { "epoch": 0.15603900975243812, "grad_norm": 0.2892444431781769, "learning_rate": 7.790262172284645e-05, "loss": 0.3173, "step": 208 }, { "epoch": 0.15678919729932483, "grad_norm": 0.2923356890678406, "learning_rate": 7.827715355805245e-05, "loss": 0.3063, "step": 209 }, { "epoch": 0.15753938484621155, "grad_norm": 0.24453075230121613, "learning_rate": 7.865168539325843e-05, "loss": 0.2534, "step": 210 }, { "epoch": 0.15828957239309827, "grad_norm": 0.3415100872516632, "learning_rate": 7.902621722846442e-05, "loss": 0.397, "step": 211 }, { "epoch": 0.159039759939985, "grad_norm": 0.29072514176368713, "learning_rate": 7.940074906367042e-05, "loss": 0.342, "step": 212 }, { "epoch": 0.15978994748687173, "grad_norm": 0.29593101143836975, "learning_rate": 7.97752808988764e-05, "loss": 0.3607, "step": 213 }, { "epoch": 0.16054013503375844, "grad_norm": 0.23408815264701843, "learning_rate": 8.01498127340824e-05, "loss": 0.235, "step": 214 }, { "epoch": 0.16129032258064516, "grad_norm": 0.26164984703063965, "learning_rate": 8.052434456928839e-05, "loss": 0.2432, "step": 215 }, { "epoch": 0.16204051012753187, "grad_norm": 0.322074294090271, "learning_rate": 8.089887640449438e-05, "loss": 0.2905, "step": 216 }, { "epoch": 0.16279069767441862, "grad_norm": 0.2085808366537094, "learning_rate": 8.127340823970038e-05, "loss": 0.2681, "step": 217 }, { "epoch": 0.16354088522130533, "grad_norm": 0.3317839503288269, "learning_rate": 8.164794007490637e-05, "loss": 0.2513, "step": 218 }, { "epoch": 0.16429107276819205, "grad_norm": 0.45130306482315063, "learning_rate": 8.202247191011237e-05, "loss": 0.3101, "step": 219 }, { "epoch": 0.16504126031507876, "grad_norm": 0.40590327978134155, "learning_rate": 8.239700374531836e-05, "loss": 0.2936, "step": 220 }, { "epoch": 0.16579144786196548, "grad_norm": 0.27543216943740845, "learning_rate": 8.277153558052434e-05, "loss": 0.2582, "step": 221 }, { "epoch": 0.16654163540885222, "grad_norm": 0.3400556743144989, "learning_rate": 8.314606741573034e-05, "loss": 0.2716, "step": 222 }, { "epoch": 0.16729182295573894, "grad_norm": 0.2772960364818573, "learning_rate": 8.352059925093633e-05, "loss": 0.3182, "step": 223 }, { "epoch": 0.16804201050262565, "grad_norm": 0.31811845302581787, "learning_rate": 8.389513108614233e-05, "loss": 0.3069, "step": 224 }, { "epoch": 0.16879219804951237, "grad_norm": 0.32427364587783813, "learning_rate": 8.426966292134831e-05, "loss": 0.2887, "step": 225 }, { "epoch": 0.1695423855963991, "grad_norm": 0.399421751499176, "learning_rate": 8.46441947565543e-05, "loss": 0.3242, "step": 226 }, { "epoch": 0.17029257314328583, "grad_norm": 0.3481862246990204, "learning_rate": 8.50187265917603e-05, "loss": 0.3179, "step": 227 }, { "epoch": 0.17104276069017255, "grad_norm": 0.2746341824531555, "learning_rate": 8.53932584269663e-05, "loss": 0.3096, "step": 228 }, { "epoch": 0.17179294823705926, "grad_norm": 0.2853356897830963, "learning_rate": 8.576779026217229e-05, "loss": 0.3036, "step": 229 }, { "epoch": 0.17254313578394598, "grad_norm": 0.24907056987285614, "learning_rate": 8.614232209737829e-05, "loss": 0.2902, "step": 230 }, { "epoch": 0.17329332333083272, "grad_norm": 0.321097731590271, "learning_rate": 8.651685393258427e-05, "loss": 0.313, "step": 231 }, { "epoch": 0.17404351087771944, "grad_norm": 0.30011144280433655, "learning_rate": 8.689138576779026e-05, "loss": 0.2857, "step": 232 }, { "epoch": 0.17479369842460615, "grad_norm": 0.3504459261894226, "learning_rate": 8.726591760299626e-05, "loss": 0.3063, "step": 233 }, { "epoch": 0.17554388597149287, "grad_norm": 0.3415544629096985, "learning_rate": 8.764044943820225e-05, "loss": 0.297, "step": 234 }, { "epoch": 0.17629407351837958, "grad_norm": 0.3717586398124695, "learning_rate": 8.801498127340825e-05, "loss": 0.2724, "step": 235 }, { "epoch": 0.17704426106526633, "grad_norm": 0.2720591127872467, "learning_rate": 8.838951310861424e-05, "loss": 0.2477, "step": 236 }, { "epoch": 0.17779444861215304, "grad_norm": 0.34325769543647766, "learning_rate": 8.876404494382022e-05, "loss": 0.3138, "step": 237 }, { "epoch": 0.17854463615903976, "grad_norm": 0.2513119578361511, "learning_rate": 8.913857677902622e-05, "loss": 0.2744, "step": 238 }, { "epoch": 0.17929482370592648, "grad_norm": 0.3230467736721039, "learning_rate": 8.951310861423221e-05, "loss": 0.32, "step": 239 }, { "epoch": 0.1800450112528132, "grad_norm": 0.31532153487205505, "learning_rate": 8.988764044943821e-05, "loss": 0.2679, "step": 240 }, { "epoch": 0.18079519879969994, "grad_norm": 0.4205916225910187, "learning_rate": 9.02621722846442e-05, "loss": 0.3462, "step": 241 }, { "epoch": 0.18154538634658665, "grad_norm": 0.28419923782348633, "learning_rate": 9.063670411985018e-05, "loss": 0.268, "step": 242 }, { "epoch": 0.18229557389347337, "grad_norm": 0.35711121559143066, "learning_rate": 9.101123595505618e-05, "loss": 0.316, "step": 243 }, { "epoch": 0.18304576144036008, "grad_norm": 0.35523149371147156, "learning_rate": 9.138576779026217e-05, "loss": 0.2264, "step": 244 }, { "epoch": 0.1837959489872468, "grad_norm": 0.2740556299686432, "learning_rate": 9.176029962546817e-05, "loss": 0.2663, "step": 245 }, { "epoch": 0.18454613653413354, "grad_norm": 0.2906612455844879, "learning_rate": 9.213483146067416e-05, "loss": 0.3182, "step": 246 }, { "epoch": 0.18529632408102026, "grad_norm": 0.31456026434898376, "learning_rate": 9.250936329588016e-05, "loss": 0.2812, "step": 247 }, { "epoch": 0.18604651162790697, "grad_norm": 0.2652457356452942, "learning_rate": 9.288389513108615e-05, "loss": 0.2706, "step": 248 }, { "epoch": 0.1867966991747937, "grad_norm": 0.3360106348991394, "learning_rate": 9.325842696629214e-05, "loss": 0.2826, "step": 249 }, { "epoch": 0.18754688672168043, "grad_norm": 0.26761558651924133, "learning_rate": 9.363295880149813e-05, "loss": 0.2585, "step": 250 }, { "epoch": 0.18829707426856715, "grad_norm": 0.3208111524581909, "learning_rate": 9.400749063670413e-05, "loss": 0.2462, "step": 251 }, { "epoch": 0.18904726181545387, "grad_norm": 0.28124290704727173, "learning_rate": 9.438202247191012e-05, "loss": 0.249, "step": 252 }, { "epoch": 0.18979744936234058, "grad_norm": 0.31529223918914795, "learning_rate": 9.47565543071161e-05, "loss": 0.259, "step": 253 }, { "epoch": 0.1905476369092273, "grad_norm": 0.26012852787971497, "learning_rate": 9.51310861423221e-05, "loss": 0.2302, "step": 254 }, { "epoch": 0.19129782445611404, "grad_norm": 0.323860764503479, "learning_rate": 9.550561797752809e-05, "loss": 0.3243, "step": 255 }, { "epoch": 0.19204801200300076, "grad_norm": 0.3862539529800415, "learning_rate": 9.588014981273409e-05, "loss": 0.3169, "step": 256 }, { "epoch": 0.19279819954988747, "grad_norm": 0.2932683825492859, "learning_rate": 9.625468164794008e-05, "loss": 0.2573, "step": 257 }, { "epoch": 0.1935483870967742, "grad_norm": 0.2936646342277527, "learning_rate": 9.662921348314608e-05, "loss": 0.2402, "step": 258 }, { "epoch": 0.1942985746436609, "grad_norm": 0.3209379315376282, "learning_rate": 9.700374531835207e-05, "loss": 0.2635, "step": 259 }, { "epoch": 0.19504876219054765, "grad_norm": 0.41373732686042786, "learning_rate": 9.737827715355807e-05, "loss": 0.3145, "step": 260 }, { "epoch": 0.19579894973743436, "grad_norm": 0.3355521857738495, "learning_rate": 9.775280898876405e-05, "loss": 0.3388, "step": 261 }, { "epoch": 0.19654913728432108, "grad_norm": 0.27347877621650696, "learning_rate": 9.812734082397004e-05, "loss": 0.2475, "step": 262 }, { "epoch": 0.1972993248312078, "grad_norm": 0.3057049512863159, "learning_rate": 9.850187265917602e-05, "loss": 0.283, "step": 263 }, { "epoch": 0.1980495123780945, "grad_norm": 0.43207481503486633, "learning_rate": 9.887640449438202e-05, "loss": 0.3064, "step": 264 }, { "epoch": 0.19879969992498125, "grad_norm": 0.36544108390808105, "learning_rate": 9.925093632958801e-05, "loss": 0.3395, "step": 265 }, { "epoch": 0.19954988747186797, "grad_norm": 0.2667524218559265, "learning_rate": 9.962546816479401e-05, "loss": 0.2374, "step": 266 }, { "epoch": 0.2003000750187547, "grad_norm": 0.36456984281539917, "learning_rate": 0.0001, "loss": 0.3246, "step": 267 }, { "epoch": 0.2010502625656414, "grad_norm": 0.27113181352615356, "learning_rate": 9.99999571274618e-05, "loss": 0.1872, "step": 268 }, { "epoch": 0.20180045011252815, "grad_norm": 0.3271729350090027, "learning_rate": 9.999982850992069e-05, "loss": 0.2508, "step": 269 }, { "epoch": 0.20255063765941486, "grad_norm": 0.3091783821582794, "learning_rate": 9.999961414759727e-05, "loss": 0.3442, "step": 270 }, { "epoch": 0.20330082520630158, "grad_norm": 0.30331188440322876, "learning_rate": 9.999931404085912e-05, "loss": 0.3164, "step": 271 }, { "epoch": 0.2040510127531883, "grad_norm": 0.29597944021224976, "learning_rate": 9.999892819022092e-05, "loss": 0.3533, "step": 272 }, { "epoch": 0.204801200300075, "grad_norm": 0.2322920262813568, "learning_rate": 9.999845659634435e-05, "loss": 0.2596, "step": 273 }, { "epoch": 0.20555138784696175, "grad_norm": 0.33546024560928345, "learning_rate": 9.999789926003814e-05, "loss": 0.2787, "step": 274 }, { "epoch": 0.20630157539384847, "grad_norm": 0.20232991874217987, "learning_rate": 9.999725618225808e-05, "loss": 0.2823, "step": 275 }, { "epoch": 0.20705176294073518, "grad_norm": 0.31621062755584717, "learning_rate": 9.999652736410698e-05, "loss": 0.2587, "step": 276 }, { "epoch": 0.2078019504876219, "grad_norm": 0.3425423800945282, "learning_rate": 9.999571280683468e-05, "loss": 0.3626, "step": 277 }, { "epoch": 0.20855213803450862, "grad_norm": 0.26360300183296204, "learning_rate": 9.99948125118381e-05, "loss": 0.2956, "step": 278 }, { "epoch": 0.20930232558139536, "grad_norm": 0.26937419176101685, "learning_rate": 9.999382648066113e-05, "loss": 0.2999, "step": 279 }, { "epoch": 0.21005251312828208, "grad_norm": 0.30764681100845337, "learning_rate": 9.999275471499472e-05, "loss": 0.2912, "step": 280 }, { "epoch": 0.2108027006751688, "grad_norm": 0.31329846382141113, "learning_rate": 9.999159721667685e-05, "loss": 0.29, "step": 281 }, { "epoch": 0.2115528882220555, "grad_norm": 0.3319954574108124, "learning_rate": 9.999035398769252e-05, "loss": 0.3168, "step": 282 }, { "epoch": 0.21230307576894222, "grad_norm": 0.3386693298816681, "learning_rate": 9.998902503017372e-05, "loss": 0.3592, "step": 283 }, { "epoch": 0.21305326331582897, "grad_norm": 0.3803078234195709, "learning_rate": 9.99876103463995e-05, "loss": 0.2912, "step": 284 }, { "epoch": 0.21380345086271568, "grad_norm": 0.3271186649799347, "learning_rate": 9.998610993879589e-05, "loss": 0.2903, "step": 285 }, { "epoch": 0.2145536384096024, "grad_norm": 0.3750841021537781, "learning_rate": 9.998452380993597e-05, "loss": 0.258, "step": 286 }, { "epoch": 0.21530382595648911, "grad_norm": 0.3441598415374756, "learning_rate": 9.998285196253977e-05, "loss": 0.2194, "step": 287 }, { "epoch": 0.21605401350337583, "grad_norm": 0.3111169636249542, "learning_rate": 9.998109439947434e-05, "loss": 0.2364, "step": 288 }, { "epoch": 0.21680420105026257, "grad_norm": 0.2794364094734192, "learning_rate": 9.997925112375375e-05, "loss": 0.2587, "step": 289 }, { "epoch": 0.2175543885971493, "grad_norm": 0.3304993212223053, "learning_rate": 9.997732213853902e-05, "loss": 0.3041, "step": 290 }, { "epoch": 0.218304576144036, "grad_norm": 0.37727826833724976, "learning_rate": 9.997530744713817e-05, "loss": 0.3522, "step": 291 }, { "epoch": 0.21905476369092272, "grad_norm": 0.2943536937236786, "learning_rate": 9.997320705300621e-05, "loss": 0.3347, "step": 292 }, { "epoch": 0.21980495123780946, "grad_norm": 0.36034390330314636, "learning_rate": 9.997102095974508e-05, "loss": 0.3346, "step": 293 }, { "epoch": 0.22055513878469618, "grad_norm": 0.29582419991493225, "learning_rate": 9.996874917110378e-05, "loss": 0.1816, "step": 294 }, { "epoch": 0.2213053263315829, "grad_norm": 0.4059109687805176, "learning_rate": 9.996639169097811e-05, "loss": 0.3228, "step": 295 }, { "epoch": 0.2220555138784696, "grad_norm": 0.4772152900695801, "learning_rate": 9.996394852341098e-05, "loss": 0.4012, "step": 296 }, { "epoch": 0.22280570142535633, "grad_norm": 0.3214409053325653, "learning_rate": 9.996141967259218e-05, "loss": 0.2382, "step": 297 }, { "epoch": 0.22355588897224307, "grad_norm": 0.2654603123664856, "learning_rate": 9.995880514285841e-05, "loss": 0.2611, "step": 298 }, { "epoch": 0.2243060765191298, "grad_norm": 0.27516403794288635, "learning_rate": 9.995610493869336e-05, "loss": 0.2683, "step": 299 }, { "epoch": 0.2250562640660165, "grad_norm": 0.3601084053516388, "learning_rate": 9.99533190647276e-05, "loss": 0.3622, "step": 300 }, { "epoch": 0.22580645161290322, "grad_norm": 0.37099161744117737, "learning_rate": 9.995044752573864e-05, "loss": 0.2697, "step": 301 }, { "epoch": 0.22655663915978994, "grad_norm": 0.2741296589374542, "learning_rate": 9.994749032665085e-05, "loss": 0.2926, "step": 302 }, { "epoch": 0.22730682670667668, "grad_norm": 0.40920963883399963, "learning_rate": 9.994444747253559e-05, "loss": 0.2704, "step": 303 }, { "epoch": 0.2280570142535634, "grad_norm": 0.2439938336610794, "learning_rate": 9.9941318968611e-05, "loss": 0.185, "step": 304 }, { "epoch": 0.2288072018004501, "grad_norm": 0.3881830871105194, "learning_rate": 9.993810482024221e-05, "loss": 0.3389, "step": 305 }, { "epoch": 0.22955738934733683, "grad_norm": 0.3043234646320343, "learning_rate": 9.993480503294114e-05, "loss": 0.2681, "step": 306 }, { "epoch": 0.23030757689422354, "grad_norm": 0.2616919279098511, "learning_rate": 9.993141961236661e-05, "loss": 0.2934, "step": 307 }, { "epoch": 0.23105776444111029, "grad_norm": 0.30381616950035095, "learning_rate": 9.992794856432426e-05, "loss": 0.2703, "step": 308 }, { "epoch": 0.231807951987997, "grad_norm": 0.2548627257347107, "learning_rate": 9.992439189476661e-05, "loss": 0.3137, "step": 309 }, { "epoch": 0.23255813953488372, "grad_norm": 0.3002263307571411, "learning_rate": 9.992074960979301e-05, "loss": 0.274, "step": 310 }, { "epoch": 0.23330832708177043, "grad_norm": 0.2693195343017578, "learning_rate": 9.991702171564961e-05, "loss": 0.341, "step": 311 }, { "epoch": 0.23405851462865718, "grad_norm": 0.2616502642631531, "learning_rate": 9.991320821872939e-05, "loss": 0.2879, "step": 312 }, { "epoch": 0.2348087021755439, "grad_norm": 0.30761799216270447, "learning_rate": 9.990930912557209e-05, "loss": 0.2341, "step": 313 }, { "epoch": 0.2355588897224306, "grad_norm": 0.2792080044746399, "learning_rate": 9.990532444286431e-05, "loss": 0.2587, "step": 314 }, { "epoch": 0.23630907726931732, "grad_norm": 0.30447396636009216, "learning_rate": 9.990125417743937e-05, "loss": 0.306, "step": 315 }, { "epoch": 0.23705926481620404, "grad_norm": 0.30313432216644287, "learning_rate": 9.989709833627736e-05, "loss": 0.2169, "step": 316 }, { "epoch": 0.23780945236309078, "grad_norm": 0.3800298273563385, "learning_rate": 9.989285692650518e-05, "loss": 0.3239, "step": 317 }, { "epoch": 0.2385596399099775, "grad_norm": 0.4133218824863434, "learning_rate": 9.98885299553964e-05, "loss": 0.3734, "step": 318 }, { "epoch": 0.23930982745686422, "grad_norm": 0.23686738312244415, "learning_rate": 9.988411743037134e-05, "loss": 0.245, "step": 319 }, { "epoch": 0.24006001500375093, "grad_norm": 0.38337042927742004, "learning_rate": 9.987961935899706e-05, "loss": 0.3188, "step": 320 }, { "epoch": 0.24081020255063765, "grad_norm": 0.41182252764701843, "learning_rate": 9.987503574898731e-05, "loss": 0.2844, "step": 321 }, { "epoch": 0.2415603900975244, "grad_norm": 0.27099326252937317, "learning_rate": 9.987036660820255e-05, "loss": 0.2605, "step": 322 }, { "epoch": 0.2423105776444111, "grad_norm": 0.2866607904434204, "learning_rate": 9.986561194464985e-05, "loss": 0.2707, "step": 323 }, { "epoch": 0.24306076519129782, "grad_norm": 0.264517217874527, "learning_rate": 9.986077176648303e-05, "loss": 0.2779, "step": 324 }, { "epoch": 0.24381095273818454, "grad_norm": 0.28065624833106995, "learning_rate": 9.985584608200251e-05, "loss": 0.3724, "step": 325 }, { "epoch": 0.24456114028507125, "grad_norm": 0.24853992462158203, "learning_rate": 9.985083489965534e-05, "loss": 0.2526, "step": 326 }, { "epoch": 0.245311327831958, "grad_norm": 0.2943599820137024, "learning_rate": 9.984573822803521e-05, "loss": 0.2047, "step": 327 }, { "epoch": 0.24606151537884471, "grad_norm": 0.36511868238449097, "learning_rate": 9.984055607588242e-05, "loss": 0.3295, "step": 328 }, { "epoch": 0.24681170292573143, "grad_norm": 0.28034162521362305, "learning_rate": 9.983528845208384e-05, "loss": 0.2348, "step": 329 }, { "epoch": 0.24756189047261815, "grad_norm": 0.30771222710609436, "learning_rate": 9.982993536567293e-05, "loss": 0.2384, "step": 330 }, { "epoch": 0.2483120780195049, "grad_norm": 0.3448368012905121, "learning_rate": 9.98244968258297e-05, "loss": 0.2746, "step": 331 }, { "epoch": 0.2490622655663916, "grad_norm": 0.27195751667022705, "learning_rate": 9.981897284188073e-05, "loss": 0.2924, "step": 332 }, { "epoch": 0.24981245311327832, "grad_norm": 0.2395554482936859, "learning_rate": 9.981336342329909e-05, "loss": 0.1978, "step": 333 }, { "epoch": 0.25056264066016504, "grad_norm": 0.3751216232776642, "learning_rate": 9.980766857970438e-05, "loss": 0.3081, "step": 334 }, { "epoch": 0.25131282820705175, "grad_norm": 0.4142260253429413, "learning_rate": 9.98018883208627e-05, "loss": 0.2663, "step": 335 }, { "epoch": 0.25206301575393847, "grad_norm": 0.17451901733875275, "learning_rate": 9.979602265668664e-05, "loss": 0.2235, "step": 336 }, { "epoch": 0.2528132033008252, "grad_norm": 0.2726498246192932, "learning_rate": 9.979007159723521e-05, "loss": 0.2676, "step": 337 }, { "epoch": 0.2535633908477119, "grad_norm": 0.2192569226026535, "learning_rate": 9.97840351527139e-05, "loss": 0.2391, "step": 338 }, { "epoch": 0.25431357839459867, "grad_norm": 0.34619376063346863, "learning_rate": 9.977791333347462e-05, "loss": 0.2532, "step": 339 }, { "epoch": 0.2550637659414854, "grad_norm": 0.3849112093448639, "learning_rate": 9.97717061500157e-05, "loss": 0.3341, "step": 340 }, { "epoch": 0.2558139534883721, "grad_norm": 0.20521502196788788, "learning_rate": 9.976541361298184e-05, "loss": 0.2555, "step": 341 }, { "epoch": 0.2565641410352588, "grad_norm": 0.34699925780296326, "learning_rate": 9.97590357331641e-05, "loss": 0.3168, "step": 342 }, { "epoch": 0.25731432858214554, "grad_norm": 0.3625432848930359, "learning_rate": 9.975257252149994e-05, "loss": 0.283, "step": 343 }, { "epoch": 0.25806451612903225, "grad_norm": 0.3631323575973511, "learning_rate": 9.974602398907313e-05, "loss": 0.2613, "step": 344 }, { "epoch": 0.25881470367591897, "grad_norm": 0.3348166048526764, "learning_rate": 9.973939014711375e-05, "loss": 0.2505, "step": 345 }, { "epoch": 0.2595648912228057, "grad_norm": 0.315660685300827, "learning_rate": 9.973267100699819e-05, "loss": 0.2148, "step": 346 }, { "epoch": 0.2603150787696924, "grad_norm": 0.33237096667289734, "learning_rate": 9.972586658024911e-05, "loss": 0.3652, "step": 347 }, { "epoch": 0.26106526631657917, "grad_norm": 0.25689029693603516, "learning_rate": 9.971897687853544e-05, "loss": 0.2439, "step": 348 }, { "epoch": 0.2618154538634659, "grad_norm": 0.3529447317123413, "learning_rate": 9.971200191367234e-05, "loss": 0.2913, "step": 349 }, { "epoch": 0.2625656414103526, "grad_norm": 0.35498178005218506, "learning_rate": 9.970494169762117e-05, "loss": 0.2625, "step": 350 }, { "epoch": 0.2633158289572393, "grad_norm": 0.36856216192245483, "learning_rate": 9.969779624248954e-05, "loss": 0.3403, "step": 351 }, { "epoch": 0.26406601650412603, "grad_norm": 0.3177325427532196, "learning_rate": 9.969056556053116e-05, "loss": 0.3391, "step": 352 }, { "epoch": 0.26481620405101275, "grad_norm": 0.30229535698890686, "learning_rate": 9.968324966414597e-05, "loss": 0.4226, "step": 353 }, { "epoch": 0.26556639159789946, "grad_norm": 0.41339901089668274, "learning_rate": 9.967584856588e-05, "loss": 0.3964, "step": 354 }, { "epoch": 0.2663165791447862, "grad_norm": 0.2893449068069458, "learning_rate": 9.966836227842538e-05, "loss": 0.2949, "step": 355 }, { "epoch": 0.2670667666916729, "grad_norm": 0.30889108777046204, "learning_rate": 9.96607908146204e-05, "loss": 0.2402, "step": 356 }, { "epoch": 0.2678169542385596, "grad_norm": 0.23413531482219696, "learning_rate": 9.965313418744935e-05, "loss": 0.222, "step": 357 }, { "epoch": 0.2685671417854464, "grad_norm": 0.22721260786056519, "learning_rate": 9.964539241004261e-05, "loss": 0.2022, "step": 358 }, { "epoch": 0.2693173293323331, "grad_norm": 0.30268383026123047, "learning_rate": 9.963756549567654e-05, "loss": 0.3403, "step": 359 }, { "epoch": 0.2700675168792198, "grad_norm": 0.3824634253978729, "learning_rate": 9.962965345777353e-05, "loss": 0.311, "step": 360 }, { "epoch": 0.27081770442610653, "grad_norm": 0.2816515564918518, "learning_rate": 9.962165630990196e-05, "loss": 0.2851, "step": 361 }, { "epoch": 0.27156789197299325, "grad_norm": 0.28467679023742676, "learning_rate": 9.961357406577617e-05, "loss": 0.2529, "step": 362 }, { "epoch": 0.27231807951987996, "grad_norm": 0.2800656259059906, "learning_rate": 9.960540673925636e-05, "loss": 0.2387, "step": 363 }, { "epoch": 0.2730682670667667, "grad_norm": 0.3433350622653961, "learning_rate": 9.959715434434873e-05, "loss": 0.2902, "step": 364 }, { "epoch": 0.2738184546136534, "grad_norm": 0.33678990602493286, "learning_rate": 9.958881689520531e-05, "loss": 0.334, "step": 365 }, { "epoch": 0.2745686421605401, "grad_norm": 0.31413450837135315, "learning_rate": 9.958039440612402e-05, "loss": 0.2935, "step": 366 }, { "epoch": 0.2753188297074269, "grad_norm": 0.35551297664642334, "learning_rate": 9.957188689154859e-05, "loss": 0.3206, "step": 367 }, { "epoch": 0.2760690172543136, "grad_norm": 0.2744191884994507, "learning_rate": 9.956329436606857e-05, "loss": 0.2544, "step": 368 }, { "epoch": 0.2768192048012003, "grad_norm": 0.3117627799510956, "learning_rate": 9.955461684441928e-05, "loss": 0.2869, "step": 369 }, { "epoch": 0.27756939234808703, "grad_norm": 0.37252742052078247, "learning_rate": 9.954585434148183e-05, "loss": 0.2948, "step": 370 }, { "epoch": 0.27831957989497375, "grad_norm": 0.34387722611427307, "learning_rate": 9.953700687228306e-05, "loss": 0.2929, "step": 371 }, { "epoch": 0.27906976744186046, "grad_norm": 0.22328947484493256, "learning_rate": 9.952807445199549e-05, "loss": 0.2665, "step": 372 }, { "epoch": 0.2798199549887472, "grad_norm": 0.2621975243091583, "learning_rate": 9.951905709593735e-05, "loss": 0.2878, "step": 373 }, { "epoch": 0.2805701425356339, "grad_norm": 0.30426445603370667, "learning_rate": 9.950995481957251e-05, "loss": 0.3188, "step": 374 }, { "epoch": 0.2813203300825206, "grad_norm": 0.2481190413236618, "learning_rate": 9.950076763851049e-05, "loss": 0.2091, "step": 375 }, { "epoch": 0.2820705176294073, "grad_norm": 0.3240612745285034, "learning_rate": 9.949149556850638e-05, "loss": 0.3141, "step": 376 }, { "epoch": 0.2828207051762941, "grad_norm": 0.31994742155075073, "learning_rate": 9.94821386254609e-05, "loss": 0.2054, "step": 377 }, { "epoch": 0.2835708927231808, "grad_norm": 0.2470906674861908, "learning_rate": 9.947269682542027e-05, "loss": 0.2656, "step": 378 }, { "epoch": 0.2843210802700675, "grad_norm": 0.2776731252670288, "learning_rate": 9.946317018457622e-05, "loss": 0.3121, "step": 379 }, { "epoch": 0.28507126781695424, "grad_norm": 0.31275856494903564, "learning_rate": 9.945355871926605e-05, "loss": 0.3361, "step": 380 }, { "epoch": 0.28582145536384096, "grad_norm": 0.28119826316833496, "learning_rate": 9.944386244597244e-05, "loss": 0.4173, "step": 381 }, { "epoch": 0.2865716429107277, "grad_norm": 0.3146207332611084, "learning_rate": 9.943408138132357e-05, "loss": 0.2468, "step": 382 }, { "epoch": 0.2873218304576144, "grad_norm": 0.23670701682567596, "learning_rate": 9.942421554209297e-05, "loss": 0.2717, "step": 383 }, { "epoch": 0.2880720180045011, "grad_norm": 0.33597809076309204, "learning_rate": 9.94142649451996e-05, "loss": 0.2864, "step": 384 }, { "epoch": 0.2888222055513878, "grad_norm": 0.28663918375968933, "learning_rate": 9.940422960770776e-05, "loss": 0.4193, "step": 385 }, { "epoch": 0.2895723930982746, "grad_norm": 0.32313409447669983, "learning_rate": 9.939410954682706e-05, "loss": 0.2815, "step": 386 }, { "epoch": 0.2903225806451613, "grad_norm": 0.30829834938049316, "learning_rate": 9.938390477991242e-05, "loss": 0.2444, "step": 387 }, { "epoch": 0.291072768192048, "grad_norm": 0.28018006682395935, "learning_rate": 9.937361532446399e-05, "loss": 0.2943, "step": 388 }, { "epoch": 0.29182295573893474, "grad_norm": 0.3178389072418213, "learning_rate": 9.936324119812719e-05, "loss": 0.2805, "step": 389 }, { "epoch": 0.29257314328582146, "grad_norm": 0.3474150598049164, "learning_rate": 9.93527824186926e-05, "loss": 0.3129, "step": 390 }, { "epoch": 0.2933233308327082, "grad_norm": 0.33827823400497437, "learning_rate": 9.934223900409603e-05, "loss": 0.3013, "step": 391 }, { "epoch": 0.2940735183795949, "grad_norm": 0.31353774666786194, "learning_rate": 9.933161097241837e-05, "loss": 0.1782, "step": 392 }, { "epoch": 0.2948237059264816, "grad_norm": 0.32364851236343384, "learning_rate": 9.932089834188567e-05, "loss": 0.2784, "step": 393 }, { "epoch": 0.2955738934733683, "grad_norm": 0.30517297983169556, "learning_rate": 9.931010113086902e-05, "loss": 0.2847, "step": 394 }, { "epoch": 0.29632408102025504, "grad_norm": 0.3241008520126343, "learning_rate": 9.929921935788457e-05, "loss": 0.2292, "step": 395 }, { "epoch": 0.2970742685671418, "grad_norm": 0.29124343395233154, "learning_rate": 9.928825304159351e-05, "loss": 0.2788, "step": 396 }, { "epoch": 0.2978244561140285, "grad_norm": 0.27112913131713867, "learning_rate": 9.927720220080199e-05, "loss": 0.2531, "step": 397 }, { "epoch": 0.29857464366091524, "grad_norm": 0.3055080473423004, "learning_rate": 9.926606685446109e-05, "loss": 0.2767, "step": 398 }, { "epoch": 0.29932483120780196, "grad_norm": 0.2561214566230774, "learning_rate": 9.925484702166686e-05, "loss": 0.2525, "step": 399 }, { "epoch": 0.30007501875468867, "grad_norm": 0.2637074291706085, "learning_rate": 9.924354272166017e-05, "loss": 0.2407, "step": 400 }, { "epoch": 0.30007501875468867, "eval_loss": 0.28551289439201355, "eval_runtime": 8.94, "eval_samples_per_second": 6.04, "eval_steps_per_second": 1.566, "step": 400 }, { "epoch": 0.3008252063015754, "grad_norm": 0.29666826128959656, "learning_rate": 9.923215397382684e-05, "loss": 0.2698, "step": 401 }, { "epoch": 0.3015753938484621, "grad_norm": 0.2682521939277649, "learning_rate": 9.92206807976974e-05, "loss": 0.1811, "step": 402 }, { "epoch": 0.3023255813953488, "grad_norm": 0.36996138095855713, "learning_rate": 9.920912321294723e-05, "loss": 0.3215, "step": 403 }, { "epoch": 0.30307576894223553, "grad_norm": 0.24857886135578156, "learning_rate": 9.919748123939647e-05, "loss": 0.2252, "step": 404 }, { "epoch": 0.3038259564891223, "grad_norm": 0.473545640707016, "learning_rate": 9.918575489700993e-05, "loss": 0.2713, "step": 405 }, { "epoch": 0.304576144036009, "grad_norm": 0.36468034982681274, "learning_rate": 9.917394420589716e-05, "loss": 0.2275, "step": 406 }, { "epoch": 0.30532633158289574, "grad_norm": 0.2764924168586731, "learning_rate": 9.916204918631231e-05, "loss": 0.199, "step": 407 }, { "epoch": 0.30607651912978245, "grad_norm": 0.2723712921142578, "learning_rate": 9.915006985865416e-05, "loss": 0.2131, "step": 408 }, { "epoch": 0.30682670667666917, "grad_norm": 0.25162285566329956, "learning_rate": 9.913800624346612e-05, "loss": 0.2137, "step": 409 }, { "epoch": 0.3075768942235559, "grad_norm": 0.41007739305496216, "learning_rate": 9.912585836143606e-05, "loss": 0.3514, "step": 410 }, { "epoch": 0.3083270817704426, "grad_norm": 0.321746826171875, "learning_rate": 9.911362623339642e-05, "loss": 0.2447, "step": 411 }, { "epoch": 0.3090772693173293, "grad_norm": 0.34145185351371765, "learning_rate": 9.91013098803241e-05, "loss": 0.3194, "step": 412 }, { "epoch": 0.30982745686421603, "grad_norm": 0.2433912605047226, "learning_rate": 9.908890932334042e-05, "loss": 0.2822, "step": 413 }, { "epoch": 0.31057764441110275, "grad_norm": 0.39996811747550964, "learning_rate": 9.907642458371111e-05, "loss": 0.2702, "step": 414 }, { "epoch": 0.3113278319579895, "grad_norm": 0.3486824929714203, "learning_rate": 9.906385568284629e-05, "loss": 0.3314, "step": 415 }, { "epoch": 0.31207801950487624, "grad_norm": 0.3654520511627197, "learning_rate": 9.905120264230036e-05, "loss": 0.3227, "step": 416 }, { "epoch": 0.31282820705176295, "grad_norm": 0.20507247745990753, "learning_rate": 9.903846548377206e-05, "loss": 0.2941, "step": 417 }, { "epoch": 0.31357839459864967, "grad_norm": 0.32427701354026794, "learning_rate": 9.902564422910436e-05, "loss": 0.252, "step": 418 }, { "epoch": 0.3143285821455364, "grad_norm": 0.34143608808517456, "learning_rate": 9.901273890028444e-05, "loss": 0.3127, "step": 419 }, { "epoch": 0.3150787696924231, "grad_norm": 0.30293047428131104, "learning_rate": 9.899974951944367e-05, "loss": 0.2907, "step": 420 }, { "epoch": 0.3158289572393098, "grad_norm": 0.4177364110946655, "learning_rate": 9.898667610885757e-05, "loss": 0.2647, "step": 421 }, { "epoch": 0.31657914478619653, "grad_norm": 0.26891201734542847, "learning_rate": 9.897351869094573e-05, "loss": 0.2736, "step": 422 }, { "epoch": 0.31732933233308325, "grad_norm": 0.2967749238014221, "learning_rate": 9.896027728827185e-05, "loss": 0.3033, "step": 423 }, { "epoch": 0.31807951987997, "grad_norm": 0.3020254373550415, "learning_rate": 9.894695192354362e-05, "loss": 0.2248, "step": 424 }, { "epoch": 0.31882970742685673, "grad_norm": 0.40887290239334106, "learning_rate": 9.893354261961274e-05, "loss": 0.3493, "step": 425 }, { "epoch": 0.31957989497374345, "grad_norm": 0.2315233200788498, "learning_rate": 9.892004939947482e-05, "loss": 0.2337, "step": 426 }, { "epoch": 0.32033008252063017, "grad_norm": 0.32775694131851196, "learning_rate": 9.890647228626944e-05, "loss": 0.3288, "step": 427 }, { "epoch": 0.3210802700675169, "grad_norm": 0.2940825819969177, "learning_rate": 9.889281130327997e-05, "loss": 0.2369, "step": 428 }, { "epoch": 0.3218304576144036, "grad_norm": 0.3502274453639984, "learning_rate": 9.887906647393368e-05, "loss": 0.2688, "step": 429 }, { "epoch": 0.3225806451612903, "grad_norm": 0.2302337884902954, "learning_rate": 9.88652378218016e-05, "loss": 0.2728, "step": 430 }, { "epoch": 0.32333083270817703, "grad_norm": 0.26802852749824524, "learning_rate": 9.885132537059849e-05, "loss": 0.2455, "step": 431 }, { "epoch": 0.32408102025506375, "grad_norm": 0.2934998571872711, "learning_rate": 9.883732914418285e-05, "loss": 0.3238, "step": 432 }, { "epoch": 0.32483120780195046, "grad_norm": 0.2558884620666504, "learning_rate": 9.882324916655681e-05, "loss": 0.2663, "step": 433 }, { "epoch": 0.32558139534883723, "grad_norm": 0.35943272709846497, "learning_rate": 9.880908546186616e-05, "loss": 0.2996, "step": 434 }, { "epoch": 0.32633158289572395, "grad_norm": 0.3362939655780792, "learning_rate": 9.879483805440027e-05, "loss": 0.3034, "step": 435 }, { "epoch": 0.32708177044261066, "grad_norm": 0.2822950482368469, "learning_rate": 9.8780506968592e-05, "loss": 0.258, "step": 436 }, { "epoch": 0.3278319579894974, "grad_norm": 0.20314159989356995, "learning_rate": 9.876609222901781e-05, "loss": 0.199, "step": 437 }, { "epoch": 0.3285821455363841, "grad_norm": 0.3416706919670105, "learning_rate": 9.875159386039749e-05, "loss": 0.2788, "step": 438 }, { "epoch": 0.3293323330832708, "grad_norm": 0.2846376895904541, "learning_rate": 9.873701188759438e-05, "loss": 0.2519, "step": 439 }, { "epoch": 0.3300825206301575, "grad_norm": 0.360885888338089, "learning_rate": 9.872234633561509e-05, "loss": 0.3446, "step": 440 }, { "epoch": 0.33083270817704424, "grad_norm": 0.2632591724395752, "learning_rate": 9.87075972296096e-05, "loss": 0.2831, "step": 441 }, { "epoch": 0.33158289572393096, "grad_norm": 0.35226765275001526, "learning_rate": 9.86927645948712e-05, "loss": 0.2767, "step": 442 }, { "epoch": 0.33233308327081773, "grad_norm": 0.28218087553977966, "learning_rate": 9.867784845683637e-05, "loss": 0.2635, "step": 443 }, { "epoch": 0.33308327081770445, "grad_norm": 0.31194448471069336, "learning_rate": 9.866284884108481e-05, "loss": 0.2954, "step": 444 }, { "epoch": 0.33383345836459116, "grad_norm": 0.3088156282901764, "learning_rate": 9.864776577333941e-05, "loss": 0.2477, "step": 445 }, { "epoch": 0.3345836459114779, "grad_norm": 0.2747879922389984, "learning_rate": 9.863259927946613e-05, "loss": 0.2917, "step": 446 }, { "epoch": 0.3353338334583646, "grad_norm": 0.33235102891921997, "learning_rate": 9.861734938547405e-05, "loss": 0.2979, "step": 447 }, { "epoch": 0.3360840210052513, "grad_norm": 0.38837602734565735, "learning_rate": 9.860201611751518e-05, "loss": 0.2509, "step": 448 }, { "epoch": 0.336834208552138, "grad_norm": 0.28050798177719116, "learning_rate": 9.858659950188458e-05, "loss": 0.2704, "step": 449 }, { "epoch": 0.33758439609902474, "grad_norm": 0.2844732403755188, "learning_rate": 9.857109956502027e-05, "loss": 0.2862, "step": 450 }, { "epoch": 0.33833458364591146, "grad_norm": 0.33200013637542725, "learning_rate": 9.855551633350306e-05, "loss": 0.327, "step": 451 }, { "epoch": 0.3390847711927982, "grad_norm": 0.2736620306968689, "learning_rate": 9.853984983405668e-05, "loss": 0.3351, "step": 452 }, { "epoch": 0.33983495873968494, "grad_norm": 0.3338417708873749, "learning_rate": 9.852410009354766e-05, "loss": 0.2484, "step": 453 }, { "epoch": 0.34058514628657166, "grad_norm": 0.25424468517303467, "learning_rate": 9.850826713898521e-05, "loss": 0.2259, "step": 454 }, { "epoch": 0.3413353338334584, "grad_norm": 0.22853153944015503, "learning_rate": 9.849235099752132e-05, "loss": 0.2329, "step": 455 }, { "epoch": 0.3420855213803451, "grad_norm": 0.26735737919807434, "learning_rate": 9.847635169645058e-05, "loss": 0.2774, "step": 456 }, { "epoch": 0.3428357089272318, "grad_norm": 0.24959413707256317, "learning_rate": 9.846026926321024e-05, "loss": 0.2421, "step": 457 }, { "epoch": 0.3435858964741185, "grad_norm": 0.25345903635025024, "learning_rate": 9.844410372538006e-05, "loss": 0.2319, "step": 458 }, { "epoch": 0.34433608402100524, "grad_norm": 0.23889240622520447, "learning_rate": 9.842785511068239e-05, "loss": 0.2507, "step": 459 }, { "epoch": 0.34508627156789196, "grad_norm": 0.24956288933753967, "learning_rate": 9.841152344698197e-05, "loss": 0.2554, "step": 460 }, { "epoch": 0.34583645911477867, "grad_norm": 0.3410578668117523, "learning_rate": 9.8395108762286e-05, "loss": 0.2699, "step": 461 }, { "epoch": 0.34658664666166544, "grad_norm": 0.2648273706436157, "learning_rate": 9.837861108474404e-05, "loss": 0.263, "step": 462 }, { "epoch": 0.34733683420855216, "grad_norm": 0.24982066452503204, "learning_rate": 9.8362030442648e-05, "loss": 0.253, "step": 463 }, { "epoch": 0.3480870217554389, "grad_norm": 0.2919140160083771, "learning_rate": 9.834536686443204e-05, "loss": 0.2245, "step": 464 }, { "epoch": 0.3488372093023256, "grad_norm": 0.2518826127052307, "learning_rate": 9.832862037867257e-05, "loss": 0.231, "step": 465 }, { "epoch": 0.3495873968492123, "grad_norm": 0.3234107494354248, "learning_rate": 9.831179101408813e-05, "loss": 0.2647, "step": 466 }, { "epoch": 0.350337584396099, "grad_norm": 0.23380203545093536, "learning_rate": 9.829487879953946e-05, "loss": 0.2789, "step": 467 }, { "epoch": 0.35108777194298574, "grad_norm": 0.2975447475910187, "learning_rate": 9.827788376402932e-05, "loss": 0.3421, "step": 468 }, { "epoch": 0.35183795948987245, "grad_norm": 0.319738507270813, "learning_rate": 9.826080593670253e-05, "loss": 0.2735, "step": 469 }, { "epoch": 0.35258814703675917, "grad_norm": 0.396254301071167, "learning_rate": 9.82436453468459e-05, "loss": 0.2994, "step": 470 }, { "epoch": 0.3533383345836459, "grad_norm": 0.31047672033309937, "learning_rate": 9.822640202388812e-05, "loss": 0.2775, "step": 471 }, { "epoch": 0.35408852213053266, "grad_norm": 0.2754264175891876, "learning_rate": 9.820907599739979e-05, "loss": 0.2688, "step": 472 }, { "epoch": 0.3548387096774194, "grad_norm": 0.2777048647403717, "learning_rate": 9.819166729709336e-05, "loss": 0.3014, "step": 473 }, { "epoch": 0.3555888972243061, "grad_norm": 0.31954237818717957, "learning_rate": 9.817417595282304e-05, "loss": 0.2834, "step": 474 }, { "epoch": 0.3563390847711928, "grad_norm": 0.2917940616607666, "learning_rate": 9.815660199458476e-05, "loss": 0.2573, "step": 475 }, { "epoch": 0.3570892723180795, "grad_norm": 0.3305354714393616, "learning_rate": 9.81389454525161e-05, "loss": 0.3357, "step": 476 }, { "epoch": 0.35783945986496624, "grad_norm": 0.31601205468177795, "learning_rate": 9.812120635689632e-05, "loss": 0.3022, "step": 477 }, { "epoch": 0.35858964741185295, "grad_norm": 0.27898257970809937, "learning_rate": 9.810338473814621e-05, "loss": 0.3131, "step": 478 }, { "epoch": 0.35933983495873967, "grad_norm": 0.39138540625572205, "learning_rate": 9.808548062682812e-05, "loss": 0.3154, "step": 479 }, { "epoch": 0.3600900225056264, "grad_norm": 0.2780033051967621, "learning_rate": 9.80674940536458e-05, "loss": 0.2905, "step": 480 }, { "epoch": 0.36084021005251316, "grad_norm": 0.22239302098751068, "learning_rate": 9.804942504944445e-05, "loss": 0.2584, "step": 481 }, { "epoch": 0.36159039759939987, "grad_norm": 0.34206798672676086, "learning_rate": 9.803127364521067e-05, "loss": 0.2838, "step": 482 }, { "epoch": 0.3623405851462866, "grad_norm": 0.2698298692703247, "learning_rate": 9.801303987207229e-05, "loss": 0.2857, "step": 483 }, { "epoch": 0.3630907726931733, "grad_norm": 0.4095413088798523, "learning_rate": 9.799472376129846e-05, "loss": 0.2772, "step": 484 }, { "epoch": 0.36384096024006, "grad_norm": 0.35254210233688354, "learning_rate": 9.79763253442995e-05, "loss": 0.3212, "step": 485 }, { "epoch": 0.36459114778694673, "grad_norm": 0.2972910702228546, "learning_rate": 9.795784465262689e-05, "loss": 0.2726, "step": 486 }, { "epoch": 0.36534133533383345, "grad_norm": 0.3424765467643738, "learning_rate": 9.79392817179732e-05, "loss": 0.2948, "step": 487 }, { "epoch": 0.36609152288072017, "grad_norm": 0.23481765389442444, "learning_rate": 9.792063657217201e-05, "loss": 0.2457, "step": 488 }, { "epoch": 0.3668417104276069, "grad_norm": 0.2270536720752716, "learning_rate": 9.790190924719793e-05, "loss": 0.2437, "step": 489 }, { "epoch": 0.3675918979744936, "grad_norm": 0.3036023676395416, "learning_rate": 9.788309977516648e-05, "loss": 0.2708, "step": 490 }, { "epoch": 0.36834208552138037, "grad_norm": 0.3582788109779358, "learning_rate": 9.786420818833404e-05, "loss": 0.249, "step": 491 }, { "epoch": 0.3690922730682671, "grad_norm": 0.2789169251918793, "learning_rate": 9.784523451909782e-05, "loss": 0.2816, "step": 492 }, { "epoch": 0.3698424606151538, "grad_norm": 0.2663949429988861, "learning_rate": 9.78261787999958e-05, "loss": 0.362, "step": 493 }, { "epoch": 0.3705926481620405, "grad_norm": 0.29689744114875793, "learning_rate": 9.780704106370667e-05, "loss": 0.284, "step": 494 }, { "epoch": 0.37134283570892723, "grad_norm": 0.22535526752471924, "learning_rate": 9.778782134304976e-05, "loss": 0.2513, "step": 495 }, { "epoch": 0.37209302325581395, "grad_norm": 0.25983208417892456, "learning_rate": 9.776851967098499e-05, "loss": 0.2976, "step": 496 }, { "epoch": 0.37284321080270066, "grad_norm": 0.2814142107963562, "learning_rate": 9.774913608061282e-05, "loss": 0.2353, "step": 497 }, { "epoch": 0.3735933983495874, "grad_norm": 0.26901301741600037, "learning_rate": 9.772967060517421e-05, "loss": 0.3025, "step": 498 }, { "epoch": 0.3743435858964741, "grad_norm": 0.3545876145362854, "learning_rate": 9.771012327805055e-05, "loss": 0.2421, "step": 499 }, { "epoch": 0.37509377344336087, "grad_norm": 0.2955469787120819, "learning_rate": 9.769049413276355e-05, "loss": 0.3223, "step": 500 }, { "epoch": 0.3758439609902476, "grad_norm": 0.2930402159690857, "learning_rate": 9.767078320297528e-05, "loss": 0.2443, "step": 501 }, { "epoch": 0.3765941485371343, "grad_norm": 0.2729968726634979, "learning_rate": 9.765099052248805e-05, "loss": 0.2767, "step": 502 }, { "epoch": 0.377344336084021, "grad_norm": 0.31001028418540955, "learning_rate": 9.763111612524434e-05, "loss": 0.2931, "step": 503 }, { "epoch": 0.37809452363090773, "grad_norm": 0.27120062708854675, "learning_rate": 9.761116004532679e-05, "loss": 0.2337, "step": 504 }, { "epoch": 0.37884471117779445, "grad_norm": 0.2932368516921997, "learning_rate": 9.759112231695811e-05, "loss": 0.2888, "step": 505 }, { "epoch": 0.37959489872468116, "grad_norm": 0.3159938156604767, "learning_rate": 9.757100297450103e-05, "loss": 0.3497, "step": 506 }, { "epoch": 0.3803450862715679, "grad_norm": 0.33126944303512573, "learning_rate": 9.755080205245826e-05, "loss": 0.2826, "step": 507 }, { "epoch": 0.3810952738184546, "grad_norm": 0.32868897914886475, "learning_rate": 9.753051958547238e-05, "loss": 0.28, "step": 508 }, { "epoch": 0.3818454613653413, "grad_norm": 0.25271499156951904, "learning_rate": 9.751015560832582e-05, "loss": 0.2488, "step": 509 }, { "epoch": 0.3825956489122281, "grad_norm": 0.33580100536346436, "learning_rate": 9.748971015594078e-05, "loss": 0.2886, "step": 510 }, { "epoch": 0.3833458364591148, "grad_norm": 0.4098048508167267, "learning_rate": 9.746918326337923e-05, "loss": 0.3096, "step": 511 }, { "epoch": 0.3840960240060015, "grad_norm": 0.2540065348148346, "learning_rate": 9.744857496584274e-05, "loss": 0.2809, "step": 512 }, { "epoch": 0.38484621155288823, "grad_norm": 0.28278815746307373, "learning_rate": 9.742788529867255e-05, "loss": 0.2711, "step": 513 }, { "epoch": 0.38559639909977494, "grad_norm": 0.21623049676418304, "learning_rate": 9.740711429734936e-05, "loss": 0.2205, "step": 514 }, { "epoch": 0.38634658664666166, "grad_norm": 0.3278290629386902, "learning_rate": 9.738626199749341e-05, "loss": 0.2724, "step": 515 }, { "epoch": 0.3870967741935484, "grad_norm": 0.2619601786136627, "learning_rate": 9.736532843486433e-05, "loss": 0.3044, "step": 516 }, { "epoch": 0.3878469617404351, "grad_norm": 0.42307472229003906, "learning_rate": 9.734431364536114e-05, "loss": 0.2562, "step": 517 }, { "epoch": 0.3885971492873218, "grad_norm": 0.25051334500312805, "learning_rate": 9.732321766502213e-05, "loss": 0.279, "step": 518 }, { "epoch": 0.3893473368342086, "grad_norm": 0.20372985303401947, "learning_rate": 9.730204053002481e-05, "loss": 0.2296, "step": 519 }, { "epoch": 0.3900975243810953, "grad_norm": 0.3203490078449249, "learning_rate": 9.728078227668588e-05, "loss": 0.3393, "step": 520 }, { "epoch": 0.390847711927982, "grad_norm": 0.41421225666999817, "learning_rate": 9.725944294146119e-05, "loss": 0.3731, "step": 521 }, { "epoch": 0.3915978994748687, "grad_norm": 0.31539928913116455, "learning_rate": 9.723802256094555e-05, "loss": 0.3192, "step": 522 }, { "epoch": 0.39234808702175544, "grad_norm": 0.3954058587551117, "learning_rate": 9.721652117187283e-05, "loss": 0.3582, "step": 523 }, { "epoch": 0.39309827456864216, "grad_norm": 0.2879020869731903, "learning_rate": 9.71949388111158e-05, "loss": 0.2914, "step": 524 }, { "epoch": 0.3938484621155289, "grad_norm": 0.2849350571632385, "learning_rate": 9.717327551568608e-05, "loss": 0.2624, "step": 525 }, { "epoch": 0.3945986496624156, "grad_norm": 0.2885470390319824, "learning_rate": 9.715153132273407e-05, "loss": 0.2016, "step": 526 }, { "epoch": 0.3953488372093023, "grad_norm": 0.32981085777282715, "learning_rate": 9.712970626954893e-05, "loss": 0.2761, "step": 527 }, { "epoch": 0.396099024756189, "grad_norm": 0.332764208316803, "learning_rate": 9.71078003935585e-05, "loss": 0.3502, "step": 528 }, { "epoch": 0.3968492123030758, "grad_norm": 0.22229288518428802, "learning_rate": 9.708581373232917e-05, "loss": 0.2554, "step": 529 }, { "epoch": 0.3975993998499625, "grad_norm": 0.3743436932563782, "learning_rate": 9.70637463235659e-05, "loss": 0.3037, "step": 530 }, { "epoch": 0.3983495873968492, "grad_norm": 0.2351418286561966, "learning_rate": 9.704159820511214e-05, "loss": 0.3301, "step": 531 }, { "epoch": 0.39909977494373594, "grad_norm": 0.3410119414329529, "learning_rate": 9.701936941494971e-05, "loss": 0.3306, "step": 532 }, { "epoch": 0.39984996249062266, "grad_norm": 0.23885366320610046, "learning_rate": 9.699705999119882e-05, "loss": 0.3161, "step": 533 }, { "epoch": 0.4006001500375094, "grad_norm": 0.30339616537094116, "learning_rate": 9.697466997211793e-05, "loss": 0.1762, "step": 534 }, { "epoch": 0.4013503375843961, "grad_norm": 0.29741978645324707, "learning_rate": 9.69521993961037e-05, "loss": 0.2536, "step": 535 }, { "epoch": 0.4021005251312828, "grad_norm": 0.28487157821655273, "learning_rate": 9.692964830169098e-05, "loss": 0.2793, "step": 536 }, { "epoch": 0.4028507126781695, "grad_norm": 0.24954158067703247, "learning_rate": 9.690701672755266e-05, "loss": 0.2472, "step": 537 }, { "epoch": 0.4036009002250563, "grad_norm": 0.3365628719329834, "learning_rate": 9.688430471249967e-05, "loss": 0.2434, "step": 538 }, { "epoch": 0.404351087771943, "grad_norm": 0.24111148715019226, "learning_rate": 9.686151229548088e-05, "loss": 0.2532, "step": 539 }, { "epoch": 0.4051012753188297, "grad_norm": 0.28043311834335327, "learning_rate": 9.683863951558301e-05, "loss": 0.2728, "step": 540 }, { "epoch": 0.40585146286571644, "grad_norm": 0.20682062208652496, "learning_rate": 9.681568641203068e-05, "loss": 0.1739, "step": 541 }, { "epoch": 0.40660165041260315, "grad_norm": 0.2974221408367157, "learning_rate": 9.679265302418615e-05, "loss": 0.2802, "step": 542 }, { "epoch": 0.40735183795948987, "grad_norm": 0.3568771779537201, "learning_rate": 9.676953939154945e-05, "loss": 0.2848, "step": 543 }, { "epoch": 0.4081020255063766, "grad_norm": 0.39375755190849304, "learning_rate": 9.674634555375817e-05, "loss": 0.2358, "step": 544 }, { "epoch": 0.4088522130532633, "grad_norm": 0.34941914677619934, "learning_rate": 9.672307155058744e-05, "loss": 0.3377, "step": 545 }, { "epoch": 0.40960240060015, "grad_norm": 0.3066091537475586, "learning_rate": 9.669971742194992e-05, "loss": 0.3402, "step": 546 }, { "epoch": 0.41035258814703673, "grad_norm": 0.3906515836715698, "learning_rate": 9.667628320789562e-05, "loss": 0.3077, "step": 547 }, { "epoch": 0.4111027756939235, "grad_norm": 0.2789679169654846, "learning_rate": 9.665276894861188e-05, "loss": 0.2044, "step": 548 }, { "epoch": 0.4118529632408102, "grad_norm": 0.29728278517723083, "learning_rate": 9.66291746844234e-05, "loss": 0.2647, "step": 549 }, { "epoch": 0.41260315078769694, "grad_norm": 0.35674142837524414, "learning_rate": 9.660550045579199e-05, "loss": 0.2706, "step": 550 }, { "epoch": 0.41335333833458365, "grad_norm": 0.3953751027584076, "learning_rate": 9.65817463033166e-05, "loss": 0.3023, "step": 551 }, { "epoch": 0.41410352588147037, "grad_norm": 0.3035467863082886, "learning_rate": 9.655791226773331e-05, "loss": 0.2606, "step": 552 }, { "epoch": 0.4148537134283571, "grad_norm": 0.35518065094947815, "learning_rate": 9.65339983899151e-05, "loss": 0.3022, "step": 553 }, { "epoch": 0.4156039009752438, "grad_norm": 0.3414997458457947, "learning_rate": 9.651000471087193e-05, "loss": 0.3626, "step": 554 }, { "epoch": 0.4163540885221305, "grad_norm": 0.25935447216033936, "learning_rate": 9.64859312717506e-05, "loss": 0.2159, "step": 555 }, { "epoch": 0.41710427606901723, "grad_norm": 0.23959723114967346, "learning_rate": 9.64617781138347e-05, "loss": 0.2237, "step": 556 }, { "epoch": 0.41785446361590395, "grad_norm": 0.31431931257247925, "learning_rate": 9.643754527854451e-05, "loss": 0.332, "step": 557 }, { "epoch": 0.4186046511627907, "grad_norm": 0.347904771566391, "learning_rate": 9.641323280743693e-05, "loss": 0.2882, "step": 558 }, { "epoch": 0.41935483870967744, "grad_norm": 0.29322245717048645, "learning_rate": 9.638884074220548e-05, "loss": 0.2197, "step": 559 }, { "epoch": 0.42010502625656415, "grad_norm": 0.2820701599121094, "learning_rate": 9.636436912468015e-05, "loss": 0.2737, "step": 560 }, { "epoch": 0.42085521380345087, "grad_norm": 0.2378447949886322, "learning_rate": 9.633981799682735e-05, "loss": 0.2428, "step": 561 }, { "epoch": 0.4216054013503376, "grad_norm": 0.29645460844039917, "learning_rate": 9.631518740074985e-05, "loss": 0.2278, "step": 562 }, { "epoch": 0.4223555888972243, "grad_norm": 0.22625969350337982, "learning_rate": 9.629047737868669e-05, "loss": 0.1824, "step": 563 }, { "epoch": 0.423105776444111, "grad_norm": 0.23257173597812653, "learning_rate": 9.626568797301311e-05, "loss": 0.3062, "step": 564 }, { "epoch": 0.42385596399099773, "grad_norm": 0.278300017118454, "learning_rate": 9.624081922624053e-05, "loss": 0.2409, "step": 565 }, { "epoch": 0.42460615153788445, "grad_norm": 0.2456604391336441, "learning_rate": 9.621587118101638e-05, "loss": 0.2582, "step": 566 }, { "epoch": 0.4253563390847712, "grad_norm": 0.2717861831188202, "learning_rate": 9.619084388012412e-05, "loss": 0.2292, "step": 567 }, { "epoch": 0.42610652663165793, "grad_norm": 0.33589014410972595, "learning_rate": 9.616573736648308e-05, "loss": 0.2585, "step": 568 }, { "epoch": 0.42685671417854465, "grad_norm": 0.2769697606563568, "learning_rate": 9.61405516831485e-05, "loss": 0.258, "step": 569 }, { "epoch": 0.42760690172543137, "grad_norm": 0.309330016374588, "learning_rate": 9.61152868733113e-05, "loss": 0.2601, "step": 570 }, { "epoch": 0.4283570892723181, "grad_norm": 0.2562524676322937, "learning_rate": 9.608994298029818e-05, "loss": 0.2335, "step": 571 }, { "epoch": 0.4291072768192048, "grad_norm": 0.3424158990383148, "learning_rate": 9.60645200475714e-05, "loss": 0.3127, "step": 572 }, { "epoch": 0.4298574643660915, "grad_norm": 0.25529053807258606, "learning_rate": 9.603901811872877e-05, "loss": 0.1994, "step": 573 }, { "epoch": 0.43060765191297823, "grad_norm": 0.2934305667877197, "learning_rate": 9.601343723750363e-05, "loss": 0.3099, "step": 574 }, { "epoch": 0.43135783945986494, "grad_norm": 0.18441471457481384, "learning_rate": 9.598777744776464e-05, "loss": 0.2191, "step": 575 }, { "epoch": 0.43210802700675166, "grad_norm": 0.33996301889419556, "learning_rate": 9.596203879351582e-05, "loss": 0.2777, "step": 576 }, { "epoch": 0.43285821455363843, "grad_norm": 0.24609936773777008, "learning_rate": 9.593622131889643e-05, "loss": 0.2934, "step": 577 }, { "epoch": 0.43360840210052515, "grad_norm": 0.20088337361812592, "learning_rate": 9.591032506818089e-05, "loss": 0.237, "step": 578 }, { "epoch": 0.43435858964741186, "grad_norm": 0.3693728744983673, "learning_rate": 9.588435008577873e-05, "loss": 0.3334, "step": 579 }, { "epoch": 0.4351087771942986, "grad_norm": 0.2686242461204529, "learning_rate": 9.585829641623448e-05, "loss": 0.2713, "step": 580 }, { "epoch": 0.4358589647411853, "grad_norm": 0.26710695028305054, "learning_rate": 9.583216410422762e-05, "loss": 0.2617, "step": 581 }, { "epoch": 0.436609152288072, "grad_norm": 0.34262925386428833, "learning_rate": 9.580595319457249e-05, "loss": 0.2383, "step": 582 }, { "epoch": 0.4373593398349587, "grad_norm": 0.4435969889163971, "learning_rate": 9.577966373221823e-05, "loss": 0.2858, "step": 583 }, { "epoch": 0.43810952738184544, "grad_norm": 0.3884657323360443, "learning_rate": 9.575329576224868e-05, "loss": 0.2469, "step": 584 }, { "epoch": 0.43885971492873216, "grad_norm": 0.31032776832580566, "learning_rate": 9.572684932988227e-05, "loss": 0.3202, "step": 585 }, { "epoch": 0.43960990247561893, "grad_norm": 0.25603368878364563, "learning_rate": 9.570032448047208e-05, "loss": 0.2978, "step": 586 }, { "epoch": 0.44036009002250565, "grad_norm": 0.30342474579811096, "learning_rate": 9.567372125950559e-05, "loss": 0.2205, "step": 587 }, { "epoch": 0.44111027756939236, "grad_norm": 0.22529205679893494, "learning_rate": 9.564703971260472e-05, "loss": 0.2007, "step": 588 }, { "epoch": 0.4418604651162791, "grad_norm": 0.2549872398376465, "learning_rate": 9.562027988552567e-05, "loss": 0.2246, "step": 589 }, { "epoch": 0.4426106526631658, "grad_norm": 0.27988582849502563, "learning_rate": 9.559344182415891e-05, "loss": 0.2739, "step": 590 }, { "epoch": 0.4433608402100525, "grad_norm": 0.3312973976135254, "learning_rate": 9.55665255745291e-05, "loss": 0.324, "step": 591 }, { "epoch": 0.4441110277569392, "grad_norm": 0.2787535786628723, "learning_rate": 9.553953118279496e-05, "loss": 0.2623, "step": 592 }, { "epoch": 0.44486121530382594, "grad_norm": 0.35470905900001526, "learning_rate": 9.551245869524916e-05, "loss": 0.3339, "step": 593 }, { "epoch": 0.44561140285071266, "grad_norm": 0.3010650873184204, "learning_rate": 9.54853081583184e-05, "loss": 0.3119, "step": 594 }, { "epoch": 0.4463615903975994, "grad_norm": 0.2525995671749115, "learning_rate": 9.545807961856317e-05, "loss": 0.2064, "step": 595 }, { "epoch": 0.44711177794448614, "grad_norm": 0.361225426197052, "learning_rate": 9.543077312267773e-05, "loss": 0.2644, "step": 596 }, { "epoch": 0.44786196549137286, "grad_norm": 0.2855580151081085, "learning_rate": 9.540338871749002e-05, "loss": 0.2632, "step": 597 }, { "epoch": 0.4486121530382596, "grad_norm": 0.36146941781044006, "learning_rate": 9.537592644996162e-05, "loss": 0.2862, "step": 598 }, { "epoch": 0.4493623405851463, "grad_norm": 0.23781441152095795, "learning_rate": 9.534838636718759e-05, "loss": 0.2398, "step": 599 }, { "epoch": 0.450112528132033, "grad_norm": 0.32831722497940063, "learning_rate": 9.532076851639649e-05, "loss": 0.2113, "step": 600 }, { "epoch": 0.450112528132033, "eval_loss": 0.27472740411758423, "eval_runtime": 8.9101, "eval_samples_per_second": 6.061, "eval_steps_per_second": 1.571, "step": 600 }, { "epoch": 0.4508627156789197, "grad_norm": 0.2581571042537689, "learning_rate": 9.529307294495018e-05, "loss": 0.2547, "step": 601 }, { "epoch": 0.45161290322580644, "grad_norm": 0.29526248574256897, "learning_rate": 9.526529970034386e-05, "loss": 0.2675, "step": 602 }, { "epoch": 0.45236309077269315, "grad_norm": 0.25838950276374817, "learning_rate": 9.52374488302059e-05, "loss": 0.2623, "step": 603 }, { "epoch": 0.45311327831957987, "grad_norm": 0.2898143231868744, "learning_rate": 9.52095203822978e-05, "loss": 0.2626, "step": 604 }, { "epoch": 0.45386346586646664, "grad_norm": 0.24708092212677002, "learning_rate": 9.518151440451411e-05, "loss": 0.2359, "step": 605 }, { "epoch": 0.45461365341335336, "grad_norm": 0.2658713161945343, "learning_rate": 9.515343094488232e-05, "loss": 0.2554, "step": 606 }, { "epoch": 0.4553638409602401, "grad_norm": 0.26659634709358215, "learning_rate": 9.51252700515628e-05, "loss": 0.2148, "step": 607 }, { "epoch": 0.4561140285071268, "grad_norm": 0.26295262575149536, "learning_rate": 9.509703177284869e-05, "loss": 0.3697, "step": 608 }, { "epoch": 0.4568642160540135, "grad_norm": 0.3031051456928253, "learning_rate": 9.506871615716587e-05, "loss": 0.2944, "step": 609 }, { "epoch": 0.4576144036009002, "grad_norm": 0.24120275676250458, "learning_rate": 9.504032325307284e-05, "loss": 0.1932, "step": 610 }, { "epoch": 0.45836459114778694, "grad_norm": 0.37356826663017273, "learning_rate": 9.501185310926062e-05, "loss": 0.2596, "step": 611 }, { "epoch": 0.45911477869467365, "grad_norm": 0.31004658341407776, "learning_rate": 9.498330577455273e-05, "loss": 0.2592, "step": 612 }, { "epoch": 0.45986496624156037, "grad_norm": 0.2976832389831543, "learning_rate": 9.495468129790499e-05, "loss": 0.3079, "step": 613 }, { "epoch": 0.4606151537884471, "grad_norm": 0.2784198522567749, "learning_rate": 9.49259797284056e-05, "loss": 0.1958, "step": 614 }, { "epoch": 0.46136534133533386, "grad_norm": 0.2875315248966217, "learning_rate": 9.489720111527492e-05, "loss": 0.242, "step": 615 }, { "epoch": 0.46211552888222057, "grad_norm": 0.2550128996372223, "learning_rate": 9.486834550786543e-05, "loss": 0.2051, "step": 616 }, { "epoch": 0.4628657164291073, "grad_norm": 0.30639031529426575, "learning_rate": 9.483941295566165e-05, "loss": 0.2496, "step": 617 }, { "epoch": 0.463615903975994, "grad_norm": 0.28445613384246826, "learning_rate": 9.481040350828006e-05, "loss": 0.278, "step": 618 }, { "epoch": 0.4643660915228807, "grad_norm": 0.294940322637558, "learning_rate": 9.4781317215469e-05, "loss": 0.2285, "step": 619 }, { "epoch": 0.46511627906976744, "grad_norm": 0.3264240026473999, "learning_rate": 9.475215412710864e-05, "loss": 0.2532, "step": 620 }, { "epoch": 0.46586646661665415, "grad_norm": 0.2780837416648865, "learning_rate": 9.472291429321075e-05, "loss": 0.2367, "step": 621 }, { "epoch": 0.46661665416354087, "grad_norm": 0.23927810788154602, "learning_rate": 9.469359776391879e-05, "loss": 0.219, "step": 622 }, { "epoch": 0.4673668417104276, "grad_norm": 0.33325260877609253, "learning_rate": 9.466420458950773e-05, "loss": 0.3334, "step": 623 }, { "epoch": 0.46811702925731435, "grad_norm": 0.3050619959831238, "learning_rate": 9.463473482038395e-05, "loss": 0.2977, "step": 624 }, { "epoch": 0.46886721680420107, "grad_norm": 0.2789558172225952, "learning_rate": 9.46051885070852e-05, "loss": 0.2675, "step": 625 }, { "epoch": 0.4696174043510878, "grad_norm": 0.3214525580406189, "learning_rate": 9.457556570028052e-05, "loss": 0.259, "step": 626 }, { "epoch": 0.4703675918979745, "grad_norm": 0.3081679344177246, "learning_rate": 9.454586645077011e-05, "loss": 0.2339, "step": 627 }, { "epoch": 0.4711177794448612, "grad_norm": 0.3348385989665985, "learning_rate": 9.451609080948522e-05, "loss": 0.2965, "step": 628 }, { "epoch": 0.47186796699174793, "grad_norm": 0.2554117739200592, "learning_rate": 9.448623882748817e-05, "loss": 0.2642, "step": 629 }, { "epoch": 0.47261815453863465, "grad_norm": 0.34293505549430847, "learning_rate": 9.445631055597217e-05, "loss": 0.32, "step": 630 }, { "epoch": 0.47336834208552137, "grad_norm": 0.26327767968177795, "learning_rate": 9.442630604626126e-05, "loss": 0.2524, "step": 631 }, { "epoch": 0.4741185296324081, "grad_norm": 0.2363734245300293, "learning_rate": 9.43962253498102e-05, "loss": 0.2521, "step": 632 }, { "epoch": 0.4748687171792948, "grad_norm": 0.36404547095298767, "learning_rate": 9.436606851820444e-05, "loss": 0.2858, "step": 633 }, { "epoch": 0.47561890472618157, "grad_norm": 0.4077342748641968, "learning_rate": 9.433583560315999e-05, "loss": 0.3064, "step": 634 }, { "epoch": 0.4763690922730683, "grad_norm": 0.2777729332447052, "learning_rate": 9.430552665652328e-05, "loss": 0.2188, "step": 635 }, { "epoch": 0.477119279819955, "grad_norm": 0.27807092666625977, "learning_rate": 9.427514173027121e-05, "loss": 0.2445, "step": 636 }, { "epoch": 0.4778694673668417, "grad_norm": 0.2851824164390564, "learning_rate": 9.424468087651092e-05, "loss": 0.2747, "step": 637 }, { "epoch": 0.47861965491372843, "grad_norm": 0.2869377136230469, "learning_rate": 9.421414414747978e-05, "loss": 0.22, "step": 638 }, { "epoch": 0.47936984246061515, "grad_norm": 0.28115132451057434, "learning_rate": 9.418353159554526e-05, "loss": 0.2953, "step": 639 }, { "epoch": 0.48012003000750186, "grad_norm": 0.389658659696579, "learning_rate": 9.415284327320489e-05, "loss": 0.2955, "step": 640 }, { "epoch": 0.4808702175543886, "grad_norm": 0.30045390129089355, "learning_rate": 9.41220792330861e-05, "loss": 0.2905, "step": 641 }, { "epoch": 0.4816204051012753, "grad_norm": 0.30258139967918396, "learning_rate": 9.40912395279462e-05, "loss": 0.2948, "step": 642 }, { "epoch": 0.48237059264816207, "grad_norm": 0.3461766541004181, "learning_rate": 9.406032421067224e-05, "loss": 0.2316, "step": 643 }, { "epoch": 0.4831207801950488, "grad_norm": 0.2818823456764221, "learning_rate": 9.402933333428097e-05, "loss": 0.2812, "step": 644 }, { "epoch": 0.4838709677419355, "grad_norm": 0.23972608149051666, "learning_rate": 9.399826695191868e-05, "loss": 0.239, "step": 645 }, { "epoch": 0.4846211552888222, "grad_norm": 0.31343311071395874, "learning_rate": 9.396712511686114e-05, "loss": 0.2654, "step": 646 }, { "epoch": 0.48537134283570893, "grad_norm": 0.25434574484825134, "learning_rate": 9.393590788251354e-05, "loss": 0.2157, "step": 647 }, { "epoch": 0.48612153038259565, "grad_norm": 0.2621150016784668, "learning_rate": 9.390461530241037e-05, "loss": 0.2181, "step": 648 }, { "epoch": 0.48687171792948236, "grad_norm": 0.350813627243042, "learning_rate": 9.38732474302153e-05, "loss": 0.2597, "step": 649 }, { "epoch": 0.4876219054763691, "grad_norm": 0.39908158779144287, "learning_rate": 9.384180431972119e-05, "loss": 0.304, "step": 650 }, { "epoch": 0.4883720930232558, "grad_norm": 0.26132452487945557, "learning_rate": 9.381028602484984e-05, "loss": 0.2511, "step": 651 }, { "epoch": 0.4891222805701425, "grad_norm": 0.2970663011074066, "learning_rate": 9.377869259965202e-05, "loss": 0.2238, "step": 652 }, { "epoch": 0.4898724681170293, "grad_norm": 0.28565019369125366, "learning_rate": 9.374702409830736e-05, "loss": 0.3645, "step": 653 }, { "epoch": 0.490622655663916, "grad_norm": 0.3153075575828552, "learning_rate": 9.37152805751242e-05, "loss": 0.2659, "step": 654 }, { "epoch": 0.4913728432108027, "grad_norm": 0.2861831486225128, "learning_rate": 9.36834620845396e-05, "loss": 0.2955, "step": 655 }, { "epoch": 0.49212303075768943, "grad_norm": 0.28252729773521423, "learning_rate": 9.365156868111908e-05, "loss": 0.3266, "step": 656 }, { "epoch": 0.49287321830457614, "grad_norm": 0.30953922867774963, "learning_rate": 9.361960041955672e-05, "loss": 0.2451, "step": 657 }, { "epoch": 0.49362340585146286, "grad_norm": 0.2793737053871155, "learning_rate": 9.358755735467494e-05, "loss": 0.2968, "step": 658 }, { "epoch": 0.4943735933983496, "grad_norm": 0.24006298184394836, "learning_rate": 9.355543954142446e-05, "loss": 0.2565, "step": 659 }, { "epoch": 0.4951237809452363, "grad_norm": 0.31040674448013306, "learning_rate": 9.352324703488412e-05, "loss": 0.3355, "step": 660 }, { "epoch": 0.495873968492123, "grad_norm": 0.3101734220981598, "learning_rate": 9.349097989026093e-05, "loss": 0.2642, "step": 661 }, { "epoch": 0.4966241560390098, "grad_norm": 0.3329352140426636, "learning_rate": 9.345863816288985e-05, "loss": 0.2841, "step": 662 }, { "epoch": 0.4973743435858965, "grad_norm": 0.29761093854904175, "learning_rate": 9.342622190823378e-05, "loss": 0.2779, "step": 663 }, { "epoch": 0.4981245311327832, "grad_norm": 0.33474627137184143, "learning_rate": 9.339373118188338e-05, "loss": 0.2666, "step": 664 }, { "epoch": 0.4988747186796699, "grad_norm": 0.2793816924095154, "learning_rate": 9.336116603955707e-05, "loss": 0.2585, "step": 665 }, { "epoch": 0.49962490622655664, "grad_norm": 0.30487409234046936, "learning_rate": 9.332852653710084e-05, "loss": 0.2338, "step": 666 }, { "epoch": 0.5003750937734434, "grad_norm": 0.39107322692871094, "learning_rate": 9.329581273048822e-05, "loss": 0.2263, "step": 667 }, { "epoch": 0.5011252813203301, "grad_norm": 0.3005673289299011, "learning_rate": 9.32630246758202e-05, "loss": 0.302, "step": 668 }, { "epoch": 0.5018754688672168, "grad_norm": 0.21857699751853943, "learning_rate": 9.323016242932504e-05, "loss": 0.3005, "step": 669 }, { "epoch": 0.5026256564141035, "grad_norm": 0.28041893243789673, "learning_rate": 9.319722604735825e-05, "loss": 0.2447, "step": 670 }, { "epoch": 0.5033758439609902, "grad_norm": 0.2783823311328888, "learning_rate": 9.31642155864025e-05, "loss": 0.2583, "step": 671 }, { "epoch": 0.5041260315078769, "grad_norm": 0.3389221131801605, "learning_rate": 9.313113110306748e-05, "loss": 0.2692, "step": 672 }, { "epoch": 0.5048762190547637, "grad_norm": 0.297484815120697, "learning_rate": 9.309797265408979e-05, "loss": 0.352, "step": 673 }, { "epoch": 0.5056264066016504, "grad_norm": 0.3628078103065491, "learning_rate": 9.306474029633294e-05, "loss": 0.3329, "step": 674 }, { "epoch": 0.5063765941485371, "grad_norm": 0.2761705815792084, "learning_rate": 9.303143408678716e-05, "loss": 0.2447, "step": 675 }, { "epoch": 0.5071267816954238, "grad_norm": 0.29910406470298767, "learning_rate": 9.299805408256928e-05, "loss": 0.2287, "step": 676 }, { "epoch": 0.5078769692423106, "grad_norm": 0.2823033630847931, "learning_rate": 9.296460034092274e-05, "loss": 0.2287, "step": 677 }, { "epoch": 0.5086271567891973, "grad_norm": 0.2124030739068985, "learning_rate": 9.293107291921741e-05, "loss": 0.2355, "step": 678 }, { "epoch": 0.5093773443360841, "grad_norm": 0.2779684066772461, "learning_rate": 9.289747187494952e-05, "loss": 0.1939, "step": 679 }, { "epoch": 0.5101275318829708, "grad_norm": 0.38770848512649536, "learning_rate": 9.286379726574155e-05, "loss": 0.272, "step": 680 }, { "epoch": 0.5108777194298575, "grad_norm": 0.37286779284477234, "learning_rate": 9.283004914934215e-05, "loss": 0.2886, "step": 681 }, { "epoch": 0.5116279069767442, "grad_norm": 0.28744640946388245, "learning_rate": 9.2796227583626e-05, "loss": 0.3765, "step": 682 }, { "epoch": 0.5123780945236309, "grad_norm": 0.29213958978652954, "learning_rate": 9.276233262659375e-05, "loss": 0.2808, "step": 683 }, { "epoch": 0.5131282820705176, "grad_norm": 0.25803303718566895, "learning_rate": 9.272836433637193e-05, "loss": 0.2612, "step": 684 }, { "epoch": 0.5138784696174044, "grad_norm": 0.34655997157096863, "learning_rate": 9.269432277121281e-05, "loss": 0.2144, "step": 685 }, { "epoch": 0.5146286571642911, "grad_norm": 0.2729996144771576, "learning_rate": 9.266020798949433e-05, "loss": 0.2166, "step": 686 }, { "epoch": 0.5153788447111778, "grad_norm": 0.23743626475334167, "learning_rate": 9.262602004971996e-05, "loss": 0.2857, "step": 687 }, { "epoch": 0.5161290322580645, "grad_norm": 0.3129785358905792, "learning_rate": 9.259175901051867e-05, "loss": 0.2801, "step": 688 }, { "epoch": 0.5168792198049512, "grad_norm": 0.3642384707927704, "learning_rate": 9.255742493064474e-05, "loss": 0.274, "step": 689 }, { "epoch": 0.5176294073518379, "grad_norm": 0.28450801968574524, "learning_rate": 9.252301786897776e-05, "loss": 0.263, "step": 690 }, { "epoch": 0.5183795948987246, "grad_norm": 0.3433120846748352, "learning_rate": 9.248853788452247e-05, "loss": 0.3857, "step": 691 }, { "epoch": 0.5191297824456114, "grad_norm": 0.3061552047729492, "learning_rate": 9.24539850364086e-05, "loss": 0.304, "step": 692 }, { "epoch": 0.5198799699924981, "grad_norm": 0.29494166374206543, "learning_rate": 9.241935938389093e-05, "loss": 0.2529, "step": 693 }, { "epoch": 0.5206301575393848, "grad_norm": 0.303411602973938, "learning_rate": 9.238466098634902e-05, "loss": 0.2832, "step": 694 }, { "epoch": 0.5213803450862715, "grad_norm": 0.35831764340400696, "learning_rate": 9.234988990328719e-05, "loss": 0.2817, "step": 695 }, { "epoch": 0.5221305326331583, "grad_norm": 0.27082374691963196, "learning_rate": 9.231504619433445e-05, "loss": 0.2441, "step": 696 }, { "epoch": 0.522880720180045, "grad_norm": 0.30853259563446045, "learning_rate": 9.228012991924433e-05, "loss": 0.3138, "step": 697 }, { "epoch": 0.5236309077269318, "grad_norm": 0.2823306620121002, "learning_rate": 9.224514113789477e-05, "loss": 0.2368, "step": 698 }, { "epoch": 0.5243810952738185, "grad_norm": 0.381224662065506, "learning_rate": 9.221007991028814e-05, "loss": 0.2894, "step": 699 }, { "epoch": 0.5251312828207052, "grad_norm": 0.2432308942079544, "learning_rate": 9.217494629655094e-05, "loss": 0.227, "step": 700 }, { "epoch": 0.5258814703675919, "grad_norm": 0.26613369584083557, "learning_rate": 9.213974035693389e-05, "loss": 0.3241, "step": 701 }, { "epoch": 0.5266316579144786, "grad_norm": 0.2758384943008423, "learning_rate": 9.21044621518117e-05, "loss": 0.2577, "step": 702 }, { "epoch": 0.5273818454613654, "grad_norm": 0.2661236524581909, "learning_rate": 9.206911174168301e-05, "loss": 0.2637, "step": 703 }, { "epoch": 0.5281320330082521, "grad_norm": 0.28129100799560547, "learning_rate": 9.20336891871703e-05, "loss": 0.2723, "step": 704 }, { "epoch": 0.5288822205551388, "grad_norm": 0.2967759370803833, "learning_rate": 9.199819454901977e-05, "loss": 0.2465, "step": 705 }, { "epoch": 0.5296324081020255, "grad_norm": 0.37147876620292664, "learning_rate": 9.196262788810121e-05, "loss": 0.3489, "step": 706 }, { "epoch": 0.5303825956489122, "grad_norm": 0.27410486340522766, "learning_rate": 9.192698926540795e-05, "loss": 0.2794, "step": 707 }, { "epoch": 0.5311327831957989, "grad_norm": 0.27895089983940125, "learning_rate": 9.189127874205674e-05, "loss": 0.2129, "step": 708 }, { "epoch": 0.5318829707426856, "grad_norm": 0.28979605436325073, "learning_rate": 9.185549637928758e-05, "loss": 0.2133, "step": 709 }, { "epoch": 0.5326331582895724, "grad_norm": 0.27718594670295715, "learning_rate": 9.181964223846371e-05, "loss": 0.3042, "step": 710 }, { "epoch": 0.5333833458364591, "grad_norm": 0.333857923746109, "learning_rate": 9.178371638107146e-05, "loss": 0.2824, "step": 711 }, { "epoch": 0.5341335333833458, "grad_norm": 0.3143302798271179, "learning_rate": 9.174771886872011e-05, "loss": 0.229, "step": 712 }, { "epoch": 0.5348837209302325, "grad_norm": 0.2759588062763214, "learning_rate": 9.17116497631419e-05, "loss": 0.3105, "step": 713 }, { "epoch": 0.5356339084771192, "grad_norm": 0.30073073506355286, "learning_rate": 9.167550912619173e-05, "loss": 0.2929, "step": 714 }, { "epoch": 0.536384096024006, "grad_norm": 0.29393187165260315, "learning_rate": 9.16392970198473e-05, "loss": 0.2899, "step": 715 }, { "epoch": 0.5371342835708928, "grad_norm": 0.2438785582780838, "learning_rate": 9.160301350620875e-05, "loss": 0.206, "step": 716 }, { "epoch": 0.5378844711177795, "grad_norm": 0.22543703019618988, "learning_rate": 9.156665864749876e-05, "loss": 0.2288, "step": 717 }, { "epoch": 0.5386346586646662, "grad_norm": 0.20743077993392944, "learning_rate": 9.153023250606234e-05, "loss": 0.2489, "step": 718 }, { "epoch": 0.5393848462115529, "grad_norm": 0.27999505400657654, "learning_rate": 9.14937351443667e-05, "loss": 0.2127, "step": 719 }, { "epoch": 0.5401350337584396, "grad_norm": 0.2830229699611664, "learning_rate": 9.145716662500126e-05, "loss": 0.2065, "step": 720 }, { "epoch": 0.5408852213053263, "grad_norm": 0.29549163579940796, "learning_rate": 9.142052701067741e-05, "loss": 0.3447, "step": 721 }, { "epoch": 0.5416354088522131, "grad_norm": 0.2853567600250244, "learning_rate": 9.13838163642285e-05, "loss": 0.3106, "step": 722 }, { "epoch": 0.5423855963990998, "grad_norm": 0.21336734294891357, "learning_rate": 9.134703474860963e-05, "loss": 0.2213, "step": 723 }, { "epoch": 0.5431357839459865, "grad_norm": 0.22916728258132935, "learning_rate": 9.13101822268977e-05, "loss": 0.2089, "step": 724 }, { "epoch": 0.5438859714928732, "grad_norm": 0.3816883862018585, "learning_rate": 9.127325886229115e-05, "loss": 0.2787, "step": 725 }, { "epoch": 0.5446361590397599, "grad_norm": 0.29940110445022583, "learning_rate": 9.123626471810988e-05, "loss": 0.2561, "step": 726 }, { "epoch": 0.5453863465866466, "grad_norm": 0.29712194204330444, "learning_rate": 9.119919985779521e-05, "loss": 0.309, "step": 727 }, { "epoch": 0.5461365341335334, "grad_norm": 0.25624704360961914, "learning_rate": 9.116206434490976e-05, "loss": 0.3543, "step": 728 }, { "epoch": 0.5468867216804201, "grad_norm": 0.26873889565467834, "learning_rate": 9.112485824313726e-05, "loss": 0.3298, "step": 729 }, { "epoch": 0.5476369092273068, "grad_norm": 0.24775317311286926, "learning_rate": 9.10875816162825e-05, "loss": 0.2725, "step": 730 }, { "epoch": 0.5483870967741935, "grad_norm": 0.2539280652999878, "learning_rate": 9.105023452827121e-05, "loss": 0.258, "step": 731 }, { "epoch": 0.5491372843210802, "grad_norm": 0.3034622371196747, "learning_rate": 9.101281704315002e-05, "loss": 0.2496, "step": 732 }, { "epoch": 0.5498874718679669, "grad_norm": 0.2789207100868225, "learning_rate": 9.097532922508619e-05, "loss": 0.355, "step": 733 }, { "epoch": 0.5506376594148538, "grad_norm": 0.32782891392707825, "learning_rate": 9.093777113836765e-05, "loss": 0.2776, "step": 734 }, { "epoch": 0.5513878469617405, "grad_norm": 0.3012196719646454, "learning_rate": 9.090014284740283e-05, "loss": 0.3481, "step": 735 }, { "epoch": 0.5521380345086272, "grad_norm": 0.22016215324401855, "learning_rate": 9.086244441672052e-05, "loss": 0.265, "step": 736 }, { "epoch": 0.5528882220555139, "grad_norm": 0.27907922863960266, "learning_rate": 9.082467591096982e-05, "loss": 0.2397, "step": 737 }, { "epoch": 0.5536384096024006, "grad_norm": 0.3009335398674011, "learning_rate": 9.078683739492002e-05, "loss": 0.2335, "step": 738 }, { "epoch": 0.5543885971492873, "grad_norm": 0.30347198247909546, "learning_rate": 9.074892893346043e-05, "loss": 0.3381, "step": 739 }, { "epoch": 0.5551387846961741, "grad_norm": 0.28463301062583923, "learning_rate": 9.071095059160035e-05, "loss": 0.2203, "step": 740 }, { "epoch": 0.5558889722430608, "grad_norm": 0.25159740447998047, "learning_rate": 9.067290243446887e-05, "loss": 0.2678, "step": 741 }, { "epoch": 0.5566391597899475, "grad_norm": 0.27210533618927, "learning_rate": 9.063478452731484e-05, "loss": 0.2437, "step": 742 }, { "epoch": 0.5573893473368342, "grad_norm": 0.35878896713256836, "learning_rate": 9.059659693550673e-05, "loss": 0.3248, "step": 743 }, { "epoch": 0.5581395348837209, "grad_norm": 0.24176593124866486, "learning_rate": 9.055833972453249e-05, "loss": 0.3086, "step": 744 }, { "epoch": 0.5588897224306076, "grad_norm": 0.30327391624450684, "learning_rate": 9.052001295999947e-05, "loss": 0.3327, "step": 745 }, { "epoch": 0.5596399099774944, "grad_norm": 0.273722380399704, "learning_rate": 9.048161670763429e-05, "loss": 0.329, "step": 746 }, { "epoch": 0.5603900975243811, "grad_norm": 0.3546513020992279, "learning_rate": 9.044315103328276e-05, "loss": 0.2303, "step": 747 }, { "epoch": 0.5611402850712678, "grad_norm": 0.25301119685173035, "learning_rate": 9.04046160029097e-05, "loss": 0.2643, "step": 748 }, { "epoch": 0.5618904726181545, "grad_norm": 0.2752533257007599, "learning_rate": 9.036601168259893e-05, "loss": 0.232, "step": 749 }, { "epoch": 0.5626406601650412, "grad_norm": 0.25641700625419617, "learning_rate": 9.032733813855301e-05, "loss": 0.3012, "step": 750 }, { "epoch": 0.5633908477119279, "grad_norm": 0.2964957058429718, "learning_rate": 9.02885954370933e-05, "loss": 0.3502, "step": 751 }, { "epoch": 0.5641410352588146, "grad_norm": 0.41227367520332336, "learning_rate": 9.02497836446597e-05, "loss": 0.3433, "step": 752 }, { "epoch": 0.5648912228057015, "grad_norm": 0.28575795888900757, "learning_rate": 9.021090282781059e-05, "loss": 0.2641, "step": 753 }, { "epoch": 0.5656414103525882, "grad_norm": 0.278216153383255, "learning_rate": 9.01719530532228e-05, "loss": 0.326, "step": 754 }, { "epoch": 0.5663915978994749, "grad_norm": 0.2690160572528839, "learning_rate": 9.01329343876913e-05, "loss": 0.246, "step": 755 }, { "epoch": 0.5671417854463616, "grad_norm": 0.2619618773460388, "learning_rate": 9.009384689812928e-05, "loss": 0.2476, "step": 756 }, { "epoch": 0.5678919729932483, "grad_norm": 0.27449509501457214, "learning_rate": 9.005469065156795e-05, "loss": 0.2226, "step": 757 }, { "epoch": 0.568642160540135, "grad_norm": 0.29348188638687134, "learning_rate": 9.00154657151564e-05, "loss": 0.2416, "step": 758 }, { "epoch": 0.5693923480870218, "grad_norm": 0.2771177589893341, "learning_rate": 8.997617215616154e-05, "loss": 0.3042, "step": 759 }, { "epoch": 0.5701425356339085, "grad_norm": 0.32715073227882385, "learning_rate": 8.993681004196797e-05, "loss": 0.2581, "step": 760 }, { "epoch": 0.5708927231807952, "grad_norm": 0.2621978521347046, "learning_rate": 8.989737944007781e-05, "loss": 0.2244, "step": 761 }, { "epoch": 0.5716429107276819, "grad_norm": 0.244574636220932, "learning_rate": 8.985788041811068e-05, "loss": 0.333, "step": 762 }, { "epoch": 0.5723930982745686, "grad_norm": 0.3070138692855835, "learning_rate": 8.981831304380348e-05, "loss": 0.2875, "step": 763 }, { "epoch": 0.5731432858214554, "grad_norm": 0.3449687361717224, "learning_rate": 8.97786773850104e-05, "loss": 0.2056, "step": 764 }, { "epoch": 0.5738934733683421, "grad_norm": 0.24818167090415955, "learning_rate": 8.973897350970269e-05, "loss": 0.2045, "step": 765 }, { "epoch": 0.5746436609152288, "grad_norm": 0.27504634857177734, "learning_rate": 8.969920148596857e-05, "loss": 0.3329, "step": 766 }, { "epoch": 0.5753938484621155, "grad_norm": 0.2633123993873596, "learning_rate": 8.965936138201314e-05, "loss": 0.2901, "step": 767 }, { "epoch": 0.5761440360090022, "grad_norm": 0.2722806930541992, "learning_rate": 8.961945326615829e-05, "loss": 0.2607, "step": 768 }, { "epoch": 0.5768942235558889, "grad_norm": 0.23863765597343445, "learning_rate": 8.957947720684246e-05, "loss": 0.2646, "step": 769 }, { "epoch": 0.5776444111027756, "grad_norm": 0.27626338601112366, "learning_rate": 8.953943327262066e-05, "loss": 0.2729, "step": 770 }, { "epoch": 0.5783945986496624, "grad_norm": 0.26283955574035645, "learning_rate": 8.949932153216434e-05, "loss": 0.2819, "step": 771 }, { "epoch": 0.5791447861965492, "grad_norm": 0.2279903143644333, "learning_rate": 8.945914205426116e-05, "loss": 0.2747, "step": 772 }, { "epoch": 0.5798949737434359, "grad_norm": 0.2379155457019806, "learning_rate": 8.941889490781494e-05, "loss": 0.2495, "step": 773 }, { "epoch": 0.5806451612903226, "grad_norm": 0.3073385953903198, "learning_rate": 8.937858016184563e-05, "loss": 0.2644, "step": 774 }, { "epoch": 0.5813953488372093, "grad_norm": 0.24979422986507416, "learning_rate": 8.933819788548899e-05, "loss": 0.2188, "step": 775 }, { "epoch": 0.582145536384096, "grad_norm": 0.1512099653482437, "learning_rate": 8.92977481479967e-05, "loss": 0.1693, "step": 776 }, { "epoch": 0.5828957239309828, "grad_norm": 0.3009202778339386, "learning_rate": 8.925723101873603e-05, "loss": 0.3092, "step": 777 }, { "epoch": 0.5836459114778695, "grad_norm": 0.24724484980106354, "learning_rate": 8.92166465671899e-05, "loss": 0.195, "step": 778 }, { "epoch": 0.5843960990247562, "grad_norm": 0.2952735722064972, "learning_rate": 8.917599486295664e-05, "loss": 0.2446, "step": 779 }, { "epoch": 0.5851462865716429, "grad_norm": 0.3050011098384857, "learning_rate": 8.913527597574991e-05, "loss": 0.2833, "step": 780 }, { "epoch": 0.5858964741185296, "grad_norm": 0.31963202357292175, "learning_rate": 8.90944899753986e-05, "loss": 0.2435, "step": 781 }, { "epoch": 0.5866466616654163, "grad_norm": 0.37558919191360474, "learning_rate": 8.905363693184668e-05, "loss": 0.3003, "step": 782 }, { "epoch": 0.5873968492123031, "grad_norm": 0.30881357192993164, "learning_rate": 8.901271691515309e-05, "loss": 0.2959, "step": 783 }, { "epoch": 0.5881470367591898, "grad_norm": 0.23410387337207794, "learning_rate": 8.897172999549165e-05, "loss": 0.2259, "step": 784 }, { "epoch": 0.5888972243060765, "grad_norm": 0.24389761686325073, "learning_rate": 8.893067624315088e-05, "loss": 0.2225, "step": 785 }, { "epoch": 0.5896474118529632, "grad_norm": 0.3342288136482239, "learning_rate": 8.888955572853392e-05, "loss": 0.3336, "step": 786 }, { "epoch": 0.5903975993998499, "grad_norm": 0.2823770046234131, "learning_rate": 8.884836852215841e-05, "loss": 0.238, "step": 787 }, { "epoch": 0.5911477869467366, "grad_norm": 0.3033815324306488, "learning_rate": 8.880711469465635e-05, "loss": 0.2496, "step": 788 }, { "epoch": 0.5918979744936234, "grad_norm": 0.24962513148784637, "learning_rate": 8.876579431677398e-05, "loss": 0.2578, "step": 789 }, { "epoch": 0.5926481620405101, "grad_norm": 0.23458802700042725, "learning_rate": 8.87244074593717e-05, "loss": 0.2654, "step": 790 }, { "epoch": 0.5933983495873969, "grad_norm": 0.27628204226493835, "learning_rate": 8.868295419342389e-05, "loss": 0.2126, "step": 791 }, { "epoch": 0.5941485371342836, "grad_norm": 0.335840106010437, "learning_rate": 8.86414345900188e-05, "loss": 0.2563, "step": 792 }, { "epoch": 0.5948987246811703, "grad_norm": 0.32570695877075195, "learning_rate": 8.859984872035849e-05, "loss": 0.2657, "step": 793 }, { "epoch": 0.595648912228057, "grad_norm": 0.25517165660858154, "learning_rate": 8.85581966557586e-05, "loss": 0.2202, "step": 794 }, { "epoch": 0.5963990997749438, "grad_norm": 0.2630554735660553, "learning_rate": 8.851647846764835e-05, "loss": 0.272, "step": 795 }, { "epoch": 0.5971492873218305, "grad_norm": 0.3294072449207306, "learning_rate": 8.847469422757031e-05, "loss": 0.3173, "step": 796 }, { "epoch": 0.5978994748687172, "grad_norm": 0.22169524431228638, "learning_rate": 8.843284400718033e-05, "loss": 0.1874, "step": 797 }, { "epoch": 0.5986496624156039, "grad_norm": 0.3307165205478668, "learning_rate": 8.839092787824743e-05, "loss": 0.2701, "step": 798 }, { "epoch": 0.5993998499624906, "grad_norm": 0.23371872305870056, "learning_rate": 8.834894591265364e-05, "loss": 0.2446, "step": 799 }, { "epoch": 0.6001500375093773, "grad_norm": 0.290356308221817, "learning_rate": 8.830689818239388e-05, "loss": 0.2523, "step": 800 }, { "epoch": 0.6001500375093773, "eval_loss": 0.26833051443099976, "eval_runtime": 8.9047, "eval_samples_per_second": 6.064, "eval_steps_per_second": 1.572, "step": 800 }, { "epoch": 0.6009002250562641, "grad_norm": 0.34519678354263306, "learning_rate": 8.826478475957589e-05, "loss": 0.2472, "step": 801 }, { "epoch": 0.6016504126031508, "grad_norm": 0.21608851850032806, "learning_rate": 8.822260571642005e-05, "loss": 0.2589, "step": 802 }, { "epoch": 0.6024006001500375, "grad_norm": 0.22945556044578552, "learning_rate": 8.818036112525924e-05, "loss": 0.2431, "step": 803 }, { "epoch": 0.6031507876969242, "grad_norm": 0.21437807381153107, "learning_rate": 8.813805105853879e-05, "loss": 0.241, "step": 804 }, { "epoch": 0.6039009752438109, "grad_norm": 0.26581358909606934, "learning_rate": 8.809567558881628e-05, "loss": 0.2223, "step": 805 }, { "epoch": 0.6046511627906976, "grad_norm": 0.28238964080810547, "learning_rate": 8.805323478876149e-05, "loss": 0.2899, "step": 806 }, { "epoch": 0.6054013503375844, "grad_norm": 0.283059298992157, "learning_rate": 8.80107287311562e-05, "loss": 0.2376, "step": 807 }, { "epoch": 0.6061515378844711, "grad_norm": 0.3178575038909912, "learning_rate": 8.796815748889413e-05, "loss": 0.2286, "step": 808 }, { "epoch": 0.6069017254313578, "grad_norm": 0.2260517179965973, "learning_rate": 8.792552113498073e-05, "loss": 0.2545, "step": 809 }, { "epoch": 0.6076519129782446, "grad_norm": 0.3416258990764618, "learning_rate": 8.788281974253318e-05, "loss": 0.2832, "step": 810 }, { "epoch": 0.6084021005251313, "grad_norm": 0.264175683259964, "learning_rate": 8.784005338478017e-05, "loss": 0.2471, "step": 811 }, { "epoch": 0.609152288072018, "grad_norm": 0.30896416306495667, "learning_rate": 8.779722213506178e-05, "loss": 0.268, "step": 812 }, { "epoch": 0.6099024756189048, "grad_norm": 0.28913983702659607, "learning_rate": 8.775432606682937e-05, "loss": 0.3257, "step": 813 }, { "epoch": 0.6106526631657915, "grad_norm": 0.32982000708580017, "learning_rate": 8.77113652536455e-05, "loss": 0.2375, "step": 814 }, { "epoch": 0.6114028507126782, "grad_norm": 0.36035236716270447, "learning_rate": 8.766833976918371e-05, "loss": 0.2613, "step": 815 }, { "epoch": 0.6121530382595649, "grad_norm": 0.2983340620994568, "learning_rate": 8.76252496872285e-05, "loss": 0.2773, "step": 816 }, { "epoch": 0.6129032258064516, "grad_norm": 0.17204448580741882, "learning_rate": 8.758209508167508e-05, "loss": 0.2613, "step": 817 }, { "epoch": 0.6136534133533383, "grad_norm": 0.29837167263031006, "learning_rate": 8.753887602652937e-05, "loss": 0.2662, "step": 818 }, { "epoch": 0.614403600900225, "grad_norm": 0.26355665922164917, "learning_rate": 8.74955925959078e-05, "loss": 0.2409, "step": 819 }, { "epoch": 0.6151537884471118, "grad_norm": 0.27990904450416565, "learning_rate": 8.745224486403718e-05, "loss": 0.2697, "step": 820 }, { "epoch": 0.6159039759939985, "grad_norm": 0.31860336661338806, "learning_rate": 8.74088329052546e-05, "loss": 0.2409, "step": 821 }, { "epoch": 0.6166541635408852, "grad_norm": 0.328995943069458, "learning_rate": 8.73653567940073e-05, "loss": 0.2822, "step": 822 }, { "epoch": 0.6174043510877719, "grad_norm": 0.31269708275794983, "learning_rate": 8.732181660485252e-05, "loss": 0.27, "step": 823 }, { "epoch": 0.6181545386346586, "grad_norm": 0.461117684841156, "learning_rate": 8.727821241245742e-05, "loss": 0.2456, "step": 824 }, { "epoch": 0.6189047261815454, "grad_norm": 0.29472312331199646, "learning_rate": 8.723454429159888e-05, "loss": 0.294, "step": 825 }, { "epoch": 0.6196549137284321, "grad_norm": 0.29525622725486755, "learning_rate": 8.719081231716341e-05, "loss": 0.2652, "step": 826 }, { "epoch": 0.6204051012753188, "grad_norm": 0.24446353316307068, "learning_rate": 8.714701656414708e-05, "loss": 0.2713, "step": 827 }, { "epoch": 0.6211552888222055, "grad_norm": 0.2875005304813385, "learning_rate": 8.710315710765526e-05, "loss": 0.2526, "step": 828 }, { "epoch": 0.6219054763690923, "grad_norm": 0.3742363154888153, "learning_rate": 8.705923402290261e-05, "loss": 0.252, "step": 829 }, { "epoch": 0.622655663915979, "grad_norm": 0.25650063157081604, "learning_rate": 8.701524738521291e-05, "loss": 0.2074, "step": 830 }, { "epoch": 0.6234058514628658, "grad_norm": 0.2750839591026306, "learning_rate": 8.697119727001887e-05, "loss": 0.2688, "step": 831 }, { "epoch": 0.6241560390097525, "grad_norm": 0.2552943229675293, "learning_rate": 8.692708375286217e-05, "loss": 0.2287, "step": 832 }, { "epoch": 0.6249062265566392, "grad_norm": 0.2949307858943939, "learning_rate": 8.688290690939307e-05, "loss": 0.2476, "step": 833 }, { "epoch": 0.6256564141035259, "grad_norm": 0.20152868330478668, "learning_rate": 8.683866681537054e-05, "loss": 0.1892, "step": 834 }, { "epoch": 0.6264066016504126, "grad_norm": 0.26337313652038574, "learning_rate": 8.679436354666202e-05, "loss": 0.2096, "step": 835 }, { "epoch": 0.6271567891972993, "grad_norm": 0.316845178604126, "learning_rate": 8.67499971792432e-05, "loss": 0.2697, "step": 836 }, { "epoch": 0.627906976744186, "grad_norm": 0.2452125996351242, "learning_rate": 8.670556778919805e-05, "loss": 0.2514, "step": 837 }, { "epoch": 0.6286571642910728, "grad_norm": 0.29365503787994385, "learning_rate": 8.666107545271859e-05, "loss": 0.2668, "step": 838 }, { "epoch": 0.6294073518379595, "grad_norm": 0.27912577986717224, "learning_rate": 8.661652024610482e-05, "loss": 0.2482, "step": 839 }, { "epoch": 0.6301575393848462, "grad_norm": 0.2778094410896301, "learning_rate": 8.657190224576453e-05, "loss": 0.2728, "step": 840 }, { "epoch": 0.6309077269317329, "grad_norm": 0.2547931969165802, "learning_rate": 8.652722152821318e-05, "loss": 0.2539, "step": 841 }, { "epoch": 0.6316579144786196, "grad_norm": 0.2820577025413513, "learning_rate": 8.64824781700738e-05, "loss": 0.2166, "step": 842 }, { "epoch": 0.6324081020255063, "grad_norm": 0.28078949451446533, "learning_rate": 8.643767224807685e-05, "loss": 0.3142, "step": 843 }, { "epoch": 0.6331582895723931, "grad_norm": 0.3312782049179077, "learning_rate": 8.639280383906008e-05, "loss": 0.2356, "step": 844 }, { "epoch": 0.6339084771192798, "grad_norm": 0.27808159589767456, "learning_rate": 8.634787301996839e-05, "loss": 0.2136, "step": 845 }, { "epoch": 0.6346586646661665, "grad_norm": 0.2727961242198944, "learning_rate": 8.630287986785368e-05, "loss": 0.1901, "step": 846 }, { "epoch": 0.6354088522130532, "grad_norm": 0.34746474027633667, "learning_rate": 8.625782445987483e-05, "loss": 0.3142, "step": 847 }, { "epoch": 0.63615903975994, "grad_norm": 0.23802009224891663, "learning_rate": 8.621270687329738e-05, "loss": 0.1884, "step": 848 }, { "epoch": 0.6369092273068268, "grad_norm": 0.21691298484802246, "learning_rate": 8.616752718549359e-05, "loss": 0.2158, "step": 849 }, { "epoch": 0.6376594148537135, "grad_norm": 0.3197733163833618, "learning_rate": 8.612228547394215e-05, "loss": 0.2978, "step": 850 }, { "epoch": 0.6384096024006002, "grad_norm": 0.30178794264793396, "learning_rate": 8.607698181622814e-05, "loss": 0.213, "step": 851 }, { "epoch": 0.6391597899474869, "grad_norm": 0.324343204498291, "learning_rate": 8.603161629004287e-05, "loss": 0.2416, "step": 852 }, { "epoch": 0.6399099774943736, "grad_norm": 0.31799623370170593, "learning_rate": 8.598618897318375e-05, "loss": 0.2649, "step": 853 }, { "epoch": 0.6406601650412603, "grad_norm": 0.3543478846549988, "learning_rate": 8.594069994355419e-05, "loss": 0.2132, "step": 854 }, { "epoch": 0.641410352588147, "grad_norm": 0.30568501353263855, "learning_rate": 8.589514927916336e-05, "loss": 0.2947, "step": 855 }, { "epoch": 0.6421605401350338, "grad_norm": 0.2198677510023117, "learning_rate": 8.584953705812615e-05, "loss": 0.2411, "step": 856 }, { "epoch": 0.6429107276819205, "grad_norm": 0.26765403151512146, "learning_rate": 8.580386335866308e-05, "loss": 0.1456, "step": 857 }, { "epoch": 0.6436609152288072, "grad_norm": 0.25498613715171814, "learning_rate": 8.575812825909998e-05, "loss": 0.2451, "step": 858 }, { "epoch": 0.6444111027756939, "grad_norm": 0.32217881083488464, "learning_rate": 8.57123318378681e-05, "loss": 0.3026, "step": 859 }, { "epoch": 0.6451612903225806, "grad_norm": 0.2847852408885956, "learning_rate": 8.566647417350378e-05, "loss": 0.2448, "step": 860 }, { "epoch": 0.6459114778694673, "grad_norm": 0.3279675245285034, "learning_rate": 8.562055534464838e-05, "loss": 0.2553, "step": 861 }, { "epoch": 0.6466616654163541, "grad_norm": 0.29539749026298523, "learning_rate": 8.557457543004819e-05, "loss": 0.3104, "step": 862 }, { "epoch": 0.6474118529632408, "grad_norm": 0.3447539806365967, "learning_rate": 8.552853450855422e-05, "loss": 0.3031, "step": 863 }, { "epoch": 0.6481620405101275, "grad_norm": 0.27442842721939087, "learning_rate": 8.548243265912213e-05, "loss": 0.2461, "step": 864 }, { "epoch": 0.6489122280570142, "grad_norm": 0.27341389656066895, "learning_rate": 8.543626996081202e-05, "loss": 0.2243, "step": 865 }, { "epoch": 0.6496624156039009, "grad_norm": 0.38139811158180237, "learning_rate": 8.539004649278841e-05, "loss": 0.3369, "step": 866 }, { "epoch": 0.6504126031507877, "grad_norm": 0.27363166213035583, "learning_rate": 8.534376233432e-05, "loss": 0.2966, "step": 867 }, { "epoch": 0.6511627906976745, "grad_norm": 0.2441435158252716, "learning_rate": 8.529741756477953e-05, "loss": 0.2409, "step": 868 }, { "epoch": 0.6519129782445612, "grad_norm": 0.22724738717079163, "learning_rate": 8.525101226364374e-05, "loss": 0.2845, "step": 869 }, { "epoch": 0.6526631657914479, "grad_norm": 0.2747040092945099, "learning_rate": 8.520454651049313e-05, "loss": 0.1951, "step": 870 }, { "epoch": 0.6534133533383346, "grad_norm": 0.3179396390914917, "learning_rate": 8.51580203850119e-05, "loss": 0.2267, "step": 871 }, { "epoch": 0.6541635408852213, "grad_norm": 0.3324615955352783, "learning_rate": 8.511143396698781e-05, "loss": 0.2716, "step": 872 }, { "epoch": 0.654913728432108, "grad_norm": 0.358310729265213, "learning_rate": 8.506478733631193e-05, "loss": 0.2745, "step": 873 }, { "epoch": 0.6556639159789948, "grad_norm": 0.2879120409488678, "learning_rate": 8.501808057297866e-05, "loss": 0.2439, "step": 874 }, { "epoch": 0.6564141035258815, "grad_norm": 0.25415655970573425, "learning_rate": 8.49713137570855e-05, "loss": 0.2575, "step": 875 }, { "epoch": 0.6571642910727682, "grad_norm": 0.2501605451107025, "learning_rate": 8.492448696883292e-05, "loss": 0.2446, "step": 876 }, { "epoch": 0.6579144786196549, "grad_norm": 0.33058109879493713, "learning_rate": 8.487760028852427e-05, "loss": 0.3324, "step": 877 }, { "epoch": 0.6586646661665416, "grad_norm": 0.29319003224372864, "learning_rate": 8.483065379656558e-05, "loss": 0.2927, "step": 878 }, { "epoch": 0.6594148537134283, "grad_norm": 0.2485966831445694, "learning_rate": 8.478364757346546e-05, "loss": 0.2174, "step": 879 }, { "epoch": 0.660165041260315, "grad_norm": 0.2414620816707611, "learning_rate": 8.473658169983496e-05, "loss": 0.2303, "step": 880 }, { "epoch": 0.6609152288072018, "grad_norm": 0.2715598940849304, "learning_rate": 8.468945625638742e-05, "loss": 0.2476, "step": 881 }, { "epoch": 0.6616654163540885, "grad_norm": 0.2986600697040558, "learning_rate": 8.464227132393831e-05, "loss": 0.263, "step": 882 }, { "epoch": 0.6624156039009752, "grad_norm": 0.2964460253715515, "learning_rate": 8.459502698340519e-05, "loss": 0.2936, "step": 883 }, { "epoch": 0.6631657914478619, "grad_norm": 0.24268998205661774, "learning_rate": 8.45477233158074e-05, "loss": 0.2434, "step": 884 }, { "epoch": 0.6639159789947486, "grad_norm": 0.22852426767349243, "learning_rate": 8.450036040226612e-05, "loss": 0.222, "step": 885 }, { "epoch": 0.6646661665416355, "grad_norm": 0.24842998385429382, "learning_rate": 8.445293832400402e-05, "loss": 0.2513, "step": 886 }, { "epoch": 0.6654163540885222, "grad_norm": 0.28640031814575195, "learning_rate": 8.440545716234538e-05, "loss": 0.2175, "step": 887 }, { "epoch": 0.6661665416354089, "grad_norm": 0.259583443403244, "learning_rate": 8.435791699871564e-05, "loss": 0.28, "step": 888 }, { "epoch": 0.6669167291822956, "grad_norm": 0.30513501167297363, "learning_rate": 8.431031791464155e-05, "loss": 0.3325, "step": 889 }, { "epoch": 0.6676669167291823, "grad_norm": 0.2527770698070526, "learning_rate": 8.426265999175081e-05, "loss": 0.1847, "step": 890 }, { "epoch": 0.668417104276069, "grad_norm": 0.319124311208725, "learning_rate": 8.421494331177208e-05, "loss": 0.323, "step": 891 }, { "epoch": 0.6691672918229558, "grad_norm": 0.3217097520828247, "learning_rate": 8.41671679565348e-05, "loss": 0.2655, "step": 892 }, { "epoch": 0.6699174793698425, "grad_norm": 0.29677343368530273, "learning_rate": 8.411933400796896e-05, "loss": 0.3703, "step": 893 }, { "epoch": 0.6706676669167292, "grad_norm": 0.31900718808174133, "learning_rate": 8.407144154810509e-05, "loss": 0.3596, "step": 894 }, { "epoch": 0.6714178544636159, "grad_norm": 0.2391609400510788, "learning_rate": 8.402349065907403e-05, "loss": 0.2321, "step": 895 }, { "epoch": 0.6721680420105026, "grad_norm": 0.3079070746898651, "learning_rate": 8.397548142310685e-05, "loss": 0.1943, "step": 896 }, { "epoch": 0.6729182295573893, "grad_norm": 0.23926346004009247, "learning_rate": 8.392741392253465e-05, "loss": 0.2397, "step": 897 }, { "epoch": 0.673668417104276, "grad_norm": 0.35179442167282104, "learning_rate": 8.387928823978846e-05, "loss": 0.2233, "step": 898 }, { "epoch": 0.6744186046511628, "grad_norm": 0.22909726202487946, "learning_rate": 8.383110445739907e-05, "loss": 0.2151, "step": 899 }, { "epoch": 0.6751687921980495, "grad_norm": 0.25112900137901306, "learning_rate": 8.378286265799698e-05, "loss": 0.2804, "step": 900 }, { "epoch": 0.6759189797449362, "grad_norm": 0.2630389630794525, "learning_rate": 8.373456292431206e-05, "loss": 0.2374, "step": 901 }, { "epoch": 0.6766691672918229, "grad_norm": 0.28805676102638245, "learning_rate": 8.368620533917363e-05, "loss": 0.2434, "step": 902 }, { "epoch": 0.6774193548387096, "grad_norm": 0.21624280512332916, "learning_rate": 8.363778998551018e-05, "loss": 0.3084, "step": 903 }, { "epoch": 0.6781695423855963, "grad_norm": 0.2578418552875519, "learning_rate": 8.358931694634928e-05, "loss": 0.2151, "step": 904 }, { "epoch": 0.6789197299324832, "grad_norm": 0.26006704568862915, "learning_rate": 8.35407863048174e-05, "loss": 0.2519, "step": 905 }, { "epoch": 0.6796699174793699, "grad_norm": 0.2581077814102173, "learning_rate": 8.349219814413984e-05, "loss": 0.2621, "step": 906 }, { "epoch": 0.6804201050262566, "grad_norm": 0.3084411025047302, "learning_rate": 8.344355254764049e-05, "loss": 0.2539, "step": 907 }, { "epoch": 0.6811702925731433, "grad_norm": 0.2617936134338379, "learning_rate": 8.339484959874178e-05, "loss": 0.2942, "step": 908 }, { "epoch": 0.68192048012003, "grad_norm": 0.25313594937324524, "learning_rate": 8.334608938096443e-05, "loss": 0.2748, "step": 909 }, { "epoch": 0.6826706676669168, "grad_norm": 0.24148371815681458, "learning_rate": 8.329727197792744e-05, "loss": 0.2272, "step": 910 }, { "epoch": 0.6834208552138035, "grad_norm": 0.21879245340824127, "learning_rate": 8.324839747334787e-05, "loss": 0.1974, "step": 911 }, { "epoch": 0.6841710427606902, "grad_norm": 0.35614484548568726, "learning_rate": 8.319946595104065e-05, "loss": 0.2716, "step": 912 }, { "epoch": 0.6849212303075769, "grad_norm": 0.2506597638130188, "learning_rate": 8.315047749491851e-05, "loss": 0.2038, "step": 913 }, { "epoch": 0.6856714178544636, "grad_norm": 0.2553820013999939, "learning_rate": 8.310143218899187e-05, "loss": 0.2139, "step": 914 }, { "epoch": 0.6864216054013503, "grad_norm": 0.24824173748493195, "learning_rate": 8.305233011736857e-05, "loss": 0.3225, "step": 915 }, { "epoch": 0.687171792948237, "grad_norm": 0.23854871094226837, "learning_rate": 8.300317136425385e-05, "loss": 0.1826, "step": 916 }, { "epoch": 0.6879219804951238, "grad_norm": 0.3177955448627472, "learning_rate": 8.295395601395011e-05, "loss": 0.2729, "step": 917 }, { "epoch": 0.6886721680420105, "grad_norm": 0.29884910583496094, "learning_rate": 8.290468415085683e-05, "loss": 0.2764, "step": 918 }, { "epoch": 0.6894223555888972, "grad_norm": 0.3076704442501068, "learning_rate": 8.285535585947042e-05, "loss": 0.3194, "step": 919 }, { "epoch": 0.6901725431357839, "grad_norm": 0.2731105387210846, "learning_rate": 8.280597122438404e-05, "loss": 0.2367, "step": 920 }, { "epoch": 0.6909227306826706, "grad_norm": 0.3746616542339325, "learning_rate": 8.275653033028745e-05, "loss": 0.3509, "step": 921 }, { "epoch": 0.6916729182295573, "grad_norm": 0.26702332496643066, "learning_rate": 8.270703326196696e-05, "loss": 0.29, "step": 922 }, { "epoch": 0.6924231057764441, "grad_norm": 0.24746784567832947, "learning_rate": 8.265748010430513e-05, "loss": 0.2098, "step": 923 }, { "epoch": 0.6931732933233309, "grad_norm": 0.2709972858428955, "learning_rate": 8.260787094228076e-05, "loss": 0.2246, "step": 924 }, { "epoch": 0.6939234808702176, "grad_norm": 0.26061001420021057, "learning_rate": 8.255820586096867e-05, "loss": 0.3135, "step": 925 }, { "epoch": 0.6946736684171043, "grad_norm": 0.32399269938468933, "learning_rate": 8.25084849455396e-05, "loss": 0.3295, "step": 926 }, { "epoch": 0.695423855963991, "grad_norm": 0.2459634691476822, "learning_rate": 8.245870828126e-05, "loss": 0.1834, "step": 927 }, { "epoch": 0.6961740435108777, "grad_norm": 0.3163287341594696, "learning_rate": 8.240887595349197e-05, "loss": 0.2447, "step": 928 }, { "epoch": 0.6969242310577645, "grad_norm": 0.289944589138031, "learning_rate": 8.235898804769303e-05, "loss": 0.2907, "step": 929 }, { "epoch": 0.6976744186046512, "grad_norm": 0.30337268114089966, "learning_rate": 8.230904464941604e-05, "loss": 0.2985, "step": 930 }, { "epoch": 0.6984246061515379, "grad_norm": 0.271065354347229, "learning_rate": 8.225904584430901e-05, "loss": 0.252, "step": 931 }, { "epoch": 0.6991747936984246, "grad_norm": 0.2904837429523468, "learning_rate": 8.220899171811495e-05, "loss": 0.2492, "step": 932 }, { "epoch": 0.6999249812453113, "grad_norm": 0.2502482235431671, "learning_rate": 8.215888235667176e-05, "loss": 0.2753, "step": 933 }, { "epoch": 0.700675168792198, "grad_norm": 0.40906867384910583, "learning_rate": 8.210871784591207e-05, "loss": 0.3002, "step": 934 }, { "epoch": 0.7014253563390848, "grad_norm": 0.22791269421577454, "learning_rate": 8.205849827186308e-05, "loss": 0.1735, "step": 935 }, { "epoch": 0.7021755438859715, "grad_norm": 0.2665943205356598, "learning_rate": 8.200822372064641e-05, "loss": 0.2111, "step": 936 }, { "epoch": 0.7029257314328582, "grad_norm": 0.30841535329818726, "learning_rate": 8.195789427847796e-05, "loss": 0.2448, "step": 937 }, { "epoch": 0.7036759189797449, "grad_norm": 0.21702636778354645, "learning_rate": 8.190751003166778e-05, "loss": 0.165, "step": 938 }, { "epoch": 0.7044261065266316, "grad_norm": 0.22894582152366638, "learning_rate": 8.185707106661986e-05, "loss": 0.2677, "step": 939 }, { "epoch": 0.7051762940735183, "grad_norm": 0.3047623038291931, "learning_rate": 8.18065774698321e-05, "loss": 0.2932, "step": 940 }, { "epoch": 0.705926481620405, "grad_norm": 0.433017373085022, "learning_rate": 8.175602932789601e-05, "loss": 0.2638, "step": 941 }, { "epoch": 0.7066766691672918, "grad_norm": 0.32063257694244385, "learning_rate": 8.17054267274967e-05, "loss": 0.2583, "step": 942 }, { "epoch": 0.7074268567141786, "grad_norm": 0.4075561463832855, "learning_rate": 8.165476975541264e-05, "loss": 0.263, "step": 943 }, { "epoch": 0.7081770442610653, "grad_norm": 0.321635901927948, "learning_rate": 8.160405849851556e-05, "loss": 0.2313, "step": 944 }, { "epoch": 0.708927231807952, "grad_norm": 0.2614186108112335, "learning_rate": 8.155329304377025e-05, "loss": 0.2543, "step": 945 }, { "epoch": 0.7096774193548387, "grad_norm": 0.25979527831077576, "learning_rate": 8.150247347823448e-05, "loss": 0.2763, "step": 946 }, { "epoch": 0.7104276069017255, "grad_norm": 0.25194698572158813, "learning_rate": 8.145159988905879e-05, "loss": 0.2243, "step": 947 }, { "epoch": 0.7111777944486122, "grad_norm": 0.2760258615016937, "learning_rate": 8.140067236348638e-05, "loss": 0.2639, "step": 948 }, { "epoch": 0.7119279819954989, "grad_norm": 0.2803248465061188, "learning_rate": 8.134969098885294e-05, "loss": 0.3051, "step": 949 }, { "epoch": 0.7126781695423856, "grad_norm": 0.2724289000034332, "learning_rate": 8.129865585258653e-05, "loss": 0.2458, "step": 950 }, { "epoch": 0.7134283570892723, "grad_norm": 0.3022490441799164, "learning_rate": 8.124756704220735e-05, "loss": 0.3454, "step": 951 }, { "epoch": 0.714178544636159, "grad_norm": 0.33581092953681946, "learning_rate": 8.11964246453277e-05, "loss": 0.2579, "step": 952 }, { "epoch": 0.7149287321830458, "grad_norm": 0.26205509901046753, "learning_rate": 8.114522874965174e-05, "loss": 0.2925, "step": 953 }, { "epoch": 0.7156789197299325, "grad_norm": 0.3290042579174042, "learning_rate": 8.10939794429754e-05, "loss": 0.2787, "step": 954 }, { "epoch": 0.7164291072768192, "grad_norm": 0.29176750779151917, "learning_rate": 8.10426768131862e-05, "loss": 0.2299, "step": 955 }, { "epoch": 0.7171792948237059, "grad_norm": 0.21612803637981415, "learning_rate": 8.099132094826308e-05, "loss": 0.2261, "step": 956 }, { "epoch": 0.7179294823705926, "grad_norm": 0.24385301768779755, "learning_rate": 8.093991193627631e-05, "loss": 0.3345, "step": 957 }, { "epoch": 0.7186796699174793, "grad_norm": 0.24991290271282196, "learning_rate": 8.088844986538727e-05, "loss": 0.2025, "step": 958 }, { "epoch": 0.719429857464366, "grad_norm": 0.24456940591335297, "learning_rate": 8.083693482384836e-05, "loss": 0.2581, "step": 959 }, { "epoch": 0.7201800450112528, "grad_norm": 0.2605403661727905, "learning_rate": 8.078536690000278e-05, "loss": 0.2653, "step": 960 }, { "epoch": 0.7209302325581395, "grad_norm": 0.23132088780403137, "learning_rate": 8.073374618228445e-05, "loss": 0.1823, "step": 961 }, { "epoch": 0.7216804201050263, "grad_norm": 0.236076220870018, "learning_rate": 8.068207275921782e-05, "loss": 0.2388, "step": 962 }, { "epoch": 0.722430607651913, "grad_norm": 0.2721904516220093, "learning_rate": 8.063034671941774e-05, "loss": 0.2288, "step": 963 }, { "epoch": 0.7231807951987997, "grad_norm": 0.30902594327926636, "learning_rate": 8.057856815158924e-05, "loss": 0.2452, "step": 964 }, { "epoch": 0.7239309827456865, "grad_norm": 0.26528143882751465, "learning_rate": 8.05267371445275e-05, "loss": 0.2436, "step": 965 }, { "epoch": 0.7246811702925732, "grad_norm": 0.2860455811023712, "learning_rate": 8.047485378711756e-05, "loss": 0.2664, "step": 966 }, { "epoch": 0.7254313578394599, "grad_norm": 0.23292917013168335, "learning_rate": 8.042291816833429e-05, "loss": 0.2097, "step": 967 }, { "epoch": 0.7261815453863466, "grad_norm": 0.3331770598888397, "learning_rate": 8.037093037724216e-05, "loss": 0.2709, "step": 968 }, { "epoch": 0.7269317329332333, "grad_norm": 0.27733784914016724, "learning_rate": 8.031889050299511e-05, "loss": 0.2141, "step": 969 }, { "epoch": 0.72768192048012, "grad_norm": 0.27033716440200806, "learning_rate": 8.02667986348364e-05, "loss": 0.3367, "step": 970 }, { "epoch": 0.7284321080270068, "grad_norm": 0.2400401085615158, "learning_rate": 8.021465486209846e-05, "loss": 0.2257, "step": 971 }, { "epoch": 0.7291822955738935, "grad_norm": 0.22652189433574677, "learning_rate": 8.016245927420272e-05, "loss": 0.2274, "step": 972 }, { "epoch": 0.7299324831207802, "grad_norm": 0.33732274174690247, "learning_rate": 8.011021196065946e-05, "loss": 0.2572, "step": 973 }, { "epoch": 0.7306826706676669, "grad_norm": 0.2512076795101166, "learning_rate": 8.005791301106769e-05, "loss": 0.2736, "step": 974 }, { "epoch": 0.7314328582145536, "grad_norm": 0.25896620750427246, "learning_rate": 8.000556251511498e-05, "loss": 0.3207, "step": 975 }, { "epoch": 0.7321830457614403, "grad_norm": 0.23951098322868347, "learning_rate": 7.995316056257723e-05, "loss": 0.2321, "step": 976 }, { "epoch": 0.732933233308327, "grad_norm": 0.25237876176834106, "learning_rate": 7.990070724331866e-05, "loss": 0.2611, "step": 977 }, { "epoch": 0.7336834208552138, "grad_norm": 0.28328949213027954, "learning_rate": 7.984820264729156e-05, "loss": 0.2512, "step": 978 }, { "epoch": 0.7344336084021005, "grad_norm": 0.2247048020362854, "learning_rate": 7.979564686453612e-05, "loss": 0.2632, "step": 979 }, { "epoch": 0.7351837959489872, "grad_norm": 0.2904275059700012, "learning_rate": 7.974303998518031e-05, "loss": 0.3359, "step": 980 }, { "epoch": 0.735933983495874, "grad_norm": 0.2315652072429657, "learning_rate": 7.96903820994398e-05, "loss": 0.256, "step": 981 }, { "epoch": 0.7366841710427607, "grad_norm": 0.2848539650440216, "learning_rate": 7.963767329761762e-05, "loss": 0.2929, "step": 982 }, { "epoch": 0.7374343585896475, "grad_norm": 0.22695893049240112, "learning_rate": 7.958491367010423e-05, "loss": 0.2178, "step": 983 }, { "epoch": 0.7381845461365342, "grad_norm": 0.24621844291687012, "learning_rate": 7.953210330737718e-05, "loss": 0.2559, "step": 984 }, { "epoch": 0.7389347336834209, "grad_norm": 0.2657919228076935, "learning_rate": 7.947924230000102e-05, "loss": 0.2682, "step": 985 }, { "epoch": 0.7396849212303076, "grad_norm": 0.22935669124126434, "learning_rate": 7.942633073862718e-05, "loss": 0.2669, "step": 986 }, { "epoch": 0.7404351087771943, "grad_norm": 0.28396075963974, "learning_rate": 7.937336871399379e-05, "loss": 0.3223, "step": 987 }, { "epoch": 0.741185296324081, "grad_norm": 0.32090240716934204, "learning_rate": 7.932035631692549e-05, "loss": 0.2472, "step": 988 }, { "epoch": 0.7419354838709677, "grad_norm": 0.27513542771339417, "learning_rate": 7.926729363833335e-05, "loss": 0.2672, "step": 989 }, { "epoch": 0.7426856714178545, "grad_norm": 0.31878089904785156, "learning_rate": 7.921418076921461e-05, "loss": 0.2614, "step": 990 }, { "epoch": 0.7434358589647412, "grad_norm": 0.2965988218784332, "learning_rate": 7.916101780065263e-05, "loss": 0.2689, "step": 991 }, { "epoch": 0.7441860465116279, "grad_norm": 0.311419278383255, "learning_rate": 7.910780482381665e-05, "loss": 0.2473, "step": 992 }, { "epoch": 0.7449362340585146, "grad_norm": 0.36029312014579773, "learning_rate": 7.905454192996169e-05, "loss": 0.2344, "step": 993 }, { "epoch": 0.7456864216054013, "grad_norm": 0.3848575949668884, "learning_rate": 7.900122921042837e-05, "loss": 0.3087, "step": 994 }, { "epoch": 0.746436609152288, "grad_norm": 0.3068178594112396, "learning_rate": 7.894786675664277e-05, "loss": 0.2743, "step": 995 }, { "epoch": 0.7471867966991748, "grad_norm": 0.24920549988746643, "learning_rate": 7.88944546601162e-05, "loss": 0.2853, "step": 996 }, { "epoch": 0.7479369842460615, "grad_norm": 0.31349292397499084, "learning_rate": 7.884099301244519e-05, "loss": 0.2586, "step": 997 }, { "epoch": 0.7486871717929482, "grad_norm": 0.21929319202899933, "learning_rate": 7.878748190531118e-05, "loss": 0.246, "step": 998 }, { "epoch": 0.7494373593398349, "grad_norm": 0.26347535848617554, "learning_rate": 7.873392143048047e-05, "loss": 0.2614, "step": 999 }, { "epoch": 0.7501875468867217, "grad_norm": 0.29184189438819885, "learning_rate": 7.868031167980397e-05, "loss": 0.2713, "step": 1000 }, { "epoch": 0.7501875468867217, "eval_loss": 0.26416653394699097, "eval_runtime": 8.8819, "eval_samples_per_second": 6.08, "eval_steps_per_second": 1.576, "step": 1000 }, { "epoch": 0.7509377344336085, "grad_norm": 0.3440553843975067, "learning_rate": 7.862665274521712e-05, "loss": 0.2905, "step": 1001 }, { "epoch": 0.7516879219804952, "grad_norm": 0.30128660798072815, "learning_rate": 7.857294471873975e-05, "loss": 0.2207, "step": 1002 }, { "epoch": 0.7524381095273819, "grad_norm": 0.31816792488098145, "learning_rate": 7.851918769247582e-05, "loss": 0.3119, "step": 1003 }, { "epoch": 0.7531882970742686, "grad_norm": 0.2909788191318512, "learning_rate": 7.846538175861332e-05, "loss": 0.2661, "step": 1004 }, { "epoch": 0.7539384846211553, "grad_norm": 0.27092036604881287, "learning_rate": 7.841152700942413e-05, "loss": 0.24, "step": 1005 }, { "epoch": 0.754688672168042, "grad_norm": 0.2513846158981323, "learning_rate": 7.835762353726386e-05, "loss": 0.3248, "step": 1006 }, { "epoch": 0.7554388597149287, "grad_norm": 0.2131604552268982, "learning_rate": 7.830367143457165e-05, "loss": 0.284, "step": 1007 }, { "epoch": 0.7561890472618155, "grad_norm": 0.25292253494262695, "learning_rate": 7.824967079387002e-05, "loss": 0.2109, "step": 1008 }, { "epoch": 0.7569392348087022, "grad_norm": 0.3120124638080597, "learning_rate": 7.81956217077648e-05, "loss": 0.2341, "step": 1009 }, { "epoch": 0.7576894223555889, "grad_norm": 0.2651812732219696, "learning_rate": 7.814152426894478e-05, "loss": 0.2467, "step": 1010 }, { "epoch": 0.7584396099024756, "grad_norm": 0.24526168406009674, "learning_rate": 7.808737857018182e-05, "loss": 0.2539, "step": 1011 }, { "epoch": 0.7591897974493623, "grad_norm": 0.2588599920272827, "learning_rate": 7.803318470433042e-05, "loss": 0.1995, "step": 1012 }, { "epoch": 0.759939984996249, "grad_norm": 0.29522594809532166, "learning_rate": 7.797894276432772e-05, "loss": 0.2959, "step": 1013 }, { "epoch": 0.7606901725431358, "grad_norm": 0.2584953308105469, "learning_rate": 7.792465284319332e-05, "loss": 0.3029, "step": 1014 }, { "epoch": 0.7614403600900225, "grad_norm": 0.26034507155418396, "learning_rate": 7.787031503402907e-05, "loss": 0.248, "step": 1015 }, { "epoch": 0.7621905476369092, "grad_norm": 0.335578054189682, "learning_rate": 7.781592943001899e-05, "loss": 0.3122, "step": 1016 }, { "epoch": 0.7629407351837959, "grad_norm": 0.26463964581489563, "learning_rate": 7.776149612442899e-05, "loss": 0.3212, "step": 1017 }, { "epoch": 0.7636909227306826, "grad_norm": 0.22560244798660278, "learning_rate": 7.770701521060688e-05, "loss": 0.2458, "step": 1018 }, { "epoch": 0.7644411102775694, "grad_norm": 0.2402961254119873, "learning_rate": 7.765248678198203e-05, "loss": 0.1538, "step": 1019 }, { "epoch": 0.7651912978244562, "grad_norm": 0.24556198716163635, "learning_rate": 7.759791093206534e-05, "loss": 0.2735, "step": 1020 }, { "epoch": 0.7659414853713429, "grad_norm": 0.3314375877380371, "learning_rate": 7.754328775444903e-05, "loss": 0.2326, "step": 1021 }, { "epoch": 0.7666916729182296, "grad_norm": 0.32800862193107605, "learning_rate": 7.748861734280643e-05, "loss": 0.3225, "step": 1022 }, { "epoch": 0.7674418604651163, "grad_norm": 0.3003426194190979, "learning_rate": 7.743389979089196e-05, "loss": 0.3309, "step": 1023 }, { "epoch": 0.768192048012003, "grad_norm": 0.27405473589897156, "learning_rate": 7.737913519254079e-05, "loss": 0.2536, "step": 1024 }, { "epoch": 0.7689422355588897, "grad_norm": 0.26868969202041626, "learning_rate": 7.732432364166884e-05, "loss": 0.2806, "step": 1025 }, { "epoch": 0.7696924231057765, "grad_norm": 0.3379047214984894, "learning_rate": 7.726946523227251e-05, "loss": 0.2465, "step": 1026 }, { "epoch": 0.7704426106526632, "grad_norm": 0.31727099418640137, "learning_rate": 7.721456005842861e-05, "loss": 0.3189, "step": 1027 }, { "epoch": 0.7711927981995499, "grad_norm": 0.2712079882621765, "learning_rate": 7.715960821429404e-05, "loss": 0.2469, "step": 1028 }, { "epoch": 0.7719429857464366, "grad_norm": 0.2974828779697418, "learning_rate": 7.710460979410585e-05, "loss": 0.2515, "step": 1029 }, { "epoch": 0.7726931732933233, "grad_norm": 0.376665860414505, "learning_rate": 7.704956489218091e-05, "loss": 0.2736, "step": 1030 }, { "epoch": 0.77344336084021, "grad_norm": 0.34059056639671326, "learning_rate": 7.699447360291576e-05, "loss": 0.2851, "step": 1031 }, { "epoch": 0.7741935483870968, "grad_norm": 0.3448140025138855, "learning_rate": 7.69393360207866e-05, "loss": 0.2469, "step": 1032 }, { "epoch": 0.7749437359339835, "grad_norm": 0.24518583714962006, "learning_rate": 7.688415224034893e-05, "loss": 0.2051, "step": 1033 }, { "epoch": 0.7756939234808702, "grad_norm": 0.22350285947322845, "learning_rate": 7.682892235623749e-05, "loss": 0.2652, "step": 1034 }, { "epoch": 0.7764441110277569, "grad_norm": 0.29983019828796387, "learning_rate": 7.67736464631661e-05, "loss": 0.2462, "step": 1035 }, { "epoch": 0.7771942985746436, "grad_norm": 0.2750382721424103, "learning_rate": 7.671832465592746e-05, "loss": 0.1932, "step": 1036 }, { "epoch": 0.7779444861215303, "grad_norm": 0.2465767115354538, "learning_rate": 7.666295702939305e-05, "loss": 0.237, "step": 1037 }, { "epoch": 0.7786946736684172, "grad_norm": 0.25808414816856384, "learning_rate": 7.660754367851286e-05, "loss": 0.2019, "step": 1038 }, { "epoch": 0.7794448612153039, "grad_norm": 0.2413925975561142, "learning_rate": 7.655208469831536e-05, "loss": 0.2228, "step": 1039 }, { "epoch": 0.7801950487621906, "grad_norm": 0.37567564845085144, "learning_rate": 7.649658018390725e-05, "loss": 0.3298, "step": 1040 }, { "epoch": 0.7809452363090773, "grad_norm": 0.22226759791374207, "learning_rate": 7.644103023047327e-05, "loss": 0.2855, "step": 1041 }, { "epoch": 0.781695423855964, "grad_norm": 0.2970743775367737, "learning_rate": 7.638543493327613e-05, "loss": 0.2134, "step": 1042 }, { "epoch": 0.7824456114028507, "grad_norm": 0.2538881003856659, "learning_rate": 7.63297943876563e-05, "loss": 0.2424, "step": 1043 }, { "epoch": 0.7831957989497375, "grad_norm": 0.2798883020877838, "learning_rate": 7.627410868903184e-05, "loss": 0.3043, "step": 1044 }, { "epoch": 0.7839459864966242, "grad_norm": 0.2579958140850067, "learning_rate": 7.621837793289824e-05, "loss": 0.2347, "step": 1045 }, { "epoch": 0.7846961740435109, "grad_norm": 0.24593886733055115, "learning_rate": 7.616260221482825e-05, "loss": 0.2434, "step": 1046 }, { "epoch": 0.7854463615903976, "grad_norm": 0.32042503356933594, "learning_rate": 7.610678163047174e-05, "loss": 0.2967, "step": 1047 }, { "epoch": 0.7861965491372843, "grad_norm": 0.2840851843357086, "learning_rate": 7.60509162755555e-05, "loss": 0.265, "step": 1048 }, { "epoch": 0.786946736684171, "grad_norm": 0.2497265487909317, "learning_rate": 7.599500624588314e-05, "loss": 0.1897, "step": 1049 }, { "epoch": 0.7876969242310577, "grad_norm": 0.21462185680866241, "learning_rate": 7.593905163733484e-05, "loss": 0.2307, "step": 1050 }, { "epoch": 0.7884471117779445, "grad_norm": 0.3675479590892792, "learning_rate": 7.588305254586724e-05, "loss": 0.3776, "step": 1051 }, { "epoch": 0.7891972993248312, "grad_norm": 0.26289623975753784, "learning_rate": 7.582700906751325e-05, "loss": 0.2615, "step": 1052 }, { "epoch": 0.7899474868717179, "grad_norm": 0.26494017243385315, "learning_rate": 7.577092129838197e-05, "loss": 0.2265, "step": 1053 }, { "epoch": 0.7906976744186046, "grad_norm": 0.22506411373615265, "learning_rate": 7.571478933465836e-05, "loss": 0.2708, "step": 1054 }, { "epoch": 0.7914478619654913, "grad_norm": 0.3298030495643616, "learning_rate": 7.565861327260322e-05, "loss": 0.2711, "step": 1055 }, { "epoch": 0.792198049512378, "grad_norm": 0.3718823194503784, "learning_rate": 7.560239320855296e-05, "loss": 0.3884, "step": 1056 }, { "epoch": 0.7929482370592649, "grad_norm": 0.36073753237724304, "learning_rate": 7.554612923891946e-05, "loss": 0.2833, "step": 1057 }, { "epoch": 0.7936984246061516, "grad_norm": 0.33475157618522644, "learning_rate": 7.548982146018988e-05, "loss": 0.2591, "step": 1058 }, { "epoch": 0.7944486121530383, "grad_norm": 0.37227001786231995, "learning_rate": 7.543346996892654e-05, "loss": 0.3286, "step": 1059 }, { "epoch": 0.795198799699925, "grad_norm": 0.31355008482933044, "learning_rate": 7.537707486176667e-05, "loss": 0.2025, "step": 1060 }, { "epoch": 0.7959489872468117, "grad_norm": 0.26683029532432556, "learning_rate": 7.532063623542231e-05, "loss": 0.2674, "step": 1061 }, { "epoch": 0.7966991747936985, "grad_norm": 0.3263293504714966, "learning_rate": 7.52641541866802e-05, "loss": 0.2357, "step": 1062 }, { "epoch": 0.7974493623405852, "grad_norm": 0.3189033269882202, "learning_rate": 7.520762881240147e-05, "loss": 0.3415, "step": 1063 }, { "epoch": 0.7981995498874719, "grad_norm": 0.2913949489593506, "learning_rate": 7.515106020952156e-05, "loss": 0.2247, "step": 1064 }, { "epoch": 0.7989497374343586, "grad_norm": 0.27122050523757935, "learning_rate": 7.509444847505005e-05, "loss": 0.2832, "step": 1065 }, { "epoch": 0.7996999249812453, "grad_norm": 0.29842230677604675, "learning_rate": 7.503779370607049e-05, "loss": 0.2772, "step": 1066 }, { "epoch": 0.800450112528132, "grad_norm": 0.25025674700737, "learning_rate": 7.498109599974024e-05, "loss": 0.2357, "step": 1067 }, { "epoch": 0.8012003000750187, "grad_norm": 0.25898095965385437, "learning_rate": 7.49243554532903e-05, "loss": 0.3134, "step": 1068 }, { "epoch": 0.8019504876219055, "grad_norm": 0.22235994040966034, "learning_rate": 7.486757216402509e-05, "loss": 0.1637, "step": 1069 }, { "epoch": 0.8027006751687922, "grad_norm": 0.3074110448360443, "learning_rate": 7.481074622932236e-05, "loss": 0.2169, "step": 1070 }, { "epoch": 0.8034508627156789, "grad_norm": 0.3002496063709259, "learning_rate": 7.475387774663302e-05, "loss": 0.1921, "step": 1071 }, { "epoch": 0.8042010502625656, "grad_norm": 0.26039600372314453, "learning_rate": 7.469696681348088e-05, "loss": 0.2323, "step": 1072 }, { "epoch": 0.8049512378094523, "grad_norm": 0.4236557185649872, "learning_rate": 7.464001352746263e-05, "loss": 0.3752, "step": 1073 }, { "epoch": 0.805701425356339, "grad_norm": 0.23451805114746094, "learning_rate": 7.45830179862475e-05, "loss": 0.2218, "step": 1074 }, { "epoch": 0.8064516129032258, "grad_norm": 0.22463178634643555, "learning_rate": 7.452598028757729e-05, "loss": 0.1871, "step": 1075 }, { "epoch": 0.8072018004501126, "grad_norm": 0.33912789821624756, "learning_rate": 7.446890052926598e-05, "loss": 0.2543, "step": 1076 }, { "epoch": 0.8079519879969993, "grad_norm": 0.24198795855045319, "learning_rate": 7.441177880919976e-05, "loss": 0.1981, "step": 1077 }, { "epoch": 0.808702175543886, "grad_norm": 0.34430012106895447, "learning_rate": 7.435461522533674e-05, "loss": 0.2374, "step": 1078 }, { "epoch": 0.8094523630907727, "grad_norm": 0.37674805521965027, "learning_rate": 7.429740987570686e-05, "loss": 0.2688, "step": 1079 }, { "epoch": 0.8102025506376594, "grad_norm": 0.3720530569553375, "learning_rate": 7.424016285841165e-05, "loss": 0.2712, "step": 1080 }, { "epoch": 0.8109527381845462, "grad_norm": 0.21259364485740662, "learning_rate": 7.41828742716241e-05, "loss": 0.3053, "step": 1081 }, { "epoch": 0.8117029257314329, "grad_norm": 0.32373929023742676, "learning_rate": 7.41255442135885e-05, "loss": 0.2953, "step": 1082 }, { "epoch": 0.8124531132783196, "grad_norm": 0.30722734332084656, "learning_rate": 7.406817278262027e-05, "loss": 0.2333, "step": 1083 }, { "epoch": 0.8132033008252063, "grad_norm": 0.2306872308254242, "learning_rate": 7.401076007710575e-05, "loss": 0.2608, "step": 1084 }, { "epoch": 0.813953488372093, "grad_norm": 0.2906799614429474, "learning_rate": 7.395330619550207e-05, "loss": 0.2595, "step": 1085 }, { "epoch": 0.8147036759189797, "grad_norm": 0.31546592712402344, "learning_rate": 7.3895811236337e-05, "loss": 0.251, "step": 1086 }, { "epoch": 0.8154538634658665, "grad_norm": 0.28102412819862366, "learning_rate": 7.38382752982087e-05, "loss": 0.2525, "step": 1087 }, { "epoch": 0.8162040510127532, "grad_norm": 0.4119090735912323, "learning_rate": 7.378069847978568e-05, "loss": 0.3066, "step": 1088 }, { "epoch": 0.8169542385596399, "grad_norm": 0.3258497714996338, "learning_rate": 7.372308087980647e-05, "loss": 0.3176, "step": 1089 }, { "epoch": 0.8177044261065266, "grad_norm": 0.28876015543937683, "learning_rate": 7.366542259707962e-05, "loss": 0.2148, "step": 1090 }, { "epoch": 0.8184546136534133, "grad_norm": 0.27026915550231934, "learning_rate": 7.360772373048338e-05, "loss": 0.2472, "step": 1091 }, { "epoch": 0.8192048012003, "grad_norm": 0.32744088768959045, "learning_rate": 7.354998437896565e-05, "loss": 0.2961, "step": 1092 }, { "epoch": 0.8199549887471868, "grad_norm": 0.2807987928390503, "learning_rate": 7.349220464154371e-05, "loss": 0.2197, "step": 1093 }, { "epoch": 0.8207051762940735, "grad_norm": 0.28608086705207825, "learning_rate": 7.343438461730411e-05, "loss": 0.3032, "step": 1094 }, { "epoch": 0.8214553638409603, "grad_norm": 0.23416754603385925, "learning_rate": 7.337652440540252e-05, "loss": 0.3419, "step": 1095 }, { "epoch": 0.822205551387847, "grad_norm": 0.2615034282207489, "learning_rate": 7.331862410506353e-05, "loss": 0.2539, "step": 1096 }, { "epoch": 0.8229557389347337, "grad_norm": 0.2757169306278229, "learning_rate": 7.32606838155804e-05, "loss": 0.2266, "step": 1097 }, { "epoch": 0.8237059264816204, "grad_norm": 0.27804407477378845, "learning_rate": 7.320270363631505e-05, "loss": 0.2481, "step": 1098 }, { "epoch": 0.8244561140285072, "grad_norm": 0.3440892994403839, "learning_rate": 7.314468366669777e-05, "loss": 0.3103, "step": 1099 }, { "epoch": 0.8252063015753939, "grad_norm": 0.2704513967037201, "learning_rate": 7.30866240062271e-05, "loss": 0.1665, "step": 1100 }, { "epoch": 0.8259564891222806, "grad_norm": 0.23068906366825104, "learning_rate": 7.302852475446963e-05, "loss": 0.2824, "step": 1101 }, { "epoch": 0.8267066766691673, "grad_norm": 0.23840470612049103, "learning_rate": 7.297038601105988e-05, "loss": 0.2282, "step": 1102 }, { "epoch": 0.827456864216054, "grad_norm": 0.28294798731803894, "learning_rate": 7.291220787570005e-05, "loss": 0.216, "step": 1103 }, { "epoch": 0.8282070517629407, "grad_norm": 0.2676393389701843, "learning_rate": 7.28539904481599e-05, "loss": 0.2478, "step": 1104 }, { "epoch": 0.8289572393098275, "grad_norm": 0.17676688730716705, "learning_rate": 7.279573382827662e-05, "loss": 0.2181, "step": 1105 }, { "epoch": 0.8297074268567142, "grad_norm": 0.24498018622398376, "learning_rate": 7.273743811595454e-05, "loss": 0.2442, "step": 1106 }, { "epoch": 0.8304576144036009, "grad_norm": 0.28664886951446533, "learning_rate": 7.267910341116512e-05, "loss": 0.2597, "step": 1107 }, { "epoch": 0.8312078019504876, "grad_norm": 0.26148414611816406, "learning_rate": 7.262072981394656e-05, "loss": 0.1969, "step": 1108 }, { "epoch": 0.8319579894973743, "grad_norm": 0.2754013240337372, "learning_rate": 7.256231742440389e-05, "loss": 0.2706, "step": 1109 }, { "epoch": 0.832708177044261, "grad_norm": 0.2794703245162964, "learning_rate": 7.25038663427086e-05, "loss": 0.2606, "step": 1110 }, { "epoch": 0.8334583645911477, "grad_norm": 0.31487202644348145, "learning_rate": 7.24453766690985e-05, "loss": 0.1782, "step": 1111 }, { "epoch": 0.8342085521380345, "grad_norm": 0.2968665361404419, "learning_rate": 7.238684850387765e-05, "loss": 0.2316, "step": 1112 }, { "epoch": 0.8349587396849212, "grad_norm": 0.2559606432914734, "learning_rate": 7.232828194741611e-05, "loss": 0.245, "step": 1113 }, { "epoch": 0.8357089272318079, "grad_norm": 0.27775752544403076, "learning_rate": 7.226967710014971e-05, "loss": 0.2604, "step": 1114 }, { "epoch": 0.8364591147786947, "grad_norm": 0.26683878898620605, "learning_rate": 7.221103406258003e-05, "loss": 0.2017, "step": 1115 }, { "epoch": 0.8372093023255814, "grad_norm": 0.25626155734062195, "learning_rate": 7.215235293527409e-05, "loss": 0.2776, "step": 1116 }, { "epoch": 0.8379594898724682, "grad_norm": 0.27310711145401, "learning_rate": 7.209363381886423e-05, "loss": 0.292, "step": 1117 }, { "epoch": 0.8387096774193549, "grad_norm": 0.2527594566345215, "learning_rate": 7.203487681404798e-05, "loss": 0.2395, "step": 1118 }, { "epoch": 0.8394598649662416, "grad_norm": 0.2429451048374176, "learning_rate": 7.19760820215878e-05, "loss": 0.2424, "step": 1119 }, { "epoch": 0.8402100525131283, "grad_norm": 0.18625757098197937, "learning_rate": 7.191724954231098e-05, "loss": 0.1674, "step": 1120 }, { "epoch": 0.840960240060015, "grad_norm": 0.1871057152748108, "learning_rate": 7.185837947710943e-05, "loss": 0.2047, "step": 1121 }, { "epoch": 0.8417104276069017, "grad_norm": 0.23940734565258026, "learning_rate": 7.17994719269395e-05, "loss": 0.2301, "step": 1122 }, { "epoch": 0.8424606151537885, "grad_norm": 0.24877877533435822, "learning_rate": 7.174052699282183e-05, "loss": 0.2979, "step": 1123 }, { "epoch": 0.8432108027006752, "grad_norm": 0.2933540940284729, "learning_rate": 7.168154477584123e-05, "loss": 0.2547, "step": 1124 }, { "epoch": 0.8439609902475619, "grad_norm": 0.3226040303707123, "learning_rate": 7.162252537714633e-05, "loss": 0.2286, "step": 1125 }, { "epoch": 0.8447111777944486, "grad_norm": 0.3179830312728882, "learning_rate": 7.156346889794962e-05, "loss": 0.3145, "step": 1126 }, { "epoch": 0.8454613653413353, "grad_norm": 0.3769233226776123, "learning_rate": 7.150437543952715e-05, "loss": 0.3069, "step": 1127 }, { "epoch": 0.846211552888222, "grad_norm": 0.19375602900981903, "learning_rate": 7.144524510321837e-05, "loss": 0.241, "step": 1128 }, { "epoch": 0.8469617404351087, "grad_norm": 0.2979907691478729, "learning_rate": 7.138607799042598e-05, "loss": 0.2922, "step": 1129 }, { "epoch": 0.8477119279819955, "grad_norm": 0.2792617380619049, "learning_rate": 7.132687420261576e-05, "loss": 0.2185, "step": 1130 }, { "epoch": 0.8484621155288822, "grad_norm": 0.33341464400291443, "learning_rate": 7.126763384131638e-05, "loss": 0.3246, "step": 1131 }, { "epoch": 0.8492123030757689, "grad_norm": 0.243754044175148, "learning_rate": 7.120835700811923e-05, "loss": 0.2949, "step": 1132 }, { "epoch": 0.8499624906226556, "grad_norm": 0.22130246460437775, "learning_rate": 7.114904380467823e-05, "loss": 0.2456, "step": 1133 }, { "epoch": 0.8507126781695424, "grad_norm": 0.2978169620037079, "learning_rate": 7.108969433270968e-05, "loss": 0.2645, "step": 1134 }, { "epoch": 0.8514628657164292, "grad_norm": 0.25530266761779785, "learning_rate": 7.10303086939921e-05, "loss": 0.2555, "step": 1135 }, { "epoch": 0.8522130532633159, "grad_norm": 0.30415529012680054, "learning_rate": 7.097088699036598e-05, "loss": 0.3204, "step": 1136 }, { "epoch": 0.8529632408102026, "grad_norm": 0.2876017093658447, "learning_rate": 7.091142932373371e-05, "loss": 0.2134, "step": 1137 }, { "epoch": 0.8537134283570893, "grad_norm": 0.30672404170036316, "learning_rate": 7.085193579605935e-05, "loss": 0.2726, "step": 1138 }, { "epoch": 0.854463615903976, "grad_norm": 0.21835702657699585, "learning_rate": 7.079240650936843e-05, "loss": 0.2365, "step": 1139 }, { "epoch": 0.8552138034508627, "grad_norm": 0.21209684014320374, "learning_rate": 7.07328415657478e-05, "loss": 0.1858, "step": 1140 }, { "epoch": 0.8559639909977494, "grad_norm": 0.33733153343200684, "learning_rate": 7.067324106734548e-05, "loss": 0.3017, "step": 1141 }, { "epoch": 0.8567141785446362, "grad_norm": 0.32906374335289, "learning_rate": 7.061360511637045e-05, "loss": 0.2123, "step": 1142 }, { "epoch": 0.8574643660915229, "grad_norm": 0.2283150851726532, "learning_rate": 7.055393381509253e-05, "loss": 0.2146, "step": 1143 }, { "epoch": 0.8582145536384096, "grad_norm": 0.2341356873512268, "learning_rate": 7.049422726584206e-05, "loss": 0.277, "step": 1144 }, { "epoch": 0.8589647411852963, "grad_norm": 0.2874937951564789, "learning_rate": 7.043448557100995e-05, "loss": 0.2869, "step": 1145 }, { "epoch": 0.859714928732183, "grad_norm": 0.2907133400440216, "learning_rate": 7.037470883304731e-05, "loss": 0.2044, "step": 1146 }, { "epoch": 0.8604651162790697, "grad_norm": 0.26814940571784973, "learning_rate": 7.031489715446535e-05, "loss": 0.2609, "step": 1147 }, { "epoch": 0.8612153038259565, "grad_norm": 0.303264856338501, "learning_rate": 7.02550506378352e-05, "loss": 0.3456, "step": 1148 }, { "epoch": 0.8619654913728432, "grad_norm": 0.28112706542015076, "learning_rate": 7.019516938578777e-05, "loss": 0.2156, "step": 1149 }, { "epoch": 0.8627156789197299, "grad_norm": 0.30424219369888306, "learning_rate": 7.013525350101348e-05, "loss": 0.2671, "step": 1150 }, { "epoch": 0.8634658664666166, "grad_norm": 0.23765511810779572, "learning_rate": 7.00753030862622e-05, "loss": 0.2211, "step": 1151 }, { "epoch": 0.8642160540135033, "grad_norm": 0.22158914804458618, "learning_rate": 7.001531824434299e-05, "loss": 0.2422, "step": 1152 }, { "epoch": 0.8649662415603901, "grad_norm": 0.35320600867271423, "learning_rate": 6.995529907812391e-05, "loss": 0.2431, "step": 1153 }, { "epoch": 0.8657164291072769, "grad_norm": 0.22154858708381653, "learning_rate": 6.989524569053196e-05, "loss": 0.2126, "step": 1154 }, { "epoch": 0.8664666166541636, "grad_norm": 0.31103944778442383, "learning_rate": 6.983515818455275e-05, "loss": 0.1509, "step": 1155 }, { "epoch": 0.8672168042010503, "grad_norm": 0.32523053884506226, "learning_rate": 6.977503666323048e-05, "loss": 0.2811, "step": 1156 }, { "epoch": 0.867966991747937, "grad_norm": 0.3360098600387573, "learning_rate": 6.971488122966758e-05, "loss": 0.3739, "step": 1157 }, { "epoch": 0.8687171792948237, "grad_norm": 0.2449420690536499, "learning_rate": 6.965469198702475e-05, "loss": 0.2496, "step": 1158 }, { "epoch": 0.8694673668417104, "grad_norm": 0.2142074555158615, "learning_rate": 6.95944690385206e-05, "loss": 0.193, "step": 1159 }, { "epoch": 0.8702175543885972, "grad_norm": 0.29441049695014954, "learning_rate": 6.953421248743154e-05, "loss": 0.3053, "step": 1160 }, { "epoch": 0.8709677419354839, "grad_norm": 0.32524898648262024, "learning_rate": 6.947392243709163e-05, "loss": 0.2367, "step": 1161 }, { "epoch": 0.8717179294823706, "grad_norm": 0.2872486412525177, "learning_rate": 6.941359899089238e-05, "loss": 0.1867, "step": 1162 }, { "epoch": 0.8724681170292573, "grad_norm": 0.22578471899032593, "learning_rate": 6.935324225228254e-05, "loss": 0.3038, "step": 1163 }, { "epoch": 0.873218304576144, "grad_norm": 0.2831272780895233, "learning_rate": 6.929285232476797e-05, "loss": 0.2382, "step": 1164 }, { "epoch": 0.8739684921230307, "grad_norm": 0.2321145236492157, "learning_rate": 6.923242931191148e-05, "loss": 0.2151, "step": 1165 }, { "epoch": 0.8747186796699175, "grad_norm": 0.3738405406475067, "learning_rate": 6.917197331733257e-05, "loss": 0.2743, "step": 1166 }, { "epoch": 0.8754688672168042, "grad_norm": 0.2705344557762146, "learning_rate": 6.911148444470731e-05, "loss": 0.2608, "step": 1167 }, { "epoch": 0.8762190547636909, "grad_norm": 0.27389052510261536, "learning_rate": 6.905096279776819e-05, "loss": 0.2705, "step": 1168 }, { "epoch": 0.8769692423105776, "grad_norm": 0.17423346638679504, "learning_rate": 6.899040848030384e-05, "loss": 0.1937, "step": 1169 }, { "epoch": 0.8777194298574643, "grad_norm": 0.23306900262832642, "learning_rate": 6.892982159615895e-05, "loss": 0.2464, "step": 1170 }, { "epoch": 0.878469617404351, "grad_norm": 0.3401232063770294, "learning_rate": 6.886920224923408e-05, "loss": 0.2743, "step": 1171 }, { "epoch": 0.8792198049512379, "grad_norm": 0.27245357632637024, "learning_rate": 6.880855054348543e-05, "loss": 0.1972, "step": 1172 }, { "epoch": 0.8799699924981246, "grad_norm": 0.24177445471286774, "learning_rate": 6.874786658292472e-05, "loss": 0.202, "step": 1173 }, { "epoch": 0.8807201800450113, "grad_norm": 0.2444036602973938, "learning_rate": 6.868715047161896e-05, "loss": 0.229, "step": 1174 }, { "epoch": 0.881470367591898, "grad_norm": 0.29559674859046936, "learning_rate": 6.862640231369029e-05, "loss": 0.2985, "step": 1175 }, { "epoch": 0.8822205551387847, "grad_norm": 0.23746810853481293, "learning_rate": 6.856562221331581e-05, "loss": 0.2351, "step": 1176 }, { "epoch": 0.8829707426856714, "grad_norm": 0.2819749414920807, "learning_rate": 6.850481027472743e-05, "loss": 0.3295, "step": 1177 }, { "epoch": 0.8837209302325582, "grad_norm": 0.2149581015110016, "learning_rate": 6.844396660221164e-05, "loss": 0.2116, "step": 1178 }, { "epoch": 0.8844711177794449, "grad_norm": 0.2274528592824936, "learning_rate": 6.838309130010933e-05, "loss": 0.2069, "step": 1179 }, { "epoch": 0.8852213053263316, "grad_norm": 0.25971266627311707, "learning_rate": 6.832218447281566e-05, "loss": 0.2757, "step": 1180 }, { "epoch": 0.8859714928732183, "grad_norm": 0.22781336307525635, "learning_rate": 6.826124622477981e-05, "loss": 0.2717, "step": 1181 }, { "epoch": 0.886721680420105, "grad_norm": 0.3223855793476105, "learning_rate": 6.820027666050493e-05, "loss": 0.2021, "step": 1182 }, { "epoch": 0.8874718679669917, "grad_norm": 0.3653661012649536, "learning_rate": 6.813927588454778e-05, "loss": 0.314, "step": 1183 }, { "epoch": 0.8882220555138785, "grad_norm": 0.2787640690803528, "learning_rate": 6.80782440015187e-05, "loss": 0.2692, "step": 1184 }, { "epoch": 0.8889722430607652, "grad_norm": 0.3384714424610138, "learning_rate": 6.801718111608133e-05, "loss": 0.2606, "step": 1185 }, { "epoch": 0.8897224306076519, "grad_norm": 0.29886987805366516, "learning_rate": 6.795608733295254e-05, "loss": 0.3186, "step": 1186 }, { "epoch": 0.8904726181545386, "grad_norm": 0.21844054758548737, "learning_rate": 6.789496275690215e-05, "loss": 0.2071, "step": 1187 }, { "epoch": 0.8912228057014253, "grad_norm": 0.2613053619861603, "learning_rate": 6.783380749275277e-05, "loss": 0.2384, "step": 1188 }, { "epoch": 0.891972993248312, "grad_norm": 0.21696613729000092, "learning_rate": 6.777262164537966e-05, "loss": 0.2299, "step": 1189 }, { "epoch": 0.8927231807951987, "grad_norm": 0.27892565727233887, "learning_rate": 6.771140531971054e-05, "loss": 0.2445, "step": 1190 }, { "epoch": 0.8934733683420856, "grad_norm": 0.28729599714279175, "learning_rate": 6.765015862072536e-05, "loss": 0.23, "step": 1191 }, { "epoch": 0.8942235558889723, "grad_norm": 0.22692418098449707, "learning_rate": 6.758888165345619e-05, "loss": 0.2369, "step": 1192 }, { "epoch": 0.894973743435859, "grad_norm": 0.31195831298828125, "learning_rate": 6.7527574522987e-05, "loss": 0.2981, "step": 1193 }, { "epoch": 0.8957239309827457, "grad_norm": 0.2865544855594635, "learning_rate": 6.746623733445346e-05, "loss": 0.2101, "step": 1194 }, { "epoch": 0.8964741185296324, "grad_norm": 0.2803462743759155, "learning_rate": 6.740487019304282e-05, "loss": 0.2789, "step": 1195 }, { "epoch": 0.8972243060765192, "grad_norm": 0.2356065958738327, "learning_rate": 6.734347320399369e-05, "loss": 0.2803, "step": 1196 }, { "epoch": 0.8979744936234059, "grad_norm": 0.17872647941112518, "learning_rate": 6.728204647259586e-05, "loss": 0.2303, "step": 1197 }, { "epoch": 0.8987246811702926, "grad_norm": 0.26151472330093384, "learning_rate": 6.722059010419013e-05, "loss": 0.2759, "step": 1198 }, { "epoch": 0.8994748687171793, "grad_norm": 0.3112211525440216, "learning_rate": 6.715910420416809e-05, "loss": 0.3342, "step": 1199 }, { "epoch": 0.900225056264066, "grad_norm": 0.271685391664505, "learning_rate": 6.709758887797205e-05, "loss": 0.2373, "step": 1200 }, { "epoch": 0.900225056264066, "eval_loss": 0.2598603069782257, "eval_runtime": 8.9181, "eval_samples_per_second": 6.055, "eval_steps_per_second": 1.57, "step": 1200 }, { "epoch": 0.9009752438109527, "grad_norm": 0.36755096912384033, "learning_rate": 6.703604423109468e-05, "loss": 0.3079, "step": 1201 }, { "epoch": 0.9017254313578394, "grad_norm": 0.24592749774456024, "learning_rate": 6.697447036907904e-05, "loss": 0.206, "step": 1202 }, { "epoch": 0.9024756189047262, "grad_norm": 0.20737814903259277, "learning_rate": 6.691286739751824e-05, "loss": 0.1791, "step": 1203 }, { "epoch": 0.9032258064516129, "grad_norm": 0.2510600686073303, "learning_rate": 6.685123542205526e-05, "loss": 0.2507, "step": 1204 }, { "epoch": 0.9039759939984996, "grad_norm": 0.2614128887653351, "learning_rate": 6.678957454838292e-05, "loss": 0.2241, "step": 1205 }, { "epoch": 0.9047261815453863, "grad_norm": 0.34144389629364014, "learning_rate": 6.672788488224352e-05, "loss": 0.2477, "step": 1206 }, { "epoch": 0.905476369092273, "grad_norm": 0.3296656906604767, "learning_rate": 6.666616652942878e-05, "loss": 0.2297, "step": 1207 }, { "epoch": 0.9062265566391597, "grad_norm": 0.2896101176738739, "learning_rate": 6.660441959577958e-05, "loss": 0.2248, "step": 1208 }, { "epoch": 0.9069767441860465, "grad_norm": 0.2831351161003113, "learning_rate": 6.654264418718584e-05, "loss": 0.3077, "step": 1209 }, { "epoch": 0.9077269317329333, "grad_norm": 0.342795729637146, "learning_rate": 6.64808404095863e-05, "loss": 0.2984, "step": 1210 }, { "epoch": 0.90847711927982, "grad_norm": 0.2785213589668274, "learning_rate": 6.641900836896835e-05, "loss": 0.2479, "step": 1211 }, { "epoch": 0.9092273068267067, "grad_norm": 0.2722512483596802, "learning_rate": 6.635714817136785e-05, "loss": 0.2104, "step": 1212 }, { "epoch": 0.9099774943735934, "grad_norm": 0.24673239886760712, "learning_rate": 6.629525992286898e-05, "loss": 0.2659, "step": 1213 }, { "epoch": 0.9107276819204801, "grad_norm": 0.34213677048683167, "learning_rate": 6.623334372960393e-05, "loss": 0.3561, "step": 1214 }, { "epoch": 0.9114778694673669, "grad_norm": 0.25264623761177063, "learning_rate": 6.617139969775292e-05, "loss": 0.195, "step": 1215 }, { "epoch": 0.9122280570142536, "grad_norm": 0.3294694721698761, "learning_rate": 6.610942793354387e-05, "loss": 0.2046, "step": 1216 }, { "epoch": 0.9129782445611403, "grad_norm": 0.28122833371162415, "learning_rate": 6.604742854325222e-05, "loss": 0.2408, "step": 1217 }, { "epoch": 0.913728432108027, "grad_norm": 0.3307669758796692, "learning_rate": 6.598540163320084e-05, "loss": 0.2677, "step": 1218 }, { "epoch": 0.9144786196549137, "grad_norm": 0.30026087164878845, "learning_rate": 6.592334730975975e-05, "loss": 0.2768, "step": 1219 }, { "epoch": 0.9152288072018004, "grad_norm": 0.3331785798072815, "learning_rate": 6.586126567934605e-05, "loss": 0.3338, "step": 1220 }, { "epoch": 0.9159789947486872, "grad_norm": 0.257973849773407, "learning_rate": 6.57991568484236e-05, "loss": 0.2231, "step": 1221 }, { "epoch": 0.9167291822955739, "grad_norm": 0.29099372029304504, "learning_rate": 6.573702092350292e-05, "loss": 0.273, "step": 1222 }, { "epoch": 0.9174793698424606, "grad_norm": 0.3015498220920563, "learning_rate": 6.567485801114099e-05, "loss": 0.2315, "step": 1223 }, { "epoch": 0.9182295573893473, "grad_norm": 0.25092631578445435, "learning_rate": 6.561266821794111e-05, "loss": 0.1749, "step": 1224 }, { "epoch": 0.918979744936234, "grad_norm": 0.27579358220100403, "learning_rate": 6.555045165055263e-05, "loss": 0.3009, "step": 1225 }, { "epoch": 0.9197299324831207, "grad_norm": 0.2755884528160095, "learning_rate": 6.548820841567086e-05, "loss": 0.2425, "step": 1226 }, { "epoch": 0.9204801200300075, "grad_norm": 0.2558373808860779, "learning_rate": 6.54259386200368e-05, "loss": 0.2306, "step": 1227 }, { "epoch": 0.9212303075768942, "grad_norm": 0.29067474603652954, "learning_rate": 6.536364237043703e-05, "loss": 0.2378, "step": 1228 }, { "epoch": 0.921980495123781, "grad_norm": 0.26360759139060974, "learning_rate": 6.530131977370348e-05, "loss": 0.2353, "step": 1229 }, { "epoch": 0.9227306826706677, "grad_norm": 0.2438310980796814, "learning_rate": 6.523897093671326e-05, "loss": 0.2077, "step": 1230 }, { "epoch": 0.9234808702175544, "grad_norm": 0.2443327158689499, "learning_rate": 6.51765959663885e-05, "loss": 0.2597, "step": 1231 }, { "epoch": 0.9242310577644411, "grad_norm": 0.3418941795825958, "learning_rate": 6.511419496969612e-05, "loss": 0.2763, "step": 1232 }, { "epoch": 0.9249812453113279, "grad_norm": 0.2596498131752014, "learning_rate": 6.505176805364767e-05, "loss": 0.235, "step": 1233 }, { "epoch": 0.9257314328582146, "grad_norm": 0.2161298394203186, "learning_rate": 6.498931532529921e-05, "loss": 0.2537, "step": 1234 }, { "epoch": 0.9264816204051013, "grad_norm": 0.27278441190719604, "learning_rate": 6.492683689175098e-05, "loss": 0.1916, "step": 1235 }, { "epoch": 0.927231807951988, "grad_norm": 0.285061776638031, "learning_rate": 6.486433286014734e-05, "loss": 0.1955, "step": 1236 }, { "epoch": 0.9279819954988747, "grad_norm": 0.26830658316612244, "learning_rate": 6.480180333767658e-05, "loss": 0.2397, "step": 1237 }, { "epoch": 0.9287321830457614, "grad_norm": 0.2572755515575409, "learning_rate": 6.473924843157065e-05, "loss": 0.193, "step": 1238 }, { "epoch": 0.9294823705926482, "grad_norm": 0.2306939661502838, "learning_rate": 6.467666824910505e-05, "loss": 0.2779, "step": 1239 }, { "epoch": 0.9302325581395349, "grad_norm": 0.22929872572422028, "learning_rate": 6.461406289759862e-05, "loss": 0.2361, "step": 1240 }, { "epoch": 0.9309827456864216, "grad_norm": 0.33751994371414185, "learning_rate": 6.455143248441342e-05, "loss": 0.2717, "step": 1241 }, { "epoch": 0.9317329332333083, "grad_norm": 0.23822185397148132, "learning_rate": 6.44887771169544e-05, "loss": 0.1773, "step": 1242 }, { "epoch": 0.932483120780195, "grad_norm": 0.3233415484428406, "learning_rate": 6.442609690266937e-05, "loss": 0.2439, "step": 1243 }, { "epoch": 0.9332333083270817, "grad_norm": 0.40737825632095337, "learning_rate": 6.436339194904872e-05, "loss": 0.2608, "step": 1244 }, { "epoch": 0.9339834958739685, "grad_norm": 0.2380070835351944, "learning_rate": 6.430066236362524e-05, "loss": 0.2062, "step": 1245 }, { "epoch": 0.9347336834208552, "grad_norm": 0.27065709233283997, "learning_rate": 6.423790825397404e-05, "loss": 0.2224, "step": 1246 }, { "epoch": 0.9354838709677419, "grad_norm": 0.2996170222759247, "learning_rate": 6.417512972771219e-05, "loss": 0.1962, "step": 1247 }, { "epoch": 0.9362340585146287, "grad_norm": 0.32225918769836426, "learning_rate": 6.411232689249873e-05, "loss": 0.2565, "step": 1248 }, { "epoch": 0.9369842460615154, "grad_norm": 0.33466240763664246, "learning_rate": 6.40494998560343e-05, "loss": 0.2385, "step": 1249 }, { "epoch": 0.9377344336084021, "grad_norm": 0.3060718774795532, "learning_rate": 6.39866487260611e-05, "loss": 0.2307, "step": 1250 }, { "epoch": 0.9384846211552889, "grad_norm": 0.214109405875206, "learning_rate": 6.392377361036262e-05, "loss": 0.1679, "step": 1251 }, { "epoch": 0.9392348087021756, "grad_norm": 0.20549508929252625, "learning_rate": 6.386087461676351e-05, "loss": 0.273, "step": 1252 }, { "epoch": 0.9399849962490623, "grad_norm": 0.23964238166809082, "learning_rate": 6.379795185312933e-05, "loss": 0.2535, "step": 1253 }, { "epoch": 0.940735183795949, "grad_norm": 0.3147329092025757, "learning_rate": 6.373500542736643e-05, "loss": 0.2692, "step": 1254 }, { "epoch": 0.9414853713428357, "grad_norm": 0.2459629476070404, "learning_rate": 6.367203544742171e-05, "loss": 0.1808, "step": 1255 }, { "epoch": 0.9422355588897224, "grad_norm": 0.3093070685863495, "learning_rate": 6.360904202128252e-05, "loss": 0.237, "step": 1256 }, { "epoch": 0.9429857464366092, "grad_norm": 0.3342586159706116, "learning_rate": 6.354602525697638e-05, "loss": 0.259, "step": 1257 }, { "epoch": 0.9437359339834959, "grad_norm": 0.2829574644565582, "learning_rate": 6.348298526257082e-05, "loss": 0.2552, "step": 1258 }, { "epoch": 0.9444861215303826, "grad_norm": 0.3656161427497864, "learning_rate": 6.341992214617323e-05, "loss": 0.289, "step": 1259 }, { "epoch": 0.9452363090772693, "grad_norm": 0.23185093700885773, "learning_rate": 6.335683601593062e-05, "loss": 0.2433, "step": 1260 }, { "epoch": 0.945986496624156, "grad_norm": 0.22676430642604828, "learning_rate": 6.329372698002954e-05, "loss": 0.2134, "step": 1261 }, { "epoch": 0.9467366841710427, "grad_norm": 0.31350377202033997, "learning_rate": 6.323059514669571e-05, "loss": 0.287, "step": 1262 }, { "epoch": 0.9474868717179294, "grad_norm": 0.30896708369255066, "learning_rate": 6.316744062419409e-05, "loss": 0.2961, "step": 1263 }, { "epoch": 0.9482370592648162, "grad_norm": 0.2619584798812866, "learning_rate": 6.310426352082838e-05, "loss": 0.264, "step": 1264 }, { "epoch": 0.9489872468117029, "grad_norm": 0.25245243310928345, "learning_rate": 6.304106394494116e-05, "loss": 0.2124, "step": 1265 }, { "epoch": 0.9497374343585896, "grad_norm": 0.27320316433906555, "learning_rate": 6.297784200491343e-05, "loss": 0.2831, "step": 1266 }, { "epoch": 0.9504876219054764, "grad_norm": 0.33607780933380127, "learning_rate": 6.291459780916463e-05, "loss": 0.2788, "step": 1267 }, { "epoch": 0.9512378094523631, "grad_norm": 0.2973000705242157, "learning_rate": 6.285133146615228e-05, "loss": 0.2681, "step": 1268 }, { "epoch": 0.9519879969992499, "grad_norm": 0.29907122254371643, "learning_rate": 6.278804308437198e-05, "loss": 0.2634, "step": 1269 }, { "epoch": 0.9527381845461366, "grad_norm": 0.33518245816230774, "learning_rate": 6.272473277235703e-05, "loss": 0.2505, "step": 1270 }, { "epoch": 0.9534883720930233, "grad_norm": 0.2538456916809082, "learning_rate": 6.266140063867843e-05, "loss": 0.2678, "step": 1271 }, { "epoch": 0.95423855963991, "grad_norm": 0.21485751867294312, "learning_rate": 6.25980467919445e-05, "loss": 0.2727, "step": 1272 }, { "epoch": 0.9549887471867967, "grad_norm": 0.2709319591522217, "learning_rate": 6.253467134080088e-05, "loss": 0.2432, "step": 1273 }, { "epoch": 0.9557389347336834, "grad_norm": 0.30399203300476074, "learning_rate": 6.247127439393023e-05, "loss": 0.2487, "step": 1274 }, { "epoch": 0.9564891222805701, "grad_norm": 0.23047830164432526, "learning_rate": 6.240785606005206e-05, "loss": 0.3044, "step": 1275 }, { "epoch": 0.9572393098274569, "grad_norm": 0.25030508637428284, "learning_rate": 6.234441644792256e-05, "loss": 0.2194, "step": 1276 }, { "epoch": 0.9579894973743436, "grad_norm": 0.3227004110813141, "learning_rate": 6.228095566633443e-05, "loss": 0.3042, "step": 1277 }, { "epoch": 0.9587396849212303, "grad_norm": 0.2680698335170746, "learning_rate": 6.221747382411667e-05, "loss": 0.1731, "step": 1278 }, { "epoch": 0.959489872468117, "grad_norm": 0.3874387741088867, "learning_rate": 6.215397103013436e-05, "loss": 0.2465, "step": 1279 }, { "epoch": 0.9602400600150037, "grad_norm": 0.2634330987930298, "learning_rate": 6.209044739328858e-05, "loss": 0.2374, "step": 1280 }, { "epoch": 0.9609902475618904, "grad_norm": 0.24593226611614227, "learning_rate": 6.202690302251606e-05, "loss": 0.2324, "step": 1281 }, { "epoch": 0.9617404351087772, "grad_norm": 0.2061801552772522, "learning_rate": 6.196333802678918e-05, "loss": 0.1982, "step": 1282 }, { "epoch": 0.9624906226556639, "grad_norm": 0.33368608355522156, "learning_rate": 6.189975251511562e-05, "loss": 0.227, "step": 1283 }, { "epoch": 0.9632408102025506, "grad_norm": 0.2840694487094879, "learning_rate": 6.18361465965383e-05, "loss": 0.2915, "step": 1284 }, { "epoch": 0.9639909977494373, "grad_norm": 0.3025999367237091, "learning_rate": 6.177252038013509e-05, "loss": 0.2788, "step": 1285 }, { "epoch": 0.9647411852963241, "grad_norm": 0.20990639925003052, "learning_rate": 6.170887397501868e-05, "loss": 0.2055, "step": 1286 }, { "epoch": 0.9654913728432108, "grad_norm": 0.32666540145874023, "learning_rate": 6.16452074903364e-05, "loss": 0.3379, "step": 1287 }, { "epoch": 0.9662415603900976, "grad_norm": 0.33180001378059387, "learning_rate": 6.158152103527e-05, "loss": 0.2663, "step": 1288 }, { "epoch": 0.9669917479369843, "grad_norm": 0.2985075116157532, "learning_rate": 6.151781471903548e-05, "loss": 0.2519, "step": 1289 }, { "epoch": 0.967741935483871, "grad_norm": 0.27229228615760803, "learning_rate": 6.14540886508829e-05, "loss": 0.2966, "step": 1290 }, { "epoch": 0.9684921230307577, "grad_norm": 0.28812769055366516, "learning_rate": 6.139034294009617e-05, "loss": 0.2897, "step": 1291 }, { "epoch": 0.9692423105776444, "grad_norm": 0.2783651351928711, "learning_rate": 6.132657769599293e-05, "loss": 0.2035, "step": 1292 }, { "epoch": 0.9699924981245311, "grad_norm": 0.3414267897605896, "learning_rate": 6.126279302792429e-05, "loss": 0.2734, "step": 1293 }, { "epoch": 0.9707426856714179, "grad_norm": 0.2517719566822052, "learning_rate": 6.119898904527468e-05, "loss": 0.2525, "step": 1294 }, { "epoch": 0.9714928732183046, "grad_norm": 0.23830890655517578, "learning_rate": 6.113516585746164e-05, "loss": 0.2661, "step": 1295 }, { "epoch": 0.9722430607651913, "grad_norm": 0.2728438377380371, "learning_rate": 6.107132357393563e-05, "loss": 0.2573, "step": 1296 }, { "epoch": 0.972993248312078, "grad_norm": 0.3221016824245453, "learning_rate": 6.100746230417993e-05, "loss": 0.2056, "step": 1297 }, { "epoch": 0.9737434358589647, "grad_norm": 0.2769758701324463, "learning_rate": 6.0943582157710285e-05, "loss": 0.2128, "step": 1298 }, { "epoch": 0.9744936234058514, "grad_norm": 0.3126924932003021, "learning_rate": 6.0879683244074894e-05, "loss": 0.2755, "step": 1299 }, { "epoch": 0.9752438109527382, "grad_norm": 0.3210766911506653, "learning_rate": 6.0815765672854065e-05, "loss": 0.2112, "step": 1300 }, { "epoch": 0.9759939984996249, "grad_norm": 0.20561034977436066, "learning_rate": 6.0751829553660155e-05, "loss": 0.2048, "step": 1301 }, { "epoch": 0.9767441860465116, "grad_norm": 0.19861342012882233, "learning_rate": 6.06878749961373e-05, "loss": 0.2706, "step": 1302 }, { "epoch": 0.9774943735933983, "grad_norm": 0.2890970706939697, "learning_rate": 6.0623902109961295e-05, "loss": 0.23, "step": 1303 }, { "epoch": 0.978244561140285, "grad_norm": 0.30523234605789185, "learning_rate": 6.055991100483932e-05, "loss": 0.3465, "step": 1304 }, { "epoch": 0.9789947486871718, "grad_norm": 0.3009622395038605, "learning_rate": 6.0495901790509836e-05, "loss": 0.3458, "step": 1305 }, { "epoch": 0.9797449362340586, "grad_norm": 0.25227123498916626, "learning_rate": 6.043187457674231e-05, "loss": 0.2125, "step": 1306 }, { "epoch": 0.9804951237809453, "grad_norm": 0.3086055517196655, "learning_rate": 6.0367829473337136e-05, "loss": 0.3105, "step": 1307 }, { "epoch": 0.981245311327832, "grad_norm": 0.26335278153419495, "learning_rate": 6.0303766590125365e-05, "loss": 0.1658, "step": 1308 }, { "epoch": 0.9819954988747187, "grad_norm": 0.22805826365947723, "learning_rate": 6.02396860369685e-05, "loss": 0.2507, "step": 1309 }, { "epoch": 0.9827456864216054, "grad_norm": 0.2978416085243225, "learning_rate": 6.0175587923758416e-05, "loss": 0.2122, "step": 1310 }, { "epoch": 0.9834958739684921, "grad_norm": 0.34618857502937317, "learning_rate": 6.0111472360417044e-05, "loss": 0.2802, "step": 1311 }, { "epoch": 0.9842460615153789, "grad_norm": 0.32733967900276184, "learning_rate": 6.004733945689628e-05, "loss": 0.2413, "step": 1312 }, { "epoch": 0.9849962490622656, "grad_norm": 0.22149665653705597, "learning_rate": 5.998318932317771e-05, "loss": 0.2147, "step": 1313 }, { "epoch": 0.9857464366091523, "grad_norm": 0.41897204518318176, "learning_rate": 5.991902206927252e-05, "loss": 0.2688, "step": 1314 }, { "epoch": 0.986496624156039, "grad_norm": 0.2500593662261963, "learning_rate": 5.985483780522122e-05, "loss": 0.2292, "step": 1315 }, { "epoch": 0.9872468117029257, "grad_norm": 0.32280322909355164, "learning_rate": 5.9790636641093523e-05, "loss": 0.3009, "step": 1316 }, { "epoch": 0.9879969992498124, "grad_norm": 0.27554023265838623, "learning_rate": 5.972641868698805e-05, "loss": 0.2485, "step": 1317 }, { "epoch": 0.9887471867966992, "grad_norm": 0.3174436092376709, "learning_rate": 5.966218405303234e-05, "loss": 0.2725, "step": 1318 }, { "epoch": 0.9894973743435859, "grad_norm": 0.2680257260799408, "learning_rate": 5.959793284938242e-05, "loss": 0.2764, "step": 1319 }, { "epoch": 0.9902475618904726, "grad_norm": 0.3296760320663452, "learning_rate": 5.953366518622279e-05, "loss": 0.3521, "step": 1320 }, { "epoch": 0.9909977494373593, "grad_norm": 0.25383758544921875, "learning_rate": 5.946938117376616e-05, "loss": 0.2864, "step": 1321 }, { "epoch": 0.991747936984246, "grad_norm": 0.2520928382873535, "learning_rate": 5.940508092225328e-05, "loss": 0.2333, "step": 1322 }, { "epoch": 0.9924981245311327, "grad_norm": 0.30443039536476135, "learning_rate": 5.9340764541952755e-05, "loss": 0.2918, "step": 1323 }, { "epoch": 0.9932483120780196, "grad_norm": 0.2217380404472351, "learning_rate": 5.9276432143160835e-05, "loss": 0.2491, "step": 1324 }, { "epoch": 0.9939984996249063, "grad_norm": 0.29230329394340515, "learning_rate": 5.921208383620126e-05, "loss": 0.2034, "step": 1325 }, { "epoch": 0.994748687171793, "grad_norm": 0.40205276012420654, "learning_rate": 5.9147719731425034e-05, "loss": 0.2485, "step": 1326 }, { "epoch": 0.9954988747186797, "grad_norm": 0.23298341035842896, "learning_rate": 5.908333993921027e-05, "loss": 0.281, "step": 1327 }, { "epoch": 0.9962490622655664, "grad_norm": 0.19305676221847534, "learning_rate": 5.901894456996196e-05, "loss": 0.2128, "step": 1328 }, { "epoch": 0.9969992498124531, "grad_norm": 0.3036314845085144, "learning_rate": 5.895453373411182e-05, "loss": 0.3395, "step": 1329 }, { "epoch": 0.9977494373593399, "grad_norm": 0.27853089570999146, "learning_rate": 5.889010754211809e-05, "loss": 0.3564, "step": 1330 }, { "epoch": 0.9984996249062266, "grad_norm": 0.2961403429508209, "learning_rate": 5.882566610446534e-05, "loss": 0.2251, "step": 1331 }, { "epoch": 0.9992498124531133, "grad_norm": 0.3144397735595703, "learning_rate": 5.8761209531664306e-05, "loss": 0.3625, "step": 1332 }, { "epoch": 1.0, "grad_norm": 0.2760504186153412, "learning_rate": 5.869673793425168e-05, "loss": 0.1879, "step": 1333 }, { "epoch": 1.0007501875468867, "grad_norm": 0.33402565121650696, "learning_rate": 5.863225142278985e-05, "loss": 0.1947, "step": 1334 }, { "epoch": 1.0015003750937734, "grad_norm": 0.28173115849494934, "learning_rate": 5.856775010786687e-05, "loss": 0.2051, "step": 1335 }, { "epoch": 1.0022505626406601, "grad_norm": 0.27882272005081177, "learning_rate": 5.850323410009614e-05, "loss": 0.1844, "step": 1336 }, { "epoch": 1.0030007501875469, "grad_norm": 0.32667404413223267, "learning_rate": 5.8438703510116256e-05, "loss": 0.2003, "step": 1337 }, { "epoch": 1.0037509377344336, "grad_norm": 0.26928389072418213, "learning_rate": 5.8374158448590823e-05, "loss": 0.2062, "step": 1338 }, { "epoch": 1.0045011252813203, "grad_norm": 0.23898983001708984, "learning_rate": 5.830959902620826e-05, "loss": 0.2232, "step": 1339 }, { "epoch": 1.005251312828207, "grad_norm": 0.3427899181842804, "learning_rate": 5.824502535368164e-05, "loss": 0.1574, "step": 1340 }, { "epoch": 1.0060015003750937, "grad_norm": 0.22753161191940308, "learning_rate": 5.818043754174843e-05, "loss": 0.1787, "step": 1341 }, { "epoch": 1.0067516879219804, "grad_norm": 0.3660281002521515, "learning_rate": 5.81158357011704e-05, "loss": 0.2594, "step": 1342 }, { "epoch": 1.0075018754688672, "grad_norm": 0.2773469090461731, "learning_rate": 5.80512199427333e-05, "loss": 0.1739, "step": 1343 }, { "epoch": 1.0082520630157539, "grad_norm": 0.3064981698989868, "learning_rate": 5.798659037724683e-05, "loss": 0.1924, "step": 1344 }, { "epoch": 1.0090022505626406, "grad_norm": 0.24929401278495789, "learning_rate": 5.792194711554429e-05, "loss": 0.2109, "step": 1345 }, { "epoch": 1.0097524381095273, "grad_norm": 0.2475636899471283, "learning_rate": 5.7857290268482555e-05, "loss": 0.2571, "step": 1346 }, { "epoch": 1.010502625656414, "grad_norm": 0.30824771523475647, "learning_rate": 5.779261994694173e-05, "loss": 0.1635, "step": 1347 }, { "epoch": 1.0112528132033007, "grad_norm": 0.4603348970413208, "learning_rate": 5.772793626182506e-05, "loss": 0.2863, "step": 1348 }, { "epoch": 1.0120030007501875, "grad_norm": 0.3176836669445038, "learning_rate": 5.766323932405866e-05, "loss": 0.2291, "step": 1349 }, { "epoch": 1.0127531882970742, "grad_norm": 0.36271700263023376, "learning_rate": 5.7598529244591436e-05, "loss": 0.25, "step": 1350 }, { "epoch": 1.0135033758439609, "grad_norm": 0.3972218334674835, "learning_rate": 5.7533806134394806e-05, "loss": 0.2172, "step": 1351 }, { "epoch": 1.0142535633908478, "grad_norm": 0.3361879289150238, "learning_rate": 5.746907010446252e-05, "loss": 0.2055, "step": 1352 }, { "epoch": 1.0150037509377345, "grad_norm": 0.2929416000843048, "learning_rate": 5.740432126581049e-05, "loss": 0.2371, "step": 1353 }, { "epoch": 1.0157539384846213, "grad_norm": 0.34861284494400024, "learning_rate": 5.73395597294766e-05, "loss": 0.1919, "step": 1354 }, { "epoch": 1.016504126031508, "grad_norm": 0.2553778290748596, "learning_rate": 5.727478560652053e-05, "loss": 0.1499, "step": 1355 }, { "epoch": 1.0172543135783947, "grad_norm": 0.31026890873908997, "learning_rate": 5.7209999008023496e-05, "loss": 0.2095, "step": 1356 }, { "epoch": 1.0180045011252814, "grad_norm": 0.3212050795555115, "learning_rate": 5.7145200045088156e-05, "loss": 0.2985, "step": 1357 }, { "epoch": 1.0187546886721681, "grad_norm": 0.2565504312515259, "learning_rate": 5.7080388828838324e-05, "loss": 0.1559, "step": 1358 }, { "epoch": 1.0195048762190548, "grad_norm": 0.28624406456947327, "learning_rate": 5.701556547041888e-05, "loss": 0.2788, "step": 1359 }, { "epoch": 1.0202550637659416, "grad_norm": 0.4151807427406311, "learning_rate": 5.695073008099547e-05, "loss": 0.2322, "step": 1360 }, { "epoch": 1.0210052513128283, "grad_norm": 0.2993619441986084, "learning_rate": 5.688588277175444e-05, "loss": 0.1903, "step": 1361 }, { "epoch": 1.021755438859715, "grad_norm": 0.30110669136047363, "learning_rate": 5.6821023653902517e-05, "loss": 0.166, "step": 1362 }, { "epoch": 1.0225056264066017, "grad_norm": 0.3241769075393677, "learning_rate": 5.675615283866671e-05, "loss": 0.1856, "step": 1363 }, { "epoch": 1.0232558139534884, "grad_norm": 0.3174424469470978, "learning_rate": 5.669127043729406e-05, "loss": 0.2064, "step": 1364 }, { "epoch": 1.0240060015003751, "grad_norm": 0.2527333199977875, "learning_rate": 5.662637656105152e-05, "loss": 0.1563, "step": 1365 }, { "epoch": 1.0247561890472618, "grad_norm": 0.33262643218040466, "learning_rate": 5.6561471321225676e-05, "loss": 0.2834, "step": 1366 }, { "epoch": 1.0255063765941486, "grad_norm": 0.3247932195663452, "learning_rate": 5.649655482912265e-05, "loss": 0.2635, "step": 1367 }, { "epoch": 1.0262565641410353, "grad_norm": 0.3623289167881012, "learning_rate": 5.6431627196067816e-05, "loss": 0.1758, "step": 1368 }, { "epoch": 1.027006751687922, "grad_norm": 0.29462680220603943, "learning_rate": 5.636668853340567e-05, "loss": 0.2198, "step": 1369 }, { "epoch": 1.0277569392348087, "grad_norm": 0.2121676653623581, "learning_rate": 5.6301738952499636e-05, "loss": 0.2098, "step": 1370 }, { "epoch": 1.0285071267816954, "grad_norm": 0.31966498494148254, "learning_rate": 5.623677856473183e-05, "loss": 0.2668, "step": 1371 }, { "epoch": 1.0292573143285821, "grad_norm": 0.2599949538707733, "learning_rate": 5.617180748150295e-05, "loss": 0.2006, "step": 1372 }, { "epoch": 1.0300075018754689, "grad_norm": 0.2898136079311371, "learning_rate": 5.6106825814231953e-05, "loss": 0.1894, "step": 1373 }, { "epoch": 1.0307576894223556, "grad_norm": 0.23856934905052185, "learning_rate": 5.604183367435606e-05, "loss": 0.2407, "step": 1374 }, { "epoch": 1.0315078769692423, "grad_norm": 0.4207324981689453, "learning_rate": 5.597683117333036e-05, "loss": 0.2568, "step": 1375 }, { "epoch": 1.032258064516129, "grad_norm": 0.23304231464862823, "learning_rate": 5.591181842262776e-05, "loss": 0.1106, "step": 1376 }, { "epoch": 1.0330082520630157, "grad_norm": 0.34589719772338867, "learning_rate": 5.584679553373869e-05, "loss": 0.1995, "step": 1377 }, { "epoch": 1.0337584396099024, "grad_norm": 0.23905061185359955, "learning_rate": 5.578176261817104e-05, "loss": 0.2638, "step": 1378 }, { "epoch": 1.0345086271567892, "grad_norm": 0.25565803050994873, "learning_rate": 5.571671978744983e-05, "loss": 0.2251, "step": 1379 }, { "epoch": 1.0352588147036759, "grad_norm": 0.2593427002429962, "learning_rate": 5.565166715311711e-05, "loss": 0.2231, "step": 1380 }, { "epoch": 1.0360090022505626, "grad_norm": 0.25450852513313293, "learning_rate": 5.558660482673177e-05, "loss": 0.2294, "step": 1381 }, { "epoch": 1.0367591897974493, "grad_norm": 0.27342140674591064, "learning_rate": 5.552153291986927e-05, "loss": 0.214, "step": 1382 }, { "epoch": 1.037509377344336, "grad_norm": 0.19418363273143768, "learning_rate": 5.5456451544121523e-05, "loss": 0.2198, "step": 1383 }, { "epoch": 1.0382595648912227, "grad_norm": 0.3223038613796234, "learning_rate": 5.5391360811096684e-05, "loss": 0.2599, "step": 1384 }, { "epoch": 1.0390097524381094, "grad_norm": 0.25424689054489136, "learning_rate": 5.5326260832418955e-05, "loss": 0.226, "step": 1385 }, { "epoch": 1.0397599399849962, "grad_norm": 0.3329003155231476, "learning_rate": 5.526115171972838e-05, "loss": 0.218, "step": 1386 }, { "epoch": 1.0405101275318829, "grad_norm": 0.25894811749458313, "learning_rate": 5.5196033584680675e-05, "loss": 0.1618, "step": 1387 }, { "epoch": 1.0412603150787696, "grad_norm": 0.2783038914203644, "learning_rate": 5.5130906538947034e-05, "loss": 0.172, "step": 1388 }, { "epoch": 1.0420105026256563, "grad_norm": 0.4169522821903229, "learning_rate": 5.506577069421395e-05, "loss": 0.2396, "step": 1389 }, { "epoch": 1.042760690172543, "grad_norm": 0.3108487129211426, "learning_rate": 5.5000626162182944e-05, "loss": 0.2429, "step": 1390 }, { "epoch": 1.04351087771943, "grad_norm": 0.2856238782405853, "learning_rate": 5.49354730545705e-05, "loss": 0.2127, "step": 1391 }, { "epoch": 1.0442610652663167, "grad_norm": 0.35207676887512207, "learning_rate": 5.487031148310775e-05, "loss": 0.2434, "step": 1392 }, { "epoch": 1.0450112528132034, "grad_norm": 0.2355295866727829, "learning_rate": 5.480514155954042e-05, "loss": 0.2243, "step": 1393 }, { "epoch": 1.04576144036009, "grad_norm": 0.2742708921432495, "learning_rate": 5.4739963395628456e-05, "loss": 0.2101, "step": 1394 }, { "epoch": 1.0465116279069768, "grad_norm": 0.3036324977874756, "learning_rate": 5.4674777103146045e-05, "loss": 0.2651, "step": 1395 }, { "epoch": 1.0472618154538635, "grad_norm": 0.3571079671382904, "learning_rate": 5.460958279388122e-05, "loss": 0.2343, "step": 1396 }, { "epoch": 1.0480120030007503, "grad_norm": 0.2640590965747833, "learning_rate": 5.4544380579635824e-05, "loss": 0.1899, "step": 1397 }, { "epoch": 1.048762190547637, "grad_norm": 0.3048892915248871, "learning_rate": 5.447917057222523e-05, "loss": 0.2617, "step": 1398 }, { "epoch": 1.0495123780945237, "grad_norm": 0.2911650240421295, "learning_rate": 5.441395288347818e-05, "loss": 0.1871, "step": 1399 }, { "epoch": 1.0502625656414104, "grad_norm": 0.30502086877822876, "learning_rate": 5.434872762523658e-05, "loss": 0.1968, "step": 1400 }, { "epoch": 1.0502625656414104, "eval_loss": 0.2604842782020569, "eval_runtime": 8.9194, "eval_samples_per_second": 6.054, "eval_steps_per_second": 1.57, "step": 1400 }, { "epoch": 1.0510127531882971, "grad_norm": 0.27795174717903137, "learning_rate": 5.4283494909355314e-05, "loss": 0.253, "step": 1401 }, { "epoch": 1.0517629407351838, "grad_norm": 0.30377012491226196, "learning_rate": 5.42182548477021e-05, "loss": 0.2899, "step": 1402 }, { "epoch": 1.0525131282820706, "grad_norm": 0.3513970673084259, "learning_rate": 5.41530075521572e-05, "loss": 0.2303, "step": 1403 }, { "epoch": 1.0532633158289573, "grad_norm": 0.3081195652484894, "learning_rate": 5.4087753134613294e-05, "loss": 0.2682, "step": 1404 }, { "epoch": 1.054013503375844, "grad_norm": 0.35805124044418335, "learning_rate": 5.40224917069753e-05, "loss": 0.2584, "step": 1405 }, { "epoch": 1.0547636909227307, "grad_norm": 0.3005058765411377, "learning_rate": 5.3957223381160126e-05, "loss": 0.2093, "step": 1406 }, { "epoch": 1.0555138784696174, "grad_norm": 0.32510605454444885, "learning_rate": 5.389194826909653e-05, "loss": 0.2089, "step": 1407 }, { "epoch": 1.0562640660165041, "grad_norm": 0.2908082902431488, "learning_rate": 5.382666648272489e-05, "loss": 0.2999, "step": 1408 }, { "epoch": 1.0570142535633908, "grad_norm": 0.32218775153160095, "learning_rate": 5.3761378133997044e-05, "loss": 0.2994, "step": 1409 }, { "epoch": 1.0577644411102776, "grad_norm": 0.338564932346344, "learning_rate": 5.3696083334876105e-05, "loss": 0.2252, "step": 1410 }, { "epoch": 1.0585146286571643, "grad_norm": 0.26838916540145874, "learning_rate": 5.363078219733619e-05, "loss": 0.171, "step": 1411 }, { "epoch": 1.059264816204051, "grad_norm": 0.3963358998298645, "learning_rate": 5.3565474833362353e-05, "loss": 0.2773, "step": 1412 }, { "epoch": 1.0600150037509377, "grad_norm": 0.2668197453022003, "learning_rate": 5.3500161354950274e-05, "loss": 0.1872, "step": 1413 }, { "epoch": 1.0607651912978244, "grad_norm": 0.31688767671585083, "learning_rate": 5.3434841874106124e-05, "loss": 0.2555, "step": 1414 }, { "epoch": 1.0615153788447111, "grad_norm": 0.23720906674861908, "learning_rate": 5.3369516502846396e-05, "loss": 0.2132, "step": 1415 }, { "epoch": 1.0622655663915979, "grad_norm": 0.252228707075119, "learning_rate": 5.330418535319768e-05, "loss": 0.1539, "step": 1416 }, { "epoch": 1.0630157539384846, "grad_norm": 0.34244251251220703, "learning_rate": 5.323884853719645e-05, "loss": 0.226, "step": 1417 }, { "epoch": 1.0637659414853713, "grad_norm": 0.262812077999115, "learning_rate": 5.31735061668889e-05, "loss": 0.2007, "step": 1418 }, { "epoch": 1.064516129032258, "grad_norm": 0.3474026918411255, "learning_rate": 5.3108158354330795e-05, "loss": 0.2983, "step": 1419 }, { "epoch": 1.0652663165791447, "grad_norm": 0.24028290808200836, "learning_rate": 5.304280521158716e-05, "loss": 0.1307, "step": 1420 }, { "epoch": 1.0660165041260314, "grad_norm": 0.3442460000514984, "learning_rate": 5.2977446850732236e-05, "loss": 0.2066, "step": 1421 }, { "epoch": 1.0667666916729182, "grad_norm": 0.3726935088634491, "learning_rate": 5.291208338384913e-05, "loss": 0.1685, "step": 1422 }, { "epoch": 1.0675168792198049, "grad_norm": 0.34708118438720703, "learning_rate": 5.2846714923029795e-05, "loss": 0.2844, "step": 1423 }, { "epoch": 1.0682670667666916, "grad_norm": 0.373830109834671, "learning_rate": 5.278134158037469e-05, "loss": 0.2609, "step": 1424 }, { "epoch": 1.0690172543135783, "grad_norm": 0.31111639738082886, "learning_rate": 5.2715963467992656e-05, "loss": 0.1802, "step": 1425 }, { "epoch": 1.069767441860465, "grad_norm": 0.3632708191871643, "learning_rate": 5.265058069800072e-05, "loss": 0.308, "step": 1426 }, { "epoch": 1.0705176294073517, "grad_norm": 0.32613417506217957, "learning_rate": 5.258519338252389e-05, "loss": 0.2788, "step": 1427 }, { "epoch": 1.0712678169542387, "grad_norm": 0.31477367877960205, "learning_rate": 5.251980163369499e-05, "loss": 0.218, "step": 1428 }, { "epoch": 1.0720180045011252, "grad_norm": 0.3604421317577362, "learning_rate": 5.24544055636544e-05, "loss": 0.2364, "step": 1429 }, { "epoch": 1.072768192048012, "grad_norm": 0.4178027808666229, "learning_rate": 5.2389005284549954e-05, "loss": 0.2949, "step": 1430 }, { "epoch": 1.0735183795948988, "grad_norm": 0.2535725235939026, "learning_rate": 5.232360090853671e-05, "loss": 0.2092, "step": 1431 }, { "epoch": 1.0742685671417855, "grad_norm": 0.39206013083457947, "learning_rate": 5.225819254777671e-05, "loss": 0.2549, "step": 1432 }, { "epoch": 1.0750187546886723, "grad_norm": 0.28637292981147766, "learning_rate": 5.219278031443886e-05, "loss": 0.203, "step": 1433 }, { "epoch": 1.075768942235559, "grad_norm": 0.35033726692199707, "learning_rate": 5.21273643206987e-05, "loss": 0.2826, "step": 1434 }, { "epoch": 1.0765191297824457, "grad_norm": 0.3011876940727234, "learning_rate": 5.206194467873822e-05, "loss": 0.1982, "step": 1435 }, { "epoch": 1.0772693173293324, "grad_norm": 0.2880212962627411, "learning_rate": 5.1996521500745645e-05, "loss": 0.2128, "step": 1436 }, { "epoch": 1.0780195048762191, "grad_norm": 0.3639679551124573, "learning_rate": 5.19310948989153e-05, "loss": 0.2395, "step": 1437 }, { "epoch": 1.0787696924231058, "grad_norm": 0.3012164235115051, "learning_rate": 5.186566498544737e-05, "loss": 0.1596, "step": 1438 }, { "epoch": 1.0795198799699925, "grad_norm": 0.24250833690166473, "learning_rate": 5.18002318725477e-05, "loss": 0.1785, "step": 1439 }, { "epoch": 1.0802700675168793, "grad_norm": 0.34617021679878235, "learning_rate": 5.173479567242765e-05, "loss": 0.1948, "step": 1440 }, { "epoch": 1.081020255063766, "grad_norm": 0.3333530128002167, "learning_rate": 5.1669356497303835e-05, "loss": 0.2248, "step": 1441 }, { "epoch": 1.0817704426106527, "grad_norm": 0.3230828642845154, "learning_rate": 5.1603914459398016e-05, "loss": 0.2278, "step": 1442 }, { "epoch": 1.0825206301575394, "grad_norm": 0.33597928285598755, "learning_rate": 5.153846967093684e-05, "loss": 0.3137, "step": 1443 }, { "epoch": 1.0832708177044261, "grad_norm": 0.32051336765289307, "learning_rate": 5.1473022244151684e-05, "loss": 0.292, "step": 1444 }, { "epoch": 1.0840210052513128, "grad_norm": 0.26457923650741577, "learning_rate": 5.140757229127842e-05, "loss": 0.149, "step": 1445 }, { "epoch": 1.0847711927981996, "grad_norm": 0.31593549251556396, "learning_rate": 5.1342119924557275e-05, "loss": 0.2387, "step": 1446 }, { "epoch": 1.0855213803450863, "grad_norm": 0.2970251142978668, "learning_rate": 5.127666525623264e-05, "loss": 0.2159, "step": 1447 }, { "epoch": 1.086271567891973, "grad_norm": 0.3575063943862915, "learning_rate": 5.121120839855279e-05, "loss": 0.243, "step": 1448 }, { "epoch": 1.0870217554388597, "grad_norm": 0.4525129199028015, "learning_rate": 5.114574946376982e-05, "loss": 0.2552, "step": 1449 }, { "epoch": 1.0877719429857464, "grad_norm": 0.34076669812202454, "learning_rate": 5.1080288564139325e-05, "loss": 0.2632, "step": 1450 }, { "epoch": 1.0885221305326331, "grad_norm": 0.3594658076763153, "learning_rate": 5.101482581192033e-05, "loss": 0.167, "step": 1451 }, { "epoch": 1.0892723180795199, "grad_norm": 0.2983902394771576, "learning_rate": 5.0949361319374996e-05, "loss": 0.2458, "step": 1452 }, { "epoch": 1.0900225056264066, "grad_norm": 0.40579554438591003, "learning_rate": 5.0883895198768494e-05, "loss": 0.2002, "step": 1453 }, { "epoch": 1.0907726931732933, "grad_norm": 0.31652334332466125, "learning_rate": 5.0818427562368764e-05, "loss": 0.2236, "step": 1454 }, { "epoch": 1.09152288072018, "grad_norm": 0.21786466240882874, "learning_rate": 5.0752958522446356e-05, "loss": 0.2323, "step": 1455 }, { "epoch": 1.0922730682670667, "grad_norm": 0.2560447156429291, "learning_rate": 5.0687488191274215e-05, "loss": 0.2505, "step": 1456 }, { "epoch": 1.0930232558139534, "grad_norm": 0.3443382680416107, "learning_rate": 5.0622016681127526e-05, "loss": 0.1932, "step": 1457 }, { "epoch": 1.0937734433608401, "grad_norm": 0.29815614223480225, "learning_rate": 5.055654410428349e-05, "loss": 0.1756, "step": 1458 }, { "epoch": 1.0945236309077269, "grad_norm": 0.2818271815776825, "learning_rate": 5.0491070573021116e-05, "loss": 0.2027, "step": 1459 }, { "epoch": 1.0952738184546136, "grad_norm": 0.23081719875335693, "learning_rate": 5.0425596199621064e-05, "loss": 0.2578, "step": 1460 }, { "epoch": 1.0960240060015003, "grad_norm": 0.3082742989063263, "learning_rate": 5.036012109636543e-05, "loss": 0.2214, "step": 1461 }, { "epoch": 1.096774193548387, "grad_norm": 0.3191351890563965, "learning_rate": 5.0294645375537594e-05, "loss": 0.2165, "step": 1462 }, { "epoch": 1.0975243810952737, "grad_norm": 0.36559221148490906, "learning_rate": 5.022916914942195e-05, "loss": 0.2847, "step": 1463 }, { "epoch": 1.0982745686421604, "grad_norm": 0.3442118465900421, "learning_rate": 5.0163692530303774e-05, "loss": 0.176, "step": 1464 }, { "epoch": 1.0990247561890472, "grad_norm": 0.35876166820526123, "learning_rate": 5.009821563046903e-05, "loss": 0.2182, "step": 1465 }, { "epoch": 1.0997749437359339, "grad_norm": 0.3490297198295593, "learning_rate": 5.003273856220415e-05, "loss": 0.2383, "step": 1466 }, { "epoch": 1.1005251312828208, "grad_norm": 0.29343780875205994, "learning_rate": 4.996726143779586e-05, "loss": 0.2136, "step": 1467 }, { "epoch": 1.1012753188297075, "grad_norm": 0.3284321129322052, "learning_rate": 4.990178436953099e-05, "loss": 0.2888, "step": 1468 }, { "epoch": 1.1020255063765942, "grad_norm": 0.40511828660964966, "learning_rate": 4.9836307469696244e-05, "loss": 0.275, "step": 1469 }, { "epoch": 1.102775693923481, "grad_norm": 0.2825699746608734, "learning_rate": 4.9770830850578075e-05, "loss": 0.2215, "step": 1470 }, { "epoch": 1.1035258814703677, "grad_norm": 0.3898945450782776, "learning_rate": 4.9705354624462424e-05, "loss": 0.2447, "step": 1471 }, { "epoch": 1.1042760690172544, "grad_norm": 0.33155012130737305, "learning_rate": 4.963987890363458e-05, "loss": 0.2458, "step": 1472 }, { "epoch": 1.105026256564141, "grad_norm": 0.28734290599823, "learning_rate": 4.957440380037896e-05, "loss": 0.2174, "step": 1473 }, { "epoch": 1.1057764441110278, "grad_norm": 0.26184868812561035, "learning_rate": 4.9508929426978896e-05, "loss": 0.2058, "step": 1474 }, { "epoch": 1.1065266316579145, "grad_norm": 0.31342172622680664, "learning_rate": 4.944345589571651e-05, "loss": 0.2196, "step": 1475 }, { "epoch": 1.1072768192048013, "grad_norm": 0.2987440228462219, "learning_rate": 4.937798331887248e-05, "loss": 0.2249, "step": 1476 }, { "epoch": 1.108027006751688, "grad_norm": 0.30080920457839966, "learning_rate": 4.931251180872579e-05, "loss": 0.1635, "step": 1477 }, { "epoch": 1.1087771942985747, "grad_norm": 0.35609471797943115, "learning_rate": 4.9247041477553656e-05, "loss": 0.2873, "step": 1478 }, { "epoch": 1.1095273818454614, "grad_norm": 0.3109179437160492, "learning_rate": 4.9181572437631255e-05, "loss": 0.2349, "step": 1479 }, { "epoch": 1.1102775693923481, "grad_norm": 0.2964603006839752, "learning_rate": 4.911610480123151e-05, "loss": 0.2546, "step": 1480 }, { "epoch": 1.1110277569392348, "grad_norm": 0.299516886472702, "learning_rate": 4.905063868062501e-05, "loss": 0.2206, "step": 1481 }, { "epoch": 1.1117779444861215, "grad_norm": 0.2337196320295334, "learning_rate": 4.898517418807968e-05, "loss": 0.2387, "step": 1482 }, { "epoch": 1.1125281320330083, "grad_norm": 0.22706818580627441, "learning_rate": 4.891971143586069e-05, "loss": 0.2249, "step": 1483 }, { "epoch": 1.113278319579895, "grad_norm": 0.29159751534461975, "learning_rate": 4.88542505362302e-05, "loss": 0.222, "step": 1484 }, { "epoch": 1.1140285071267817, "grad_norm": 0.3284156620502472, "learning_rate": 4.878879160144723e-05, "loss": 0.2338, "step": 1485 }, { "epoch": 1.1147786946736684, "grad_norm": 0.31191563606262207, "learning_rate": 4.872333474376739e-05, "loss": 0.2364, "step": 1486 }, { "epoch": 1.1155288822205551, "grad_norm": 0.3978080153465271, "learning_rate": 4.865788007544274e-05, "loss": 0.2258, "step": 1487 }, { "epoch": 1.1162790697674418, "grad_norm": 0.33552274107933044, "learning_rate": 4.859242770872158e-05, "loss": 0.167, "step": 1488 }, { "epoch": 1.1170292573143286, "grad_norm": 0.30755186080932617, "learning_rate": 4.852697775584833e-05, "loss": 0.2003, "step": 1489 }, { "epoch": 1.1177794448612153, "grad_norm": 0.27730605006217957, "learning_rate": 4.846153032906316e-05, "loss": 0.2357, "step": 1490 }, { "epoch": 1.118529632408102, "grad_norm": 0.3002850115299225, "learning_rate": 4.8396085540601995e-05, "loss": 0.2574, "step": 1491 }, { "epoch": 1.1192798199549887, "grad_norm": 0.30860385298728943, "learning_rate": 4.833064350269617e-05, "loss": 0.2082, "step": 1492 }, { "epoch": 1.1200300075018754, "grad_norm": 0.34558695554733276, "learning_rate": 4.826520432757236e-05, "loss": 0.1993, "step": 1493 }, { "epoch": 1.1207801950487621, "grad_norm": 0.3402259945869446, "learning_rate": 4.8199768127452314e-05, "loss": 0.2531, "step": 1494 }, { "epoch": 1.1215303825956489, "grad_norm": 0.440168559551239, "learning_rate": 4.813433501455264e-05, "loss": 0.2465, "step": 1495 }, { "epoch": 1.1222805701425356, "grad_norm": 0.4254785478115082, "learning_rate": 4.806890510108471e-05, "loss": 0.2786, "step": 1496 }, { "epoch": 1.1230307576894223, "grad_norm": 0.3467138707637787, "learning_rate": 4.800347849925437e-05, "loss": 0.2507, "step": 1497 }, { "epoch": 1.123780945236309, "grad_norm": 0.36541232466697693, "learning_rate": 4.793805532126181e-05, "loss": 0.2629, "step": 1498 }, { "epoch": 1.1245311327831957, "grad_norm": 0.35487833619117737, "learning_rate": 4.787263567930132e-05, "loss": 0.2318, "step": 1499 }, { "epoch": 1.1252813203300824, "grad_norm": 0.3629397451877594, "learning_rate": 4.780721968556115e-05, "loss": 0.2275, "step": 1500 }, { "epoch": 1.1260315078769692, "grad_norm": 0.2651798725128174, "learning_rate": 4.774180745222331e-05, "loss": 0.1663, "step": 1501 }, { "epoch": 1.1267816954238559, "grad_norm": 0.35256990790367126, "learning_rate": 4.7676399091463296e-05, "loss": 0.2764, "step": 1502 }, { "epoch": 1.1275318829707426, "grad_norm": 0.34462007880210876, "learning_rate": 4.7610994715450044e-05, "loss": 0.2701, "step": 1503 }, { "epoch": 1.1282820705176295, "grad_norm": 0.3185523748397827, "learning_rate": 4.754559443634561e-05, "loss": 0.2205, "step": 1504 }, { "epoch": 1.129032258064516, "grad_norm": 0.28199151158332825, "learning_rate": 4.748019836630503e-05, "loss": 0.2337, "step": 1505 }, { "epoch": 1.129782445611403, "grad_norm": 0.2722025513648987, "learning_rate": 4.7414806617476124e-05, "loss": 0.2214, "step": 1506 }, { "epoch": 1.1305326331582897, "grad_norm": 0.34087133407592773, "learning_rate": 4.7349419301999294e-05, "loss": 0.2256, "step": 1507 }, { "epoch": 1.1312828207051764, "grad_norm": 0.3303645849227905, "learning_rate": 4.7284036532007356e-05, "loss": 0.2442, "step": 1508 }, { "epoch": 1.132033008252063, "grad_norm": 0.2852473258972168, "learning_rate": 4.721865841962533e-05, "loss": 0.2062, "step": 1509 }, { "epoch": 1.1327831957989498, "grad_norm": 0.2546837329864502, "learning_rate": 4.715328507697021e-05, "loss": 0.3009, "step": 1510 }, { "epoch": 1.1335333833458365, "grad_norm": 0.2938421666622162, "learning_rate": 4.7087916616150886e-05, "loss": 0.2746, "step": 1511 }, { "epoch": 1.1342835708927232, "grad_norm": 0.302015095949173, "learning_rate": 4.702255314926779e-05, "loss": 0.2343, "step": 1512 }, { "epoch": 1.13503375843961, "grad_norm": 0.2973228991031647, "learning_rate": 4.695719478841286e-05, "loss": 0.2026, "step": 1513 }, { "epoch": 1.1357839459864967, "grad_norm": 0.335861474275589, "learning_rate": 4.6891841645669224e-05, "loss": 0.2225, "step": 1514 }, { "epoch": 1.1365341335333834, "grad_norm": 0.3774379789829254, "learning_rate": 4.6826493833111104e-05, "loss": 0.24, "step": 1515 }, { "epoch": 1.13728432108027, "grad_norm": 0.3847174644470215, "learning_rate": 4.676115146280356e-05, "loss": 0.2469, "step": 1516 }, { "epoch": 1.1380345086271568, "grad_norm": 0.31384989619255066, "learning_rate": 4.669581464680233e-05, "loss": 0.1479, "step": 1517 }, { "epoch": 1.1387846961740435, "grad_norm": 0.3265465795993805, "learning_rate": 4.66304834971536e-05, "loss": 0.2661, "step": 1518 }, { "epoch": 1.1395348837209303, "grad_norm": 0.2602500021457672, "learning_rate": 4.656515812589389e-05, "loss": 0.2515, "step": 1519 }, { "epoch": 1.140285071267817, "grad_norm": 0.31318238377571106, "learning_rate": 4.6499838645049744e-05, "loss": 0.2524, "step": 1520 }, { "epoch": 1.1410352588147037, "grad_norm": 0.30664822459220886, "learning_rate": 4.643452516663766e-05, "loss": 0.2338, "step": 1521 }, { "epoch": 1.1417854463615904, "grad_norm": 0.3168707489967346, "learning_rate": 4.636921780266381e-05, "loss": 0.311, "step": 1522 }, { "epoch": 1.1425356339084771, "grad_norm": 0.2585882246494293, "learning_rate": 4.63039166651239e-05, "loss": 0.2098, "step": 1523 }, { "epoch": 1.1432858214553638, "grad_norm": 0.3347552716732025, "learning_rate": 4.623862186600297e-05, "loss": 0.2213, "step": 1524 }, { "epoch": 1.1440360090022506, "grad_norm": 0.29252713918685913, "learning_rate": 4.617333351727513e-05, "loss": 0.2283, "step": 1525 }, { "epoch": 1.1447861965491373, "grad_norm": 0.2644529938697815, "learning_rate": 4.61080517309035e-05, "loss": 0.2446, "step": 1526 }, { "epoch": 1.145536384096024, "grad_norm": 0.34511175751686096, "learning_rate": 4.604277661883989e-05, "loss": 0.2518, "step": 1527 }, { "epoch": 1.1462865716429107, "grad_norm": 0.2935892343521118, "learning_rate": 4.5977508293024726e-05, "loss": 0.1382, "step": 1528 }, { "epoch": 1.1470367591897974, "grad_norm": 0.3596172332763672, "learning_rate": 4.591224686538672e-05, "loss": 0.1923, "step": 1529 }, { "epoch": 1.1477869467366841, "grad_norm": 0.3269699513912201, "learning_rate": 4.584699244784281e-05, "loss": 0.2184, "step": 1530 }, { "epoch": 1.1485371342835708, "grad_norm": 0.2846202552318573, "learning_rate": 4.578174515229789e-05, "loss": 0.3049, "step": 1531 }, { "epoch": 1.1492873218304576, "grad_norm": 0.32833126187324524, "learning_rate": 4.5716505090644684e-05, "loss": 0.2426, "step": 1532 }, { "epoch": 1.1500375093773443, "grad_norm": 0.30242830514907837, "learning_rate": 4.5651272374763423e-05, "loss": 0.2377, "step": 1533 }, { "epoch": 1.150787696924231, "grad_norm": 0.3805653154850006, "learning_rate": 4.558604711652183e-05, "loss": 0.2207, "step": 1534 }, { "epoch": 1.1515378844711177, "grad_norm": 0.4061774015426636, "learning_rate": 4.552082942777478e-05, "loss": 0.2246, "step": 1535 }, { "epoch": 1.1522880720180044, "grad_norm": 0.3513825535774231, "learning_rate": 4.545561942036418e-05, "loss": 0.2423, "step": 1536 }, { "epoch": 1.1530382595648911, "grad_norm": 0.33843132853507996, "learning_rate": 4.5390417206118784e-05, "loss": 0.2495, "step": 1537 }, { "epoch": 1.1537884471117779, "grad_norm": 0.24668993055820465, "learning_rate": 4.5325222896853966e-05, "loss": 0.2251, "step": 1538 }, { "epoch": 1.1545386346586646, "grad_norm": 0.3061322867870331, "learning_rate": 4.5260036604371556e-05, "loss": 0.2461, "step": 1539 }, { "epoch": 1.1552888222055513, "grad_norm": 0.31666457653045654, "learning_rate": 4.51948584404596e-05, "loss": 0.217, "step": 1540 }, { "epoch": 1.1560390097524382, "grad_norm": 0.24307827651500702, "learning_rate": 4.5129688516892264e-05, "loss": 0.1922, "step": 1541 }, { "epoch": 1.1567891972993247, "grad_norm": 0.29883265495300293, "learning_rate": 4.506452694542953e-05, "loss": 0.2438, "step": 1542 }, { "epoch": 1.1575393848462117, "grad_norm": 0.35469886660575867, "learning_rate": 4.499937383781708e-05, "loss": 0.2011, "step": 1543 }, { "epoch": 1.1582895723930982, "grad_norm": 0.35888004302978516, "learning_rate": 4.493422930578605e-05, "loss": 0.2463, "step": 1544 }, { "epoch": 1.159039759939985, "grad_norm": 0.3095334470272064, "learning_rate": 4.486909346105296e-05, "loss": 0.1891, "step": 1545 }, { "epoch": 1.1597899474868718, "grad_norm": 0.31609460711479187, "learning_rate": 4.480396641531932e-05, "loss": 0.1868, "step": 1546 }, { "epoch": 1.1605401350337585, "grad_norm": 0.3092728853225708, "learning_rate": 4.4738848280271626e-05, "loss": 0.2501, "step": 1547 }, { "epoch": 1.1612903225806452, "grad_norm": 0.2874087393283844, "learning_rate": 4.467373916758105e-05, "loss": 0.1611, "step": 1548 }, { "epoch": 1.162040510127532, "grad_norm": 0.3200473487377167, "learning_rate": 4.460863918890333e-05, "loss": 0.2813, "step": 1549 }, { "epoch": 1.1627906976744187, "grad_norm": 0.40106692910194397, "learning_rate": 4.454354845587849e-05, "loss": 0.2249, "step": 1550 }, { "epoch": 1.1635408852213054, "grad_norm": 0.2511223554611206, "learning_rate": 4.4478467080130734e-05, "loss": 0.2666, "step": 1551 }, { "epoch": 1.164291072768192, "grad_norm": 0.3830283284187317, "learning_rate": 4.4413395173268243e-05, "loss": 0.2399, "step": 1552 }, { "epoch": 1.1650412603150788, "grad_norm": 0.3684549927711487, "learning_rate": 4.43483328468829e-05, "loss": 0.2329, "step": 1553 }, { "epoch": 1.1657914478619655, "grad_norm": 0.3301328718662262, "learning_rate": 4.4283280212550194e-05, "loss": 0.2429, "step": 1554 }, { "epoch": 1.1665416354088523, "grad_norm": 0.2988622486591339, "learning_rate": 4.421823738182898e-05, "loss": 0.1914, "step": 1555 }, { "epoch": 1.167291822955739, "grad_norm": 0.34864240884780884, "learning_rate": 4.4153204466261334e-05, "loss": 0.213, "step": 1556 }, { "epoch": 1.1680420105026257, "grad_norm": 0.33677372336387634, "learning_rate": 4.408818157737227e-05, "loss": 0.2056, "step": 1557 }, { "epoch": 1.1687921980495124, "grad_norm": 0.3577517867088318, "learning_rate": 4.402316882666964e-05, "loss": 0.2738, "step": 1558 }, { "epoch": 1.1695423855963991, "grad_norm": 0.3997240662574768, "learning_rate": 4.395816632564393e-05, "loss": 0.1965, "step": 1559 }, { "epoch": 1.1702925731432858, "grad_norm": 0.287601113319397, "learning_rate": 4.3893174185768045e-05, "loss": 0.1605, "step": 1560 }, { "epoch": 1.1710427606901725, "grad_norm": 0.36766675114631653, "learning_rate": 4.382819251849707e-05, "loss": 0.2126, "step": 1561 }, { "epoch": 1.1717929482370593, "grad_norm": 0.30259934067726135, "learning_rate": 4.376322143526818e-05, "loss": 0.1832, "step": 1562 }, { "epoch": 1.172543135783946, "grad_norm": 0.3860498070716858, "learning_rate": 4.3698261047500376e-05, "loss": 0.2641, "step": 1563 }, { "epoch": 1.1732933233308327, "grad_norm": 0.3383405804634094, "learning_rate": 4.3633311466594345e-05, "loss": 0.1834, "step": 1564 }, { "epoch": 1.1740435108777194, "grad_norm": 0.3454212546348572, "learning_rate": 4.3568372803932195e-05, "loss": 0.1976, "step": 1565 }, { "epoch": 1.1747936984246061, "grad_norm": 0.3357439935207367, "learning_rate": 4.3503445170877354e-05, "loss": 0.2089, "step": 1566 }, { "epoch": 1.1755438859714928, "grad_norm": 0.2818272113800049, "learning_rate": 4.343852867877433e-05, "loss": 0.1804, "step": 1567 }, { "epoch": 1.1762940735183796, "grad_norm": 0.2606286108493805, "learning_rate": 4.3373623438948496e-05, "loss": 0.2279, "step": 1568 }, { "epoch": 1.1770442610652663, "grad_norm": 0.2464844435453415, "learning_rate": 4.330872956270596e-05, "loss": 0.1606, "step": 1569 }, { "epoch": 1.177794448612153, "grad_norm": 0.2835439443588257, "learning_rate": 4.324384716133332e-05, "loss": 0.2294, "step": 1570 }, { "epoch": 1.1785446361590397, "grad_norm": 0.36301788687705994, "learning_rate": 4.317897634609751e-05, "loss": 0.2665, "step": 1571 }, { "epoch": 1.1792948237059264, "grad_norm": 0.29482296109199524, "learning_rate": 4.3114117228245565e-05, "loss": 0.1975, "step": 1572 }, { "epoch": 1.1800450112528131, "grad_norm": 0.3220256567001343, "learning_rate": 4.304926991900453e-05, "loss": 0.251, "step": 1573 }, { "epoch": 1.1807951987996999, "grad_norm": 0.3773668706417084, "learning_rate": 4.298443452958113e-05, "loss": 0.1572, "step": 1574 }, { "epoch": 1.1815453863465866, "grad_norm": 0.4250320792198181, "learning_rate": 4.291961117116168e-05, "loss": 0.2585, "step": 1575 }, { "epoch": 1.1822955738934733, "grad_norm": 0.35499510169029236, "learning_rate": 4.285479995491185e-05, "loss": 0.2169, "step": 1576 }, { "epoch": 1.18304576144036, "grad_norm": 0.3106638193130493, "learning_rate": 4.279000099197651e-05, "loss": 0.2468, "step": 1577 }, { "epoch": 1.1837959489872467, "grad_norm": 0.24931307137012482, "learning_rate": 4.272521439347947e-05, "loss": 0.238, "step": 1578 }, { "epoch": 1.1845461365341334, "grad_norm": 0.27592089772224426, "learning_rate": 4.26604402705234e-05, "loss": 0.1597, "step": 1579 }, { "epoch": 1.1852963240810204, "grad_norm": 0.3505455553531647, "learning_rate": 4.259567873418952e-05, "loss": 0.1117, "step": 1580 }, { "epoch": 1.1860465116279069, "grad_norm": 0.4154384136199951, "learning_rate": 4.25309298955375e-05, "loss": 0.2749, "step": 1581 }, { "epoch": 1.1867966991747938, "grad_norm": 0.3487173914909363, "learning_rate": 4.246619386560521e-05, "loss": 0.1995, "step": 1582 }, { "epoch": 1.1875468867216805, "grad_norm": 0.27134397625923157, "learning_rate": 4.240147075540858e-05, "loss": 0.1322, "step": 1583 }, { "epoch": 1.1882970742685672, "grad_norm": 0.26783376932144165, "learning_rate": 4.233676067594137e-05, "loss": 0.2292, "step": 1584 }, { "epoch": 1.189047261815454, "grad_norm": 0.3791578710079193, "learning_rate": 4.227206373817497e-05, "loss": 0.1862, "step": 1585 }, { "epoch": 1.1897974493623407, "grad_norm": 0.28652194142341614, "learning_rate": 4.220738005305827e-05, "loss": 0.2503, "step": 1586 }, { "epoch": 1.1905476369092274, "grad_norm": 0.2522481381893158, "learning_rate": 4.214270973151745e-05, "loss": 0.262, "step": 1587 }, { "epoch": 1.191297824456114, "grad_norm": 0.3348347544670105, "learning_rate": 4.207805288445571e-05, "loss": 0.2399, "step": 1588 }, { "epoch": 1.1920480120030008, "grad_norm": 0.32416749000549316, "learning_rate": 4.201340962275318e-05, "loss": 0.2305, "step": 1589 }, { "epoch": 1.1927981995498875, "grad_norm": 0.3049074113368988, "learning_rate": 4.194878005726671e-05, "loss": 0.2095, "step": 1590 }, { "epoch": 1.1935483870967742, "grad_norm": 0.3129787743091583, "learning_rate": 4.1884164298829615e-05, "loss": 0.2388, "step": 1591 }, { "epoch": 1.194298574643661, "grad_norm": 0.38045230507850647, "learning_rate": 4.181956245825158e-05, "loss": 0.2383, "step": 1592 }, { "epoch": 1.1950487621905477, "grad_norm": 0.2652301490306854, "learning_rate": 4.1754974646318365e-05, "loss": 0.1843, "step": 1593 }, { "epoch": 1.1957989497374344, "grad_norm": 0.34247884154319763, "learning_rate": 4.1690400973791756e-05, "loss": 0.2269, "step": 1594 }, { "epoch": 1.196549137284321, "grad_norm": 0.25492122769355774, "learning_rate": 4.1625841551409195e-05, "loss": 0.2118, "step": 1595 }, { "epoch": 1.1972993248312078, "grad_norm": 0.3387736976146698, "learning_rate": 4.156129648988376e-05, "loss": 0.2626, "step": 1596 }, { "epoch": 1.1980495123780945, "grad_norm": 0.20954130589962006, "learning_rate": 4.149676589990388e-05, "loss": 0.1888, "step": 1597 }, { "epoch": 1.1987996999249813, "grad_norm": 0.42928799986839294, "learning_rate": 4.143224989213315e-05, "loss": 0.2289, "step": 1598 }, { "epoch": 1.199549887471868, "grad_norm": 0.3360680639743805, "learning_rate": 4.136774857721017e-05, "loss": 0.3024, "step": 1599 }, { "epoch": 1.2003000750187547, "grad_norm": 0.3290533125400543, "learning_rate": 4.130326206574834e-05, "loss": 0.2904, "step": 1600 }, { "epoch": 1.2003000750187547, "eval_loss": 0.25874844193458557, "eval_runtime": 8.9451, "eval_samples_per_second": 6.037, "eval_steps_per_second": 1.565, "step": 1600 }, { "epoch": 1.2010502625656414, "grad_norm": 0.3028523325920105, "learning_rate": 4.1238790468335685e-05, "loss": 0.173, "step": 1601 }, { "epoch": 1.2018004501125281, "grad_norm": 0.322441041469574, "learning_rate": 4.117433389553466e-05, "loss": 0.3002, "step": 1602 }, { "epoch": 1.2025506376594148, "grad_norm": 0.26059845089912415, "learning_rate": 4.1109892457881924e-05, "loss": 0.2288, "step": 1603 }, { "epoch": 1.2033008252063015, "grad_norm": 0.3151003122329712, "learning_rate": 4.1045466265888195e-05, "loss": 0.1825, "step": 1604 }, { "epoch": 1.2040510127531883, "grad_norm": 0.3421390950679779, "learning_rate": 4.0981055430038055e-05, "loss": 0.1617, "step": 1605 }, { "epoch": 1.204801200300075, "grad_norm": 0.39860203862190247, "learning_rate": 4.091666006078974e-05, "loss": 0.3145, "step": 1606 }, { "epoch": 1.2055513878469617, "grad_norm": 0.2723231017589569, "learning_rate": 4.085228026857498e-05, "loss": 0.2015, "step": 1607 }, { "epoch": 1.2063015753938484, "grad_norm": 0.3173131048679352, "learning_rate": 4.0787916163798743e-05, "loss": 0.2074, "step": 1608 }, { "epoch": 1.2070517629407351, "grad_norm": 0.35263362526893616, "learning_rate": 4.0723567856839184e-05, "loss": 0.2674, "step": 1609 }, { "epoch": 1.2078019504876218, "grad_norm": 0.3426976501941681, "learning_rate": 4.0659235458047264e-05, "loss": 0.1719, "step": 1610 }, { "epoch": 1.2085521380345086, "grad_norm": 0.32557350397109985, "learning_rate": 4.0594919077746734e-05, "loss": 0.2011, "step": 1611 }, { "epoch": 1.2093023255813953, "grad_norm": 0.2930859923362732, "learning_rate": 4.053061882623386e-05, "loss": 0.2423, "step": 1612 }, { "epoch": 1.210052513128282, "grad_norm": 0.37576591968536377, "learning_rate": 4.0466334813777216e-05, "loss": 0.1786, "step": 1613 }, { "epoch": 1.2108027006751687, "grad_norm": 0.3251000642776489, "learning_rate": 4.040206715061758e-05, "loss": 0.2122, "step": 1614 }, { "epoch": 1.2115528882220554, "grad_norm": 0.35406678915023804, "learning_rate": 4.033781594696767e-05, "loss": 0.2607, "step": 1615 }, { "epoch": 1.2123030757689421, "grad_norm": 0.27920711040496826, "learning_rate": 4.027358131301194e-05, "loss": 0.2313, "step": 1616 }, { "epoch": 1.213053263315829, "grad_norm": 0.27117228507995605, "learning_rate": 4.0209363358906495e-05, "loss": 0.1612, "step": 1617 }, { "epoch": 1.2138034508627156, "grad_norm": 0.299973726272583, "learning_rate": 4.014516219477878e-05, "loss": 0.2811, "step": 1618 }, { "epoch": 1.2145536384096025, "grad_norm": 0.2633667290210724, "learning_rate": 4.008097793072749e-05, "loss": 0.2112, "step": 1619 }, { "epoch": 1.215303825956489, "grad_norm": 0.28035491704940796, "learning_rate": 4.00168106768223e-05, "loss": 0.1997, "step": 1620 }, { "epoch": 1.216054013503376, "grad_norm": 0.30824849009513855, "learning_rate": 3.9952660543103734e-05, "loss": 0.1695, "step": 1621 }, { "epoch": 1.2168042010502627, "grad_norm": 0.35905876755714417, "learning_rate": 3.988852763958297e-05, "loss": 0.2632, "step": 1622 }, { "epoch": 1.2175543885971494, "grad_norm": 0.2960232198238373, "learning_rate": 3.9824412076241595e-05, "loss": 0.2805, "step": 1623 }, { "epoch": 1.218304576144036, "grad_norm": 0.36951473355293274, "learning_rate": 3.9760313963031516e-05, "loss": 0.1972, "step": 1624 }, { "epoch": 1.2190547636909228, "grad_norm": 0.2783275246620178, "learning_rate": 3.9696233409874654e-05, "loss": 0.1716, "step": 1625 }, { "epoch": 1.2198049512378095, "grad_norm": 0.39426150918006897, "learning_rate": 3.963217052666287e-05, "loss": 0.1725, "step": 1626 }, { "epoch": 1.2205551387846962, "grad_norm": 0.29598405957221985, "learning_rate": 3.956812542325769e-05, "loss": 0.1915, "step": 1627 }, { "epoch": 1.221305326331583, "grad_norm": 0.2572273313999176, "learning_rate": 3.950409820949018e-05, "loss": 0.2429, "step": 1628 }, { "epoch": 1.2220555138784697, "grad_norm": 0.3635823130607605, "learning_rate": 3.9440088995160676e-05, "loss": 0.2518, "step": 1629 }, { "epoch": 1.2228057014253564, "grad_norm": 0.3049802780151367, "learning_rate": 3.937609789003871e-05, "loss": 0.1985, "step": 1630 }, { "epoch": 1.223555888972243, "grad_norm": 0.2385801523923874, "learning_rate": 3.93121250038627e-05, "loss": 0.203, "step": 1631 }, { "epoch": 1.2243060765191298, "grad_norm": 0.35034415125846863, "learning_rate": 3.924817044633985e-05, "loss": 0.224, "step": 1632 }, { "epoch": 1.2250562640660165, "grad_norm": 0.4914926290512085, "learning_rate": 3.9184234327145954e-05, "loss": 0.207, "step": 1633 }, { "epoch": 1.2258064516129032, "grad_norm": 0.24380230903625488, "learning_rate": 3.912031675592512e-05, "loss": 0.2343, "step": 1634 }, { "epoch": 1.22655663915979, "grad_norm": 0.3949087858200073, "learning_rate": 3.905641784228972e-05, "loss": 0.2338, "step": 1635 }, { "epoch": 1.2273068267066767, "grad_norm": 0.2681834399700165, "learning_rate": 3.899253769582008e-05, "loss": 0.1728, "step": 1636 }, { "epoch": 1.2280570142535634, "grad_norm": 0.38118577003479004, "learning_rate": 3.8928676426064376e-05, "loss": 0.2491, "step": 1637 }, { "epoch": 1.22880720180045, "grad_norm": 0.29324421286582947, "learning_rate": 3.886483414253838e-05, "loss": 0.1746, "step": 1638 }, { "epoch": 1.2295573893473368, "grad_norm": 0.3867463767528534, "learning_rate": 3.880101095472535e-05, "loss": 0.2428, "step": 1639 }, { "epoch": 1.2303075768942235, "grad_norm": 0.2596743702888489, "learning_rate": 3.873720697207572e-05, "loss": 0.2147, "step": 1640 }, { "epoch": 1.2310577644411103, "grad_norm": 0.33100491762161255, "learning_rate": 3.867342230400707e-05, "loss": 0.2638, "step": 1641 }, { "epoch": 1.231807951987997, "grad_norm": 0.28941938281059265, "learning_rate": 3.860965705990383e-05, "loss": 0.2122, "step": 1642 }, { "epoch": 1.2325581395348837, "grad_norm": 0.3421730101108551, "learning_rate": 3.8545911349117114e-05, "loss": 0.2169, "step": 1643 }, { "epoch": 1.2333083270817704, "grad_norm": 0.3948231041431427, "learning_rate": 3.848218528096452e-05, "loss": 0.2738, "step": 1644 }, { "epoch": 1.2340585146286571, "grad_norm": 0.41763341426849365, "learning_rate": 3.841847896473001e-05, "loss": 0.1986, "step": 1645 }, { "epoch": 1.2348087021755438, "grad_norm": 0.28536278009414673, "learning_rate": 3.83547925096636e-05, "loss": 0.2047, "step": 1646 }, { "epoch": 1.2355588897224306, "grad_norm": 0.30205661058425903, "learning_rate": 3.829112602498132e-05, "loss": 0.2131, "step": 1647 }, { "epoch": 1.2363090772693173, "grad_norm": 0.34370771050453186, "learning_rate": 3.822747961986493e-05, "loss": 0.2711, "step": 1648 }, { "epoch": 1.237059264816204, "grad_norm": 0.2822244167327881, "learning_rate": 3.816385340346171e-05, "loss": 0.1753, "step": 1649 }, { "epoch": 1.2378094523630907, "grad_norm": 0.3295425474643707, "learning_rate": 3.81002474848844e-05, "loss": 0.1562, "step": 1650 }, { "epoch": 1.2385596399099774, "grad_norm": 0.30464744567871094, "learning_rate": 3.803666197321084e-05, "loss": 0.2531, "step": 1651 }, { "epoch": 1.2393098274568641, "grad_norm": 0.406931608915329, "learning_rate": 3.797309697748396e-05, "loss": 0.3144, "step": 1652 }, { "epoch": 1.2400600150037508, "grad_norm": 0.39187508821487427, "learning_rate": 3.7909552606711454e-05, "loss": 0.1803, "step": 1653 }, { "epoch": 1.2408102025506376, "grad_norm": 0.3150406777858734, "learning_rate": 3.784602896986566e-05, "loss": 0.2105, "step": 1654 }, { "epoch": 1.2415603900975243, "grad_norm": 0.36202991008758545, "learning_rate": 3.778252617588334e-05, "loss": 0.2163, "step": 1655 }, { "epoch": 1.2423105776444112, "grad_norm": 0.3692854344844818, "learning_rate": 3.771904433366557e-05, "loss": 0.1807, "step": 1656 }, { "epoch": 1.2430607651912977, "grad_norm": 0.3354456126689911, "learning_rate": 3.7655583552077446e-05, "loss": 0.214, "step": 1657 }, { "epoch": 1.2438109527381846, "grad_norm": 0.3666786551475525, "learning_rate": 3.7592143939947955e-05, "loss": 0.2147, "step": 1658 }, { "epoch": 1.2445611402850711, "grad_norm": 0.31586048007011414, "learning_rate": 3.7528725606069774e-05, "loss": 0.2673, "step": 1659 }, { "epoch": 1.245311327831958, "grad_norm": 0.27692165970802307, "learning_rate": 3.746532865919913e-05, "loss": 0.1749, "step": 1660 }, { "epoch": 1.2460615153788448, "grad_norm": 0.3557927906513214, "learning_rate": 3.740195320805551e-05, "loss": 0.2551, "step": 1661 }, { "epoch": 1.2468117029257315, "grad_norm": 0.3752962350845337, "learning_rate": 3.733859936132158e-05, "loss": 0.1956, "step": 1662 }, { "epoch": 1.2475618904726182, "grad_norm": 0.334614098072052, "learning_rate": 3.727526722764297e-05, "loss": 0.2075, "step": 1663 }, { "epoch": 1.248312078019505, "grad_norm": 0.24263255298137665, "learning_rate": 3.7211956915628035e-05, "loss": 0.1474, "step": 1664 }, { "epoch": 1.2490622655663917, "grad_norm": 0.3994738757610321, "learning_rate": 3.7148668533847744e-05, "loss": 0.2009, "step": 1665 }, { "epoch": 1.2498124531132784, "grad_norm": 0.4036114811897278, "learning_rate": 3.7085402190835406e-05, "loss": 0.2401, "step": 1666 }, { "epoch": 1.250562640660165, "grad_norm": 0.34436261653900146, "learning_rate": 3.702215799508659e-05, "loss": 0.2813, "step": 1667 }, { "epoch": 1.2513128282070518, "grad_norm": 0.26914671063423157, "learning_rate": 3.695893605505887e-05, "loss": 0.2153, "step": 1668 }, { "epoch": 1.2520630157539385, "grad_norm": 0.2780751585960388, "learning_rate": 3.689573647917162e-05, "loss": 0.1698, "step": 1669 }, { "epoch": 1.2528132033008252, "grad_norm": 0.39220017194747925, "learning_rate": 3.683255937580592e-05, "loss": 0.225, "step": 1670 }, { "epoch": 1.253563390847712, "grad_norm": 0.35116371512413025, "learning_rate": 3.6769404853304276e-05, "loss": 0.2709, "step": 1671 }, { "epoch": 1.2543135783945987, "grad_norm": 0.35009950399398804, "learning_rate": 3.670627301997047e-05, "loss": 0.1848, "step": 1672 }, { "epoch": 1.2550637659414854, "grad_norm": 0.31623223423957825, "learning_rate": 3.664316398406939e-05, "loss": 0.2186, "step": 1673 }, { "epoch": 1.255813953488372, "grad_norm": 0.4009574055671692, "learning_rate": 3.658007785382679e-05, "loss": 0.2725, "step": 1674 }, { "epoch": 1.2565641410352588, "grad_norm": 0.28646835684776306, "learning_rate": 3.65170147374292e-05, "loss": 0.2516, "step": 1675 }, { "epoch": 1.2573143285821455, "grad_norm": 0.4005051851272583, "learning_rate": 3.645397474302363e-05, "loss": 0.1576, "step": 1676 }, { "epoch": 1.2580645161290323, "grad_norm": 0.36446449160575867, "learning_rate": 3.639095797871748e-05, "loss": 0.1789, "step": 1677 }, { "epoch": 1.258814703675919, "grad_norm": 0.2981589138507843, "learning_rate": 3.63279645525783e-05, "loss": 0.2501, "step": 1678 }, { "epoch": 1.2595648912228057, "grad_norm": 0.257861852645874, "learning_rate": 3.626499457263359e-05, "loss": 0.1846, "step": 1679 }, { "epoch": 1.2603150787696924, "grad_norm": 0.41239506006240845, "learning_rate": 3.620204814687069e-05, "loss": 0.1938, "step": 1680 }, { "epoch": 1.2610652663165791, "grad_norm": 0.25087255239486694, "learning_rate": 3.61391253832365e-05, "loss": 0.2001, "step": 1681 }, { "epoch": 1.2618154538634658, "grad_norm": 0.3214680552482605, "learning_rate": 3.607622638963739e-05, "loss": 0.2981, "step": 1682 }, { "epoch": 1.2625656414103525, "grad_norm": 0.31270381808280945, "learning_rate": 3.601335127393889e-05, "loss": 0.1991, "step": 1683 }, { "epoch": 1.2633158289572393, "grad_norm": 0.35805413126945496, "learning_rate": 3.59505001439657e-05, "loss": 0.2264, "step": 1684 }, { "epoch": 1.264066016504126, "grad_norm": 0.25953367352485657, "learning_rate": 3.588767310750127e-05, "loss": 0.2948, "step": 1685 }, { "epoch": 1.2648162040510127, "grad_norm": 0.3666065037250519, "learning_rate": 3.5824870272287815e-05, "loss": 0.2517, "step": 1686 }, { "epoch": 1.2655663915978994, "grad_norm": 0.33552369475364685, "learning_rate": 3.576209174602597e-05, "loss": 0.2029, "step": 1687 }, { "epoch": 1.2663165791447861, "grad_norm": 0.3431728482246399, "learning_rate": 3.569933763637477e-05, "loss": 0.2087, "step": 1688 }, { "epoch": 1.2670667666916728, "grad_norm": 0.33877378702163696, "learning_rate": 3.56366080509513e-05, "loss": 0.1633, "step": 1689 }, { "epoch": 1.2678169542385596, "grad_norm": 0.39214563369750977, "learning_rate": 3.557390309733065e-05, "loss": 0.2726, "step": 1690 }, { "epoch": 1.2685671417854465, "grad_norm": 0.2867389917373657, "learning_rate": 3.551122288304561e-05, "loss": 0.225, "step": 1691 }, { "epoch": 1.269317329332333, "grad_norm": 0.334683358669281, "learning_rate": 3.544856751558659e-05, "loss": 0.2405, "step": 1692 }, { "epoch": 1.27006751687922, "grad_norm": 0.3424105942249298, "learning_rate": 3.538593710240139e-05, "loss": 0.2589, "step": 1693 }, { "epoch": 1.2708177044261064, "grad_norm": 0.37541893124580383, "learning_rate": 3.532333175089498e-05, "loss": 0.2109, "step": 1694 }, { "epoch": 1.2715678919729934, "grad_norm": 0.40160587430000305, "learning_rate": 3.526075156842938e-05, "loss": 0.3089, "step": 1695 }, { "epoch": 1.2723180795198799, "grad_norm": 0.3662906885147095, "learning_rate": 3.519819666232345e-05, "loss": 0.1578, "step": 1696 }, { "epoch": 1.2730682670667668, "grad_norm": 0.3027971386909485, "learning_rate": 3.5135667139852654e-05, "loss": 0.1838, "step": 1697 }, { "epoch": 1.2738184546136533, "grad_norm": 0.3352603018283844, "learning_rate": 3.507316310824902e-05, "loss": 0.183, "step": 1698 }, { "epoch": 1.2745686421605402, "grad_norm": 0.3270041048526764, "learning_rate": 3.50106846747008e-05, "loss": 0.2409, "step": 1699 }, { "epoch": 1.275318829707427, "grad_norm": 0.3709702491760254, "learning_rate": 3.4948231946352314e-05, "loss": 0.2814, "step": 1700 }, { "epoch": 1.2760690172543137, "grad_norm": 0.29025861620903015, "learning_rate": 3.488580503030389e-05, "loss": 0.2238, "step": 1701 }, { "epoch": 1.2768192048012004, "grad_norm": 0.3695102035999298, "learning_rate": 3.482340403361151e-05, "loss": 0.2426, "step": 1702 }, { "epoch": 1.277569392348087, "grad_norm": 0.32657161355018616, "learning_rate": 3.4761029063286745e-05, "loss": 0.1939, "step": 1703 }, { "epoch": 1.2783195798949738, "grad_norm": 0.27680882811546326, "learning_rate": 3.4698680226296526e-05, "loss": 0.2135, "step": 1704 }, { "epoch": 1.2790697674418605, "grad_norm": 0.3367266356945038, "learning_rate": 3.4636357629562986e-05, "loss": 0.2187, "step": 1705 }, { "epoch": 1.2798199549887472, "grad_norm": 0.308275043964386, "learning_rate": 3.457406137996321e-05, "loss": 0.1857, "step": 1706 }, { "epoch": 1.280570142535634, "grad_norm": 0.29198160767555237, "learning_rate": 3.4511791584329154e-05, "loss": 0.3026, "step": 1707 }, { "epoch": 1.2813203300825207, "grad_norm": 0.24216660857200623, "learning_rate": 3.4449548349447394e-05, "loss": 0.233, "step": 1708 }, { "epoch": 1.2820705176294074, "grad_norm": 0.4202030301094055, "learning_rate": 3.438733178205892e-05, "loss": 0.1917, "step": 1709 }, { "epoch": 1.282820705176294, "grad_norm": 0.3119998872280121, "learning_rate": 3.4325141988859046e-05, "loss": 0.2834, "step": 1710 }, { "epoch": 1.2835708927231808, "grad_norm": 0.3263137638568878, "learning_rate": 3.426297907649711e-05, "loss": 0.1609, "step": 1711 }, { "epoch": 1.2843210802700675, "grad_norm": 0.37646692991256714, "learning_rate": 3.4200843151576414e-05, "loss": 0.2668, "step": 1712 }, { "epoch": 1.2850712678169542, "grad_norm": 0.3361072242259979, "learning_rate": 3.413873432065394e-05, "loss": 0.1937, "step": 1713 }, { "epoch": 1.285821455363841, "grad_norm": 0.3917451500892639, "learning_rate": 3.407665269024024e-05, "loss": 0.2462, "step": 1714 }, { "epoch": 1.2865716429107277, "grad_norm": 0.3620074391365051, "learning_rate": 3.401459836679917e-05, "loss": 0.2752, "step": 1715 }, { "epoch": 1.2873218304576144, "grad_norm": 0.26227065920829773, "learning_rate": 3.39525714567478e-05, "loss": 0.2556, "step": 1716 }, { "epoch": 1.288072018004501, "grad_norm": 0.31470125913619995, "learning_rate": 3.389057206645614e-05, "loss": 0.2384, "step": 1717 }, { "epoch": 1.2888222055513878, "grad_norm": 0.3931310772895813, "learning_rate": 3.382860030224708e-05, "loss": 0.1958, "step": 1718 }, { "epoch": 1.2895723930982745, "grad_norm": 0.3045668303966522, "learning_rate": 3.3766656270396074e-05, "loss": 0.2842, "step": 1719 }, { "epoch": 1.2903225806451613, "grad_norm": 0.27477169036865234, "learning_rate": 3.3704740077131036e-05, "loss": 0.2084, "step": 1720 }, { "epoch": 1.291072768192048, "grad_norm": 0.30683931708335876, "learning_rate": 3.3642851828632155e-05, "loss": 0.2382, "step": 1721 }, { "epoch": 1.2918229557389347, "grad_norm": 0.2966896593570709, "learning_rate": 3.3580991631031656e-05, "loss": 0.2281, "step": 1722 }, { "epoch": 1.2925731432858214, "grad_norm": 0.31389838457107544, "learning_rate": 3.3519159590413715e-05, "loss": 0.1876, "step": 1723 }, { "epoch": 1.2933233308327081, "grad_norm": 0.3378860652446747, "learning_rate": 3.345735581281417e-05, "loss": 0.3381, "step": 1724 }, { "epoch": 1.2940735183795948, "grad_norm": 0.24462836980819702, "learning_rate": 3.339558040422042e-05, "loss": 0.2389, "step": 1725 }, { "epoch": 1.2948237059264815, "grad_norm": 0.3047317862510681, "learning_rate": 3.333383347057123e-05, "loss": 0.2233, "step": 1726 }, { "epoch": 1.2955738934733683, "grad_norm": 0.38822922110557556, "learning_rate": 3.3272115117756476e-05, "loss": 0.2069, "step": 1727 }, { "epoch": 1.296324081020255, "grad_norm": 0.3567216992378235, "learning_rate": 3.3210425451617074e-05, "loss": 0.2219, "step": 1728 }, { "epoch": 1.2970742685671417, "grad_norm": 0.35570627450942993, "learning_rate": 3.314876457794474e-05, "loss": 0.2269, "step": 1729 }, { "epoch": 1.2978244561140286, "grad_norm": 0.33582448959350586, "learning_rate": 3.3087132602481774e-05, "loss": 0.2029, "step": 1730 }, { "epoch": 1.2985746436609151, "grad_norm": 0.2409912496805191, "learning_rate": 3.302552963092096e-05, "loss": 0.1881, "step": 1731 }, { "epoch": 1.299324831207802, "grad_norm": 0.34440815448760986, "learning_rate": 3.296395576890532e-05, "loss": 0.3006, "step": 1732 }, { "epoch": 1.3000750187546886, "grad_norm": 0.32237693667411804, "learning_rate": 3.290241112202797e-05, "loss": 0.1647, "step": 1733 }, { "epoch": 1.3008252063015755, "grad_norm": 0.3539527654647827, "learning_rate": 3.284089579583192e-05, "loss": 0.2647, "step": 1734 }, { "epoch": 1.301575393848462, "grad_norm": 0.28776368498802185, "learning_rate": 3.2779409895809886e-05, "loss": 0.1766, "step": 1735 }, { "epoch": 1.302325581395349, "grad_norm": 0.30008983612060547, "learning_rate": 3.2717953527404155e-05, "loss": 0.2011, "step": 1736 }, { "epoch": 1.3030757689422354, "grad_norm": 0.3417651057243347, "learning_rate": 3.265652679600631e-05, "loss": 0.2519, "step": 1737 }, { "epoch": 1.3038259564891224, "grad_norm": 0.28882378339767456, "learning_rate": 3.25951298069572e-05, "loss": 0.2032, "step": 1738 }, { "epoch": 1.304576144036009, "grad_norm": 0.3196262717247009, "learning_rate": 3.253376266554655e-05, "loss": 0.2374, "step": 1739 }, { "epoch": 1.3053263315828958, "grad_norm": 0.3278460204601288, "learning_rate": 3.247242547701301e-05, "loss": 0.2147, "step": 1740 }, { "epoch": 1.3060765191297825, "grad_norm": 0.3536241948604584, "learning_rate": 3.241111834654382e-05, "loss": 0.2328, "step": 1741 }, { "epoch": 1.3068267066766692, "grad_norm": 0.3421139121055603, "learning_rate": 3.234984137927464e-05, "loss": 0.267, "step": 1742 }, { "epoch": 1.307576894223556, "grad_norm": 0.3087572157382965, "learning_rate": 3.228859468028946e-05, "loss": 0.2864, "step": 1743 }, { "epoch": 1.3083270817704427, "grad_norm": 0.3381185233592987, "learning_rate": 3.222737835462034e-05, "loss": 0.2571, "step": 1744 }, { "epoch": 1.3090772693173294, "grad_norm": 0.2770289182662964, "learning_rate": 3.216619250724724e-05, "loss": 0.1701, "step": 1745 }, { "epoch": 1.309827456864216, "grad_norm": 0.35542744398117065, "learning_rate": 3.2105037243097866e-05, "loss": 0.2337, "step": 1746 }, { "epoch": 1.3105776444111028, "grad_norm": 0.3424038290977478, "learning_rate": 3.2043912667047465e-05, "loss": 0.2138, "step": 1747 }, { "epoch": 1.3113278319579895, "grad_norm": 0.3233925402164459, "learning_rate": 3.198281888391869e-05, "loss": 0.19, "step": 1748 }, { "epoch": 1.3120780195048762, "grad_norm": 0.32215288281440735, "learning_rate": 3.192175599848133e-05, "loss": 0.1922, "step": 1749 }, { "epoch": 1.312828207051763, "grad_norm": 0.39430102705955505, "learning_rate": 3.1860724115452234e-05, "loss": 0.2278, "step": 1750 }, { "epoch": 1.3135783945986497, "grad_norm": 0.30198490619659424, "learning_rate": 3.179972333949509e-05, "loss": 0.1704, "step": 1751 }, { "epoch": 1.3143285821455364, "grad_norm": 0.35161876678466797, "learning_rate": 3.173875377522019e-05, "loss": 0.2016, "step": 1752 }, { "epoch": 1.315078769692423, "grad_norm": 0.3158937692642212, "learning_rate": 3.167781552718435e-05, "loss": 0.2518, "step": 1753 }, { "epoch": 1.3158289572393098, "grad_norm": 0.2586856782436371, "learning_rate": 3.161690869989068e-05, "loss": 0.1947, "step": 1754 }, { "epoch": 1.3165791447861965, "grad_norm": 0.32662808895111084, "learning_rate": 3.155603339778837e-05, "loss": 0.2225, "step": 1755 }, { "epoch": 1.3173293323330832, "grad_norm": 0.35891592502593994, "learning_rate": 3.149518972527257e-05, "loss": 0.2898, "step": 1756 }, { "epoch": 1.31807951987997, "grad_norm": 0.4343377351760864, "learning_rate": 3.1434377786684197e-05, "loss": 0.2103, "step": 1757 }, { "epoch": 1.3188297074268567, "grad_norm": 0.3335869610309601, "learning_rate": 3.137359768630972e-05, "loss": 0.2294, "step": 1758 }, { "epoch": 1.3195798949737434, "grad_norm": 0.4104772210121155, "learning_rate": 3.131284952838106e-05, "loss": 0.2294, "step": 1759 }, { "epoch": 1.32033008252063, "grad_norm": 0.2743198275566101, "learning_rate": 3.125213341707528e-05, "loss": 0.1892, "step": 1760 }, { "epoch": 1.3210802700675168, "grad_norm": 0.36215102672576904, "learning_rate": 3.1191449456514575e-05, "loss": 0.1975, "step": 1761 }, { "epoch": 1.3218304576144035, "grad_norm": 0.28209322690963745, "learning_rate": 3.113079775076593e-05, "loss": 0.2332, "step": 1762 }, { "epoch": 1.3225806451612903, "grad_norm": 0.2658754587173462, "learning_rate": 3.107017840384107e-05, "loss": 0.2277, "step": 1763 }, { "epoch": 1.323330832708177, "grad_norm": 0.3233780562877655, "learning_rate": 3.100959151969619e-05, "loss": 0.2439, "step": 1764 }, { "epoch": 1.3240810202550637, "grad_norm": 0.3709869682788849, "learning_rate": 3.0949037202231826e-05, "loss": 0.1724, "step": 1765 }, { "epoch": 1.3248312078019504, "grad_norm": 0.4281843304634094, "learning_rate": 3.08885155552927e-05, "loss": 0.2598, "step": 1766 }, { "epoch": 1.3255813953488373, "grad_norm": 0.376985102891922, "learning_rate": 3.082802668266743e-05, "loss": 0.2105, "step": 1767 }, { "epoch": 1.3263315828957238, "grad_norm": 0.2816842496395111, "learning_rate": 3.076757068808852e-05, "loss": 0.2006, "step": 1768 }, { "epoch": 1.3270817704426108, "grad_norm": 0.3276558816432953, "learning_rate": 3.070714767523203e-05, "loss": 0.2462, "step": 1769 }, { "epoch": 1.3278319579894973, "grad_norm": 0.3480972945690155, "learning_rate": 3.0646757747717475e-05, "loss": 0.1941, "step": 1770 }, { "epoch": 1.3285821455363842, "grad_norm": 0.3278963565826416, "learning_rate": 3.0586401009107636e-05, "loss": 0.2187, "step": 1771 }, { "epoch": 1.3293323330832707, "grad_norm": 0.3244778513908386, "learning_rate": 3.0526077562908386e-05, "loss": 0.2788, "step": 1772 }, { "epoch": 1.3300825206301576, "grad_norm": 0.3286433815956116, "learning_rate": 3.0465787512568466e-05, "loss": 0.2963, "step": 1773 }, { "epoch": 1.3308327081770441, "grad_norm": 0.38392579555511475, "learning_rate": 3.040553096147942e-05, "loss": 0.25, "step": 1774 }, { "epoch": 1.331582895723931, "grad_norm": 0.2530438005924225, "learning_rate": 3.0345308012975255e-05, "loss": 0.1932, "step": 1775 }, { "epoch": 1.3323330832708178, "grad_norm": 0.4070311486721039, "learning_rate": 3.0285118770332428e-05, "loss": 0.269, "step": 1776 }, { "epoch": 1.3330832708177045, "grad_norm": 0.3199729323387146, "learning_rate": 3.022496333676954e-05, "loss": 0.211, "step": 1777 }, { "epoch": 1.3338334583645912, "grad_norm": 0.33081933856010437, "learning_rate": 3.0164841815447263e-05, "loss": 0.2237, "step": 1778 }, { "epoch": 1.334583645911478, "grad_norm": 0.3206513524055481, "learning_rate": 3.0104754309468066e-05, "loss": 0.2123, "step": 1779 }, { "epoch": 1.3353338334583646, "grad_norm": 0.37083613872528076, "learning_rate": 3.00447009218761e-05, "loss": 0.2819, "step": 1780 }, { "epoch": 1.3360840210052514, "grad_norm": 0.29029735922813416, "learning_rate": 2.9984681755657017e-05, "loss": 0.1727, "step": 1781 }, { "epoch": 1.336834208552138, "grad_norm": 0.24736320972442627, "learning_rate": 2.9924696913737792e-05, "loss": 0.2197, "step": 1782 }, { "epoch": 1.3375843960990248, "grad_norm": 0.33381155133247375, "learning_rate": 2.986474649898651e-05, "loss": 0.203, "step": 1783 }, { "epoch": 1.3383345836459115, "grad_norm": 0.34021586179733276, "learning_rate": 2.9804830614212242e-05, "loss": 0.2118, "step": 1784 }, { "epoch": 1.3390847711927982, "grad_norm": 0.3193535804748535, "learning_rate": 2.9744949362164798e-05, "loss": 0.2557, "step": 1785 }, { "epoch": 1.339834958739685, "grad_norm": 0.30929240584373474, "learning_rate": 2.9685102845534658e-05, "loss": 0.2024, "step": 1786 }, { "epoch": 1.3405851462865717, "grad_norm": 0.2783874273300171, "learning_rate": 2.9625291166952702e-05, "loss": 0.2294, "step": 1787 }, { "epoch": 1.3413353338334584, "grad_norm": 0.3557001054286957, "learning_rate": 2.956551442899005e-05, "loss": 0.3239, "step": 1788 }, { "epoch": 1.342085521380345, "grad_norm": 0.36473843455314636, "learning_rate": 2.9505772734157948e-05, "loss": 0.3257, "step": 1789 }, { "epoch": 1.3428357089272318, "grad_norm": 0.3535447418689728, "learning_rate": 2.9446066184907495e-05, "loss": 0.2417, "step": 1790 }, { "epoch": 1.3435858964741185, "grad_norm": 0.3848086893558502, "learning_rate": 2.9386394883629565e-05, "loss": 0.2241, "step": 1791 }, { "epoch": 1.3443360840210052, "grad_norm": 0.30586060881614685, "learning_rate": 2.932675893265454e-05, "loss": 0.219, "step": 1792 }, { "epoch": 1.345086271567892, "grad_norm": 0.2651546001434326, "learning_rate": 2.926715843425223e-05, "loss": 0.2538, "step": 1793 }, { "epoch": 1.3458364591147787, "grad_norm": 0.23828212916851044, "learning_rate": 2.9207593490631592e-05, "loss": 0.2154, "step": 1794 }, { "epoch": 1.3465866466616654, "grad_norm": 0.2617068588733673, "learning_rate": 2.914806420394064e-05, "loss": 0.2672, "step": 1795 }, { "epoch": 1.347336834208552, "grad_norm": 0.3360888361930847, "learning_rate": 2.908857067626629e-05, "loss": 0.2385, "step": 1796 }, { "epoch": 1.3480870217554388, "grad_norm": 0.315958172082901, "learning_rate": 2.902911300963403e-05, "loss": 0.2553, "step": 1797 }, { "epoch": 1.3488372093023255, "grad_norm": 0.2911926507949829, "learning_rate": 2.8969691306007918e-05, "loss": 0.1942, "step": 1798 }, { "epoch": 1.3495873968492123, "grad_norm": 0.23689843714237213, "learning_rate": 2.891030566729032e-05, "loss": 0.1774, "step": 1799 }, { "epoch": 1.350337584396099, "grad_norm": 0.24161942303180695, "learning_rate": 2.8850956195321795e-05, "loss": 0.1625, "step": 1800 }, { "epoch": 1.350337584396099, "eval_loss": 0.25722581148147583, "eval_runtime": 8.9329, "eval_samples_per_second": 6.045, "eval_steps_per_second": 1.567, "step": 1800 }, { "epoch": 1.3510877719429857, "grad_norm": 0.41473543643951416, "learning_rate": 2.8791642991880784e-05, "loss": 0.2724, "step": 1801 }, { "epoch": 1.3518379594898724, "grad_norm": 0.3009076416492462, "learning_rate": 2.873236615868362e-05, "loss": 0.2044, "step": 1802 }, { "epoch": 1.3525881470367591, "grad_norm": 0.36390992999076843, "learning_rate": 2.8673125797384243e-05, "loss": 0.2532, "step": 1803 }, { "epoch": 1.3533383345836458, "grad_norm": 0.30089640617370605, "learning_rate": 2.8613922009574024e-05, "loss": 0.2362, "step": 1804 }, { "epoch": 1.3540885221305325, "grad_norm": 0.3343108594417572, "learning_rate": 2.8554754896781656e-05, "loss": 0.1879, "step": 1805 }, { "epoch": 1.3548387096774195, "grad_norm": 0.36372244358062744, "learning_rate": 2.8495624560472866e-05, "loss": 0.1702, "step": 1806 }, { "epoch": 1.355588897224306, "grad_norm": 0.35814642906188965, "learning_rate": 2.843653110205039e-05, "loss": 0.1352, "step": 1807 }, { "epoch": 1.356339084771193, "grad_norm": 0.3516104519367218, "learning_rate": 2.8377474622853683e-05, "loss": 0.2469, "step": 1808 }, { "epoch": 1.3570892723180794, "grad_norm": 0.32612282037734985, "learning_rate": 2.8318455224158786e-05, "loss": 0.2378, "step": 1809 }, { "epoch": 1.3578394598649663, "grad_norm": 0.35120633244514465, "learning_rate": 2.8259473007178163e-05, "loss": 0.2378, "step": 1810 }, { "epoch": 1.3585896474118528, "grad_norm": 0.3522869348526001, "learning_rate": 2.8200528073060507e-05, "loss": 0.1773, "step": 1811 }, { "epoch": 1.3593398349587398, "grad_norm": 0.3047502040863037, "learning_rate": 2.814162052289058e-05, "loss": 0.1724, "step": 1812 }, { "epoch": 1.3600900225056263, "grad_norm": 0.2902930974960327, "learning_rate": 2.8082750457689033e-05, "loss": 0.1961, "step": 1813 }, { "epoch": 1.3608402100525132, "grad_norm": 0.3911548852920532, "learning_rate": 2.8023917978412207e-05, "loss": 0.2121, "step": 1814 }, { "epoch": 1.3615903975994, "grad_norm": 0.2949431240558624, "learning_rate": 2.7965123185952023e-05, "loss": 0.215, "step": 1815 }, { "epoch": 1.3623405851462866, "grad_norm": 0.36813151836395264, "learning_rate": 2.7906366181135775e-05, "loss": 0.2667, "step": 1816 }, { "epoch": 1.3630907726931734, "grad_norm": 0.35193634033203125, "learning_rate": 2.7847647064725924e-05, "loss": 0.1806, "step": 1817 }, { "epoch": 1.36384096024006, "grad_norm": 0.3044644892215729, "learning_rate": 2.778896593741999e-05, "loss": 0.2537, "step": 1818 }, { "epoch": 1.3645911477869468, "grad_norm": 0.33335423469543457, "learning_rate": 2.77303228998503e-05, "loss": 0.1629, "step": 1819 }, { "epoch": 1.3653413353338335, "grad_norm": 0.40530210733413696, "learning_rate": 2.7671718052583908e-05, "loss": 0.2511, "step": 1820 }, { "epoch": 1.3660915228807202, "grad_norm": 0.25957563519477844, "learning_rate": 2.7613151496122347e-05, "loss": 0.1896, "step": 1821 }, { "epoch": 1.366841710427607, "grad_norm": 0.23524042963981628, "learning_rate": 2.7554623330901524e-05, "loss": 0.2478, "step": 1822 }, { "epoch": 1.3675918979744937, "grad_norm": 0.39039161801338196, "learning_rate": 2.749613365729141e-05, "loss": 0.2396, "step": 1823 }, { "epoch": 1.3683420855213804, "grad_norm": 0.3564348816871643, "learning_rate": 2.7437682575596104e-05, "loss": 0.2319, "step": 1824 }, { "epoch": 1.369092273068267, "grad_norm": 0.35391780734062195, "learning_rate": 2.7379270186053428e-05, "loss": 0.2418, "step": 1825 }, { "epoch": 1.3698424606151538, "grad_norm": 0.2859008014202118, "learning_rate": 2.7320896588834903e-05, "loss": 0.2701, "step": 1826 }, { "epoch": 1.3705926481620405, "grad_norm": 0.3732033371925354, "learning_rate": 2.7262561884045457e-05, "loss": 0.1893, "step": 1827 }, { "epoch": 1.3713428357089272, "grad_norm": 0.3259146809577942, "learning_rate": 2.720426617172339e-05, "loss": 0.2057, "step": 1828 }, { "epoch": 1.372093023255814, "grad_norm": 0.3049114942550659, "learning_rate": 2.71460095518401e-05, "loss": 0.2404, "step": 1829 }, { "epoch": 1.3728432108027007, "grad_norm": 0.31050169467926025, "learning_rate": 2.708779212429996e-05, "loss": 0.2019, "step": 1830 }, { "epoch": 1.3735933983495874, "grad_norm": 0.3692934513092041, "learning_rate": 2.702961398894014e-05, "loss": 0.1626, "step": 1831 }, { "epoch": 1.374343585896474, "grad_norm": 0.46135446429252625, "learning_rate": 2.6971475245530375e-05, "loss": 0.227, "step": 1832 }, { "epoch": 1.3750937734433608, "grad_norm": 0.4024883806705475, "learning_rate": 2.6913375993772915e-05, "loss": 0.2335, "step": 1833 }, { "epoch": 1.3758439609902475, "grad_norm": 0.37783777713775635, "learning_rate": 2.6855316333302237e-05, "loss": 0.2351, "step": 1834 }, { "epoch": 1.3765941485371342, "grad_norm": 0.3174808621406555, "learning_rate": 2.6797296363684977e-05, "loss": 0.1854, "step": 1835 }, { "epoch": 1.377344336084021, "grad_norm": 0.45114609599113464, "learning_rate": 2.6739316184419622e-05, "loss": 0.2718, "step": 1836 }, { "epoch": 1.3780945236309077, "grad_norm": 0.47988560795783997, "learning_rate": 2.6681375894936472e-05, "loss": 0.1861, "step": 1837 }, { "epoch": 1.3788447111777944, "grad_norm": 0.30619314312934875, "learning_rate": 2.662347559459746e-05, "loss": 0.2678, "step": 1838 }, { "epoch": 1.379594898724681, "grad_norm": 0.32252827286720276, "learning_rate": 2.6565615382695896e-05, "loss": 0.1908, "step": 1839 }, { "epoch": 1.3803450862715678, "grad_norm": 0.29635876417160034, "learning_rate": 2.6507795358456307e-05, "loss": 0.2572, "step": 1840 }, { "epoch": 1.3810952738184545, "grad_norm": 0.3219766616821289, "learning_rate": 2.6450015621034362e-05, "loss": 0.2274, "step": 1841 }, { "epoch": 1.3818454613653413, "grad_norm": 0.31852519512176514, "learning_rate": 2.6392276269516613e-05, "loss": 0.1499, "step": 1842 }, { "epoch": 1.3825956489122282, "grad_norm": 0.34567901492118835, "learning_rate": 2.63345774029204e-05, "loss": 0.2621, "step": 1843 }, { "epoch": 1.3833458364591147, "grad_norm": 0.4169693887233734, "learning_rate": 2.6276919120193543e-05, "loss": 0.2631, "step": 1844 }, { "epoch": 1.3840960240060016, "grad_norm": 0.3173653483390808, "learning_rate": 2.621930152021434e-05, "loss": 0.1962, "step": 1845 }, { "epoch": 1.3848462115528881, "grad_norm": 0.4584205448627472, "learning_rate": 2.6161724701791306e-05, "loss": 0.2604, "step": 1846 }, { "epoch": 1.385596399099775, "grad_norm": 0.36192214488983154, "learning_rate": 2.6104188763663018e-05, "loss": 0.202, "step": 1847 }, { "epoch": 1.3863465866466615, "grad_norm": 0.3180589973926544, "learning_rate": 2.604669380449795e-05, "loss": 0.1948, "step": 1848 }, { "epoch": 1.3870967741935485, "grad_norm": 0.45792898535728455, "learning_rate": 2.598923992289427e-05, "loss": 0.2038, "step": 1849 }, { "epoch": 1.387846961740435, "grad_norm": 0.2872127294540405, "learning_rate": 2.5931827217379746e-05, "loss": 0.218, "step": 1850 }, { "epoch": 1.388597149287322, "grad_norm": 0.2964303195476532, "learning_rate": 2.5874455786411505e-05, "loss": 0.1825, "step": 1851 }, { "epoch": 1.3893473368342086, "grad_norm": 0.2897481918334961, "learning_rate": 2.5817125728375912e-05, "loss": 0.2107, "step": 1852 }, { "epoch": 1.3900975243810954, "grad_norm": 0.3183579444885254, "learning_rate": 2.5759837141588362e-05, "loss": 0.2041, "step": 1853 }, { "epoch": 1.390847711927982, "grad_norm": 0.3782413601875305, "learning_rate": 2.5702590124293147e-05, "loss": 0.2073, "step": 1854 }, { "epoch": 1.3915978994748688, "grad_norm": 0.31032872200012207, "learning_rate": 2.5645384774663262e-05, "loss": 0.21, "step": 1855 }, { "epoch": 1.3923480870217555, "grad_norm": 0.391583651304245, "learning_rate": 2.5588221190800264e-05, "loss": 0.1955, "step": 1856 }, { "epoch": 1.3930982745686422, "grad_norm": 0.4930109977722168, "learning_rate": 2.5531099470734038e-05, "loss": 0.2927, "step": 1857 }, { "epoch": 1.393848462115529, "grad_norm": 0.3764902353286743, "learning_rate": 2.5474019712422724e-05, "loss": 0.1726, "step": 1858 }, { "epoch": 1.3945986496624156, "grad_norm": 0.44849470257759094, "learning_rate": 2.541698201375249e-05, "loss": 0.211, "step": 1859 }, { "epoch": 1.3953488372093024, "grad_norm": 0.3235580325126648, "learning_rate": 2.5359986472537373e-05, "loss": 0.165, "step": 1860 }, { "epoch": 1.396099024756189, "grad_norm": 0.26657378673553467, "learning_rate": 2.530303318651913e-05, "loss": 0.1881, "step": 1861 }, { "epoch": 1.3968492123030758, "grad_norm": 0.3232589066028595, "learning_rate": 2.5246122253366998e-05, "loss": 0.2283, "step": 1862 }, { "epoch": 1.3975993998499625, "grad_norm": 0.2692256271839142, "learning_rate": 2.5189253770677644e-05, "loss": 0.2033, "step": 1863 }, { "epoch": 1.3983495873968492, "grad_norm": 0.3684447705745697, "learning_rate": 2.5132427835974926e-05, "loss": 0.1845, "step": 1864 }, { "epoch": 1.399099774943736, "grad_norm": 0.2801147401332855, "learning_rate": 2.507564454670971e-05, "loss": 0.2214, "step": 1865 }, { "epoch": 1.3998499624906227, "grad_norm": 0.30190205574035645, "learning_rate": 2.5018904000259757e-05, "loss": 0.1831, "step": 1866 }, { "epoch": 1.4006001500375094, "grad_norm": 0.2702212333679199, "learning_rate": 2.4962206293929512e-05, "loss": 0.1652, "step": 1867 }, { "epoch": 1.401350337584396, "grad_norm": 0.3676742911338806, "learning_rate": 2.490555152494996e-05, "loss": 0.1526, "step": 1868 }, { "epoch": 1.4021005251312828, "grad_norm": 0.3497639298439026, "learning_rate": 2.4848939790478463e-05, "loss": 0.2298, "step": 1869 }, { "epoch": 1.4028507126781695, "grad_norm": 0.34031257033348083, "learning_rate": 2.4792371187598544e-05, "loss": 0.2201, "step": 1870 }, { "epoch": 1.4036009002250562, "grad_norm": 0.2810254395008087, "learning_rate": 2.4735845813319804e-05, "loss": 0.1864, "step": 1871 }, { "epoch": 1.404351087771943, "grad_norm": 0.38328075408935547, "learning_rate": 2.4679363764577683e-05, "loss": 0.2883, "step": 1872 }, { "epoch": 1.4051012753188297, "grad_norm": 0.3316168785095215, "learning_rate": 2.462292513823336e-05, "loss": 0.2215, "step": 1873 }, { "epoch": 1.4058514628657164, "grad_norm": 0.3287341594696045, "learning_rate": 2.4566530031073486e-05, "loss": 0.208, "step": 1874 }, { "epoch": 1.406601650412603, "grad_norm": 0.27530941367149353, "learning_rate": 2.451017853981013e-05, "loss": 0.1208, "step": 1875 }, { "epoch": 1.4073518379594898, "grad_norm": 0.3174988627433777, "learning_rate": 2.4453870761080554e-05, "loss": 0.3047, "step": 1876 }, { "epoch": 1.4081020255063765, "grad_norm": 0.2633969187736511, "learning_rate": 2.4397606791447052e-05, "loss": 0.1775, "step": 1877 }, { "epoch": 1.4088522130532632, "grad_norm": 0.3297993838787079, "learning_rate": 2.4341386727396793e-05, "loss": 0.2797, "step": 1878 }, { "epoch": 1.40960240060015, "grad_norm": 0.3476991653442383, "learning_rate": 2.4285210665341646e-05, "loss": 0.206, "step": 1879 }, { "epoch": 1.4103525881470367, "grad_norm": 0.4210616648197174, "learning_rate": 2.422907870161803e-05, "loss": 0.234, "step": 1880 }, { "epoch": 1.4111027756939234, "grad_norm": 0.4128344655036926, "learning_rate": 2.4172990932486733e-05, "loss": 0.2026, "step": 1881 }, { "epoch": 1.4118529632408103, "grad_norm": 0.4745548963546753, "learning_rate": 2.4116947454132782e-05, "loss": 0.2572, "step": 1882 }, { "epoch": 1.4126031507876968, "grad_norm": 0.2751515507698059, "learning_rate": 2.4060948362665176e-05, "loss": 0.2157, "step": 1883 }, { "epoch": 1.4133533383345838, "grad_norm": 0.3571735918521881, "learning_rate": 2.4004993754116867e-05, "loss": 0.1886, "step": 1884 }, { "epoch": 1.4141035258814703, "grad_norm": 0.3857848048210144, "learning_rate": 2.39490837244445e-05, "loss": 0.1931, "step": 1885 }, { "epoch": 1.4148537134283572, "grad_norm": 0.29611849784851074, "learning_rate": 2.389321836952828e-05, "loss": 0.2361, "step": 1886 }, { "epoch": 1.4156039009752437, "grad_norm": 0.34168103337287903, "learning_rate": 2.383739778517176e-05, "loss": 0.2174, "step": 1887 }, { "epoch": 1.4163540885221306, "grad_norm": 0.37992724776268005, "learning_rate": 2.3781622067101767e-05, "loss": 0.3052, "step": 1888 }, { "epoch": 1.4171042760690171, "grad_norm": 0.3205695152282715, "learning_rate": 2.372589131096816e-05, "loss": 0.2213, "step": 1889 }, { "epoch": 1.417854463615904, "grad_norm": 0.2661772072315216, "learning_rate": 2.36702056123437e-05, "loss": 0.2236, "step": 1890 }, { "epoch": 1.4186046511627908, "grad_norm": 0.4081861972808838, "learning_rate": 2.3614565066723892e-05, "loss": 0.216, "step": 1891 }, { "epoch": 1.4193548387096775, "grad_norm": 0.3451412320137024, "learning_rate": 2.355896976952674e-05, "loss": 0.2138, "step": 1892 }, { "epoch": 1.4201050262565642, "grad_norm": 0.3408620059490204, "learning_rate": 2.350341981609276e-05, "loss": 0.2655, "step": 1893 }, { "epoch": 1.420855213803451, "grad_norm": 0.3166339695453644, "learning_rate": 2.344791530168465e-05, "loss": 0.2268, "step": 1894 }, { "epoch": 1.4216054013503376, "grad_norm": 0.3005989193916321, "learning_rate": 2.339245632148715e-05, "loss": 0.27, "step": 1895 }, { "epoch": 1.4223555888972244, "grad_norm": 0.2609424889087677, "learning_rate": 2.3337042970606965e-05, "loss": 0.1967, "step": 1896 }, { "epoch": 1.423105776444111, "grad_norm": 0.2880832850933075, "learning_rate": 2.3281675344072545e-05, "loss": 0.2484, "step": 1897 }, { "epoch": 1.4238559639909978, "grad_norm": 0.3153662383556366, "learning_rate": 2.3226353536833907e-05, "loss": 0.228, "step": 1898 }, { "epoch": 1.4246061515378845, "grad_norm": 0.32784128189086914, "learning_rate": 2.317107764376253e-05, "loss": 0.2305, "step": 1899 }, { "epoch": 1.4253563390847712, "grad_norm": 0.41642847657203674, "learning_rate": 2.3115847759651082e-05, "loss": 0.2077, "step": 1900 }, { "epoch": 1.426106526631658, "grad_norm": 0.3231176733970642, "learning_rate": 2.3060663979213404e-05, "loss": 0.2268, "step": 1901 }, { "epoch": 1.4268567141785446, "grad_norm": 0.36550119519233704, "learning_rate": 2.300552639708423e-05, "loss": 0.2018, "step": 1902 }, { "epoch": 1.4276069017254314, "grad_norm": 0.29330140352249146, "learning_rate": 2.2950435107819124e-05, "loss": 0.2411, "step": 1903 }, { "epoch": 1.428357089272318, "grad_norm": 0.3829742670059204, "learning_rate": 2.2895390205894164e-05, "loss": 0.2384, "step": 1904 }, { "epoch": 1.4291072768192048, "grad_norm": 0.35479775071144104, "learning_rate": 2.2840391785705967e-05, "loss": 0.2191, "step": 1905 }, { "epoch": 1.4298574643660915, "grad_norm": 0.39314618706703186, "learning_rate": 2.278543994157139e-05, "loss": 0.2413, "step": 1906 }, { "epoch": 1.4306076519129782, "grad_norm": 0.3135644793510437, "learning_rate": 2.2730534767727483e-05, "loss": 0.251, "step": 1907 }, { "epoch": 1.431357839459865, "grad_norm": 0.25173741579055786, "learning_rate": 2.267567635833116e-05, "loss": 0.3515, "step": 1908 }, { "epoch": 1.4321080270067517, "grad_norm": 0.4123310148715973, "learning_rate": 2.2620864807459213e-05, "loss": 0.2731, "step": 1909 }, { "epoch": 1.4328582145536384, "grad_norm": 0.28756436705589294, "learning_rate": 2.2566100209108048e-05, "loss": 0.1985, "step": 1910 }, { "epoch": 1.433608402100525, "grad_norm": 0.4153062701225281, "learning_rate": 2.2511382657193565e-05, "loss": 0.2538, "step": 1911 }, { "epoch": 1.4343585896474118, "grad_norm": 0.2938479483127594, "learning_rate": 2.2456712245550993e-05, "loss": 0.2213, "step": 1912 }, { "epoch": 1.4351087771942985, "grad_norm": 0.23668654263019562, "learning_rate": 2.2402089067934668e-05, "loss": 0.1779, "step": 1913 }, { "epoch": 1.4358589647411852, "grad_norm": 0.3860184848308563, "learning_rate": 2.2347513218017974e-05, "loss": 0.2108, "step": 1914 }, { "epoch": 1.436609152288072, "grad_norm": 0.4423319399356842, "learning_rate": 2.2292984789393122e-05, "loss": 0.182, "step": 1915 }, { "epoch": 1.4373593398349587, "grad_norm": 0.24182984232902527, "learning_rate": 2.2238503875571028e-05, "loss": 0.1448, "step": 1916 }, { "epoch": 1.4381095273818454, "grad_norm": 0.26014232635498047, "learning_rate": 2.218407056998104e-05, "loss": 0.296, "step": 1917 }, { "epoch": 1.438859714928732, "grad_norm": 0.4221387803554535, "learning_rate": 2.2129684965970948e-05, "loss": 0.1733, "step": 1918 }, { "epoch": 1.439609902475619, "grad_norm": 0.2776692807674408, "learning_rate": 2.2075347156806697e-05, "loss": 0.2244, "step": 1919 }, { "epoch": 1.4403600900225055, "grad_norm": 0.3943135440349579, "learning_rate": 2.2021057235672288e-05, "loss": 0.2609, "step": 1920 }, { "epoch": 1.4411102775693925, "grad_norm": 0.36778298020362854, "learning_rate": 2.1966815295669585e-05, "loss": 0.1791, "step": 1921 }, { "epoch": 1.441860465116279, "grad_norm": 0.2615351676940918, "learning_rate": 2.1912621429818177e-05, "loss": 0.1771, "step": 1922 }, { "epoch": 1.442610652663166, "grad_norm": 0.33867865800857544, "learning_rate": 2.18584757310552e-05, "loss": 0.2722, "step": 1923 }, { "epoch": 1.4433608402100524, "grad_norm": 0.32349616289138794, "learning_rate": 2.1804378292235224e-05, "loss": 0.2611, "step": 1924 }, { "epoch": 1.4441110277569393, "grad_norm": 0.3319915533065796, "learning_rate": 2.1750329206129988e-05, "loss": 0.2043, "step": 1925 }, { "epoch": 1.4448612153038258, "grad_norm": 0.36895647644996643, "learning_rate": 2.1696328565428364e-05, "loss": 0.2349, "step": 1926 }, { "epoch": 1.4456114028507128, "grad_norm": 0.3146461844444275, "learning_rate": 2.1642376462736148e-05, "loss": 0.2044, "step": 1927 }, { "epoch": 1.4463615903975993, "grad_norm": 0.49012184143066406, "learning_rate": 2.158847299057587e-05, "loss": 0.2385, "step": 1928 }, { "epoch": 1.4471117779444862, "grad_norm": 0.34117910265922546, "learning_rate": 2.1534618241386705e-05, "loss": 0.2012, "step": 1929 }, { "epoch": 1.447861965491373, "grad_norm": 0.36037197709083557, "learning_rate": 2.14808123075242e-05, "loss": 0.2857, "step": 1930 }, { "epoch": 1.4486121530382596, "grad_norm": 0.35831284523010254, "learning_rate": 2.1427055281260255e-05, "loss": 0.24, "step": 1931 }, { "epoch": 1.4493623405851463, "grad_norm": 0.39922547340393066, "learning_rate": 2.1373347254782882e-05, "loss": 0.2261, "step": 1932 }, { "epoch": 1.450112528132033, "grad_norm": 0.317452073097229, "learning_rate": 2.1319688320196048e-05, "loss": 0.1706, "step": 1933 }, { "epoch": 1.4508627156789198, "grad_norm": 0.2653583586215973, "learning_rate": 2.1266078569519542e-05, "loss": 0.1961, "step": 1934 }, { "epoch": 1.4516129032258065, "grad_norm": 0.36667782068252563, "learning_rate": 2.121251809468882e-05, "loss": 0.2666, "step": 1935 }, { "epoch": 1.4523630907726932, "grad_norm": 0.2900708317756653, "learning_rate": 2.1159006987554807e-05, "loss": 0.2216, "step": 1936 }, { "epoch": 1.45311327831958, "grad_norm": 0.2978344261646271, "learning_rate": 2.1105545339883808e-05, "loss": 0.1963, "step": 1937 }, { "epoch": 1.4538634658664666, "grad_norm": 0.3685671091079712, "learning_rate": 2.1052133243357253e-05, "loss": 0.2809, "step": 1938 }, { "epoch": 1.4546136534133534, "grad_norm": 0.42794328927993774, "learning_rate": 2.0998770789571636e-05, "loss": 0.213, "step": 1939 }, { "epoch": 1.45536384096024, "grad_norm": 0.3327684998512268, "learning_rate": 2.0945458070038315e-05, "loss": 0.3008, "step": 1940 }, { "epoch": 1.4561140285071268, "grad_norm": 0.4629894495010376, "learning_rate": 2.0892195176183354e-05, "loss": 0.2239, "step": 1941 }, { "epoch": 1.4568642160540135, "grad_norm": 0.4106677770614624, "learning_rate": 2.083898219934739e-05, "loss": 0.2512, "step": 1942 }, { "epoch": 1.4576144036009002, "grad_norm": 0.3300548493862152, "learning_rate": 2.0785819230785398e-05, "loss": 0.2263, "step": 1943 }, { "epoch": 1.458364591147787, "grad_norm": 0.26039597392082214, "learning_rate": 2.073270636166666e-05, "loss": 0.2011, "step": 1944 }, { "epoch": 1.4591147786946737, "grad_norm": 0.3882303833961487, "learning_rate": 2.0679643683074513e-05, "loss": 0.2114, "step": 1945 }, { "epoch": 1.4598649662415604, "grad_norm": 0.3750239312648773, "learning_rate": 2.0626631286006236e-05, "loss": 0.2403, "step": 1946 }, { "epoch": 1.460615153788447, "grad_norm": 0.3504992723464966, "learning_rate": 2.0573669261372847e-05, "loss": 0.2055, "step": 1947 }, { "epoch": 1.4613653413353338, "grad_norm": 0.3679874539375305, "learning_rate": 2.052075769999899e-05, "loss": 0.2141, "step": 1948 }, { "epoch": 1.4621155288822205, "grad_norm": 0.2993316054344177, "learning_rate": 2.046789669262283e-05, "loss": 0.1837, "step": 1949 }, { "epoch": 1.4628657164291072, "grad_norm": 0.3158014714717865, "learning_rate": 2.0415086329895784e-05, "loss": 0.169, "step": 1950 }, { "epoch": 1.463615903975994, "grad_norm": 0.3342015743255615, "learning_rate": 2.0362326702382384e-05, "loss": 0.1902, "step": 1951 }, { "epoch": 1.4643660915228807, "grad_norm": 0.33370962738990784, "learning_rate": 2.0309617900560218e-05, "loss": 0.2134, "step": 1952 }, { "epoch": 1.4651162790697674, "grad_norm": 0.3770402669906616, "learning_rate": 2.0256960014819692e-05, "loss": 0.2102, "step": 1953 }, { "epoch": 1.465866466616654, "grad_norm": 0.35789042711257935, "learning_rate": 2.020435313546391e-05, "loss": 0.1705, "step": 1954 }, { "epoch": 1.4666166541635408, "grad_norm": 0.4119642972946167, "learning_rate": 2.0151797352708457e-05, "loss": 0.3186, "step": 1955 }, { "epoch": 1.4673668417104275, "grad_norm": 0.2800465226173401, "learning_rate": 2.0099292756681343e-05, "loss": 0.2165, "step": 1956 }, { "epoch": 1.4681170292573142, "grad_norm": 0.29550519585609436, "learning_rate": 2.0046839437422772e-05, "loss": 0.2343, "step": 1957 }, { "epoch": 1.4688672168042012, "grad_norm": 0.34560781717300415, "learning_rate": 1.999443748488503e-05, "loss": 0.2276, "step": 1958 }, { "epoch": 1.4696174043510877, "grad_norm": 0.2832637131214142, "learning_rate": 1.9942086988932323e-05, "loss": 0.2054, "step": 1959 }, { "epoch": 1.4703675918979746, "grad_norm": 0.23919551074504852, "learning_rate": 1.9889788039340558e-05, "loss": 0.1571, "step": 1960 }, { "epoch": 1.471117779444861, "grad_norm": 0.3006112277507782, "learning_rate": 1.9837540725797305e-05, "loss": 0.2092, "step": 1961 }, { "epoch": 1.471867966991748, "grad_norm": 0.3058153986930847, "learning_rate": 1.9785345137901533e-05, "loss": 0.2278, "step": 1962 }, { "epoch": 1.4726181545386345, "grad_norm": 0.3452884554862976, "learning_rate": 1.9733201365163607e-05, "loss": 0.2052, "step": 1963 }, { "epoch": 1.4733683420855215, "grad_norm": 0.3478454351425171, "learning_rate": 1.968110949700489e-05, "loss": 0.2532, "step": 1964 }, { "epoch": 1.474118529632408, "grad_norm": 0.21480776369571686, "learning_rate": 1.962906962275784e-05, "loss": 0.1848, "step": 1965 }, { "epoch": 1.474868717179295, "grad_norm": 0.3528771698474884, "learning_rate": 1.9577081831665707e-05, "loss": 0.251, "step": 1966 }, { "epoch": 1.4756189047261816, "grad_norm": 0.25388941168785095, "learning_rate": 1.9525146212882456e-05, "loss": 0.2686, "step": 1967 }, { "epoch": 1.4763690922730683, "grad_norm": 0.22556820511817932, "learning_rate": 1.9473262855472517e-05, "loss": 0.1748, "step": 1968 }, { "epoch": 1.477119279819955, "grad_norm": 0.2955077290534973, "learning_rate": 1.942143184841077e-05, "loss": 0.206, "step": 1969 }, { "epoch": 1.4778694673668418, "grad_norm": 0.30799224972724915, "learning_rate": 1.9369653280582273e-05, "loss": 0.2037, "step": 1970 }, { "epoch": 1.4786196549137285, "grad_norm": 0.3539981544017792, "learning_rate": 1.931792724078218e-05, "loss": 0.2245, "step": 1971 }, { "epoch": 1.4793698424606152, "grad_norm": 0.43906956911087036, "learning_rate": 1.9266253817715575e-05, "loss": 0.2677, "step": 1972 }, { "epoch": 1.480120030007502, "grad_norm": 0.4066217243671417, "learning_rate": 1.921463309999724e-05, "loss": 0.2146, "step": 1973 }, { "epoch": 1.4808702175543886, "grad_norm": 0.3831619620323181, "learning_rate": 1.9163065176151662e-05, "loss": 0.1843, "step": 1974 }, { "epoch": 1.4816204051012754, "grad_norm": 0.3045404553413391, "learning_rate": 1.9111550134612738e-05, "loss": 0.2187, "step": 1975 }, { "epoch": 1.482370592648162, "grad_norm": 0.34106945991516113, "learning_rate": 1.9060088063723696e-05, "loss": 0.1845, "step": 1976 }, { "epoch": 1.4831207801950488, "grad_norm": 0.3167369067668915, "learning_rate": 1.900867905173692e-05, "loss": 0.2037, "step": 1977 }, { "epoch": 1.4838709677419355, "grad_norm": 0.26099687814712524, "learning_rate": 1.8957323186813803e-05, "loss": 0.1547, "step": 1978 }, { "epoch": 1.4846211552888222, "grad_norm": 0.3368515968322754, "learning_rate": 1.8906020557024597e-05, "loss": 0.1915, "step": 1979 }, { "epoch": 1.485371342835709, "grad_norm": 0.41900205612182617, "learning_rate": 1.885477125034827e-05, "loss": 0.1945, "step": 1980 }, { "epoch": 1.4861215303825956, "grad_norm": 0.43290483951568604, "learning_rate": 1.8803575354672315e-05, "loss": 0.1748, "step": 1981 }, { "epoch": 1.4868717179294824, "grad_norm": 0.25294435024261475, "learning_rate": 1.8752432957792654e-05, "loss": 0.1965, "step": 1982 }, { "epoch": 1.487621905476369, "grad_norm": 0.3728505074977875, "learning_rate": 1.8701344147413474e-05, "loss": 0.3021, "step": 1983 }, { "epoch": 1.4883720930232558, "grad_norm": 0.3190241754055023, "learning_rate": 1.8650309011147053e-05, "loss": 0.1666, "step": 1984 }, { "epoch": 1.4891222805701425, "grad_norm": 0.3899969756603241, "learning_rate": 1.8599327636513636e-05, "loss": 0.2134, "step": 1985 }, { "epoch": 1.4898724681170292, "grad_norm": 0.38135841488838196, "learning_rate": 1.8548400110941228e-05, "loss": 0.2188, "step": 1986 }, { "epoch": 1.490622655663916, "grad_norm": 0.449079304933548, "learning_rate": 1.8497526521765534e-05, "loss": 0.2216, "step": 1987 }, { "epoch": 1.4913728432108027, "grad_norm": 0.32195788621902466, "learning_rate": 1.844670695622976e-05, "loss": 0.1569, "step": 1988 }, { "epoch": 1.4921230307576894, "grad_norm": 0.34831491112709045, "learning_rate": 1.8395941501484464e-05, "loss": 0.1985, "step": 1989 }, { "epoch": 1.492873218304576, "grad_norm": 0.31190118193626404, "learning_rate": 1.8345230244587354e-05, "loss": 0.2628, "step": 1990 }, { "epoch": 1.4936234058514628, "grad_norm": 0.34048858284950256, "learning_rate": 1.829457327250329e-05, "loss": 0.2442, "step": 1991 }, { "epoch": 1.4943735933983495, "grad_norm": 0.4425194263458252, "learning_rate": 1.8243970672103982e-05, "loss": 0.2096, "step": 1992 }, { "epoch": 1.4951237809452362, "grad_norm": 0.38266193866729736, "learning_rate": 1.8193422530167914e-05, "loss": 0.2041, "step": 1993 }, { "epoch": 1.495873968492123, "grad_norm": 0.2893502414226532, "learning_rate": 1.8142928933380142e-05, "loss": 0.2097, "step": 1994 }, { "epoch": 1.49662415603901, "grad_norm": 0.2854366898536682, "learning_rate": 1.8092489968332233e-05, "loss": 0.1897, "step": 1995 }, { "epoch": 1.4973743435858964, "grad_norm": 0.3061712086200714, "learning_rate": 1.804210572152204e-05, "loss": 0.2297, "step": 1996 }, { "epoch": 1.4981245311327833, "grad_norm": 0.342894047498703, "learning_rate": 1.7991776279353604e-05, "loss": 0.2423, "step": 1997 }, { "epoch": 1.4988747186796698, "grad_norm": 0.26828625798225403, "learning_rate": 1.794150172813693e-05, "loss": 0.1421, "step": 1998 }, { "epoch": 1.4996249062265568, "grad_norm": 0.4540550410747528, "learning_rate": 1.7891282154087934e-05, "loss": 0.2692, "step": 1999 }, { "epoch": 1.5003750937734432, "grad_norm": 0.3485063910484314, "learning_rate": 1.7841117643328246e-05, "loss": 0.2277, "step": 2000 }, { "epoch": 1.5003750937734432, "eval_loss": 0.25594252347946167, "eval_runtime": 8.9007, "eval_samples_per_second": 6.067, "eval_steps_per_second": 1.573, "step": 2000 }, { "epoch": 1.5011252813203302, "grad_norm": 0.3182351887226105, "learning_rate": 1.779100828188506e-05, "loss": 0.2186, "step": 2001 }, { "epoch": 1.5018754688672167, "grad_norm": 0.37051379680633545, "learning_rate": 1.774095415569102e-05, "loss": 0.2292, "step": 2002 }, { "epoch": 1.5026256564141036, "grad_norm": 0.3527226746082306, "learning_rate": 1.7690955350583976e-05, "loss": 0.2379, "step": 2003 }, { "epoch": 1.50337584396099, "grad_norm": 0.3646087050437927, "learning_rate": 1.764101195230696e-05, "loss": 0.2213, "step": 2004 }, { "epoch": 1.504126031507877, "grad_norm": 0.4253525137901306, "learning_rate": 1.7591124046508045e-05, "loss": 0.2463, "step": 2005 }, { "epoch": 1.5048762190547635, "grad_norm": 0.32970163226127625, "learning_rate": 1.7541291718740012e-05, "loss": 0.2293, "step": 2006 }, { "epoch": 1.5056264066016505, "grad_norm": 0.37259411811828613, "learning_rate": 1.7491515054460418e-05, "loss": 0.1964, "step": 2007 }, { "epoch": 1.506376594148537, "grad_norm": 0.2639153003692627, "learning_rate": 1.7441794139031337e-05, "loss": 0.1743, "step": 2008 }, { "epoch": 1.507126781695424, "grad_norm": 0.43189552426338196, "learning_rate": 1.7392129057719246e-05, "loss": 0.2564, "step": 2009 }, { "epoch": 1.5078769692423106, "grad_norm": 0.3170628547668457, "learning_rate": 1.7342519895694886e-05, "loss": 0.2569, "step": 2010 }, { "epoch": 1.5086271567891973, "grad_norm": 0.3008267879486084, "learning_rate": 1.7292966738033057e-05, "loss": 0.1974, "step": 2011 }, { "epoch": 1.509377344336084, "grad_norm": 0.4293673634529114, "learning_rate": 1.7243469669712546e-05, "loss": 0.2531, "step": 2012 }, { "epoch": 1.5101275318829708, "grad_norm": 0.3296526372432709, "learning_rate": 1.7194028775615966e-05, "loss": 0.1832, "step": 2013 }, { "epoch": 1.5108777194298575, "grad_norm": 0.3830187916755676, "learning_rate": 1.714464414052958e-05, "loss": 0.2444, "step": 2014 }, { "epoch": 1.5116279069767442, "grad_norm": 0.3339605927467346, "learning_rate": 1.7095315849143184e-05, "loss": 0.2259, "step": 2015 }, { "epoch": 1.512378094523631, "grad_norm": 0.41907432675361633, "learning_rate": 1.704604398604991e-05, "loss": 0.2034, "step": 2016 }, { "epoch": 1.5131282820705176, "grad_norm": 0.41604459285736084, "learning_rate": 1.6996828635746165e-05, "loss": 0.2115, "step": 2017 }, { "epoch": 1.5138784696174044, "grad_norm": 0.3704230487346649, "learning_rate": 1.6947669882631434e-05, "loss": 0.2025, "step": 2018 }, { "epoch": 1.514628657164291, "grad_norm": 0.4509468674659729, "learning_rate": 1.6898567811008135e-05, "loss": 0.276, "step": 2019 }, { "epoch": 1.5153788447111778, "grad_norm": 0.36796435713768005, "learning_rate": 1.684952250508149e-05, "loss": 0.1706, "step": 2020 }, { "epoch": 1.5161290322580645, "grad_norm": 0.3223189115524292, "learning_rate": 1.6800534048959364e-05, "loss": 0.1649, "step": 2021 }, { "epoch": 1.5168792198049512, "grad_norm": 0.32763442397117615, "learning_rate": 1.6751602526652133e-05, "loss": 0.214, "step": 2022 }, { "epoch": 1.517629407351838, "grad_norm": 0.45092734694480896, "learning_rate": 1.6702728022072562e-05, "loss": 0.2432, "step": 2023 }, { "epoch": 1.5183795948987246, "grad_norm": 0.2848072052001953, "learning_rate": 1.665391061903558e-05, "loss": 0.1366, "step": 2024 }, { "epoch": 1.5191297824456114, "grad_norm": 0.3958592116832733, "learning_rate": 1.660515040125824e-05, "loss": 0.249, "step": 2025 }, { "epoch": 1.519879969992498, "grad_norm": 0.2600467801094055, "learning_rate": 1.6556447452359512e-05, "loss": 0.2003, "step": 2026 }, { "epoch": 1.5206301575393848, "grad_norm": 0.39340925216674805, "learning_rate": 1.6507801855860177e-05, "loss": 0.2802, "step": 2027 }, { "epoch": 1.5213803450862715, "grad_norm": 0.3376513719558716, "learning_rate": 1.645921369518261e-05, "loss": 0.2325, "step": 2028 }, { "epoch": 1.5221305326331582, "grad_norm": 0.45349448919296265, "learning_rate": 1.6410683053650737e-05, "loss": 0.1742, "step": 2029 }, { "epoch": 1.5228807201800452, "grad_norm": 0.43773627281188965, "learning_rate": 1.636221001448983e-05, "loss": 0.3062, "step": 2030 }, { "epoch": 1.5236309077269317, "grad_norm": 0.3786463141441345, "learning_rate": 1.631379466082638e-05, "loss": 0.2353, "step": 2031 }, { "epoch": 1.5243810952738186, "grad_norm": 0.21817055344581604, "learning_rate": 1.626543707568795e-05, "loss": 0.2108, "step": 2032 }, { "epoch": 1.525131282820705, "grad_norm": 0.31064313650131226, "learning_rate": 1.6217137342003036e-05, "loss": 0.1744, "step": 2033 }, { "epoch": 1.525881470367592, "grad_norm": 0.4497145116329193, "learning_rate": 1.616889554260092e-05, "loss": 0.2613, "step": 2034 }, { "epoch": 1.5266316579144785, "grad_norm": 0.30002138018608093, "learning_rate": 1.6120711760211548e-05, "loss": 0.2053, "step": 2035 }, { "epoch": 1.5273818454613655, "grad_norm": 0.2787644565105438, "learning_rate": 1.607258607746537e-05, "loss": 0.1907, "step": 2036 }, { "epoch": 1.528132033008252, "grad_norm": 0.36335381865501404, "learning_rate": 1.602451857689316e-05, "loss": 0.237, "step": 2037 }, { "epoch": 1.528882220555139, "grad_norm": 0.3274303674697876, "learning_rate": 1.5976509340925977e-05, "loss": 0.1726, "step": 2038 }, { "epoch": 1.5296324081020254, "grad_norm": 0.2659587264060974, "learning_rate": 1.5928558451894914e-05, "loss": 0.2018, "step": 2039 }, { "epoch": 1.5303825956489123, "grad_norm": 0.3835706114768982, "learning_rate": 1.588066599203106e-05, "loss": 0.2103, "step": 2040 }, { "epoch": 1.5311327831957988, "grad_norm": 0.37673234939575195, "learning_rate": 1.583283204346521e-05, "loss": 0.2133, "step": 2041 }, { "epoch": 1.5318829707426858, "grad_norm": 0.4497091472148895, "learning_rate": 1.5785056688227916e-05, "loss": 0.3204, "step": 2042 }, { "epoch": 1.5326331582895723, "grad_norm": 0.35026296973228455, "learning_rate": 1.5737340008249202e-05, "loss": 0.175, "step": 2043 }, { "epoch": 1.5333833458364592, "grad_norm": 0.35697436332702637, "learning_rate": 1.5689682085358465e-05, "loss": 0.2022, "step": 2044 }, { "epoch": 1.5341335333833457, "grad_norm": 0.4025813341140747, "learning_rate": 1.564208300128438e-05, "loss": 0.216, "step": 2045 }, { "epoch": 1.5348837209302326, "grad_norm": 0.35970011353492737, "learning_rate": 1.5594542837654625e-05, "loss": 0.257, "step": 2046 }, { "epoch": 1.5356339084771191, "grad_norm": 0.3831564486026764, "learning_rate": 1.554706167599596e-05, "loss": 0.257, "step": 2047 }, { "epoch": 1.536384096024006, "grad_norm": 0.37232697010040283, "learning_rate": 1.5499639597733902e-05, "loss": 0.1749, "step": 2048 }, { "epoch": 1.5371342835708928, "grad_norm": 0.3647191524505615, "learning_rate": 1.54522766841926e-05, "loss": 0.211, "step": 2049 }, { "epoch": 1.5378844711177795, "grad_norm": 0.4209310710430145, "learning_rate": 1.540497301659482e-05, "loss": 0.2331, "step": 2050 }, { "epoch": 1.5386346586646662, "grad_norm": 0.3934963643550873, "learning_rate": 1.5357728676061685e-05, "loss": 0.2682, "step": 2051 }, { "epoch": 1.539384846211553, "grad_norm": 0.41596686840057373, "learning_rate": 1.5310543743612582e-05, "loss": 0.2795, "step": 2052 }, { "epoch": 1.5401350337584396, "grad_norm": 0.37060457468032837, "learning_rate": 1.526341830016505e-05, "loss": 0.2078, "step": 2053 }, { "epoch": 1.5408852213053263, "grad_norm": 0.3526898920536041, "learning_rate": 1.5216352426534548e-05, "loss": 0.2168, "step": 2054 }, { "epoch": 1.541635408852213, "grad_norm": 0.32955822348594666, "learning_rate": 1.5169346203434425e-05, "loss": 0.193, "step": 2055 }, { "epoch": 1.5423855963990998, "grad_norm": 0.4373268783092499, "learning_rate": 1.5122399711475732e-05, "loss": 0.2515, "step": 2056 }, { "epoch": 1.5431357839459865, "grad_norm": 0.31679531931877136, "learning_rate": 1.50755130311671e-05, "loss": 0.1793, "step": 2057 }, { "epoch": 1.5438859714928732, "grad_norm": 0.39237284660339355, "learning_rate": 1.502868624291452e-05, "loss": 0.2033, "step": 2058 }, { "epoch": 1.54463615903976, "grad_norm": 0.32702842354774475, "learning_rate": 1.4981919427021357e-05, "loss": 0.2067, "step": 2059 }, { "epoch": 1.5453863465866466, "grad_norm": 0.32684507966041565, "learning_rate": 1.493521266368807e-05, "loss": 0.2391, "step": 2060 }, { "epoch": 1.5461365341335334, "grad_norm": 0.3501066267490387, "learning_rate": 1.4888566033012201e-05, "loss": 0.2183, "step": 2061 }, { "epoch": 1.54688672168042, "grad_norm": 0.37345078587532043, "learning_rate": 1.4841979614988094e-05, "loss": 0.2229, "step": 2062 }, { "epoch": 1.5476369092273068, "grad_norm": 0.25095558166503906, "learning_rate": 1.4795453489506878e-05, "loss": 0.1812, "step": 2063 }, { "epoch": 1.5483870967741935, "grad_norm": 0.26450151205062866, "learning_rate": 1.4748987736356273e-05, "loss": 0.2022, "step": 2064 }, { "epoch": 1.5491372843210802, "grad_norm": 0.35751715302467346, "learning_rate": 1.4702582435220475e-05, "loss": 0.2385, "step": 2065 }, { "epoch": 1.549887471867967, "grad_norm": 0.3760606646537781, "learning_rate": 1.4656237665680017e-05, "loss": 0.2079, "step": 2066 }, { "epoch": 1.5506376594148539, "grad_norm": 0.4015029966831207, "learning_rate": 1.4609953507211593e-05, "loss": 0.2519, "step": 2067 }, { "epoch": 1.5513878469617404, "grad_norm": 0.3060472309589386, "learning_rate": 1.4563730039187984e-05, "loss": 0.2281, "step": 2068 }, { "epoch": 1.5521380345086273, "grad_norm": 0.2860056161880493, "learning_rate": 1.4517567340877886e-05, "loss": 0.2175, "step": 2069 }, { "epoch": 1.5528882220555138, "grad_norm": 0.4113752841949463, "learning_rate": 1.4471465491445802e-05, "loss": 0.2054, "step": 2070 }, { "epoch": 1.5536384096024007, "grad_norm": 0.4366605877876282, "learning_rate": 1.4425424569951822e-05, "loss": 0.1709, "step": 2071 }, { "epoch": 1.5543885971492872, "grad_norm": 0.23527930676937103, "learning_rate": 1.4379444655351626e-05, "loss": 0.2358, "step": 2072 }, { "epoch": 1.5551387846961742, "grad_norm": 0.43676328659057617, "learning_rate": 1.4333525826496224e-05, "loss": 0.2364, "step": 2073 }, { "epoch": 1.5558889722430607, "grad_norm": 0.34080198407173157, "learning_rate": 1.4287668162131896e-05, "loss": 0.2343, "step": 2074 }, { "epoch": 1.5566391597899476, "grad_norm": 0.3275124430656433, "learning_rate": 1.4241871740900014e-05, "loss": 0.2047, "step": 2075 }, { "epoch": 1.557389347336834, "grad_norm": 0.36881810426712036, "learning_rate": 1.4196136641336932e-05, "loss": 0.2486, "step": 2076 }, { "epoch": 1.558139534883721, "grad_norm": 0.3420529067516327, "learning_rate": 1.4150462941873843e-05, "loss": 0.2218, "step": 2077 }, { "epoch": 1.5588897224306075, "grad_norm": 0.43109914660453796, "learning_rate": 1.410485072083666e-05, "loss": 0.1602, "step": 2078 }, { "epoch": 1.5596399099774945, "grad_norm": 0.31198281049728394, "learning_rate": 1.4059300056445823e-05, "loss": 0.2014, "step": 2079 }, { "epoch": 1.560390097524381, "grad_norm": 0.29856041073799133, "learning_rate": 1.4013811026816243e-05, "loss": 0.1596, "step": 2080 }, { "epoch": 1.561140285071268, "grad_norm": 0.3845573961734772, "learning_rate": 1.3968383709957133e-05, "loss": 0.1913, "step": 2081 }, { "epoch": 1.5618904726181544, "grad_norm": 0.32687148451805115, "learning_rate": 1.3923018183771868e-05, "loss": 0.1416, "step": 2082 }, { "epoch": 1.5626406601650413, "grad_norm": 0.385017991065979, "learning_rate": 1.3877714526057872e-05, "loss": 0.2452, "step": 2083 }, { "epoch": 1.5633908477119278, "grad_norm": 0.3449662923812866, "learning_rate": 1.3832472814506425e-05, "loss": 0.1894, "step": 2084 }, { "epoch": 1.5641410352588148, "grad_norm": 0.8420486450195312, "learning_rate": 1.3787293126702622e-05, "loss": 0.2489, "step": 2085 }, { "epoch": 1.5648912228057015, "grad_norm": 0.361682653427124, "learning_rate": 1.3742175540125179e-05, "loss": 0.2762, "step": 2086 }, { "epoch": 1.5656414103525882, "grad_norm": 0.36209341883659363, "learning_rate": 1.3697120132146318e-05, "loss": 0.222, "step": 2087 }, { "epoch": 1.566391597899475, "grad_norm": 0.32256630063056946, "learning_rate": 1.3652126980031627e-05, "loss": 0.1842, "step": 2088 }, { "epoch": 1.5671417854463616, "grad_norm": 0.33997461199760437, "learning_rate": 1.3607196160939927e-05, "loss": 0.2412, "step": 2089 }, { "epoch": 1.5678919729932483, "grad_norm": 0.3712252974510193, "learning_rate": 1.3562327751923149e-05, "loss": 0.2206, "step": 2090 }, { "epoch": 1.568642160540135, "grad_norm": 0.38602781295776367, "learning_rate": 1.351752182992621e-05, "loss": 0.3031, "step": 2091 }, { "epoch": 1.5693923480870218, "grad_norm": 0.3721514046192169, "learning_rate": 1.3472778471786829e-05, "loss": 0.2288, "step": 2092 }, { "epoch": 1.5701425356339085, "grad_norm": 0.35960397124290466, "learning_rate": 1.3428097754235475e-05, "loss": 0.2978, "step": 2093 }, { "epoch": 1.5708927231807952, "grad_norm": 0.2877357602119446, "learning_rate": 1.3383479753895174e-05, "loss": 0.2632, "step": 2094 }, { "epoch": 1.571642910727682, "grad_norm": 0.39095762372016907, "learning_rate": 1.33389245472814e-05, "loss": 0.1874, "step": 2095 }, { "epoch": 1.5723930982745686, "grad_norm": 0.3488343358039856, "learning_rate": 1.3294432210801966e-05, "loss": 0.2023, "step": 2096 }, { "epoch": 1.5731432858214554, "grad_norm": 0.3617469072341919, "learning_rate": 1.3250002820756819e-05, "loss": 0.2206, "step": 2097 }, { "epoch": 1.573893473368342, "grad_norm": 0.36220860481262207, "learning_rate": 1.3205636453338e-05, "loss": 0.2197, "step": 2098 }, { "epoch": 1.5746436609152288, "grad_norm": 0.26773908734321594, "learning_rate": 1.316133318462946e-05, "loss": 0.1965, "step": 2099 }, { "epoch": 1.5753938484621155, "grad_norm": 0.3782847225666046, "learning_rate": 1.3117093090606958e-05, "loss": 0.1869, "step": 2100 }, { "epoch": 1.5761440360090022, "grad_norm": 0.3348628580570221, "learning_rate": 1.3072916247137861e-05, "loss": 0.1393, "step": 2101 }, { "epoch": 1.576894223555889, "grad_norm": 0.3963795006275177, "learning_rate": 1.302880272998112e-05, "loss": 0.2461, "step": 2102 }, { "epoch": 1.5776444111027756, "grad_norm": 0.42578253149986267, "learning_rate": 1.29847526147871e-05, "loss": 0.1464, "step": 2103 }, { "epoch": 1.5783945986496624, "grad_norm": 0.36926746368408203, "learning_rate": 1.2940765977097402e-05, "loss": 0.1603, "step": 2104 }, { "epoch": 1.579144786196549, "grad_norm": 0.38307133316993713, "learning_rate": 1.2896842892344751e-05, "loss": 0.2608, "step": 2105 }, { "epoch": 1.579894973743436, "grad_norm": 0.3213891088962555, "learning_rate": 1.2852983435852928e-05, "loss": 0.2512, "step": 2106 }, { "epoch": 1.5806451612903225, "grad_norm": 0.3578799068927765, "learning_rate": 1.2809187682836588e-05, "loss": 0.2118, "step": 2107 }, { "epoch": 1.5813953488372094, "grad_norm": 0.31538939476013184, "learning_rate": 1.2765455708401142e-05, "loss": 0.2371, "step": 2108 }, { "epoch": 1.582145536384096, "grad_norm": 0.3526604175567627, "learning_rate": 1.2721787587542595e-05, "loss": 0.2342, "step": 2109 }, { "epoch": 1.5828957239309829, "grad_norm": 0.38238978385925293, "learning_rate": 1.2678183395147485e-05, "loss": 0.2605, "step": 2110 }, { "epoch": 1.5836459114778694, "grad_norm": 0.35559824109077454, "learning_rate": 1.2634643205992707e-05, "loss": 0.2112, "step": 2111 }, { "epoch": 1.5843960990247563, "grad_norm": 0.35218286514282227, "learning_rate": 1.2591167094745404e-05, "loss": 0.2874, "step": 2112 }, { "epoch": 1.5851462865716428, "grad_norm": 0.257209450006485, "learning_rate": 1.2547755135962841e-05, "loss": 0.2147, "step": 2113 }, { "epoch": 1.5858964741185297, "grad_norm": 0.3194068372249603, "learning_rate": 1.2504407404092217e-05, "loss": 0.2215, "step": 2114 }, { "epoch": 1.5866466616654162, "grad_norm": 0.31058770418167114, "learning_rate": 1.2461123973470634e-05, "loss": 0.2089, "step": 2115 }, { "epoch": 1.5873968492123032, "grad_norm": 0.39986902475357056, "learning_rate": 1.2417904918324913e-05, "loss": 0.205, "step": 2116 }, { "epoch": 1.5881470367591897, "grad_norm": 0.3535638749599457, "learning_rate": 1.237475031277151e-05, "loss": 0.2356, "step": 2117 }, { "epoch": 1.5888972243060766, "grad_norm": 0.2762316167354584, "learning_rate": 1.2331660230816288e-05, "loss": 0.2068, "step": 2118 }, { "epoch": 1.589647411852963, "grad_norm": 0.3583389222621918, "learning_rate": 1.2288634746354505e-05, "loss": 0.1878, "step": 2119 }, { "epoch": 1.59039759939985, "grad_norm": 0.4379098415374756, "learning_rate": 1.2245673933170626e-05, "loss": 0.2569, "step": 2120 }, { "epoch": 1.5911477869467365, "grad_norm": 0.3194253742694855, "learning_rate": 1.2202777864938236e-05, "loss": 0.2012, "step": 2121 }, { "epoch": 1.5918979744936235, "grad_norm": 0.44894564151763916, "learning_rate": 1.2159946615219836e-05, "loss": 0.1785, "step": 2122 }, { "epoch": 1.59264816204051, "grad_norm": 0.3323229253292084, "learning_rate": 1.211718025746682e-05, "loss": 0.1642, "step": 2123 }, { "epoch": 1.593398349587397, "grad_norm": 0.4309782087802887, "learning_rate": 1.2074478865019273e-05, "loss": 0.1867, "step": 2124 }, { "epoch": 1.5941485371342836, "grad_norm": 0.30987805128097534, "learning_rate": 1.2031842511105885e-05, "loss": 0.1568, "step": 2125 }, { "epoch": 1.5948987246811703, "grad_norm": 0.3701426386833191, "learning_rate": 1.1989271268843815e-05, "loss": 0.204, "step": 2126 }, { "epoch": 1.595648912228057, "grad_norm": 0.47456595301628113, "learning_rate": 1.1946765211238526e-05, "loss": 0.2443, "step": 2127 }, { "epoch": 1.5963990997749438, "grad_norm": 0.3984193503856659, "learning_rate": 1.1904324411183731e-05, "loss": 0.2759, "step": 2128 }, { "epoch": 1.5971492873218305, "grad_norm": 0.33830997347831726, "learning_rate": 1.1861948941461226e-05, "loss": 0.2794, "step": 2129 }, { "epoch": 1.5978994748687172, "grad_norm": 0.2814776599407196, "learning_rate": 1.1819638874740769e-05, "loss": 0.1763, "step": 2130 }, { "epoch": 1.598649662415604, "grad_norm": 0.45382824540138245, "learning_rate": 1.1777394283579956e-05, "loss": 0.1946, "step": 2131 }, { "epoch": 1.5993998499624906, "grad_norm": 0.42319226264953613, "learning_rate": 1.1735215240424102e-05, "loss": 0.243, "step": 2132 }, { "epoch": 1.6001500375093773, "grad_norm": 0.3408633768558502, "learning_rate": 1.1693101817606117e-05, "loss": 0.2138, "step": 2133 }, { "epoch": 1.600900225056264, "grad_norm": 0.33623549342155457, "learning_rate": 1.165105408734638e-05, "loss": 0.1459, "step": 2134 }, { "epoch": 1.6016504126031508, "grad_norm": 0.41078945994377136, "learning_rate": 1.1609072121752584e-05, "loss": 0.2741, "step": 2135 }, { "epoch": 1.6024006001500375, "grad_norm": 0.32982394099235535, "learning_rate": 1.1567155992819678e-05, "loss": 0.2097, "step": 2136 }, { "epoch": 1.6031507876969242, "grad_norm": 0.4031613767147064, "learning_rate": 1.15253057724297e-05, "loss": 0.2591, "step": 2137 }, { "epoch": 1.603900975243811, "grad_norm": 0.3800208866596222, "learning_rate": 1.1483521532351654e-05, "loss": 0.2496, "step": 2138 }, { "epoch": 1.6046511627906976, "grad_norm": 0.2940170168876648, "learning_rate": 1.144180334424141e-05, "loss": 0.1898, "step": 2139 }, { "epoch": 1.6054013503375844, "grad_norm": 0.3772093653678894, "learning_rate": 1.1400151279641525e-05, "loss": 0.2053, "step": 2140 }, { "epoch": 1.606151537884471, "grad_norm": 0.30234789848327637, "learning_rate": 1.1358565409981203e-05, "loss": 0.2177, "step": 2141 }, { "epoch": 1.6069017254313578, "grad_norm": 0.35745713114738464, "learning_rate": 1.1317045806576121e-05, "loss": 0.1932, "step": 2142 }, { "epoch": 1.6076519129782447, "grad_norm": 0.33750462532043457, "learning_rate": 1.12755925406283e-05, "loss": 0.2516, "step": 2143 }, { "epoch": 1.6084021005251312, "grad_norm": 0.501732587814331, "learning_rate": 1.1234205683226012e-05, "loss": 0.2311, "step": 2144 }, { "epoch": 1.6091522880720182, "grad_norm": 0.24356506764888763, "learning_rate": 1.1192885305343648e-05, "loss": 0.2265, "step": 2145 }, { "epoch": 1.6099024756189046, "grad_norm": 0.2592124044895172, "learning_rate": 1.1151631477841584e-05, "loss": 0.2219, "step": 2146 }, { "epoch": 1.6106526631657916, "grad_norm": 0.3934110105037689, "learning_rate": 1.1110444271466086e-05, "loss": 0.1857, "step": 2147 }, { "epoch": 1.611402850712678, "grad_norm": 0.2999211847782135, "learning_rate": 1.1069323756849126e-05, "loss": 0.224, "step": 2148 }, { "epoch": 1.612153038259565, "grad_norm": 0.4355531334877014, "learning_rate": 1.102827000450835e-05, "loss": 0.2511, "step": 2149 }, { "epoch": 1.6129032258064515, "grad_norm": 0.371836394071579, "learning_rate": 1.0987283084846905e-05, "loss": 0.219, "step": 2150 }, { "epoch": 1.6136534133533385, "grad_norm": 0.2819121181964874, "learning_rate": 1.0946363068153343e-05, "loss": 0.1753, "step": 2151 }, { "epoch": 1.614403600900225, "grad_norm": 0.3656623959541321, "learning_rate": 1.0905510024601423e-05, "loss": 0.2401, "step": 2152 }, { "epoch": 1.6151537884471119, "grad_norm": 0.3601517379283905, "learning_rate": 1.0864724024250106e-05, "loss": 0.1948, "step": 2153 }, { "epoch": 1.6159039759939984, "grad_norm": 0.2885972857475281, "learning_rate": 1.0824005137043375e-05, "loss": 0.1392, "step": 2154 }, { "epoch": 1.6166541635408853, "grad_norm": 0.2865145206451416, "learning_rate": 1.0783353432810106e-05, "loss": 0.2303, "step": 2155 }, { "epoch": 1.6174043510877718, "grad_norm": 0.30970925092697144, "learning_rate": 1.0742768981263984e-05, "loss": 0.1723, "step": 2156 }, { "epoch": 1.6181545386346587, "grad_norm": 0.2804229259490967, "learning_rate": 1.070225185200331e-05, "loss": 0.1483, "step": 2157 }, { "epoch": 1.6189047261815452, "grad_norm": 0.3012600839138031, "learning_rate": 1.0661802114511005e-05, "loss": 0.2733, "step": 2158 }, { "epoch": 1.6196549137284322, "grad_norm": 0.32413673400878906, "learning_rate": 1.062141983815439e-05, "loss": 0.1876, "step": 2159 }, { "epoch": 1.6204051012753187, "grad_norm": 0.32718247175216675, "learning_rate": 1.0581105092185062e-05, "loss": 0.1885, "step": 2160 }, { "epoch": 1.6211552888222056, "grad_norm": 0.3035251498222351, "learning_rate": 1.0540857945738852e-05, "loss": 0.2163, "step": 2161 }, { "epoch": 1.6219054763690923, "grad_norm": 0.38433384895324707, "learning_rate": 1.0500678467835662e-05, "loss": 0.1924, "step": 2162 }, { "epoch": 1.622655663915979, "grad_norm": 0.37670424580574036, "learning_rate": 1.0460566727379335e-05, "loss": 0.182, "step": 2163 }, { "epoch": 1.6234058514628658, "grad_norm": 0.38290417194366455, "learning_rate": 1.0420522793157567e-05, "loss": 0.1825, "step": 2164 }, { "epoch": 1.6241560390097525, "grad_norm": 0.58870929479599, "learning_rate": 1.038054673384174e-05, "loss": 0.2279, "step": 2165 }, { "epoch": 1.6249062265566392, "grad_norm": 0.36976227164268494, "learning_rate": 1.0340638617986864e-05, "loss": 0.2116, "step": 2166 }, { "epoch": 1.625656414103526, "grad_norm": 0.376412957906723, "learning_rate": 1.030079851403144e-05, "loss": 0.2273, "step": 2167 }, { "epoch": 1.6264066016504126, "grad_norm": 0.3948953151702881, "learning_rate": 1.0261026490297315e-05, "loss": 0.2011, "step": 2168 }, { "epoch": 1.6271567891972993, "grad_norm": 0.2291846126317978, "learning_rate": 1.022132261498961e-05, "loss": 0.1877, "step": 2169 }, { "epoch": 1.627906976744186, "grad_norm": 0.39602792263031006, "learning_rate": 1.0181686956196529e-05, "loss": 0.1793, "step": 2170 }, { "epoch": 1.6286571642910728, "grad_norm": 0.3197188973426819, "learning_rate": 1.0142119581889332e-05, "loss": 0.2435, "step": 2171 }, { "epoch": 1.6294073518379595, "grad_norm": 0.49767956137657166, "learning_rate": 1.0102620559922204e-05, "loss": 0.2957, "step": 2172 }, { "epoch": 1.6301575393848462, "grad_norm": 0.42927104234695435, "learning_rate": 1.0063189958032043e-05, "loss": 0.2328, "step": 2173 }, { "epoch": 1.630907726931733, "grad_norm": 0.3505445420742035, "learning_rate": 1.0023827843838457e-05, "loss": 0.208, "step": 2174 }, { "epoch": 1.6316579144786196, "grad_norm": 0.3092861771583557, "learning_rate": 9.984534284843594e-06, "loss": 0.2283, "step": 2175 }, { "epoch": 1.6324081020255063, "grad_norm": 0.2963518500328064, "learning_rate": 9.945309348432047e-06, "loss": 0.2316, "step": 2176 }, { "epoch": 1.633158289572393, "grad_norm": 0.3202652633190155, "learning_rate": 9.906153101870725e-06, "loss": 0.2731, "step": 2177 }, { "epoch": 1.6339084771192798, "grad_norm": 0.3126668632030487, "learning_rate": 9.867065612308713e-06, "loss": 0.1957, "step": 2178 }, { "epoch": 1.6346586646661665, "grad_norm": 0.2765806019306183, "learning_rate": 9.82804694677722e-06, "loss": 0.1502, "step": 2179 }, { "epoch": 1.6354088522130532, "grad_norm": 0.41856059432029724, "learning_rate": 9.78909717218941e-06, "loss": 0.17, "step": 2180 }, { "epoch": 1.63615903975994, "grad_norm": 0.2817077934741974, "learning_rate": 9.75021635534033e-06, "loss": 0.1641, "step": 2181 }, { "epoch": 1.6369092273068269, "grad_norm": 0.3464756906032562, "learning_rate": 9.711404562906717e-06, "loss": 0.2644, "step": 2182 }, { "epoch": 1.6376594148537134, "grad_norm": 0.36019188165664673, "learning_rate": 9.672661861447002e-06, "loss": 0.2555, "step": 2183 }, { "epoch": 1.6384096024006003, "grad_norm": 0.4182175099849701, "learning_rate": 9.633988317401087e-06, "loss": 0.1853, "step": 2184 }, { "epoch": 1.6391597899474868, "grad_norm": 0.4180493950843811, "learning_rate": 9.595383997090302e-06, "loss": 0.253, "step": 2185 }, { "epoch": 1.6399099774943737, "grad_norm": 0.3464215099811554, "learning_rate": 9.556848966717247e-06, "loss": 0.2271, "step": 2186 }, { "epoch": 1.6406601650412602, "grad_norm": 0.3817156255245209, "learning_rate": 9.518383292365713e-06, "loss": 0.2903, "step": 2187 }, { "epoch": 1.6414103525881472, "grad_norm": 0.32559800148010254, "learning_rate": 9.479987040000538e-06, "loss": 0.2006, "step": 2188 }, { "epoch": 1.6421605401350337, "grad_norm": 0.36530059576034546, "learning_rate": 9.441660275467512e-06, "loss": 0.2383, "step": 2189 }, { "epoch": 1.6429107276819206, "grad_norm": 0.35157638788223267, "learning_rate": 9.403403064493282e-06, "loss": 0.1631, "step": 2190 }, { "epoch": 1.643660915228807, "grad_norm": 0.3439987003803253, "learning_rate": 9.365215472685163e-06, "loss": 0.2231, "step": 2191 }, { "epoch": 1.644411102775694, "grad_norm": 0.35855019092559814, "learning_rate": 9.32709756553114e-06, "loss": 0.2186, "step": 2192 }, { "epoch": 1.6451612903225805, "grad_norm": 0.3391314744949341, "learning_rate": 9.289049408399659e-06, "loss": 0.237, "step": 2193 }, { "epoch": 1.6459114778694675, "grad_norm": 0.3282822370529175, "learning_rate": 9.251071066539579e-06, "loss": 0.2012, "step": 2194 }, { "epoch": 1.646661665416354, "grad_norm": 0.4114097058773041, "learning_rate": 9.21316260507999e-06, "loss": 0.2336, "step": 2195 }, { "epoch": 1.6474118529632409, "grad_norm": 0.3987085521221161, "learning_rate": 9.175324089030185e-06, "loss": 0.23, "step": 2196 }, { "epoch": 1.6481620405101274, "grad_norm": 0.42324626445770264, "learning_rate": 9.137555583279495e-06, "loss": 0.3193, "step": 2197 }, { "epoch": 1.6489122280570143, "grad_norm": 0.4010095000267029, "learning_rate": 9.099857152597185e-06, "loss": 0.2187, "step": 2198 }, { "epoch": 1.6496624156039008, "grad_norm": 0.35067957639694214, "learning_rate": 9.062228861632354e-06, "loss": 0.1995, "step": 2199 }, { "epoch": 1.6504126031507877, "grad_norm": 0.36225444078445435, "learning_rate": 9.024670774913812e-06, "loss": 0.2696, "step": 2200 }, { "epoch": 1.6504126031507877, "eval_loss": 0.2538428008556366, "eval_runtime": 8.8866, "eval_samples_per_second": 6.077, "eval_steps_per_second": 1.575, "step": 2200 }, { "epoch": 1.6511627906976745, "grad_norm": 0.2923382520675659, "learning_rate": 8.987182956849983e-06, "loss": 0.1842, "step": 2201 }, { "epoch": 1.6519129782445612, "grad_norm": 0.30847442150115967, "learning_rate": 8.949765471728789e-06, "loss": 0.1697, "step": 2202 }, { "epoch": 1.652663165791448, "grad_norm": 0.30500367283821106, "learning_rate": 8.912418383717513e-06, "loss": 0.2979, "step": 2203 }, { "epoch": 1.6534133533383346, "grad_norm": 0.23297064006328583, "learning_rate": 8.875141756862749e-06, "loss": 0.1794, "step": 2204 }, { "epoch": 1.6541635408852213, "grad_norm": 0.40768733620643616, "learning_rate": 8.837935655090241e-06, "loss": 0.1939, "step": 2205 }, { "epoch": 1.654913728432108, "grad_norm": 0.3398992419242859, "learning_rate": 8.800800142204779e-06, "loss": 0.1984, "step": 2206 }, { "epoch": 1.6556639159789948, "grad_norm": 0.3460598289966583, "learning_rate": 8.763735281890133e-06, "loss": 0.2771, "step": 2207 }, { "epoch": 1.6564141035258815, "grad_norm": 0.28442656993865967, "learning_rate": 8.726741137708866e-06, "loss": 0.2549, "step": 2208 }, { "epoch": 1.6571642910727682, "grad_norm": 0.40022140741348267, "learning_rate": 8.689817773102293e-06, "loss": 0.1155, "step": 2209 }, { "epoch": 1.657914478619655, "grad_norm": 0.29244744777679443, "learning_rate": 8.65296525139036e-06, "loss": 0.227, "step": 2210 }, { "epoch": 1.6586646661665416, "grad_norm": 0.342029333114624, "learning_rate": 8.616183635771525e-06, "loss": 0.2101, "step": 2211 }, { "epoch": 1.6594148537134283, "grad_norm": 0.3985951840877533, "learning_rate": 8.579472989322602e-06, "loss": 0.2296, "step": 2212 }, { "epoch": 1.660165041260315, "grad_norm": 0.33778664469718933, "learning_rate": 8.542833374998744e-06, "loss": 0.1978, "step": 2213 }, { "epoch": 1.6609152288072018, "grad_norm": 0.461139976978302, "learning_rate": 8.5062648556333e-06, "loss": 0.2249, "step": 2214 }, { "epoch": 1.6616654163540885, "grad_norm": 0.29061359167099, "learning_rate": 8.469767493937681e-06, "loss": 0.2507, "step": 2215 }, { "epoch": 1.6624156039009752, "grad_norm": 0.4071022868156433, "learning_rate": 8.43334135250125e-06, "loss": 0.2883, "step": 2216 }, { "epoch": 1.663165791447862, "grad_norm": 0.2868415117263794, "learning_rate": 8.39698649379126e-06, "loss": 0.1561, "step": 2217 }, { "epoch": 1.6639159789947486, "grad_norm": 0.33006608486175537, "learning_rate": 8.360702980152713e-06, "loss": 0.3168, "step": 2218 }, { "epoch": 1.6646661665416356, "grad_norm": 0.3339625597000122, "learning_rate": 8.32449087380826e-06, "loss": 0.2607, "step": 2219 }, { "epoch": 1.665416354088522, "grad_norm": 0.315928190946579, "learning_rate": 8.288350236858117e-06, "loss": 0.197, "step": 2220 }, { "epoch": 1.666166541635409, "grad_norm": 0.33040204644203186, "learning_rate": 8.252281131279887e-06, "loss": 0.2301, "step": 2221 }, { "epoch": 1.6669167291822955, "grad_norm": 0.3343827724456787, "learning_rate": 8.21628361892855e-06, "loss": 0.1628, "step": 2222 }, { "epoch": 1.6676669167291824, "grad_norm": 0.36361345648765564, "learning_rate": 8.180357761536296e-06, "loss": 0.2152, "step": 2223 }, { "epoch": 1.668417104276069, "grad_norm": 0.31529873609542847, "learning_rate": 8.14450362071244e-06, "loss": 0.2465, "step": 2224 }, { "epoch": 1.6691672918229559, "grad_norm": 0.35468676686286926, "learning_rate": 8.10872125794328e-06, "loss": 0.2177, "step": 2225 }, { "epoch": 1.6699174793698424, "grad_norm": 0.35961002111434937, "learning_rate": 8.073010734592057e-06, "loss": 0.1242, "step": 2226 }, { "epoch": 1.6706676669167293, "grad_norm": 0.4144163131713867, "learning_rate": 8.037372111898789e-06, "loss": 0.2181, "step": 2227 }, { "epoch": 1.6714178544636158, "grad_norm": 0.3460024893283844, "learning_rate": 8.001805450980249e-06, "loss": 0.2671, "step": 2228 }, { "epoch": 1.6721680420105027, "grad_norm": 0.4186900556087494, "learning_rate": 7.966310812829709e-06, "loss": 0.2841, "step": 2229 }, { "epoch": 1.6729182295573892, "grad_norm": 0.2907342314720154, "learning_rate": 7.930888258316998e-06, "loss": 0.1798, "step": 2230 }, { "epoch": 1.6736684171042762, "grad_norm": 0.44431841373443604, "learning_rate": 7.89553784818831e-06, "loss": 0.239, "step": 2231 }, { "epoch": 1.6744186046511627, "grad_norm": 0.29800155758857727, "learning_rate": 7.860259643066126e-06, "loss": 0.1786, "step": 2232 }, { "epoch": 1.6751687921980496, "grad_norm": 0.33217474818229675, "learning_rate": 7.82505370344907e-06, "loss": 0.2263, "step": 2233 }, { "epoch": 1.675918979744936, "grad_norm": 0.2673128843307495, "learning_rate": 7.789920089711871e-06, "loss": 0.2503, "step": 2234 }, { "epoch": 1.676669167291823, "grad_norm": 0.30531173944473267, "learning_rate": 7.754858862105224e-06, "loss": 0.2477, "step": 2235 }, { "epoch": 1.6774193548387095, "grad_norm": 0.37796080112457275, "learning_rate": 7.71987008075568e-06, "loss": 0.2374, "step": 2236 }, { "epoch": 1.6781695423855965, "grad_norm": 0.3416386544704437, "learning_rate": 7.684953805665562e-06, "loss": 0.2278, "step": 2237 }, { "epoch": 1.6789197299324832, "grad_norm": 0.32736822962760925, "learning_rate": 7.65011009671282e-06, "loss": 0.247, "step": 2238 }, { "epoch": 1.67966991747937, "grad_norm": 0.32533955574035645, "learning_rate": 7.615339013651001e-06, "loss": 0.2534, "step": 2239 }, { "epoch": 1.6804201050262566, "grad_norm": 0.2593258023262024, "learning_rate": 7.580640616109081e-06, "loss": 0.158, "step": 2240 }, { "epoch": 1.6811702925731433, "grad_norm": 0.4093160331249237, "learning_rate": 7.546014963591397e-06, "loss": 0.1843, "step": 2241 }, { "epoch": 1.68192048012003, "grad_norm": 0.3519222140312195, "learning_rate": 7.511462115477536e-06, "loss": 0.2322, "step": 2242 }, { "epoch": 1.6826706676669168, "grad_norm": 0.33975446224212646, "learning_rate": 7.476982131022231e-06, "loss": 0.2053, "step": 2243 }, { "epoch": 1.6834208552138035, "grad_norm": 0.33370235562324524, "learning_rate": 7.442575069355256e-06, "loss": 0.2195, "step": 2244 }, { "epoch": 1.6841710427606902, "grad_norm": 0.3285314440727234, "learning_rate": 7.408240989481347e-06, "loss": 0.2242, "step": 2245 }, { "epoch": 1.684921230307577, "grad_norm": 0.4442092478275299, "learning_rate": 7.373979950280046e-06, "loss": 0.2045, "step": 2246 }, { "epoch": 1.6856714178544636, "grad_norm": 0.3377114534378052, "learning_rate": 7.33979201050568e-06, "loss": 0.1552, "step": 2247 }, { "epoch": 1.6864216054013503, "grad_norm": 0.4277175962924957, "learning_rate": 7.3056772287871886e-06, "loss": 0.1574, "step": 2248 }, { "epoch": 1.687171792948237, "grad_norm": 0.29551687836647034, "learning_rate": 7.2716356636280684e-06, "loss": 0.2712, "step": 2249 }, { "epoch": 1.6879219804951238, "grad_norm": 0.4275393486022949, "learning_rate": 7.237667373406259e-06, "loss": 0.2351, "step": 2250 }, { "epoch": 1.6886721680420105, "grad_norm": 0.29831555485725403, "learning_rate": 7.203772416374016e-06, "loss": 0.244, "step": 2251 }, { "epoch": 1.6894223555888972, "grad_norm": 0.34431734681129456, "learning_rate": 7.1699508506578636e-06, "loss": 0.1911, "step": 2252 }, { "epoch": 1.690172543135784, "grad_norm": 0.35310372710227966, "learning_rate": 7.136202734258457e-06, "loss": 0.2516, "step": 2253 }, { "epoch": 1.6909227306826706, "grad_norm": 0.48283082246780396, "learning_rate": 7.1025281250505006e-06, "loss": 0.1762, "step": 2254 }, { "epoch": 1.6916729182295573, "grad_norm": 0.37345579266548157, "learning_rate": 7.0689270807826e-06, "loss": 0.2089, "step": 2255 }, { "epoch": 1.692423105776444, "grad_norm": 0.31663957238197327, "learning_rate": 7.035399659077268e-06, "loss": 0.195, "step": 2256 }, { "epoch": 1.6931732933233308, "grad_norm": 0.4479316473007202, "learning_rate": 7.00194591743073e-06, "loss": 0.1904, "step": 2257 }, { "epoch": 1.6939234808702177, "grad_norm": 0.3440023362636566, "learning_rate": 6.96856591321286e-06, "loss": 0.2065, "step": 2258 }, { "epoch": 1.6946736684171042, "grad_norm": 0.3599484860897064, "learning_rate": 6.9352597036670575e-06, "loss": 0.2312, "step": 2259 }, { "epoch": 1.6954238559639911, "grad_norm": 0.35200509428977966, "learning_rate": 6.902027345910211e-06, "loss": 0.202, "step": 2260 }, { "epoch": 1.6961740435108776, "grad_norm": 0.3329913020133972, "learning_rate": 6.868868896932534e-06, "loss": 0.2137, "step": 2261 }, { "epoch": 1.6969242310577646, "grad_norm": 0.39469048380851746, "learning_rate": 6.835784413597512e-06, "loss": 0.2094, "step": 2262 }, { "epoch": 1.697674418604651, "grad_norm": 0.3366107940673828, "learning_rate": 6.802773952641761e-06, "loss": 0.2223, "step": 2263 }, { "epoch": 1.698424606151538, "grad_norm": 0.35615795850753784, "learning_rate": 6.769837570674975e-06, "loss": 0.2321, "step": 2264 }, { "epoch": 1.6991747936984245, "grad_norm": 0.3236789405345917, "learning_rate": 6.7369753241798114e-06, "loss": 0.2241, "step": 2265 }, { "epoch": 1.6999249812453114, "grad_norm": 0.3297240734100342, "learning_rate": 6.70418726951178e-06, "loss": 0.1835, "step": 2266 }, { "epoch": 1.700675168792198, "grad_norm": 0.3714750111103058, "learning_rate": 6.671473462899181e-06, "loss": 0.2428, "step": 2267 }, { "epoch": 1.7014253563390849, "grad_norm": 0.48665979504585266, "learning_rate": 6.638833960442948e-06, "loss": 0.1987, "step": 2268 }, { "epoch": 1.7021755438859714, "grad_norm": 0.3668679893016815, "learning_rate": 6.606268818116618e-06, "loss": 0.1639, "step": 2269 }, { "epoch": 1.7029257314328583, "grad_norm": 0.41807302832603455, "learning_rate": 6.573778091766219e-06, "loss": 0.1985, "step": 2270 }, { "epoch": 1.7036759189797448, "grad_norm": 0.316962867975235, "learning_rate": 6.541361837110149e-06, "loss": 0.2518, "step": 2271 }, { "epoch": 1.7044261065266317, "grad_norm": 0.31166699528694153, "learning_rate": 6.509020109739078e-06, "loss": 0.224, "step": 2272 }, { "epoch": 1.7051762940735182, "grad_norm": 0.3745969831943512, "learning_rate": 6.476752965115884e-06, "loss": 0.2901, "step": 2273 }, { "epoch": 1.7059264816204052, "grad_norm": 0.34928029775619507, "learning_rate": 6.444560458575544e-06, "loss": 0.2847, "step": 2274 }, { "epoch": 1.7066766691672917, "grad_norm": 0.2967956066131592, "learning_rate": 6.412442645325057e-06, "loss": 0.1913, "step": 2275 }, { "epoch": 1.7074268567141786, "grad_norm": 0.35019880533218384, "learning_rate": 6.38039958044328e-06, "loss": 0.1996, "step": 2276 }, { "epoch": 1.7081770442610653, "grad_norm": 0.3716976046562195, "learning_rate": 6.3484313188809265e-06, "loss": 0.2351, "step": 2277 }, { "epoch": 1.708927231807952, "grad_norm": 0.4160994589328766, "learning_rate": 6.316537915460418e-06, "loss": 0.1666, "step": 2278 }, { "epoch": 1.7096774193548387, "grad_norm": 0.3258497714996338, "learning_rate": 6.284719424875796e-06, "loss": 0.1489, "step": 2279 }, { "epoch": 1.7104276069017255, "grad_norm": 0.29386425018310547, "learning_rate": 6.252975901692659e-06, "loss": 0.2364, "step": 2280 }, { "epoch": 1.7111777944486122, "grad_norm": 0.35271474719047546, "learning_rate": 6.221307400347992e-06, "loss": 0.2149, "step": 2281 }, { "epoch": 1.711927981995499, "grad_norm": 0.28332942724227905, "learning_rate": 6.1897139751501796e-06, "loss": 0.1671, "step": 2282 }, { "epoch": 1.7126781695423856, "grad_norm": 0.3388148546218872, "learning_rate": 6.158195680278816e-06, "loss": 0.2784, "step": 2283 }, { "epoch": 1.7134283570892723, "grad_norm": 0.33405840396881104, "learning_rate": 6.126752569784694e-06, "loss": 0.3069, "step": 2284 }, { "epoch": 1.714178544636159, "grad_norm": 0.4597063958644867, "learning_rate": 6.095384697589635e-06, "loss": 0.1931, "step": 2285 }, { "epoch": 1.7149287321830458, "grad_norm": 0.29094210267066956, "learning_rate": 6.064092117486464e-06, "loss": 0.1426, "step": 2286 }, { "epoch": 1.7156789197299325, "grad_norm": 0.3077509105205536, "learning_rate": 6.032874883138867e-06, "loss": 0.2306, "step": 2287 }, { "epoch": 1.7164291072768192, "grad_norm": 0.3676895797252655, "learning_rate": 6.001733048081337e-06, "loss": 0.1986, "step": 2288 }, { "epoch": 1.717179294823706, "grad_norm": 0.424447625875473, "learning_rate": 5.970666665719033e-06, "loss": 0.2217, "step": 2289 }, { "epoch": 1.7179294823705926, "grad_norm": 0.35484564304351807, "learning_rate": 5.939675789327759e-06, "loss": 0.2166, "step": 2290 }, { "epoch": 1.7186796699174793, "grad_norm": 0.323914498090744, "learning_rate": 5.908760472053809e-06, "loss": 0.2554, "step": 2291 }, { "epoch": 1.719429857464366, "grad_norm": 0.3943474292755127, "learning_rate": 5.877920766913919e-06, "loss": 0.2197, "step": 2292 }, { "epoch": 1.7201800450112528, "grad_norm": 0.36348435282707214, "learning_rate": 5.847156726795133e-06, "loss": 0.2008, "step": 2293 }, { "epoch": 1.7209302325581395, "grad_norm": 0.2975328862667084, "learning_rate": 5.816468404454755e-06, "loss": 0.1869, "step": 2294 }, { "epoch": 1.7216804201050264, "grad_norm": 0.4382227063179016, "learning_rate": 5.7858558525202336e-06, "loss": 0.1629, "step": 2295 }, { "epoch": 1.722430607651913, "grad_norm": 0.3399539291858673, "learning_rate": 5.755319123489083e-06, "loss": 0.2226, "step": 2296 }, { "epoch": 1.7231807951987999, "grad_norm": 0.27254319190979004, "learning_rate": 5.724858269728789e-06, "loss": 0.2371, "step": 2297 }, { "epoch": 1.7239309827456863, "grad_norm": 0.3830123543739319, "learning_rate": 5.694473343476714e-06, "loss": 0.2213, "step": 2298 }, { "epoch": 1.7246811702925733, "grad_norm": 0.4357626140117645, "learning_rate": 5.664164396840016e-06, "loss": 0.1947, "step": 2299 }, { "epoch": 1.7254313578394598, "grad_norm": 0.40720510482788086, "learning_rate": 5.633931481795552e-06, "loss": 0.2668, "step": 2300 }, { "epoch": 1.7261815453863467, "grad_norm": 0.29477742314338684, "learning_rate": 5.603774650189808e-06, "loss": 0.2143, "step": 2301 }, { "epoch": 1.7269317329332332, "grad_norm": 0.3100448250770569, "learning_rate": 5.573693953738751e-06, "loss": 0.185, "step": 2302 }, { "epoch": 1.7276819204801201, "grad_norm": 0.3109095096588135, "learning_rate": 5.543689444027839e-06, "loss": 0.2365, "step": 2303 }, { "epoch": 1.7284321080270066, "grad_norm": 0.37967416644096375, "learning_rate": 5.513761172511833e-06, "loss": 0.1903, "step": 2304 }, { "epoch": 1.7291822955738936, "grad_norm": 0.36920520663261414, "learning_rate": 5.483909190514797e-06, "loss": 0.257, "step": 2305 }, { "epoch": 1.72993248312078, "grad_norm": 0.40135130286216736, "learning_rate": 5.4541335492299115e-06, "loss": 0.2961, "step": 2306 }, { "epoch": 1.730682670667667, "grad_norm": 0.30778589844703674, "learning_rate": 5.424434299719483e-06, "loss": 0.188, "step": 2307 }, { "epoch": 1.7314328582145535, "grad_norm": 0.261007159948349, "learning_rate": 5.394811492914803e-06, "loss": 0.1848, "step": 2308 }, { "epoch": 1.7321830457614404, "grad_norm": 0.3621092438697815, "learning_rate": 5.365265179616063e-06, "loss": 0.195, "step": 2309 }, { "epoch": 1.732933233308327, "grad_norm": 0.37577512860298157, "learning_rate": 5.3357954104922895e-06, "loss": 0.1938, "step": 2310 }, { "epoch": 1.7336834208552139, "grad_norm": 0.31820055842399597, "learning_rate": 5.306402236081209e-06, "loss": 0.2759, "step": 2311 }, { "epoch": 1.7344336084021004, "grad_norm": 0.301273375749588, "learning_rate": 5.277085706789248e-06, "loss": 0.1436, "step": 2312 }, { "epoch": 1.7351837959489873, "grad_norm": 0.31367021799087524, "learning_rate": 5.247845872891371e-06, "loss": 0.1902, "step": 2313 }, { "epoch": 1.735933983495874, "grad_norm": 0.38327717781066895, "learning_rate": 5.218682784530993e-06, "loss": 0.2785, "step": 2314 }, { "epoch": 1.7366841710427607, "grad_norm": 0.3907907009124756, "learning_rate": 5.1895964917199445e-06, "loss": 0.2411, "step": 2315 }, { "epoch": 1.7374343585896475, "grad_norm": 0.3405652940273285, "learning_rate": 5.160587044338355e-06, "loss": 0.201, "step": 2316 }, { "epoch": 1.7381845461365342, "grad_norm": 0.3169878125190735, "learning_rate": 5.131654492134574e-06, "loss": 0.1729, "step": 2317 }, { "epoch": 1.7389347336834209, "grad_norm": 0.3879595696926117, "learning_rate": 5.102798884725091e-06, "loss": 0.1572, "step": 2318 }, { "epoch": 1.7396849212303076, "grad_norm": 0.28907138109207153, "learning_rate": 5.074020271594404e-06, "loss": 0.2395, "step": 2319 }, { "epoch": 1.7404351087771943, "grad_norm": 0.3390837013721466, "learning_rate": 5.045318702095014e-06, "loss": 0.2425, "step": 2320 }, { "epoch": 1.741185296324081, "grad_norm": 0.30991917848587036, "learning_rate": 5.016694225447288e-06, "loss": 0.209, "step": 2321 }, { "epoch": 1.7419354838709677, "grad_norm": 0.2775421738624573, "learning_rate": 4.988146890739381e-06, "loss": 0.222, "step": 2322 }, { "epoch": 1.7426856714178545, "grad_norm": 0.35136184096336365, "learning_rate": 4.959676746927172e-06, "loss": 0.2117, "step": 2323 }, { "epoch": 1.7434358589647412, "grad_norm": 0.39494237303733826, "learning_rate": 4.931283842834139e-06, "loss": 0.2028, "step": 2324 }, { "epoch": 1.744186046511628, "grad_norm": 0.3303692042827606, "learning_rate": 4.902968227151311e-06, "loss": 0.1763, "step": 2325 }, { "epoch": 1.7449362340585146, "grad_norm": 0.3509422242641449, "learning_rate": 4.874729948437218e-06, "loss": 0.1607, "step": 2326 }, { "epoch": 1.7456864216054013, "grad_norm": 0.33338463306427, "learning_rate": 4.846569055117684e-06, "loss": 0.1612, "step": 2327 }, { "epoch": 1.746436609152288, "grad_norm": 0.36400294303894043, "learning_rate": 4.818485595485889e-06, "loss": 0.2183, "step": 2328 }, { "epoch": 1.7471867966991748, "grad_norm": 0.3595266342163086, "learning_rate": 4.790479617702198e-06, "loss": 0.2222, "step": 2329 }, { "epoch": 1.7479369842460615, "grad_norm": 0.31214454770088196, "learning_rate": 4.762551169794105e-06, "loss": 0.2145, "step": 2330 }, { "epoch": 1.7486871717929482, "grad_norm": 0.36894240975379944, "learning_rate": 4.734700299656158e-06, "loss": 0.2172, "step": 2331 }, { "epoch": 1.749437359339835, "grad_norm": 0.2931942641735077, "learning_rate": 4.706927055049837e-06, "loss": 0.2242, "step": 2332 }, { "epoch": 1.7501875468867216, "grad_norm": 0.3917500078678131, "learning_rate": 4.6792314836035304e-06, "loss": 0.2161, "step": 2333 }, { "epoch": 1.7509377344336086, "grad_norm": 0.2804892063140869, "learning_rate": 4.651613632812413e-06, "loss": 0.2659, "step": 2334 }, { "epoch": 1.751687921980495, "grad_norm": 0.31915900111198425, "learning_rate": 4.624073550038399e-06, "loss": 0.1761, "step": 2335 }, { "epoch": 1.752438109527382, "grad_norm": 0.41243162751197815, "learning_rate": 4.596611282509989e-06, "loss": 0.2589, "step": 2336 }, { "epoch": 1.7531882970742685, "grad_norm": 0.30186861753463745, "learning_rate": 4.56922687732228e-06, "loss": 0.2584, "step": 2337 }, { "epoch": 1.7539384846211554, "grad_norm": 0.4310404658317566, "learning_rate": 4.5419203814368376e-06, "loss": 0.2506, "step": 2338 }, { "epoch": 1.754688672168042, "grad_norm": 0.18716438114643097, "learning_rate": 4.514691841681601e-06, "loss": 0.1797, "step": 2339 }, { "epoch": 1.7554388597149289, "grad_norm": 0.42112478613853455, "learning_rate": 4.487541304750848e-06, "loss": 0.1944, "step": 2340 }, { "epoch": 1.7561890472618154, "grad_norm": 0.38693347573280334, "learning_rate": 4.4604688172050605e-06, "loss": 0.1983, "step": 2341 }, { "epoch": 1.7569392348087023, "grad_norm": 0.3854070007801056, "learning_rate": 4.433474425470902e-06, "loss": 0.2492, "step": 2342 }, { "epoch": 1.7576894223555888, "grad_norm": 0.3964751958847046, "learning_rate": 4.406558175841097e-06, "loss": 0.2315, "step": 2343 }, { "epoch": 1.7584396099024757, "grad_norm": 0.34343332052230835, "learning_rate": 4.379720114474351e-06, "loss": 0.1902, "step": 2344 }, { "epoch": 1.7591897974493622, "grad_norm": 0.37681111693382263, "learning_rate": 4.352960287395303e-06, "loss": 0.1974, "step": 2345 }, { "epoch": 1.7599399849962492, "grad_norm": 0.270175039768219, "learning_rate": 4.3262787404944165e-06, "loss": 0.1668, "step": 2346 }, { "epoch": 1.7606901725431356, "grad_norm": 0.30767977237701416, "learning_rate": 4.299675519527929e-06, "loss": 0.2375, "step": 2347 }, { "epoch": 1.7614403600900226, "grad_norm": 0.4973425269126892, "learning_rate": 4.273150670117743e-06, "loss": 0.1777, "step": 2348 }, { "epoch": 1.762190547636909, "grad_norm": 0.33434590697288513, "learning_rate": 4.246704237751342e-06, "loss": 0.2126, "step": 2349 }, { "epoch": 1.762940735183796, "grad_norm": 0.35596445202827454, "learning_rate": 4.220336267781777e-06, "loss": 0.1719, "step": 2350 }, { "epoch": 1.7636909227306825, "grad_norm": 0.30996599793434143, "learning_rate": 4.19404680542751e-06, "loss": 0.2065, "step": 2351 }, { "epoch": 1.7644411102775694, "grad_norm": 0.3474753797054291, "learning_rate": 4.167835895772382e-06, "loss": 0.2159, "step": 2352 }, { "epoch": 1.7651912978244562, "grad_norm": 0.3672126233577728, "learning_rate": 4.141703583765522e-06, "loss": 0.2052, "step": 2353 }, { "epoch": 1.7659414853713429, "grad_norm": 0.3282393217086792, "learning_rate": 4.11564991422127e-06, "loss": 0.1666, "step": 2354 }, { "epoch": 1.7666916729182296, "grad_norm": 0.389382928609848, "learning_rate": 4.0896749318191095e-06, "loss": 0.243, "step": 2355 }, { "epoch": 1.7674418604651163, "grad_norm": 0.40897005796432495, "learning_rate": 4.06377868110358e-06, "loss": 0.224, "step": 2356 }, { "epoch": 1.768192048012003, "grad_norm": 0.37719643115997314, "learning_rate": 4.037961206484186e-06, "loss": 0.2155, "step": 2357 }, { "epoch": 1.7689422355588897, "grad_norm": 0.40078455209732056, "learning_rate": 4.0122225522353675e-06, "loss": 0.2242, "step": 2358 }, { "epoch": 1.7696924231057765, "grad_norm": 0.3573429584503174, "learning_rate": 3.986562762496376e-06, "loss": 0.2686, "step": 2359 }, { "epoch": 1.7704426106526632, "grad_norm": 0.4895203709602356, "learning_rate": 3.9609818812712255e-06, "loss": 0.2953, "step": 2360 }, { "epoch": 1.77119279819955, "grad_norm": 0.31386423110961914, "learning_rate": 3.935479952428611e-06, "loss": 0.1824, "step": 2361 }, { "epoch": 1.7719429857464366, "grad_norm": 0.33440741896629333, "learning_rate": 3.91005701970183e-06, "loss": 0.1598, "step": 2362 }, { "epoch": 1.7726931732933233, "grad_norm": 0.24323497712612152, "learning_rate": 3.8847131266886935e-06, "loss": 0.1748, "step": 2363 }, { "epoch": 1.77344336084021, "grad_norm": 0.29116132855415344, "learning_rate": 3.859448316851505e-06, "loss": 0.208, "step": 2364 }, { "epoch": 1.7741935483870968, "grad_norm": 0.2556542754173279, "learning_rate": 3.834262633516916e-06, "loss": 0.1819, "step": 2365 }, { "epoch": 1.7749437359339835, "grad_norm": 0.3057789206504822, "learning_rate": 3.8091561198758897e-06, "loss": 0.1675, "step": 2366 }, { "epoch": 1.7756939234808702, "grad_norm": 0.44755733013153076, "learning_rate": 3.784128818983618e-06, "loss": 0.2261, "step": 2367 }, { "epoch": 1.776444111027757, "grad_norm": 0.32942378520965576, "learning_rate": 3.7591807737594743e-06, "loss": 0.2139, "step": 2368 }, { "epoch": 1.7771942985746436, "grad_norm": 0.32141467928886414, "learning_rate": 3.734312026986897e-06, "loss": 0.2086, "step": 2369 }, { "epoch": 1.7779444861215303, "grad_norm": 0.2884428799152374, "learning_rate": 3.7095226213133272e-06, "loss": 0.2514, "step": 2370 }, { "epoch": 1.7786946736684173, "grad_norm": 0.3222726583480835, "learning_rate": 3.6848125992501592e-06, "loss": 0.2018, "step": 2371 }, { "epoch": 1.7794448612153038, "grad_norm": 0.30391648411750793, "learning_rate": 3.6601820031726517e-06, "loss": 0.2254, "step": 2372 }, { "epoch": 1.7801950487621907, "grad_norm": 0.3846050500869751, "learning_rate": 3.6356308753198454e-06, "loss": 0.1927, "step": 2373 }, { "epoch": 1.7809452363090772, "grad_norm": 0.32409605383872986, "learning_rate": 3.6111592577945217e-06, "loss": 0.2461, "step": 2374 }, { "epoch": 1.7816954238559641, "grad_norm": 0.2730276882648468, "learning_rate": 3.586767192563073e-06, "loss": 0.2824, "step": 2375 }, { "epoch": 1.7824456114028506, "grad_norm": 0.3694244921207428, "learning_rate": 3.562454721455505e-06, "loss": 0.2582, "step": 2376 }, { "epoch": 1.7831957989497376, "grad_norm": 0.34793657064437866, "learning_rate": 3.538221886165299e-06, "loss": 0.1796, "step": 2377 }, { "epoch": 1.783945986496624, "grad_norm": 0.3481349050998688, "learning_rate": 3.514068728249398e-06, "loss": 0.2048, "step": 2378 }, { "epoch": 1.784696174043511, "grad_norm": 0.35389429330825806, "learning_rate": 3.489995289128073e-06, "loss": 0.2456, "step": 2379 }, { "epoch": 1.7854463615903975, "grad_norm": 0.3702186644077301, "learning_rate": 3.4660016100849126e-06, "loss": 0.2247, "step": 2380 }, { "epoch": 1.7861965491372844, "grad_norm": 0.30923038721084595, "learning_rate": 3.442087732266697e-06, "loss": 0.2087, "step": 2381 }, { "epoch": 1.786946736684171, "grad_norm": 0.3110504448413849, "learning_rate": 3.418253696683399e-06, "loss": 0.1622, "step": 2382 }, { "epoch": 1.7876969242310579, "grad_norm": 0.41231971979141235, "learning_rate": 3.3944995442080185e-06, "loss": 0.2124, "step": 2383 }, { "epoch": 1.7884471117779444, "grad_norm": 0.3276788294315338, "learning_rate": 3.3708253155766033e-06, "loss": 0.264, "step": 2384 }, { "epoch": 1.7891972993248313, "grad_norm": 0.4556190073490143, "learning_rate": 3.347231051388117e-06, "loss": 0.2252, "step": 2385 }, { "epoch": 1.7899474868717178, "grad_norm": 0.34956037998199463, "learning_rate": 3.323716792104403e-06, "loss": 0.1816, "step": 2386 }, { "epoch": 1.7906976744186047, "grad_norm": 0.3242213726043701, "learning_rate": 3.3002825780500957e-06, "loss": 0.2368, "step": 2387 }, { "epoch": 1.7914478619654912, "grad_norm": 0.28366902470588684, "learning_rate": 3.276928449412564e-06, "loss": 0.2244, "step": 2388 }, { "epoch": 1.7921980495123782, "grad_norm": 0.3518024682998657, "learning_rate": 3.253654446241844e-06, "loss": 0.2149, "step": 2389 }, { "epoch": 1.7929482370592649, "grad_norm": 0.34154897928237915, "learning_rate": 3.2304606084505585e-06, "loss": 0.1814, "step": 2390 }, { "epoch": 1.7936984246061516, "grad_norm": 0.47033563256263733, "learning_rate": 3.2073469758138577e-06, "loss": 0.214, "step": 2391 }, { "epoch": 1.7944486121530383, "grad_norm": 0.2690843939781189, "learning_rate": 3.18431358796934e-06, "loss": 0.2531, "step": 2392 }, { "epoch": 1.795198799699925, "grad_norm": 0.29829105734825134, "learning_rate": 3.161360484416992e-06, "loss": 0.1206, "step": 2393 }, { "epoch": 1.7959489872468117, "grad_norm": 0.3099280893802643, "learning_rate": 3.1384877045191384e-06, "loss": 0.2729, "step": 2394 }, { "epoch": 1.7966991747936985, "grad_norm": 0.303629070520401, "learning_rate": 3.1156952875003365e-06, "loss": 0.1985, "step": 2395 }, { "epoch": 1.7974493623405852, "grad_norm": 0.24835608899593353, "learning_rate": 3.0929832724473416e-06, "loss": 0.1918, "step": 2396 }, { "epoch": 1.7981995498874719, "grad_norm": 0.3845706582069397, "learning_rate": 3.0703516983090207e-06, "loss": 0.2349, "step": 2397 }, { "epoch": 1.7989497374343586, "grad_norm": 0.27274611592292786, "learning_rate": 3.0478006038962947e-06, "loss": 0.1678, "step": 2398 }, { "epoch": 1.7996999249812453, "grad_norm": 0.36049699783325195, "learning_rate": 3.0253300278820783e-06, "loss": 0.1818, "step": 2399 }, { "epoch": 1.800450112528132, "grad_norm": 0.358641654253006, "learning_rate": 3.002940008801186e-06, "loss": 0.2377, "step": 2400 }, { "epoch": 1.800450112528132, "eval_loss": 0.25396010279655457, "eval_runtime": 8.9254, "eval_samples_per_second": 6.05, "eval_steps_per_second": 1.569, "step": 2400 }, { "epoch": 1.8012003000750187, "grad_norm": 0.30615970492362976, "learning_rate": 2.9806305850502923e-06, "loss": 0.2132, "step": 2401 }, { "epoch": 1.8019504876219055, "grad_norm": 0.3530374765396118, "learning_rate": 2.9584017948878717e-06, "loss": 0.2207, "step": 2402 }, { "epoch": 1.8027006751687922, "grad_norm": 0.34785839915275574, "learning_rate": 2.9362536764341085e-06, "loss": 0.2373, "step": 2403 }, { "epoch": 1.803450862715679, "grad_norm": 0.3695445656776428, "learning_rate": 2.9141862676708486e-06, "loss": 0.277, "step": 2404 }, { "epoch": 1.8042010502625656, "grad_norm": 0.4388507902622223, "learning_rate": 2.8921996064415147e-06, "loss": 0.3193, "step": 2405 }, { "epoch": 1.8049512378094523, "grad_norm": 0.41571658849716187, "learning_rate": 2.870293730451068e-06, "loss": 0.2818, "step": 2406 }, { "epoch": 1.805701425356339, "grad_norm": 0.3665503263473511, "learning_rate": 2.8484686772659308e-06, "loss": 0.2789, "step": 2407 }, { "epoch": 1.8064516129032258, "grad_norm": 0.26665422320365906, "learning_rate": 2.826724484313925e-06, "loss": 0.1945, "step": 2408 }, { "epoch": 1.8072018004501125, "grad_norm": 0.33174988627433777, "learning_rate": 2.8050611888841947e-06, "loss": 0.1867, "step": 2409 }, { "epoch": 1.8079519879969994, "grad_norm": 0.29281336069107056, "learning_rate": 2.7834788281271616e-06, "loss": 0.1957, "step": 2410 }, { "epoch": 1.808702175543886, "grad_norm": 0.29482296109199524, "learning_rate": 2.7619774390544473e-06, "loss": 0.2111, "step": 2411 }, { "epoch": 1.8094523630907728, "grad_norm": 0.23680563271045685, "learning_rate": 2.740557058538823e-06, "loss": 0.1515, "step": 2412 }, { "epoch": 1.8102025506376593, "grad_norm": 0.40511733293533325, "learning_rate": 2.7192177233141215e-06, "loss": 0.2646, "step": 2413 }, { "epoch": 1.8109527381845463, "grad_norm": 0.32161155343055725, "learning_rate": 2.697959469975203e-06, "loss": 0.1668, "step": 2414 }, { "epoch": 1.8117029257314328, "grad_norm": 0.26846593618392944, "learning_rate": 2.6767823349778843e-06, "loss": 0.2014, "step": 2415 }, { "epoch": 1.8124531132783197, "grad_norm": 0.32262900471687317, "learning_rate": 2.65568635463887e-06, "loss": 0.2743, "step": 2416 }, { "epoch": 1.8132033008252062, "grad_norm": 0.2938326895236969, "learning_rate": 2.634671565135677e-06, "loss": 0.2163, "step": 2417 }, { "epoch": 1.8139534883720931, "grad_norm": 0.32914721965789795, "learning_rate": 2.613738002506605e-06, "loss": 0.2109, "step": 2418 }, { "epoch": 1.8147036759189796, "grad_norm": 0.35728439688682556, "learning_rate": 2.592885702650655e-06, "loss": 0.1927, "step": 2419 }, { "epoch": 1.8154538634658666, "grad_norm": 0.4085077941417694, "learning_rate": 2.572114701327466e-06, "loss": 0.2091, "step": 2420 }, { "epoch": 1.816204051012753, "grad_norm": 0.34688231348991394, "learning_rate": 2.551425034157262e-06, "loss": 0.1396, "step": 2421 }, { "epoch": 1.81695423855964, "grad_norm": 0.3181007504463196, "learning_rate": 2.5308167366207724e-06, "loss": 0.2259, "step": 2422 }, { "epoch": 1.8177044261065265, "grad_norm": 0.3178381025791168, "learning_rate": 2.510289844059216e-06, "loss": 0.1683, "step": 2423 }, { "epoch": 1.8184546136534134, "grad_norm": 0.30615562200546265, "learning_rate": 2.48984439167419e-06, "loss": 0.182, "step": 2424 }, { "epoch": 1.8192048012003, "grad_norm": 0.35434243083000183, "learning_rate": 2.4694804145276305e-06, "loss": 0.245, "step": 2425 }, { "epoch": 1.8199549887471869, "grad_norm": 0.5039822459220886, "learning_rate": 2.449197947541737e-06, "loss": 0.2464, "step": 2426 }, { "epoch": 1.8207051762940734, "grad_norm": 0.35426995158195496, "learning_rate": 2.4289970254989635e-06, "loss": 0.1843, "step": 2427 }, { "epoch": 1.8214553638409603, "grad_norm": 0.3492979407310486, "learning_rate": 2.408877683041888e-06, "loss": 0.2749, "step": 2428 }, { "epoch": 1.822205551387847, "grad_norm": 0.3494691848754883, "learning_rate": 2.388839954673222e-06, "loss": 0.2041, "step": 2429 }, { "epoch": 1.8229557389347337, "grad_norm": 0.30591216683387756, "learning_rate": 2.3688838747556674e-06, "loss": 0.2414, "step": 2430 }, { "epoch": 1.8237059264816204, "grad_norm": 0.37087398767471313, "learning_rate": 2.3490094775119597e-06, "loss": 0.2008, "step": 2431 }, { "epoch": 1.8244561140285072, "grad_norm": 0.2507796585559845, "learning_rate": 2.3292167970247193e-06, "loss": 0.1891, "step": 2432 }, { "epoch": 1.8252063015753939, "grad_norm": 0.347104549407959, "learning_rate": 2.30950586723645e-06, "loss": 0.185, "step": 2433 }, { "epoch": 1.8259564891222806, "grad_norm": 0.3273461163043976, "learning_rate": 2.2898767219494634e-06, "loss": 0.2161, "step": 2434 }, { "epoch": 1.8267066766691673, "grad_norm": 0.32168975472450256, "learning_rate": 2.270329394825793e-06, "loss": 0.2495, "step": 2435 }, { "epoch": 1.827456864216054, "grad_norm": 0.3686228096485138, "learning_rate": 2.2508639193871805e-06, "loss": 0.2557, "step": 2436 }, { "epoch": 1.8282070517629407, "grad_norm": 0.40969139337539673, "learning_rate": 2.2314803290150287e-06, "loss": 0.188, "step": 2437 }, { "epoch": 1.8289572393098275, "grad_norm": 0.3705524206161499, "learning_rate": 2.2121786569502535e-06, "loss": 0.2076, "step": 2438 }, { "epoch": 1.8297074268567142, "grad_norm": 0.35843002796173096, "learning_rate": 2.192958936293338e-06, "loss": 0.2327, "step": 2439 }, { "epoch": 1.8304576144036009, "grad_norm": 0.30722981691360474, "learning_rate": 2.1738212000042e-06, "loss": 0.244, "step": 2440 }, { "epoch": 1.8312078019504876, "grad_norm": 0.3621898889541626, "learning_rate": 2.1547654809021877e-06, "loss": 0.2146, "step": 2441 }, { "epoch": 1.8319579894973743, "grad_norm": 0.3927375376224518, "learning_rate": 2.135791811665977e-06, "loss": 0.2475, "step": 2442 }, { "epoch": 1.832708177044261, "grad_norm": 0.3483969569206238, "learning_rate": 2.1169002248335346e-06, "loss": 0.1764, "step": 2443 }, { "epoch": 1.8334583645911477, "grad_norm": 0.39914941787719727, "learning_rate": 2.098090752802073e-06, "loss": 0.2784, "step": 2444 }, { "epoch": 1.8342085521380345, "grad_norm": 0.338178426027298, "learning_rate": 2.0793634278279907e-06, "loss": 0.2045, "step": 2445 }, { "epoch": 1.8349587396849212, "grad_norm": 0.45415034890174866, "learning_rate": 2.0607182820268133e-06, "loss": 0.2419, "step": 2446 }, { "epoch": 1.835708927231808, "grad_norm": 0.39005470275878906, "learning_rate": 2.042155347373109e-06, "loss": 0.2231, "step": 2447 }, { "epoch": 1.8364591147786946, "grad_norm": 0.43800032138824463, "learning_rate": 2.023674655700497e-06, "loss": 0.1898, "step": 2448 }, { "epoch": 1.8372093023255816, "grad_norm": 0.4081275463104248, "learning_rate": 2.0052762387015424e-06, "loss": 0.1843, "step": 2449 }, { "epoch": 1.837959489872468, "grad_norm": 0.28822067379951477, "learning_rate": 1.986960127927717e-06, "loss": 0.2171, "step": 2450 }, { "epoch": 1.838709677419355, "grad_norm": 0.45534124970436096, "learning_rate": 1.9687263547893407e-06, "loss": 0.2404, "step": 2451 }, { "epoch": 1.8394598649662415, "grad_norm": 0.3228893280029297, "learning_rate": 1.9505749505555503e-06, "loss": 0.1938, "step": 2452 }, { "epoch": 1.8402100525131284, "grad_norm": 0.3348483145236969, "learning_rate": 1.932505946354213e-06, "loss": 0.2523, "step": 2453 }, { "epoch": 1.840960240060015, "grad_norm": 0.31146731972694397, "learning_rate": 1.9145193731718858e-06, "loss": 0.1859, "step": 2454 }, { "epoch": 1.8417104276069018, "grad_norm": 0.4064481556415558, "learning_rate": 1.8966152618537846e-06, "loss": 0.2186, "step": 2455 }, { "epoch": 1.8424606151537883, "grad_norm": 0.3381580114364624, "learning_rate": 1.8787936431036824e-06, "loss": 0.2709, "step": 2456 }, { "epoch": 1.8432108027006753, "grad_norm": 0.2953905165195465, "learning_rate": 1.8610545474839036e-06, "loss": 0.2376, "step": 2457 }, { "epoch": 1.8439609902475618, "grad_norm": 0.3057350516319275, "learning_rate": 1.8433980054152533e-06, "loss": 0.2364, "step": 2458 }, { "epoch": 1.8447111777944487, "grad_norm": 0.31425464153289795, "learning_rate": 1.8258240471769662e-06, "loss": 0.2163, "step": 2459 }, { "epoch": 1.8454613653413352, "grad_norm": 0.40945640206336975, "learning_rate": 1.8083327029066399e-06, "loss": 0.29, "step": 2460 }, { "epoch": 1.8462115528882221, "grad_norm": 0.4909881353378296, "learning_rate": 1.7909240026002138e-06, "loss": 0.2428, "step": 2461 }, { "epoch": 1.8469617404351086, "grad_norm": 0.36628827452659607, "learning_rate": 1.773597976111896e-06, "loss": 0.1954, "step": 2462 }, { "epoch": 1.8477119279819956, "grad_norm": 0.4565563499927521, "learning_rate": 1.7563546531541132e-06, "loss": 0.2404, "step": 2463 }, { "epoch": 1.848462115528882, "grad_norm": 0.358349472284317, "learning_rate": 1.7391940632974667e-06, "loss": 0.1691, "step": 2464 }, { "epoch": 1.849212303075769, "grad_norm": 0.351439505815506, "learning_rate": 1.7221162359706776e-06, "loss": 0.2206, "step": 2465 }, { "epoch": 1.8499624906226555, "grad_norm": 0.34446144104003906, "learning_rate": 1.705121200460541e-06, "loss": 0.1841, "step": 2466 }, { "epoch": 1.8507126781695424, "grad_norm": 0.340604692697525, "learning_rate": 1.6882089859118766e-06, "loss": 0.2097, "step": 2467 }, { "epoch": 1.8514628657164292, "grad_norm": 0.3373919725418091, "learning_rate": 1.6713796213274457e-06, "loss": 0.2067, "step": 2468 }, { "epoch": 1.8522130532633159, "grad_norm": 0.3743683695793152, "learning_rate": 1.6546331355679623e-06, "loss": 0.1991, "step": 2469 }, { "epoch": 1.8529632408102026, "grad_norm": 0.35258856415748596, "learning_rate": 1.6379695573520093e-06, "loss": 0.1785, "step": 2470 }, { "epoch": 1.8537134283570893, "grad_norm": 0.3417801558971405, "learning_rate": 1.621388915255967e-06, "loss": 0.2016, "step": 2471 }, { "epoch": 1.854463615903976, "grad_norm": 0.32723093032836914, "learning_rate": 1.604891237714018e-06, "loss": 0.2418, "step": 2472 }, { "epoch": 1.8552138034508627, "grad_norm": 0.29549065232276917, "learning_rate": 1.5884765530180478e-06, "loss": 0.1473, "step": 2473 }, { "epoch": 1.8559639909977494, "grad_norm": 0.37140610814094543, "learning_rate": 1.5721448893176228e-06, "loss": 0.2205, "step": 2474 }, { "epoch": 1.8567141785446362, "grad_norm": 0.36884841322898865, "learning_rate": 1.5558962746199335e-06, "loss": 0.261, "step": 2475 }, { "epoch": 1.8574643660915229, "grad_norm": 0.2874661982059479, "learning_rate": 1.5397307367897684e-06, "loss": 0.247, "step": 2476 }, { "epoch": 1.8582145536384096, "grad_norm": 0.3127751648426056, "learning_rate": 1.5236483035494297e-06, "loss": 0.2012, "step": 2477 }, { "epoch": 1.8589647411852963, "grad_norm": 0.364347904920578, "learning_rate": 1.5076490024786893e-06, "loss": 0.2367, "step": 2478 }, { "epoch": 1.859714928732183, "grad_norm": 0.31257790327072144, "learning_rate": 1.4917328610147885e-06, "loss": 0.2188, "step": 2479 }, { "epoch": 1.8604651162790697, "grad_norm": 0.40746310353279114, "learning_rate": 1.4758999064523493e-06, "loss": 0.2049, "step": 2480 }, { "epoch": 1.8612153038259565, "grad_norm": 0.36939361691474915, "learning_rate": 1.4601501659433137e-06, "loss": 0.1836, "step": 2481 }, { "epoch": 1.8619654913728432, "grad_norm": 0.3175240755081177, "learning_rate": 1.444483666496943e-06, "loss": 0.2791, "step": 2482 }, { "epoch": 1.86271567891973, "grad_norm": 0.27310073375701904, "learning_rate": 1.4289004349797409e-06, "loss": 0.1671, "step": 2483 }, { "epoch": 1.8634658664666166, "grad_norm": 0.3290778696537018, "learning_rate": 1.4134004981154137e-06, "loss": 0.225, "step": 2484 }, { "epoch": 1.8642160540135033, "grad_norm": 0.29327166080474854, "learning_rate": 1.3979838824848378e-06, "loss": 0.2086, "step": 2485 }, { "epoch": 1.8649662415603903, "grad_norm": 0.3827644884586334, "learning_rate": 1.382650614525971e-06, "loss": 0.217, "step": 2486 }, { "epoch": 1.8657164291072768, "grad_norm": 0.3445426821708679, "learning_rate": 1.3674007205338678e-06, "loss": 0.2236, "step": 2487 }, { "epoch": 1.8664666166541637, "grad_norm": 0.27178671956062317, "learning_rate": 1.3522342266605925e-06, "loss": 0.1751, "step": 2488 }, { "epoch": 1.8672168042010502, "grad_norm": 0.32721954584121704, "learning_rate": 1.3371511589152008e-06, "loss": 0.2406, "step": 2489 }, { "epoch": 1.8679669917479371, "grad_norm": 0.3602909743785858, "learning_rate": 1.3221515431636522e-06, "loss": 0.1945, "step": 2490 }, { "epoch": 1.8687171792948236, "grad_norm": 0.35723233222961426, "learning_rate": 1.307235405128815e-06, "loss": 0.2126, "step": 2491 }, { "epoch": 1.8694673668417106, "grad_norm": 0.3741419017314911, "learning_rate": 1.292402770390394e-06, "loss": 0.2, "step": 2492 }, { "epoch": 1.870217554388597, "grad_norm": 0.33873456716537476, "learning_rate": 1.2776536643849145e-06, "loss": 0.1473, "step": 2493 }, { "epoch": 1.870967741935484, "grad_norm": 0.35103660821914673, "learning_rate": 1.2629881124056274e-06, "loss": 0.2216, "step": 2494 }, { "epoch": 1.8717179294823705, "grad_norm": 0.3368591368198395, "learning_rate": 1.2484061396025038e-06, "loss": 0.1848, "step": 2495 }, { "epoch": 1.8724681170292574, "grad_norm": 0.3353674113750458, "learning_rate": 1.2339077709822067e-06, "loss": 0.194, "step": 2496 }, { "epoch": 1.873218304576144, "grad_norm": 0.3187189996242523, "learning_rate": 1.2194930314080032e-06, "loss": 0.1881, "step": 2497 }, { "epoch": 1.8739684921230308, "grad_norm": 0.4599374830722809, "learning_rate": 1.2051619455997476e-06, "loss": 0.2193, "step": 2498 }, { "epoch": 1.8747186796699173, "grad_norm": 0.5537548661231995, "learning_rate": 1.1909145381338472e-06, "loss": 0.253, "step": 2499 }, { "epoch": 1.8754688672168043, "grad_norm": 0.45125436782836914, "learning_rate": 1.1767508334431964e-06, "loss": 0.2367, "step": 2500 }, { "epoch": 1.8762190547636908, "grad_norm": 0.3273683190345764, "learning_rate": 1.1626708558171606e-06, "loss": 0.2891, "step": 2501 }, { "epoch": 1.8769692423105777, "grad_norm": 0.38621431589126587, "learning_rate": 1.1486746294015193e-06, "loss": 0.2425, "step": 2502 }, { "epoch": 1.8777194298574642, "grad_norm": 0.3916779160499573, "learning_rate": 1.134762178198412e-06, "loss": 0.2471, "step": 2503 }, { "epoch": 1.8784696174043511, "grad_norm": 0.3176916837692261, "learning_rate": 1.1209335260663256e-06, "loss": 0.2403, "step": 2504 }, { "epoch": 1.8792198049512379, "grad_norm": 0.3126286268234253, "learning_rate": 1.1071886967200352e-06, "loss": 0.2759, "step": 2505 }, { "epoch": 1.8799699924981246, "grad_norm": 0.3133660852909088, "learning_rate": 1.0935277137305744e-06, "loss": 0.2779, "step": 2506 }, { "epoch": 1.8807201800450113, "grad_norm": 0.2915734052658081, "learning_rate": 1.0799506005251814e-06, "loss": 0.1872, "step": 2507 }, { "epoch": 1.881470367591898, "grad_norm": 0.3481842577457428, "learning_rate": 1.06645738038727e-06, "loss": 0.2627, "step": 2508 }, { "epoch": 1.8822205551387847, "grad_norm": 0.32397395372390747, "learning_rate": 1.053048076456381e-06, "loss": 0.1572, "step": 2509 }, { "epoch": 1.8829707426856714, "grad_norm": 0.33568888902664185, "learning_rate": 1.0397227117281528e-06, "loss": 0.1785, "step": 2510 }, { "epoch": 1.8837209302325582, "grad_norm": 0.3190779685974121, "learning_rate": 1.0264813090542725e-06, "loss": 0.1698, "step": 2511 }, { "epoch": 1.8844711177794449, "grad_norm": 0.30577874183654785, "learning_rate": 1.0133238911424426e-06, "loss": 0.2602, "step": 2512 }, { "epoch": 1.8852213053263316, "grad_norm": 0.44842997193336487, "learning_rate": 1.0002504805563362e-06, "loss": 0.2379, "step": 2513 }, { "epoch": 1.8859714928732183, "grad_norm": 0.2549665570259094, "learning_rate": 9.872610997155695e-07, "loss": 0.1733, "step": 2514 }, { "epoch": 1.886721680420105, "grad_norm": 0.37826108932495117, "learning_rate": 9.743557708956575e-07, "loss": 0.2145, "step": 2515 }, { "epoch": 1.8874718679669917, "grad_norm": 0.3812573552131653, "learning_rate": 9.615345162279521e-07, "loss": 0.261, "step": 2516 }, { "epoch": 1.8882220555138785, "grad_norm": 0.45195671916007996, "learning_rate": 9.48797357699649e-07, "loss": 0.2812, "step": 2517 }, { "epoch": 1.8889722430607652, "grad_norm": 0.43096110224723816, "learning_rate": 9.361443171537254e-07, "loss": 0.214, "step": 2518 }, { "epoch": 1.8897224306076519, "grad_norm": 0.3526875972747803, "learning_rate": 9.235754162889021e-07, "loss": 0.2404, "step": 2519 }, { "epoch": 1.8904726181545386, "grad_norm": 0.25832411646842957, "learning_rate": 9.110906766595872e-07, "loss": 0.1743, "step": 2520 }, { "epoch": 1.8912228057014253, "grad_norm": 0.3644820749759674, "learning_rate": 8.986901196759046e-07, "loss": 0.2511, "step": 2521 }, { "epoch": 1.891972993248312, "grad_norm": 0.3257949650287628, "learning_rate": 8.863737666035765e-07, "loss": 0.2118, "step": 2522 }, { "epoch": 1.8927231807951987, "grad_norm": 0.3599741458892822, "learning_rate": 8.741416385639412e-07, "loss": 0.1616, "step": 2523 }, { "epoch": 1.8934733683420855, "grad_norm": 0.34863436222076416, "learning_rate": 8.619937565338854e-07, "loss": 0.2051, "step": 2524 }, { "epoch": 1.8942235558889724, "grad_norm": 0.28810715675354004, "learning_rate": 8.499301413458338e-07, "loss": 0.1887, "step": 2525 }, { "epoch": 1.894973743435859, "grad_norm": 0.36942175030708313, "learning_rate": 8.37950813687699e-07, "loss": 0.2159, "step": 2526 }, { "epoch": 1.8957239309827458, "grad_norm": 0.32684582471847534, "learning_rate": 8.26055794102848e-07, "loss": 0.2723, "step": 2527 }, { "epoch": 1.8964741185296323, "grad_norm": 0.41425010561943054, "learning_rate": 8.142451029900744e-07, "loss": 0.2436, "step": 2528 }, { "epoch": 1.8972243060765193, "grad_norm": 0.34928449988365173, "learning_rate": 8.025187606035434e-07, "loss": 0.1988, "step": 2529 }, { "epoch": 1.8979744936234058, "grad_norm": 0.3432803750038147, "learning_rate": 7.908767870527745e-07, "loss": 0.2167, "step": 2530 }, { "epoch": 1.8987246811702927, "grad_norm": 0.350181519985199, "learning_rate": 7.793192023026142e-07, "loss": 0.1758, "step": 2531 }, { "epoch": 1.8994748687171792, "grad_norm": 0.4028658866882324, "learning_rate": 7.678460261731801e-07, "loss": 0.2322, "step": 2532 }, { "epoch": 1.9002250562640661, "grad_norm": 0.45844170451164246, "learning_rate": 7.564572783398339e-07, "loss": 0.2204, "step": 2533 }, { "epoch": 1.9009752438109526, "grad_norm": 0.44057804346084595, "learning_rate": 7.451529783331523e-07, "loss": 0.2149, "step": 2534 }, { "epoch": 1.9017254313578396, "grad_norm": 0.4026328921318054, "learning_rate": 7.339331455389175e-07, "loss": 0.2039, "step": 2535 }, { "epoch": 1.902475618904726, "grad_norm": 0.30719080567359924, "learning_rate": 7.227977991980217e-07, "loss": 0.2063, "step": 2536 }, { "epoch": 1.903225806451613, "grad_norm": 0.3570643365383148, "learning_rate": 7.117469584064895e-07, "loss": 0.1818, "step": 2537 }, { "epoch": 1.9039759939984995, "grad_norm": 0.4022088348865509, "learning_rate": 7.007806421154284e-07, "loss": 0.2155, "step": 2538 }, { "epoch": 1.9047261815453864, "grad_norm": 0.39924356341362, "learning_rate": 6.898988691309893e-07, "loss": 0.202, "step": 2539 }, { "epoch": 1.905476369092273, "grad_norm": 0.38299432396888733, "learning_rate": 6.791016581143395e-07, "loss": 0.1648, "step": 2540 }, { "epoch": 1.9062265566391599, "grad_norm": 0.403477281332016, "learning_rate": 6.683890275816341e-07, "loss": 0.2335, "step": 2541 }, { "epoch": 1.9069767441860463, "grad_norm": 0.3885505795478821, "learning_rate": 6.577609959039776e-07, "loss": 0.2673, "step": 2542 }, { "epoch": 1.9077269317329333, "grad_norm": 0.3334859609603882, "learning_rate": 6.472175813074022e-07, "loss": 0.2111, "step": 2543 }, { "epoch": 1.90847711927982, "grad_norm": 0.3234958052635193, "learning_rate": 6.367588018728166e-07, "loss": 0.2126, "step": 2544 }, { "epoch": 1.9092273068267067, "grad_norm": 0.415604829788208, "learning_rate": 6.263846755360126e-07, "loss": 0.1739, "step": 2545 }, { "epoch": 1.9099774943735934, "grad_norm": 0.32881173491477966, "learning_rate": 6.16095220087587e-07, "loss": 0.2376, "step": 2546 }, { "epoch": 1.9107276819204801, "grad_norm": 0.31487998366355896, "learning_rate": 6.05890453172936e-07, "loss": 0.1902, "step": 2547 }, { "epoch": 1.9114778694673669, "grad_norm": 0.27461180090904236, "learning_rate": 5.957703922922386e-07, "loss": 0.2146, "step": 2548 }, { "epoch": 1.9122280570142536, "grad_norm": 0.4415489435195923, "learning_rate": 5.857350548004015e-07, "loss": 0.1932, "step": 2549 }, { "epoch": 1.9129782445611403, "grad_norm": 0.4042346477508545, "learning_rate": 5.757844579070359e-07, "loss": 0.1758, "step": 2550 }, { "epoch": 1.913728432108027, "grad_norm": 0.31001120805740356, "learning_rate": 5.65918618676442e-07, "loss": 0.2273, "step": 2551 }, { "epoch": 1.9144786196549137, "grad_norm": 0.3728652000427246, "learning_rate": 5.561375540275581e-07, "loss": 0.1765, "step": 2552 }, { "epoch": 1.9152288072018004, "grad_norm": 0.3924350440502167, "learning_rate": 5.464412807339558e-07, "loss": 0.1829, "step": 2553 }, { "epoch": 1.9159789947486872, "grad_norm": 0.2844933867454529, "learning_rate": 5.368298154237727e-07, "loss": 0.1963, "step": 2554 }, { "epoch": 1.9167291822955739, "grad_norm": 0.41288819909095764, "learning_rate": 5.273031745797352e-07, "loss": 0.2607, "step": 2555 }, { "epoch": 1.9174793698424606, "grad_norm": 0.48739874362945557, "learning_rate": 5.17861374539097e-07, "loss": 0.2345, "step": 2556 }, { "epoch": 1.9182295573893473, "grad_norm": 0.44389382004737854, "learning_rate": 5.085044314936116e-07, "loss": 0.2819, "step": 2557 }, { "epoch": 1.918979744936234, "grad_norm": 0.420648455619812, "learning_rate": 4.992323614895156e-07, "loss": 0.1863, "step": 2558 }, { "epoch": 1.9197299324831207, "grad_norm": 0.4259224832057953, "learning_rate": 4.900451804274898e-07, "loss": 0.2151, "step": 2559 }, { "epoch": 1.9204801200300075, "grad_norm": 0.3573651611804962, "learning_rate": 4.809429040626535e-07, "loss": 0.2079, "step": 2560 }, { "epoch": 1.9212303075768942, "grad_norm": 0.27918025851249695, "learning_rate": 4.719255480045148e-07, "loss": 0.1653, "step": 2561 }, { "epoch": 1.921980495123781, "grad_norm": 0.32250452041625977, "learning_rate": 4.6299312771694304e-07, "loss": 0.2353, "step": 2562 }, { "epoch": 1.9227306826706676, "grad_norm": 0.42545247077941895, "learning_rate": 4.5414565851816806e-07, "loss": 0.3189, "step": 2563 }, { "epoch": 1.9234808702175545, "grad_norm": 0.289092481136322, "learning_rate": 4.453831555807253e-07, "loss": 0.1723, "step": 2564 }, { "epoch": 1.924231057764441, "grad_norm": 0.2989669442176819, "learning_rate": 4.36705633931439e-07, "loss": 0.2184, "step": 2565 }, { "epoch": 1.924981245311328, "grad_norm": 0.39064231514930725, "learning_rate": 4.281131084514167e-07, "loss": 0.2689, "step": 2566 }, { "epoch": 1.9257314328582145, "grad_norm": 0.4141589403152466, "learning_rate": 4.196055938759824e-07, "loss": 0.2323, "step": 2567 }, { "epoch": 1.9264816204051014, "grad_norm": 0.4279109537601471, "learning_rate": 4.111831047946879e-07, "loss": 0.2245, "step": 2568 }, { "epoch": 1.927231807951988, "grad_norm": 0.39582133293151855, "learning_rate": 4.0284565565127384e-07, "loss": 0.2116, "step": 2569 }, { "epoch": 1.9279819954988748, "grad_norm": 0.29659610986709595, "learning_rate": 3.9459326074364756e-07, "loss": 0.2371, "step": 2570 }, { "epoch": 1.9287321830457613, "grad_norm": 0.2979874610900879, "learning_rate": 3.8642593422384965e-07, "loss": 0.235, "step": 2571 }, { "epoch": 1.9294823705926483, "grad_norm": 0.33482685685157776, "learning_rate": 3.7834369009804303e-07, "loss": 0.157, "step": 2572 }, { "epoch": 1.9302325581395348, "grad_norm": 0.3652019798755646, "learning_rate": 3.703465422264796e-07, "loss": 0.2704, "step": 2573 }, { "epoch": 1.9309827456864217, "grad_norm": 0.3898124694824219, "learning_rate": 3.624345043234778e-07, "loss": 0.2475, "step": 2574 }, { "epoch": 1.9317329332333082, "grad_norm": 0.400102823972702, "learning_rate": 3.5460758995741194e-07, "loss": 0.1879, "step": 2575 }, { "epoch": 1.9324831207801951, "grad_norm": 0.47440633177757263, "learning_rate": 3.468658125506563e-07, "loss": 0.1587, "step": 2576 }, { "epoch": 1.9332333083270816, "grad_norm": 0.33691057562828064, "learning_rate": 3.3920918537960754e-07, "loss": 0.2047, "step": 2577 }, { "epoch": 1.9339834958739686, "grad_norm": 0.28248870372772217, "learning_rate": 3.3163772157462357e-07, "loss": 0.1946, "step": 2578 }, { "epoch": 1.934733683420855, "grad_norm": 0.40552717447280884, "learning_rate": 3.241514341200236e-07, "loss": 0.3037, "step": 2579 }, { "epoch": 1.935483870967742, "grad_norm": 0.2468455284833908, "learning_rate": 3.1675033585404355e-07, "loss": 0.1657, "step": 2580 }, { "epoch": 1.9362340585146287, "grad_norm": 0.3883063793182373, "learning_rate": 3.0943443946884755e-07, "loss": 0.2476, "step": 2581 }, { "epoch": 1.9369842460615154, "grad_norm": 0.3027293384075165, "learning_rate": 3.0220375751047194e-07, "loss": 0.2329, "step": 2582 }, { "epoch": 1.9377344336084021, "grad_norm": 0.29942700266838074, "learning_rate": 2.950583023788256e-07, "loss": 0.1971, "step": 2583 }, { "epoch": 1.9384846211552889, "grad_norm": 0.3624298572540283, "learning_rate": 2.879980863276621e-07, "loss": 0.1165, "step": 2584 }, { "epoch": 1.9392348087021756, "grad_norm": 0.30916711688041687, "learning_rate": 2.8102312146455755e-07, "loss": 0.2322, "step": 2585 }, { "epoch": 1.9399849962490623, "grad_norm": 0.3332037329673767, "learning_rate": 2.7413341975088824e-07, "loss": 0.2438, "step": 2586 }, { "epoch": 1.940735183795949, "grad_norm": 0.3534415066242218, "learning_rate": 2.6732899300180857e-07, "loss": 0.2137, "step": 2587 }, { "epoch": 1.9414853713428357, "grad_norm": 0.431613028049469, "learning_rate": 2.606098528862566e-07, "loss": 0.2148, "step": 2588 }, { "epoch": 1.9422355588897224, "grad_norm": 0.27164268493652344, "learning_rate": 2.5397601092687627e-07, "loss": 0.1564, "step": 2589 }, { "epoch": 1.9429857464366092, "grad_norm": 0.398176372051239, "learning_rate": 2.474274785000619e-07, "loss": 0.1819, "step": 2590 }, { "epoch": 1.9437359339834959, "grad_norm": 0.25404220819473267, "learning_rate": 2.40964266835908e-07, "loss": 0.2551, "step": 2591 }, { "epoch": 1.9444861215303826, "grad_norm": 0.3045722544193268, "learning_rate": 2.3458638701817636e-07, "loss": 0.1633, "step": 2592 }, { "epoch": 1.9452363090772693, "grad_norm": 0.33041512966156006, "learning_rate": 2.2829384998430681e-07, "loss": 0.2067, "step": 2593 }, { "epoch": 1.945986496624156, "grad_norm": 0.3318134844303131, "learning_rate": 2.2208666652537846e-07, "loss": 0.2854, "step": 2594 }, { "epoch": 1.9467366841710427, "grad_norm": 0.2920056879520416, "learning_rate": 2.1596484728610421e-07, "loss": 0.2909, "step": 2595 }, { "epoch": 1.9474868717179294, "grad_norm": 0.37814030051231384, "learning_rate": 2.099284027647974e-07, "loss": 0.2061, "step": 2596 }, { "epoch": 1.9482370592648162, "grad_norm": 0.46224072575569153, "learning_rate": 2.039773433133718e-07, "loss": 0.2103, "step": 2597 }, { "epoch": 1.9489872468117029, "grad_norm": 0.36993005871772766, "learning_rate": 1.9811167913729723e-07, "loss": 0.2314, "step": 2598 }, { "epoch": 1.9497374343585896, "grad_norm": 0.39403414726257324, "learning_rate": 1.923314202956217e-07, "loss": 0.2238, "step": 2599 }, { "epoch": 1.9504876219054763, "grad_norm": 0.39912149310112, "learning_rate": 1.8663657670091595e-07, "loss": 0.1775, "step": 2600 }, { "epoch": 1.9504876219054763, "eval_loss": 0.25336194038391113, "eval_runtime": 8.9192, "eval_samples_per_second": 6.054, "eval_steps_per_second": 1.57, "step": 2600 }, { "epoch": 1.9512378094523632, "grad_norm": 0.3230401873588562, "learning_rate": 1.810271581192735e-07, "loss": 0.2272, "step": 2601 }, { "epoch": 1.9519879969992497, "grad_norm": 0.423566997051239, "learning_rate": 1.755031741702995e-07, "loss": 0.2285, "step": 2602 }, { "epoch": 1.9527381845461367, "grad_norm": 0.325949102640152, "learning_rate": 1.7006463432707177e-07, "loss": 0.2212, "step": 2603 }, { "epoch": 1.9534883720930232, "grad_norm": 0.2944096028804779, "learning_rate": 1.6471154791616317e-07, "loss": 0.1691, "step": 2604 }, { "epoch": 1.9542385596399101, "grad_norm": 0.35372209548950195, "learning_rate": 1.59443924117586e-07, "loss": 0.1428, "step": 2605 }, { "epoch": 1.9549887471867966, "grad_norm": 0.3731192648410797, "learning_rate": 1.5426177196479207e-07, "loss": 0.2654, "step": 2606 }, { "epoch": 1.9557389347336835, "grad_norm": 0.2923562824726105, "learning_rate": 1.4916510034466702e-07, "loss": 0.2424, "step": 2607 }, { "epoch": 1.95648912228057, "grad_norm": 0.2775515615940094, "learning_rate": 1.441539179974971e-07, "loss": 0.1765, "step": 2608 }, { "epoch": 1.957239309827457, "grad_norm": 0.3438125550746918, "learning_rate": 1.3922823351697479e-07, "loss": 0.1855, "step": 2609 }, { "epoch": 1.9579894973743435, "grad_norm": 0.39503538608551025, "learning_rate": 1.343880553501542e-07, "loss": 0.1887, "step": 2610 }, { "epoch": 1.9587396849212304, "grad_norm": 0.3080960214138031, "learning_rate": 1.2963339179746238e-07, "loss": 0.2115, "step": 2611 }, { "epoch": 1.959489872468117, "grad_norm": 0.334641695022583, "learning_rate": 1.2496425101268804e-07, "loss": 0.1957, "step": 2612 }, { "epoch": 1.9602400600150038, "grad_norm": 0.320646733045578, "learning_rate": 1.2038064100294843e-07, "loss": 0.1742, "step": 2613 }, { "epoch": 1.9609902475618903, "grad_norm": 0.3562566339969635, "learning_rate": 1.158825696286725e-07, "loss": 0.1955, "step": 2614 }, { "epoch": 1.9617404351087773, "grad_norm": 0.27916648983955383, "learning_rate": 1.114700446036232e-07, "loss": 0.1651, "step": 2615 }, { "epoch": 1.9624906226556638, "grad_norm": 0.3019964396953583, "learning_rate": 1.0714307349483089e-07, "loss": 0.2482, "step": 2616 }, { "epoch": 1.9632408102025507, "grad_norm": 0.4363631308078766, "learning_rate": 1.029016637226432e-07, "loss": 0.2941, "step": 2617 }, { "epoch": 1.9639909977494372, "grad_norm": 0.38257119059562683, "learning_rate": 9.874582256064192e-08, "loss": 0.2439, "step": 2618 }, { "epoch": 1.9647411852963241, "grad_norm": 0.4233134090900421, "learning_rate": 9.46755571356983e-08, "loss": 0.1795, "step": 2619 }, { "epoch": 1.9654913728432108, "grad_norm": 0.34872010350227356, "learning_rate": 9.069087442791224e-08, "loss": 0.2427, "step": 2620 }, { "epoch": 1.9662415603900976, "grad_norm": 0.3531149625778198, "learning_rate": 8.679178127062871e-08, "loss": 0.2442, "step": 2621 }, { "epoch": 1.9669917479369843, "grad_norm": 0.3587384819984436, "learning_rate": 8.297828435039346e-08, "loss": 0.2703, "step": 2622 }, { "epoch": 1.967741935483871, "grad_norm": 0.3438369929790497, "learning_rate": 7.925039020699187e-08, "loss": 0.1804, "step": 2623 }, { "epoch": 1.9684921230307577, "grad_norm": 0.36445942521095276, "learning_rate": 7.56081052333879e-08, "loss": 0.3207, "step": 2624 }, { "epoch": 1.9692423105776444, "grad_norm": 0.3107506334781647, "learning_rate": 7.205143567574624e-08, "loss": 0.2204, "step": 2625 }, { "epoch": 1.9699924981245311, "grad_norm": 0.4346839487552643, "learning_rate": 6.858038763340458e-08, "loss": 0.2886, "step": 2626 }, { "epoch": 1.9707426856714179, "grad_norm": 0.2782738506793976, "learning_rate": 6.519496705886252e-08, "loss": 0.2282, "step": 2627 }, { "epoch": 1.9714928732183046, "grad_norm": 0.33514344692230225, "learning_rate": 6.189517975778713e-08, "loss": 0.2305, "step": 2628 }, { "epoch": 1.9722430607651913, "grad_norm": 0.4887050986289978, "learning_rate": 5.8681031388990724e-08, "loss": 0.2867, "step": 2629 }, { "epoch": 1.972993248312078, "grad_norm": 0.3226735591888428, "learning_rate": 5.555252746441975e-08, "loss": 0.2426, "step": 2630 }, { "epoch": 1.9737434358589647, "grad_norm": 0.4334736764431, "learning_rate": 5.25096733491548e-08, "loss": 0.214, "step": 2631 }, { "epoch": 1.9744936234058514, "grad_norm": 0.3300659954547882, "learning_rate": 4.9552474261377326e-08, "loss": 0.1988, "step": 2632 }, { "epoch": 1.9752438109527382, "grad_norm": 0.3303864896297455, "learning_rate": 4.6680935272408465e-08, "loss": 0.1383, "step": 2633 }, { "epoch": 1.9759939984996249, "grad_norm": 0.35514625906944275, "learning_rate": 4.3895061306648e-08, "loss": 0.1952, "step": 2634 }, { "epoch": 1.9767441860465116, "grad_norm": 0.33816173672676086, "learning_rate": 4.119485714159099e-08, "loss": 0.2152, "step": 2635 }, { "epoch": 1.9774943735933983, "grad_norm": 0.26139745116233826, "learning_rate": 3.8580327407827796e-08, "loss": 0.244, "step": 2636 }, { "epoch": 1.978244561140285, "grad_norm": 0.4354928731918335, "learning_rate": 3.605147658901631e-08, "loss": 0.2759, "step": 2637 }, { "epoch": 1.978994748687172, "grad_norm": 0.3507806062698364, "learning_rate": 3.360830902189305e-08, "loss": 0.255, "step": 2638 }, { "epoch": 1.9797449362340584, "grad_norm": 0.30988752841949463, "learning_rate": 3.125082889623987e-08, "loss": 0.2546, "step": 2639 }, { "epoch": 1.9804951237809454, "grad_norm": 0.38186660408973694, "learning_rate": 2.8979040254911717e-08, "loss": 0.2319, "step": 2640 }, { "epoch": 1.9812453113278319, "grad_norm": 0.3184620141983032, "learning_rate": 2.67929469937922e-08, "loss": 0.1759, "step": 2641 }, { "epoch": 1.9819954988747188, "grad_norm": 0.28864148259162903, "learning_rate": 2.4692552861826925e-08, "loss": 0.2175, "step": 2642 }, { "epoch": 1.9827456864216053, "grad_norm": 0.28052279353141785, "learning_rate": 2.2677861460984607e-08, "loss": 0.1611, "step": 2643 }, { "epoch": 1.9834958739684923, "grad_norm": 0.32979339361190796, "learning_rate": 2.074887624625155e-08, "loss": 0.1887, "step": 2644 }, { "epoch": 1.9842460615153787, "grad_norm": 0.34922945499420166, "learning_rate": 1.890560052565937e-08, "loss": 0.1735, "step": 2645 }, { "epoch": 1.9849962490622657, "grad_norm": 0.3599134683609009, "learning_rate": 1.7148037460235078e-08, "loss": 0.2611, "step": 2646 }, { "epoch": 1.9857464366091522, "grad_norm": 0.4279593527317047, "learning_rate": 1.5476190064034334e-08, "loss": 0.2317, "step": 2647 }, { "epoch": 1.9864966241560391, "grad_norm": 0.2561042606830597, "learning_rate": 1.3890061204108185e-08, "loss": 0.2182, "step": 2648 }, { "epoch": 1.9872468117029256, "grad_norm": 0.3358863294124603, "learning_rate": 1.2389653600508588e-08, "loss": 0.232, "step": 2649 }, { "epoch": 1.9879969992498125, "grad_norm": 0.4547403156757355, "learning_rate": 1.0974969826288428e-08, "loss": 0.2175, "step": 2650 }, { "epoch": 1.988747186796699, "grad_norm": 0.38021424412727356, "learning_rate": 9.646012307490405e-09, "loss": 0.2125, "step": 2651 }, { "epoch": 1.989497374343586, "grad_norm": 0.32435983419418335, "learning_rate": 8.402783323147034e-09, "loss": 0.2489, "step": 2652 }, { "epoch": 1.9902475618904725, "grad_norm": 0.3554992973804474, "learning_rate": 7.245285005275104e-09, "loss": 0.2753, "step": 2653 }, { "epoch": 1.9909977494373594, "grad_norm": 0.3057315945625305, "learning_rate": 6.1735193388701155e-09, "loss": 0.195, "step": 2654 }, { "epoch": 1.991747936984246, "grad_norm": 0.4393582046031952, "learning_rate": 5.187488161895182e-09, "loss": 0.2095, "step": 2655 }, { "epoch": 1.9924981245311328, "grad_norm": 0.308921217918396, "learning_rate": 4.28719316531434e-09, "loss": 0.197, "step": 2656 }, { "epoch": 1.9932483120780196, "grad_norm": 0.3172537684440613, "learning_rate": 3.4726358930259328e-09, "loss": 0.184, "step": 2657 }, { "epoch": 1.9939984996249063, "grad_norm": 0.32849419116973877, "learning_rate": 2.743817741929222e-09, "loss": 0.2411, "step": 2658 }, { "epoch": 1.994748687171793, "grad_norm": 0.3948138654232025, "learning_rate": 2.1007399618688807e-09, "loss": 0.2049, "step": 2659 }, { "epoch": 1.9954988747186797, "grad_norm": 0.30734214186668396, "learning_rate": 1.543403655662745e-09, "loss": 0.1667, "step": 2660 }, { "epoch": 1.9962490622655664, "grad_norm": 0.32014867663383484, "learning_rate": 1.0718097790907156e-09, "loss": 0.1761, "step": 2661 }, { "epoch": 1.9969992498124531, "grad_norm": 0.3104285001754761, "learning_rate": 6.859591408836519e-10, "loss": 0.1875, "step": 2662 }, { "epoch": 1.9977494373593399, "grad_norm": 0.3667044937610626, "learning_rate": 3.8585240273447677e-10, "loss": 0.2034, "step": 2663 }, { "epoch": 1.9984996249062266, "grad_norm": 0.23348531126976013, "learning_rate": 1.7149007930927773e-10, "loss": 0.2067, "step": 2664 }, { "epoch": 1.9992498124531133, "grad_norm": 0.3244154155254364, "learning_rate": 4.2872538208449386e-11, "loss": 0.1461, "step": 2665 }, { "epoch": 2.0, "grad_norm": 0.42405906319618225, "learning_rate": 0.0, "loss": 0.1785, "step": 2666 }, { "epoch": 2.0, "step": 2666, "total_flos": 1.3076161478136955e+18, "train_loss": 0.2550497850583446, "train_runtime": 5244.2873, "train_samples_per_second": 2.032, "train_steps_per_second": 0.508 } ], "logging_steps": 1, "max_steps": 2666, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3076161478136955e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }