{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.984025559105431, "eval_steps": 500, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012779552715654952, "grad_norm": 6.592559773224311, "learning_rate": 1.0256410256410257e-06, "loss": 1.0584, "step": 1 }, { "epoch": 0.025559105431309903, "grad_norm": 6.533996250579117, "learning_rate": 2.0512820512820513e-06, "loss": 1.0359, "step": 2 }, { "epoch": 0.038338658146964855, "grad_norm": 6.403554394230576, "learning_rate": 3.0769230769230774e-06, "loss": 1.0339, "step": 3 }, { "epoch": 0.051118210862619806, "grad_norm": 5.983183526137737, "learning_rate": 4.102564102564103e-06, "loss": 1.0326, "step": 4 }, { "epoch": 0.06389776357827476, "grad_norm": 4.49473351652443, "learning_rate": 5.128205128205128e-06, "loss": 0.9829, "step": 5 }, { "epoch": 0.07667731629392971, "grad_norm": 2.7792011342028844, "learning_rate": 6.153846153846155e-06, "loss": 0.9442, "step": 6 }, { "epoch": 0.08945686900958466, "grad_norm": 2.585532313798018, "learning_rate": 7.17948717948718e-06, "loss": 0.9341, "step": 7 }, { "epoch": 0.10223642172523961, "grad_norm": 4.343578316090016, "learning_rate": 8.205128205128205e-06, "loss": 0.9597, "step": 8 }, { "epoch": 0.11501597444089456, "grad_norm": 4.337079142565349, "learning_rate": 9.230769230769232e-06, "loss": 0.9411, "step": 9 }, { "epoch": 0.12779552715654952, "grad_norm": 4.113753051228281, "learning_rate": 1.0256410256410256e-05, "loss": 0.8849, "step": 10 }, { "epoch": 0.14057507987220447, "grad_norm": 3.669068981229734, "learning_rate": 1.1282051282051283e-05, "loss": 0.8685, "step": 11 }, { "epoch": 0.15335463258785942, "grad_norm": 2.510782563487606, "learning_rate": 1.230769230769231e-05, "loss": 0.8425, "step": 12 }, { "epoch": 0.16613418530351437, "grad_norm": 1.7766490492923275, "learning_rate": 1.3333333333333333e-05, "loss": 0.8044, "step": 13 }, { "epoch": 0.17891373801916932, "grad_norm": 1.773850110884697, "learning_rate": 1.435897435897436e-05, "loss": 0.7956, "step": 14 }, { "epoch": 0.19169329073482427, "grad_norm": 1.4449567647432071, "learning_rate": 1.5384615384615387e-05, "loss": 0.7928, "step": 15 }, { "epoch": 0.20447284345047922, "grad_norm": 1.2329425176471562, "learning_rate": 1.641025641025641e-05, "loss": 0.7647, "step": 16 }, { "epoch": 0.21725239616613418, "grad_norm": 1.3214658971437656, "learning_rate": 1.7435897435897438e-05, "loss": 0.7418, "step": 17 }, { "epoch": 0.23003194888178913, "grad_norm": 1.1774696769302788, "learning_rate": 1.8461538461538465e-05, "loss": 0.7388, "step": 18 }, { "epoch": 0.24281150159744408, "grad_norm": 1.1783633623704315, "learning_rate": 1.9487179487179488e-05, "loss": 0.7342, "step": 19 }, { "epoch": 0.25559105431309903, "grad_norm": 0.9918610979106691, "learning_rate": 2.0512820512820512e-05, "loss": 0.7153, "step": 20 }, { "epoch": 0.268370607028754, "grad_norm": 1.0970437777181943, "learning_rate": 2.153846153846154e-05, "loss": 0.7131, "step": 21 }, { "epoch": 0.28115015974440893, "grad_norm": 1.1303616213524492, "learning_rate": 2.2564102564102566e-05, "loss": 0.7218, "step": 22 }, { "epoch": 0.2939297124600639, "grad_norm": 0.9410346603084735, "learning_rate": 2.3589743589743593e-05, "loss": 0.6969, "step": 23 }, { "epoch": 0.30670926517571884, "grad_norm": 1.241236726131634, "learning_rate": 2.461538461538462e-05, "loss": 0.6896, "step": 24 }, { "epoch": 0.3194888178913738, "grad_norm": 1.0125739251188415, "learning_rate": 2.5641025641025646e-05, "loss": 0.6908, "step": 25 }, { "epoch": 0.33226837060702874, "grad_norm": 1.2133669063317278, "learning_rate": 2.6666666666666667e-05, "loss": 0.6939, "step": 26 }, { "epoch": 0.3450479233226837, "grad_norm": 0.7171243870310143, "learning_rate": 2.7692307692307694e-05, "loss": 0.6917, "step": 27 }, { "epoch": 0.35782747603833864, "grad_norm": 0.8954952921929109, "learning_rate": 2.871794871794872e-05, "loss": 0.6728, "step": 28 }, { "epoch": 0.3706070287539936, "grad_norm": 1.2535395812138406, "learning_rate": 2.9743589743589747e-05, "loss": 0.6775, "step": 29 }, { "epoch": 0.38338658146964855, "grad_norm": 0.7099241097596939, "learning_rate": 3.0769230769230774e-05, "loss": 0.6652, "step": 30 }, { "epoch": 0.3961661341853035, "grad_norm": 1.1218258048553897, "learning_rate": 3.1794871794871795e-05, "loss": 0.6837, "step": 31 }, { "epoch": 0.40894568690095845, "grad_norm": 0.798182751910104, "learning_rate": 3.282051282051282e-05, "loss": 0.6619, "step": 32 }, { "epoch": 0.4217252396166134, "grad_norm": 0.9497616621304193, "learning_rate": 3.384615384615385e-05, "loss": 0.6668, "step": 33 }, { "epoch": 0.43450479233226835, "grad_norm": 0.9670159555978985, "learning_rate": 3.4871794871794875e-05, "loss": 0.652, "step": 34 }, { "epoch": 0.4472843450479233, "grad_norm": 0.7631934555113155, "learning_rate": 3.58974358974359e-05, "loss": 0.6807, "step": 35 }, { "epoch": 0.46006389776357826, "grad_norm": 1.138059017679059, "learning_rate": 3.692307692307693e-05, "loss": 0.6571, "step": 36 }, { "epoch": 0.4728434504792332, "grad_norm": 1.2472685281144757, "learning_rate": 3.794871794871795e-05, "loss": 0.6422, "step": 37 }, { "epoch": 0.48562300319488816, "grad_norm": 1.092086824798557, "learning_rate": 3.8974358974358976e-05, "loss": 0.6506, "step": 38 }, { "epoch": 0.4984025559105431, "grad_norm": 1.440587909713568, "learning_rate": 4e-05, "loss": 0.6565, "step": 39 }, { "epoch": 0.5111821086261981, "grad_norm": 1.0137445016603566, "learning_rate": 3.9999198907597046e-05, "loss": 0.6511, "step": 40 }, { "epoch": 0.5239616613418531, "grad_norm": 1.1870385032749615, "learning_rate": 3.9996795694563096e-05, "loss": 0.6595, "step": 41 }, { "epoch": 0.536741214057508, "grad_norm": 1.1579856844560368, "learning_rate": 3.999279055341771e-05, "loss": 0.6603, "step": 42 }, { "epoch": 0.549520766773163, "grad_norm": 1.1761600219305477, "learning_rate": 3.998718380500971e-05, "loss": 0.6348, "step": 43 }, { "epoch": 0.5623003194888179, "grad_norm": 1.0176273767544746, "learning_rate": 3.997997589849145e-05, "loss": 0.6476, "step": 44 }, { "epoch": 0.5750798722044729, "grad_norm": 1.2622234967026367, "learning_rate": 3.9971167411282835e-05, "loss": 0.6378, "step": 45 }, { "epoch": 0.5878594249201278, "grad_norm": 1.0075647259433982, "learning_rate": 3.99607590490251e-05, "loss": 0.6354, "step": 46 }, { "epoch": 0.6006389776357828, "grad_norm": 1.3469437316322044, "learning_rate": 3.9948751645524235e-05, "loss": 0.6383, "step": 47 }, { "epoch": 0.6134185303514377, "grad_norm": 0.7654246385881115, "learning_rate": 3.9935146162684206e-05, "loss": 0.634, "step": 48 }, { "epoch": 0.6261980830670927, "grad_norm": 1.1916229902223678, "learning_rate": 3.9919943690429906e-05, "loss": 0.6288, "step": 49 }, { "epoch": 0.6389776357827476, "grad_norm": 0.7275724716538298, "learning_rate": 3.9903145446619837e-05, "loss": 0.6279, "step": 50 }, { "epoch": 0.6517571884984026, "grad_norm": 0.998923013777855, "learning_rate": 3.9884752776948564e-05, "loss": 0.6214, "step": 51 }, { "epoch": 0.6645367412140575, "grad_norm": 1.088165548146933, "learning_rate": 3.9864767154838864e-05, "loss": 0.6183, "step": 52 }, { "epoch": 0.6773162939297125, "grad_norm": 1.1413981384829293, "learning_rate": 3.9843190181323744e-05, "loss": 0.6471, "step": 53 }, { "epoch": 0.6900958466453674, "grad_norm": 0.7149508933231995, "learning_rate": 3.982002358491817e-05, "loss": 0.6251, "step": 54 }, { "epoch": 0.7028753993610224, "grad_norm": 0.7332580072068926, "learning_rate": 3.979526922148058e-05, "loss": 0.6225, "step": 55 }, { "epoch": 0.7156549520766773, "grad_norm": 0.9613780962451666, "learning_rate": 3.9768929074064206e-05, "loss": 0.6463, "step": 56 }, { "epoch": 0.7284345047923323, "grad_norm": 0.9155040213797432, "learning_rate": 3.9741005252758255e-05, "loss": 0.6225, "step": 57 }, { "epoch": 0.7412140575079872, "grad_norm": 0.7928433142515511, "learning_rate": 3.971149999451886e-05, "loss": 0.6057, "step": 58 }, { "epoch": 0.7539936102236422, "grad_norm": 0.8354988418472067, "learning_rate": 3.9680415662989806e-05, "loss": 0.6191, "step": 59 }, { "epoch": 0.7667731629392971, "grad_norm": 0.9338419841133101, "learning_rate": 3.9647754748313294e-05, "loss": 0.623, "step": 60 }, { "epoch": 0.7795527156549521, "grad_norm": 0.6754571472606996, "learning_rate": 3.96135198669304e-05, "loss": 0.6167, "step": 61 }, { "epoch": 0.792332268370607, "grad_norm": 0.7147996913925468, "learning_rate": 3.957771376137144e-05, "loss": 0.6227, "step": 62 }, { "epoch": 0.805111821086262, "grad_norm": 0.7490356704866034, "learning_rate": 3.954033930003634e-05, "loss": 0.6239, "step": 63 }, { "epoch": 0.8178913738019169, "grad_norm": 0.6684421206373585, "learning_rate": 3.9501399476964806e-05, "loss": 0.6117, "step": 64 }, { "epoch": 0.8306709265175719, "grad_norm": 0.794134899045863, "learning_rate": 3.946089741159648e-05, "loss": 0.6174, "step": 65 }, { "epoch": 0.8434504792332268, "grad_norm": 0.7395607316321484, "learning_rate": 3.9418836348521045e-05, "loss": 0.6155, "step": 66 }, { "epoch": 0.8562300319488818, "grad_norm": 0.6873160919171922, "learning_rate": 3.937521965721831e-05, "loss": 0.6208, "step": 67 }, { "epoch": 0.8690095846645367, "grad_norm": 0.8086968956229956, "learning_rate": 3.933005083178828e-05, "loss": 0.5886, "step": 68 }, { "epoch": 0.8817891373801917, "grad_norm": 0.9021537053131113, "learning_rate": 3.928333349067125e-05, "loss": 0.6167, "step": 69 }, { "epoch": 0.8945686900958466, "grad_norm": 0.6404666118706073, "learning_rate": 3.923507137635792e-05, "loss": 0.6064, "step": 70 }, { "epoch": 0.9073482428115016, "grad_norm": 0.9637927790584386, "learning_rate": 3.9185268355089606e-05, "loss": 0.5933, "step": 71 }, { "epoch": 0.9201277955271565, "grad_norm": 0.7503553773301649, "learning_rate": 3.913392841654851e-05, "loss": 0.6008, "step": 72 }, { "epoch": 0.9329073482428115, "grad_norm": 0.7458304269209006, "learning_rate": 3.9081055673538093e-05, "loss": 0.6211, "step": 73 }, { "epoch": 0.9456869009584664, "grad_norm": 0.8726511062352104, "learning_rate": 3.902665436165364e-05, "loss": 0.6029, "step": 74 }, { "epoch": 0.9584664536741214, "grad_norm": 0.5654076555250331, "learning_rate": 3.897072883894291e-05, "loss": 0.6023, "step": 75 }, { "epoch": 0.9712460063897763, "grad_norm": 0.8246577511362533, "learning_rate": 3.8913283585557054e-05, "loss": 0.6085, "step": 76 }, { "epoch": 0.9840255591054313, "grad_norm": 0.6405797510997362, "learning_rate": 3.885432320339167e-05, "loss": 0.6035, "step": 77 }, { "epoch": 0.9968051118210862, "grad_norm": 0.8191045323137688, "learning_rate": 3.879385241571817e-05, "loss": 0.6017, "step": 78 }, { "epoch": 1.0095846645367412, "grad_norm": 0.8221831211414313, "learning_rate": 3.873187606680543e-05, "loss": 0.5699, "step": 79 }, { "epoch": 1.0223642172523961, "grad_norm": 0.7386109051638938, "learning_rate": 3.866839912153168e-05, "loss": 0.5432, "step": 80 }, { "epoch": 1.035143769968051, "grad_norm": 1.003402326295664, "learning_rate": 3.860342666498677e-05, "loss": 0.5442, "step": 81 }, { "epoch": 1.0479233226837061, "grad_norm": 0.9151698391385112, "learning_rate": 3.853696390206484e-05, "loss": 0.5384, "step": 82 }, { "epoch": 1.060702875399361, "grad_norm": 0.7416182358928667, "learning_rate": 3.846901615704734e-05, "loss": 0.5649, "step": 83 }, { "epoch": 1.073482428115016, "grad_norm": 0.6952203802327555, "learning_rate": 3.839958887317649e-05, "loss": 0.5498, "step": 84 }, { "epoch": 1.0862619808306708, "grad_norm": 0.5542868163153963, "learning_rate": 3.832868761221926e-05, "loss": 0.5283, "step": 85 }, { "epoch": 1.099041533546326, "grad_norm": 0.7223536685987605, "learning_rate": 3.825631805402182e-05, "loss": 0.5429, "step": 86 }, { "epoch": 1.1118210862619808, "grad_norm": 0.6693799206155252, "learning_rate": 3.818248599605448e-05, "loss": 0.5411, "step": 87 }, { "epoch": 1.1246006389776357, "grad_norm": 0.63507352467521, "learning_rate": 3.810719735294731e-05, "loss": 0.5397, "step": 88 }, { "epoch": 1.1373801916932909, "grad_norm": 0.7149087551276322, "learning_rate": 3.8030458156016326e-05, "loss": 0.5453, "step": 89 }, { "epoch": 1.1501597444089458, "grad_norm": 0.6147127063222498, "learning_rate": 3.795227455278029e-05, "loss": 0.538, "step": 90 }, { "epoch": 1.1629392971246006, "grad_norm": 0.6750380523841792, "learning_rate": 3.787265280646825e-05, "loss": 0.5414, "step": 91 }, { "epoch": 1.1757188498402555, "grad_norm": 0.6328930799004385, "learning_rate": 3.7791599295517825e-05, "loss": 0.5325, "step": 92 }, { "epoch": 1.1884984025559104, "grad_norm": 0.5750276261244739, "learning_rate": 3.7709120513064196e-05, "loss": 0.5323, "step": 93 }, { "epoch": 1.2012779552715656, "grad_norm": 0.6807743986140611, "learning_rate": 3.762522306641998e-05, "loss": 0.5405, "step": 94 }, { "epoch": 1.2140575079872205, "grad_norm": 0.6482076388135797, "learning_rate": 3.7539913676545874e-05, "loss": 0.544, "step": 95 }, { "epoch": 1.2268370607028753, "grad_norm": 0.6013194708955101, "learning_rate": 3.745319917751229e-05, "loss": 0.5339, "step": 96 }, { "epoch": 1.2396166134185305, "grad_norm": 0.6837960198032468, "learning_rate": 3.736508651595188e-05, "loss": 0.5367, "step": 97 }, { "epoch": 1.2523961661341854, "grad_norm": 0.6906079799057308, "learning_rate": 3.727558275050301e-05, "loss": 0.5322, "step": 98 }, { "epoch": 1.2651757188498403, "grad_norm": 0.4982857601132116, "learning_rate": 3.718469505124434e-05, "loss": 0.5347, "step": 99 }, { "epoch": 1.2779552715654952, "grad_norm": 0.7096602770473559, "learning_rate": 3.709243069912041e-05, "loss": 0.5487, "step": 100 }, { "epoch": 1.29073482428115, "grad_norm": 0.6889841969127676, "learning_rate": 3.699879708535838e-05, "loss": 0.558, "step": 101 }, { "epoch": 1.3035143769968052, "grad_norm": 0.623099378147889, "learning_rate": 3.69038017108759e-05, "loss": 0.5461, "step": 102 }, { "epoch": 1.31629392971246, "grad_norm": 0.5987316749971819, "learning_rate": 3.680745218568026e-05, "loss": 0.5292, "step": 103 }, { "epoch": 1.329073482428115, "grad_norm": 0.7385186281094401, "learning_rate": 3.6709756228258735e-05, "loss": 0.5596, "step": 104 }, { "epoch": 1.34185303514377, "grad_norm": 0.5040586704906941, "learning_rate": 3.6610721664960236e-05, "loss": 0.5538, "step": 105 }, { "epoch": 1.354632587859425, "grad_norm": 0.5502068181444864, "learning_rate": 3.65103564293684e-05, "loss": 0.5255, "step": 106 }, { "epoch": 1.3674121405750799, "grad_norm": 0.6038940086818471, "learning_rate": 3.640866856166601e-05, "loss": 0.5295, "step": 107 }, { "epoch": 1.3801916932907348, "grad_norm": 0.5294269885203353, "learning_rate": 3.6305666207990886e-05, "loss": 0.5374, "step": 108 }, { "epoch": 1.3929712460063897, "grad_norm": 0.5380829680847359, "learning_rate": 3.6201357619783336e-05, "loss": 0.5323, "step": 109 }, { "epoch": 1.4057507987220448, "grad_norm": 0.598990655418311, "learning_rate": 3.609575115312511e-05, "loss": 0.5312, "step": 110 }, { "epoch": 1.4185303514376997, "grad_norm": 0.6187080893504935, "learning_rate": 3.598885526807003e-05, "loss": 0.5577, "step": 111 }, { "epoch": 1.4313099041533546, "grad_norm": 0.5178475312005663, "learning_rate": 3.5880678527966224e-05, "loss": 0.5459, "step": 112 }, { "epoch": 1.4440894568690097, "grad_norm": 0.5678731177517095, "learning_rate": 3.577122959877017e-05, "loss": 0.5313, "step": 113 }, { "epoch": 1.4568690095846646, "grad_norm": 0.5728769593469301, "learning_rate": 3.566051724835245e-05, "loss": 0.5459, "step": 114 }, { "epoch": 1.4696485623003195, "grad_norm": 0.6236590406029358, "learning_rate": 3.554855034579532e-05, "loss": 0.533, "step": 115 }, { "epoch": 1.4824281150159744, "grad_norm": 0.6395572621499039, "learning_rate": 3.5435337860682304e-05, "loss": 0.5364, "step": 116 }, { "epoch": 1.4952076677316293, "grad_norm": 0.5473857592392815, "learning_rate": 3.532088886237956e-05, "loss": 0.5438, "step": 117 }, { "epoch": 1.5079872204472844, "grad_norm": 0.6836195507522692, "learning_rate": 3.520521251930941e-05, "loss": 0.5404, "step": 118 }, { "epoch": 1.5207667731629393, "grad_norm": 0.5133895745524997, "learning_rate": 3.5088318098215805e-05, "loss": 0.5703, "step": 119 }, { "epoch": 1.5335463258785942, "grad_norm": 0.932045951945919, "learning_rate": 3.497021496342203e-05, "loss": 0.5609, "step": 120 }, { "epoch": 1.5463258785942493, "grad_norm": 0.6666508012468598, "learning_rate": 3.485091257608047e-05, "loss": 0.5405, "step": 121 }, { "epoch": 1.5591054313099042, "grad_norm": 0.5912549145981204, "learning_rate": 3.473042049341474e-05, "loss": 0.5389, "step": 122 }, { "epoch": 1.571884984025559, "grad_norm": 0.5686320425619221, "learning_rate": 3.4608748367954064e-05, "loss": 0.539, "step": 123 }, { "epoch": 1.5846645367412142, "grad_norm": 0.4929986179056893, "learning_rate": 3.4485905946759965e-05, "loss": 0.5391, "step": 124 }, { "epoch": 1.5974440894568689, "grad_norm": 0.5930635255373866, "learning_rate": 3.4361903070645484e-05, "loss": 0.5412, "step": 125 }, { "epoch": 1.610223642172524, "grad_norm": 0.52822029078469, "learning_rate": 3.423674967338681e-05, "loss": 0.5339, "step": 126 }, { "epoch": 1.623003194888179, "grad_norm": 0.5583177055430788, "learning_rate": 3.411045578092754e-05, "loss": 0.5507, "step": 127 }, { "epoch": 1.6357827476038338, "grad_norm": 0.535242050254539, "learning_rate": 3.398303151057543e-05, "loss": 0.5352, "step": 128 }, { "epoch": 1.648562300319489, "grad_norm": 0.5154643751949803, "learning_rate": 3.385448707019199e-05, "loss": 0.542, "step": 129 }, { "epoch": 1.6613418530351438, "grad_norm": 0.5513845267160518, "learning_rate": 3.372483275737468e-05, "loss": 0.5447, "step": 130 }, { "epoch": 1.6741214057507987, "grad_norm": 0.4632157332613763, "learning_rate": 3.359407895863199e-05, "loss": 0.5401, "step": 131 }, { "epoch": 1.6869009584664538, "grad_norm": 0.5364008154819976, "learning_rate": 3.34622361485514e-05, "loss": 0.5502, "step": 132 }, { "epoch": 1.6996805111821085, "grad_norm": 0.5368875171734532, "learning_rate": 3.332931488896029e-05, "loss": 0.526, "step": 133 }, { "epoch": 1.7124600638977636, "grad_norm": 0.5336475888202112, "learning_rate": 3.319532582807977e-05, "loss": 0.5243, "step": 134 }, { "epoch": 1.7252396166134185, "grad_norm": 0.48548930789937556, "learning_rate": 3.30602796996717e-05, "loss": 0.5333, "step": 135 }, { "epoch": 1.7380191693290734, "grad_norm": 0.4399082077195597, "learning_rate": 3.2924187322178865e-05, "loss": 0.5317, "step": 136 }, { "epoch": 1.7507987220447285, "grad_norm": 0.5504323482709709, "learning_rate": 3.278705959785821e-05, "loss": 0.5379, "step": 137 }, { "epoch": 1.7635782747603834, "grad_norm": 0.40584327896715106, "learning_rate": 3.2648907511907544e-05, "loss": 0.5419, "step": 138 }, { "epoch": 1.7763578274760383, "grad_norm": 0.5380474051164952, "learning_rate": 3.250974213158555e-05, "loss": 0.516, "step": 139 }, { "epoch": 1.7891373801916934, "grad_norm": 0.506968824527567, "learning_rate": 3.23695746053251e-05, "loss": 0.5373, "step": 140 }, { "epoch": 1.8019169329073481, "grad_norm": 0.44763696341088166, "learning_rate": 3.222841616184025e-05, "loss": 0.5318, "step": 141 }, { "epoch": 1.8146964856230032, "grad_norm": 0.479794766401982, "learning_rate": 3.208627810922665e-05, "loss": 0.5226, "step": 142 }, { "epoch": 1.8274760383386581, "grad_norm": 0.4676050726980317, "learning_rate": 3.194317183405573e-05, "loss": 0.5382, "step": 143 }, { "epoch": 1.840255591054313, "grad_norm": 0.4644121130733055, "learning_rate": 3.1799108800462466e-05, "loss": 0.5392, "step": 144 }, { "epoch": 1.8530351437699681, "grad_norm": 0.492449471299021, "learning_rate": 3.1654100549227024e-05, "loss": 0.5274, "step": 145 }, { "epoch": 1.865814696485623, "grad_norm": 0.3932772336810787, "learning_rate": 3.1508158696850275e-05, "loss": 0.5358, "step": 146 }, { "epoch": 1.878594249201278, "grad_norm": 0.4964607230949776, "learning_rate": 3.136129493462312e-05, "loss": 0.5415, "step": 147 }, { "epoch": 1.891373801916933, "grad_norm": 0.44969213286367615, "learning_rate": 3.121352102768998e-05, "loss": 0.5167, "step": 148 }, { "epoch": 1.9041533546325877, "grad_norm": 0.4085529551534211, "learning_rate": 3.106484881410628e-05, "loss": 0.5405, "step": 149 }, { "epoch": 1.9169329073482428, "grad_norm": 0.4028888479451454, "learning_rate": 3.091529020389009e-05, "loss": 0.5225, "step": 150 }, { "epoch": 1.9297124600638977, "grad_norm": 0.40240512299145004, "learning_rate": 3.076485717806808e-05, "loss": 0.5268, "step": 151 }, { "epoch": 1.9424920127795526, "grad_norm": 0.42427167845298364, "learning_rate": 3.061356178771564e-05, "loss": 0.522, "step": 152 }, { "epoch": 1.9552715654952078, "grad_norm": 0.3985483640545327, "learning_rate": 3.0461416152991555e-05, "loss": 0.5328, "step": 153 }, { "epoch": 1.9680511182108626, "grad_norm": 0.44616565380137657, "learning_rate": 3.0308432462167045e-05, "loss": 0.544, "step": 154 }, { "epoch": 1.9808306709265175, "grad_norm": 0.3848131483218998, "learning_rate": 3.015462297064936e-05, "loss": 0.5444, "step": 155 }, { "epoch": 1.9936102236421727, "grad_norm": 0.4506794979037802, "learning_rate": 3.0000000000000004e-05, "loss": 0.5312, "step": 156 }, { "epoch": 2.0063897763578273, "grad_norm": 0.46737966145548454, "learning_rate": 2.98445759369477e-05, "loss": 0.4861, "step": 157 }, { "epoch": 2.0191693290734825, "grad_norm": 0.4797404865229001, "learning_rate": 2.9688363232396056e-05, "loss": 0.4469, "step": 158 }, { "epoch": 2.0319488817891376, "grad_norm": 0.9091491802667112, "learning_rate": 2.9531374400426158e-05, "loss": 0.4686, "step": 159 }, { "epoch": 2.0447284345047922, "grad_norm": 0.6362863999522714, "learning_rate": 2.9373622017294075e-05, "loss": 0.4571, "step": 160 }, { "epoch": 2.0575079872204474, "grad_norm": 0.5841242289813745, "learning_rate": 2.9215118720423375e-05, "loss": 0.4624, "step": 161 }, { "epoch": 2.070287539936102, "grad_norm": 0.5660198412301565, "learning_rate": 2.9055877207392752e-05, "loss": 0.4606, "step": 162 }, { "epoch": 2.083067092651757, "grad_norm": 0.6214246268809275, "learning_rate": 2.8895910234918828e-05, "loss": 0.4515, "step": 163 }, { "epoch": 2.0958466453674123, "grad_norm": 0.6031237202639322, "learning_rate": 2.873523061783426e-05, "loss": 0.4727, "step": 164 }, { "epoch": 2.108626198083067, "grad_norm": 0.5459362939896371, "learning_rate": 2.8573851228061084e-05, "loss": 0.4446, "step": 165 }, { "epoch": 2.121405750798722, "grad_norm": 0.6309768044121306, "learning_rate": 2.8411784993579633e-05, "loss": 0.4564, "step": 166 }, { "epoch": 2.134185303514377, "grad_norm": 0.5004805609415554, "learning_rate": 2.8249044897392814e-05, "loss": 0.4427, "step": 167 }, { "epoch": 2.146964856230032, "grad_norm": 0.5597030810750162, "learning_rate": 2.80856439764861e-05, "loss": 0.4547, "step": 168 }, { "epoch": 2.159744408945687, "grad_norm": 0.525309059574843, "learning_rate": 2.792159532078314e-05, "loss": 0.4438, "step": 169 }, { "epoch": 2.1725239616613417, "grad_norm": 0.5150529287331795, "learning_rate": 2.77569120720971e-05, "loss": 0.4395, "step": 170 }, { "epoch": 2.1853035143769968, "grad_norm": 0.49291153438447477, "learning_rate": 2.7591607423077932e-05, "loss": 0.4576, "step": 171 }, { "epoch": 2.198083067092652, "grad_norm": 0.47894233452443435, "learning_rate": 2.7425694616155474e-05, "loss": 0.448, "step": 172 }, { "epoch": 2.2108626198083066, "grad_norm": 0.5324687741348569, "learning_rate": 2.7259186942478656e-05, "loss": 0.4754, "step": 173 }, { "epoch": 2.2236421725239617, "grad_norm": 0.4667746857260792, "learning_rate": 2.7092097740850712e-05, "loss": 0.4535, "step": 174 }, { "epoch": 2.236421725239617, "grad_norm": 0.4546389264946905, "learning_rate": 2.692444039666066e-05, "loss": 0.4568, "step": 175 }, { "epoch": 2.2492012779552715, "grad_norm": 0.6207522955702326, "learning_rate": 2.6756228340810946e-05, "loss": 0.4528, "step": 176 }, { "epoch": 2.2619808306709266, "grad_norm": 0.40205802485861697, "learning_rate": 2.6587475048641596e-05, "loss": 0.4411, "step": 177 }, { "epoch": 2.2747603833865817, "grad_norm": 0.6515647885033293, "learning_rate": 2.6418194038850634e-05, "loss": 0.4633, "step": 178 }, { "epoch": 2.2875399361022364, "grad_norm": 0.3981589507945508, "learning_rate": 2.624839887241115e-05, "loss": 0.4501, "step": 179 }, { "epoch": 2.3003194888178915, "grad_norm": 0.5245299926954298, "learning_rate": 2.607810315148494e-05, "loss": 0.4631, "step": 180 }, { "epoch": 2.313099041533546, "grad_norm": 0.39902221901310364, "learning_rate": 2.5907320518332827e-05, "loss": 0.4512, "step": 181 }, { "epoch": 2.3258785942492013, "grad_norm": 0.46007760708126333, "learning_rate": 2.5736064654221808e-05, "loss": 0.4539, "step": 182 }, { "epoch": 2.3386581469648564, "grad_norm": 0.44935342627821434, "learning_rate": 2.5564349278329056e-05, "loss": 0.4583, "step": 183 }, { "epoch": 2.351437699680511, "grad_norm": 0.48732236655543254, "learning_rate": 2.539218814664288e-05, "loss": 0.4625, "step": 184 }, { "epoch": 2.364217252396166, "grad_norm": 0.41064313854717027, "learning_rate": 2.521959505086075e-05, "loss": 0.455, "step": 185 }, { "epoch": 2.376996805111821, "grad_norm": 0.4238363772894373, "learning_rate": 2.5046583817284437e-05, "loss": 0.4474, "step": 186 }, { "epoch": 2.389776357827476, "grad_norm": 0.37444556355224096, "learning_rate": 2.487316830571244e-05, "loss": 0.4484, "step": 187 }, { "epoch": 2.402555910543131, "grad_norm": 0.37693535450755855, "learning_rate": 2.4699362408329646e-05, "loss": 0.4402, "step": 188 }, { "epoch": 2.415335463258786, "grad_norm": 0.3608251145740366, "learning_rate": 2.4525180048594452e-05, "loss": 0.4424, "step": 189 }, { "epoch": 2.428115015974441, "grad_norm": 0.4233677847393811, "learning_rate": 2.435063518012335e-05, "loss": 0.4623, "step": 190 }, { "epoch": 2.440894568690096, "grad_norm": 0.31455922006156284, "learning_rate": 2.4175741785573177e-05, "loss": 0.4589, "step": 191 }, { "epoch": 2.4536741214057507, "grad_norm": 0.38364627896467995, "learning_rate": 2.4000513875520892e-05, "loss": 0.4501, "step": 192 }, { "epoch": 2.466453674121406, "grad_norm": 0.3431035862182369, "learning_rate": 2.3824965487341247e-05, "loss": 0.4522, "step": 193 }, { "epoch": 2.479233226837061, "grad_norm": 0.40165829454509927, "learning_rate": 2.3649110684082258e-05, "loss": 0.4535, "step": 194 }, { "epoch": 2.4920127795527156, "grad_norm": 0.36203476792786726, "learning_rate": 2.3472963553338614e-05, "loss": 0.4601, "step": 195 }, { "epoch": 2.5047923322683707, "grad_norm": 0.420613009985552, "learning_rate": 2.3296538206123134e-05, "loss": 0.4466, "step": 196 }, { "epoch": 2.5175718849840254, "grad_norm": 0.432857708679188, "learning_rate": 2.311984877573636e-05, "loss": 0.4716, "step": 197 }, { "epoch": 2.5303514376996805, "grad_norm": 0.3553966044000479, "learning_rate": 2.2942909416634326e-05, "loss": 0.4427, "step": 198 }, { "epoch": 2.543130990415335, "grad_norm": 0.3701325458937448, "learning_rate": 2.2765734303294666e-05, "loss": 0.4488, "step": 199 }, { "epoch": 2.5559105431309903, "grad_norm": 0.3718229874116265, "learning_rate": 2.2588337629081107e-05, "loss": 0.4438, "step": 200 }, { "epoch": 2.5686900958466454, "grad_norm": 0.4950117082964623, "learning_rate": 2.2410733605106462e-05, "loss": 0.4718, "step": 201 }, { "epoch": 2.5814696485623, "grad_norm": 0.3452023127805258, "learning_rate": 2.2232936459094158e-05, "loss": 0.4441, "step": 202 }, { "epoch": 2.594249201277955, "grad_norm": 0.49794594667174263, "learning_rate": 2.205496043423849e-05, "loss": 0.4738, "step": 203 }, { "epoch": 2.6070287539936103, "grad_norm": 0.36822312542225943, "learning_rate": 2.1876819788063586e-05, "loss": 0.4346, "step": 204 }, { "epoch": 2.619808306709265, "grad_norm": 0.520296874766587, "learning_rate": 2.16985287912813e-05, "loss": 0.4428, "step": 205 }, { "epoch": 2.63258785942492, "grad_norm": 0.39868539600412517, "learning_rate": 2.1520101726647922e-05, "loss": 0.4532, "step": 206 }, { "epoch": 2.6453674121405752, "grad_norm": 0.386961686713686, "learning_rate": 2.1341552887820048e-05, "loss": 0.4504, "step": 207 }, { "epoch": 2.65814696485623, "grad_norm": 0.4567237436678329, "learning_rate": 2.1162896578209517e-05, "loss": 0.4614, "step": 208 }, { "epoch": 2.670926517571885, "grad_norm": 0.37114888301249416, "learning_rate": 2.0984147109837564e-05, "loss": 0.4678, "step": 209 }, { "epoch": 2.68370607028754, "grad_norm": 0.4503766328517245, "learning_rate": 2.0805318802188307e-05, "loss": 0.4359, "step": 210 }, { "epoch": 2.696485623003195, "grad_norm": 0.38120015554672837, "learning_rate": 2.0626425981061608e-05, "loss": 0.4686, "step": 211 }, { "epoch": 2.70926517571885, "grad_norm": 0.3372670994794854, "learning_rate": 2.0447482977425465e-05, "loss": 0.4494, "step": 212 }, { "epoch": 2.722044728434505, "grad_norm": 0.4119808251995014, "learning_rate": 2.0268504126267952e-05, "loss": 0.4618, "step": 213 }, { "epoch": 2.7348242811501597, "grad_norm": 0.35007361397043135, "learning_rate": 2.008950376544887e-05, "loss": 0.4501, "step": 214 }, { "epoch": 2.747603833865815, "grad_norm": 0.3482131118710567, "learning_rate": 1.9910496234551132e-05, "loss": 0.4712, "step": 215 }, { "epoch": 2.7603833865814695, "grad_norm": 0.36098504969060924, "learning_rate": 1.9731495873732055e-05, "loss": 0.4462, "step": 216 }, { "epoch": 2.7731629392971247, "grad_norm": 0.38966883559748494, "learning_rate": 1.9552517022574542e-05, "loss": 0.4542, "step": 217 }, { "epoch": 2.7859424920127793, "grad_norm": 0.3345477836200502, "learning_rate": 1.93735740189384e-05, "loss": 0.4552, "step": 218 }, { "epoch": 2.7987220447284344, "grad_norm": 0.31804226071179864, "learning_rate": 1.9194681197811703e-05, "loss": 0.4618, "step": 219 }, { "epoch": 2.8115015974440896, "grad_norm": 0.34808447657242425, "learning_rate": 1.901585289016244e-05, "loss": 0.4695, "step": 220 }, { "epoch": 2.8242811501597442, "grad_norm": 0.3092928418426262, "learning_rate": 1.8837103421790486e-05, "loss": 0.4423, "step": 221 }, { "epoch": 2.8370607028753994, "grad_norm": 0.3369975841617078, "learning_rate": 1.8658447112179952e-05, "loss": 0.4422, "step": 222 }, { "epoch": 2.8498402555910545, "grad_norm": 0.34486237489642013, "learning_rate": 1.8479898273352084e-05, "loss": 0.4569, "step": 223 }, { "epoch": 2.862619808306709, "grad_norm": 0.32231034856470464, "learning_rate": 1.83014712087187e-05, "loss": 0.4619, "step": 224 }, { "epoch": 2.8753993610223643, "grad_norm": 0.3511616409676046, "learning_rate": 1.8123180211936417e-05, "loss": 0.4678, "step": 225 }, { "epoch": 2.8881789137380194, "grad_norm": 0.3116094431898499, "learning_rate": 1.794503956576152e-05, "loss": 0.452, "step": 226 }, { "epoch": 2.900958466453674, "grad_norm": 0.39241257348095804, "learning_rate": 1.776706354090585e-05, "loss": 0.4658, "step": 227 }, { "epoch": 2.913738019169329, "grad_norm": 0.3066839862034657, "learning_rate": 1.758926639489354e-05, "loss": 0.4421, "step": 228 }, { "epoch": 2.9265175718849843, "grad_norm": 0.3699344035934213, "learning_rate": 1.7411662370918893e-05, "loss": 0.4466, "step": 229 }, { "epoch": 2.939297124600639, "grad_norm": 0.3516814693478523, "learning_rate": 1.7234265696705344e-05, "loss": 0.4397, "step": 230 }, { "epoch": 2.952076677316294, "grad_norm": 0.3183618241929708, "learning_rate": 1.7057090583365678e-05, "loss": 0.4546, "step": 231 }, { "epoch": 2.9648562300319488, "grad_norm": 0.31722077213199373, "learning_rate": 1.6880151224263646e-05, "loss": 0.4526, "step": 232 }, { "epoch": 2.977635782747604, "grad_norm": 0.32780881299482323, "learning_rate": 1.6703461793876876e-05, "loss": 0.4393, "step": 233 }, { "epoch": 2.9904153354632586, "grad_norm": 0.35253714093245286, "learning_rate": 1.6527036446661396e-05, "loss": 0.4578, "step": 234 }, { "epoch": 3.0031948881789137, "grad_norm": 0.3462190522603488, "learning_rate": 1.635088931591775e-05, "loss": 0.4189, "step": 235 }, { "epoch": 3.015974440894569, "grad_norm": 0.4685540390840366, "learning_rate": 1.6175034512658753e-05, "loss": 0.3669, "step": 236 }, { "epoch": 3.0287539936102235, "grad_norm": 0.3641775630478635, "learning_rate": 1.5999486124479115e-05, "loss": 0.3889, "step": 237 }, { "epoch": 3.0415335463258786, "grad_norm": 0.5985630860170138, "learning_rate": 1.5824258214426833e-05, "loss": 0.4011, "step": 238 }, { "epoch": 3.0543130990415337, "grad_norm": 0.4602500801994949, "learning_rate": 1.5649364819876655e-05, "loss": 0.3673, "step": 239 }, { "epoch": 3.0670926517571884, "grad_norm": 0.45486062689454715, "learning_rate": 1.547481995140556e-05, "loss": 0.3711, "step": 240 }, { "epoch": 3.0798722044728435, "grad_norm": 0.49320311542655876, "learning_rate": 1.5300637591670357e-05, "loss": 0.3895, "step": 241 }, { "epoch": 3.0926517571884986, "grad_norm": 0.4521045649546727, "learning_rate": 1.5126831694287564e-05, "loss": 0.3781, "step": 242 }, { "epoch": 3.1054313099041533, "grad_norm": 0.448948101915693, "learning_rate": 1.4953416182715566e-05, "loss": 0.3944, "step": 243 }, { "epoch": 3.1182108626198084, "grad_norm": 0.4873220132706169, "learning_rate": 1.478040494913926e-05, "loss": 0.3939, "step": 244 }, { "epoch": 3.130990415335463, "grad_norm": 0.44147508802794816, "learning_rate": 1.460781185335713e-05, "loss": 0.3852, "step": 245 }, { "epoch": 3.143769968051118, "grad_norm": 0.43385527104949023, "learning_rate": 1.443565072167095e-05, "loss": 0.3671, "step": 246 }, { "epoch": 3.1565495207667733, "grad_norm": 0.4611144973784387, "learning_rate": 1.4263935345778202e-05, "loss": 0.3783, "step": 247 }, { "epoch": 3.169329073482428, "grad_norm": 0.3864931373624254, "learning_rate": 1.409267948166718e-05, "loss": 0.3954, "step": 248 }, { "epoch": 3.182108626198083, "grad_norm": 0.44744420693971776, "learning_rate": 1.3921896848515064e-05, "loss": 0.3911, "step": 249 }, { "epoch": 3.194888178913738, "grad_norm": 0.37684400568540616, "learning_rate": 1.3751601127588849e-05, "loss": 0.3864, "step": 250 }, { "epoch": 3.207667731629393, "grad_norm": 0.3796431982545521, "learning_rate": 1.3581805961149371e-05, "loss": 0.3896, "step": 251 }, { "epoch": 3.220447284345048, "grad_norm": 0.3769957541789544, "learning_rate": 1.341252495135841e-05, "loss": 0.3662, "step": 252 }, { "epoch": 3.2332268370607027, "grad_norm": 0.3536981536003253, "learning_rate": 1.324377165918906e-05, "loss": 0.3907, "step": 253 }, { "epoch": 3.246006389776358, "grad_norm": 0.3458764916465091, "learning_rate": 1.3075559603339354e-05, "loss": 0.3747, "step": 254 }, { "epoch": 3.258785942492013, "grad_norm": 0.3574613126458738, "learning_rate": 1.2907902259149287e-05, "loss": 0.388, "step": 255 }, { "epoch": 3.2715654952076676, "grad_norm": 0.33311205394265236, "learning_rate": 1.274081305752135e-05, "loss": 0.3653, "step": 256 }, { "epoch": 3.2843450479233227, "grad_norm": 0.3412095766440789, "learning_rate": 1.2574305383844528e-05, "loss": 0.3739, "step": 257 }, { "epoch": 3.297124600638978, "grad_norm": 0.31609779234124125, "learning_rate": 1.2408392576922075e-05, "loss": 0.3629, "step": 258 }, { "epoch": 3.3099041533546325, "grad_norm": 0.31319128171270894, "learning_rate": 1.2243087927902905e-05, "loss": 0.3817, "step": 259 }, { "epoch": 3.3226837060702876, "grad_norm": 0.337488566592411, "learning_rate": 1.2078404679216864e-05, "loss": 0.3495, "step": 260 }, { "epoch": 3.3354632587859427, "grad_norm": 0.3362268003840969, "learning_rate": 1.1914356023513904e-05, "loss": 0.3641, "step": 261 }, { "epoch": 3.3482428115015974, "grad_norm": 0.3251931593384129, "learning_rate": 1.1750955102607193e-05, "loss": 0.3587, "step": 262 }, { "epoch": 3.3610223642172525, "grad_norm": 0.3235766778421065, "learning_rate": 1.1588215006420374e-05, "loss": 0.3833, "step": 263 }, { "epoch": 3.373801916932907, "grad_norm": 0.3220124996724108, "learning_rate": 1.1426148771938915e-05, "loss": 0.3752, "step": 264 }, { "epoch": 3.3865814696485623, "grad_norm": 0.2951619492064331, "learning_rate": 1.1264769382165748e-05, "loss": 0.3647, "step": 265 }, { "epoch": 3.3993610223642174, "grad_norm": 0.3325399185449846, "learning_rate": 1.110408976508118e-05, "loss": 0.3748, "step": 266 }, { "epoch": 3.412140575079872, "grad_norm": 0.29609775154187334, "learning_rate": 1.094412279260726e-05, "loss": 0.3638, "step": 267 }, { "epoch": 3.4249201277955272, "grad_norm": 0.26895084636944605, "learning_rate": 1.0784881279576635e-05, "loss": 0.3846, "step": 268 }, { "epoch": 3.437699680511182, "grad_norm": 0.31292937001998616, "learning_rate": 1.0626377982705929e-05, "loss": 0.386, "step": 269 }, { "epoch": 3.450479233226837, "grad_norm": 0.28772920342792124, "learning_rate": 1.0468625599573842e-05, "loss": 0.3826, "step": 270 }, { "epoch": 3.463258785942492, "grad_norm": 0.29867691108266836, "learning_rate": 1.0311636767603952e-05, "loss": 0.3897, "step": 271 }, { "epoch": 3.476038338658147, "grad_norm": 0.31298465876789283, "learning_rate": 1.0155424063052306e-05, "loss": 0.397, "step": 272 }, { "epoch": 3.488817891373802, "grad_norm": 0.29371168883435345, "learning_rate": 1.0000000000000006e-05, "loss": 0.371, "step": 273 }, { "epoch": 3.501597444089457, "grad_norm": 0.2751312782841487, "learning_rate": 9.84537702935065e-06, "loss": 0.3627, "step": 274 }, { "epoch": 3.5143769968051117, "grad_norm": 0.29005922732820866, "learning_rate": 9.691567537832964e-06, "loss": 0.3752, "step": 275 }, { "epoch": 3.527156549520767, "grad_norm": 0.285596442518692, "learning_rate": 9.538583847008452e-06, "loss": 0.3562, "step": 276 }, { "epoch": 3.539936102236422, "grad_norm": 0.27513605379366374, "learning_rate": 9.386438212284372e-06, "loss": 0.4091, "step": 277 }, { "epoch": 3.5527156549520766, "grad_norm": 0.28110833899161064, "learning_rate": 9.235142821931928e-06, "loss": 0.3836, "step": 278 }, { "epoch": 3.5654952076677318, "grad_norm": 0.29961249529899747, "learning_rate": 9.084709796109907e-06, "loss": 0.3959, "step": 279 }, { "epoch": 3.5782747603833864, "grad_norm": 0.2897927175404746, "learning_rate": 8.93515118589373e-06, "loss": 0.385, "step": 280 }, { "epoch": 3.5910543130990416, "grad_norm": 0.27758412171226393, "learning_rate": 8.786478972310023e-06, "loss": 0.4043, "step": 281 }, { "epoch": 3.6038338658146962, "grad_norm": 0.28013334822925956, "learning_rate": 8.638705065376887e-06, "loss": 0.404, "step": 282 }, { "epoch": 3.6166134185303513, "grad_norm": 0.24987657433551447, "learning_rate": 8.491841303149728e-06, "loss": 0.4024, "step": 283 }, { "epoch": 3.6293929712460065, "grad_norm": 0.2684111365589162, "learning_rate": 8.345899450772975e-06, "loss": 0.3756, "step": 284 }, { "epoch": 3.642172523961661, "grad_norm": 0.27558752417436977, "learning_rate": 8.200891199537549e-06, "loss": 0.3759, "step": 285 }, { "epoch": 3.6549520766773163, "grad_norm": 0.25837395483010384, "learning_rate": 8.056828165944282e-06, "loss": 0.3892, "step": 286 }, { "epoch": 3.6677316293929714, "grad_norm": 0.2603889293142812, "learning_rate": 7.913721890773354e-06, "loss": 0.3755, "step": 287 }, { "epoch": 3.680511182108626, "grad_norm": 0.2815335667527747, "learning_rate": 7.771583838159756e-06, "loss": 0.3813, "step": 288 }, { "epoch": 3.693290734824281, "grad_norm": 0.26661465822337477, "learning_rate": 7.630425394674903e-06, "loss": 0.3971, "step": 289 }, { "epoch": 3.7060702875399363, "grad_norm": 0.2697574065608194, "learning_rate": 7.49025786841445e-06, "loss": 0.412, "step": 290 }, { "epoch": 3.718849840255591, "grad_norm": 0.2832734387419812, "learning_rate": 7.3510924880924575e-06, "loss": 0.3821, "step": 291 }, { "epoch": 3.731629392971246, "grad_norm": 0.27209311160455624, "learning_rate": 7.212940402141808e-06, "loss": 0.3692, "step": 292 }, { "epoch": 3.744408945686901, "grad_norm": 0.2970144837162344, "learning_rate": 7.075812677821145e-06, "loss": 0.378, "step": 293 }, { "epoch": 3.757188498402556, "grad_norm": 0.2585279833642156, "learning_rate": 6.939720300328303e-06, "loss": 0.3974, "step": 294 }, { "epoch": 3.769968051118211, "grad_norm": 0.2618419301777819, "learning_rate": 6.8046741719202385e-06, "loss": 0.3766, "step": 295 }, { "epoch": 3.7827476038338657, "grad_norm": 0.27076657962623973, "learning_rate": 6.67068511103971e-06, "loss": 0.3522, "step": 296 }, { "epoch": 3.7955271565495208, "grad_norm": 0.30173491128425356, "learning_rate": 6.537763851448593e-06, "loss": 0.3841, "step": 297 }, { "epoch": 3.8083067092651754, "grad_norm": 0.268313196160291, "learning_rate": 6.4059210413680175e-06, "loss": 0.3901, "step": 298 }, { "epoch": 3.8210862619808306, "grad_norm": 0.2640300480450196, "learning_rate": 6.275167242625331e-06, "loss": 0.3695, "step": 299 }, { "epoch": 3.8338658146964857, "grad_norm": 0.3014666623894041, "learning_rate": 6.145512929808013e-06, "loss": 0.3967, "step": 300 }, { "epoch": 3.8466453674121404, "grad_norm": 0.28752502042804456, "learning_rate": 6.016968489424572e-06, "loss": 0.3862, "step": 301 }, { "epoch": 3.8594249201277955, "grad_norm": 0.24186414715253984, "learning_rate": 5.889544219072465e-06, "loss": 0.392, "step": 302 }, { "epoch": 3.8722044728434506, "grad_norm": 0.25051017394914027, "learning_rate": 5.7632503266131925e-06, "loss": 0.3914, "step": 303 }, { "epoch": 3.8849840255591053, "grad_norm": 0.2699791212783972, "learning_rate": 5.638096929354522e-06, "loss": 0.3887, "step": 304 }, { "epoch": 3.8977635782747604, "grad_norm": 0.2858548684927234, "learning_rate": 5.514094053240035e-06, "loss": 0.3719, "step": 305 }, { "epoch": 3.9105431309904155, "grad_norm": 0.2652951522437298, "learning_rate": 5.39125163204594e-06, "loss": 0.3927, "step": 306 }, { "epoch": 3.92332268370607, "grad_norm": 0.2467682860470358, "learning_rate": 5.269579506585259e-06, "loss": 0.391, "step": 307 }, { "epoch": 3.9361022364217253, "grad_norm": 0.2828893085043367, "learning_rate": 5.149087423919541e-06, "loss": 0.3908, "step": 308 }, { "epoch": 3.9488817891373804, "grad_norm": 0.2717440146156137, "learning_rate": 5.029785036577976e-06, "loss": 0.3944, "step": 309 }, { "epoch": 3.961661341853035, "grad_norm": 0.25572320238260987, "learning_rate": 4.911681901784198e-06, "loss": 0.3653, "step": 310 }, { "epoch": 3.97444089456869, "grad_norm": 0.2522678532110314, "learning_rate": 4.794787480690597e-06, "loss": 0.3603, "step": 311 }, { "epoch": 3.987220447284345, "grad_norm": 0.2620988944897116, "learning_rate": 4.679111137620442e-06, "loss": 0.3792, "step": 312 }, { "epoch": 4.0, "grad_norm": 0.25092164609980083, "learning_rate": 4.5646621393177e-06, "loss": 0.3784, "step": 313 }, { "epoch": 4.012779552715655, "grad_norm": 0.4284345645389324, "learning_rate": 4.451449654204685e-06, "loss": 0.3544, "step": 314 }, { "epoch": 4.02555910543131, "grad_norm": 0.33050487677235896, "learning_rate": 4.339482751647557e-06, "loss": 0.3385, "step": 315 }, { "epoch": 4.038338658146965, "grad_norm": 0.2539302254984621, "learning_rate": 4.228770401229824e-06, "loss": 0.3548, "step": 316 }, { "epoch": 4.05111821086262, "grad_norm": 0.30210855193785546, "learning_rate": 4.119321472033779e-06, "loss": 0.3442, "step": 317 }, { "epoch": 4.063897763578275, "grad_norm": 0.437426611959635, "learning_rate": 4.011144731929981e-06, "loss": 0.3316, "step": 318 }, { "epoch": 4.07667731629393, "grad_norm": 0.3834924536155654, "learning_rate": 3.904248846874894e-06, "loss": 0.3477, "step": 319 }, { "epoch": 4.0894568690095845, "grad_norm": 0.28425967790184686, "learning_rate": 3.7986423802166705e-06, "loss": 0.3305, "step": 320 }, { "epoch": 4.102236421725239, "grad_norm": 0.28438148650744755, "learning_rate": 3.694333792009115e-06, "loss": 0.3379, "step": 321 }, { "epoch": 4.115015974440895, "grad_norm": 0.3005261887308328, "learning_rate": 3.5913314383339937e-06, "loss": 0.3397, "step": 322 }, { "epoch": 4.127795527156549, "grad_norm": 0.31421247064279384, "learning_rate": 3.4896435706316e-06, "loss": 0.3538, "step": 323 }, { "epoch": 4.140575079872204, "grad_norm": 0.2915742354040909, "learning_rate": 3.3892783350397675e-06, "loss": 0.3396, "step": 324 }, { "epoch": 4.15335463258786, "grad_norm": 0.33400589433205863, "learning_rate": 3.290243771741275e-06, "loss": 0.3413, "step": 325 }, { "epoch": 4.166134185303514, "grad_norm": 0.2807238540611646, "learning_rate": 3.1925478143197418e-06, "loss": 0.3408, "step": 326 }, { "epoch": 4.178913738019169, "grad_norm": 0.28570208458872476, "learning_rate": 3.0961982891241083e-06, "loss": 0.3368, "step": 327 }, { "epoch": 4.1916932907348246, "grad_norm": 0.2826646747314574, "learning_rate": 3.001202914641628e-06, "loss": 0.3354, "step": 328 }, { "epoch": 4.204472843450479, "grad_norm": 0.2825600024949671, "learning_rate": 2.907569300879596e-06, "loss": 0.3674, "step": 329 }, { "epoch": 4.217252396166134, "grad_norm": 0.28082696724891204, "learning_rate": 2.815304948755664e-06, "loss": 0.3281, "step": 330 }, { "epoch": 4.2300319488817895, "grad_norm": 0.27511892458287074, "learning_rate": 2.7244172494969978e-06, "loss": 0.314, "step": 331 }, { "epoch": 4.242811501597444, "grad_norm": 0.25711276536625566, "learning_rate": 2.6349134840481294e-06, "loss": 0.345, "step": 332 }, { "epoch": 4.255591054313099, "grad_norm": 0.252155288439604, "learning_rate": 2.546800822487714e-06, "loss": 0.3401, "step": 333 }, { "epoch": 4.268370607028754, "grad_norm": 0.25219633970213434, "learning_rate": 2.4600863234541338e-06, "loss": 0.3365, "step": 334 }, { "epoch": 4.281150159744409, "grad_norm": 0.2562662877844303, "learning_rate": 2.374776933580025e-06, "loss": 0.3366, "step": 335 }, { "epoch": 4.293929712460064, "grad_norm": 0.2546484833688915, "learning_rate": 2.2908794869358044e-06, "loss": 0.3349, "step": 336 }, { "epoch": 4.306709265175719, "grad_norm": 0.23733496012222385, "learning_rate": 2.2084007044821764e-06, "loss": 0.3404, "step": 337 }, { "epoch": 4.319488817891374, "grad_norm": 0.24011044892161978, "learning_rate": 2.127347193531757e-06, "loss": 0.3368, "step": 338 }, { "epoch": 4.332268370607029, "grad_norm": 0.23829318220252133, "learning_rate": 2.0477254472197237e-06, "loss": 0.3343, "step": 339 }, { "epoch": 4.345047923322683, "grad_norm": 0.2426599236667127, "learning_rate": 1.96954184398368e-06, "loss": 0.3276, "step": 340 }, { "epoch": 4.357827476038339, "grad_norm": 0.2484944861973328, "learning_rate": 1.8928026470526917e-06, "loss": 0.3047, "step": 341 }, { "epoch": 4.3706070287539935, "grad_norm": 0.2424524804170697, "learning_rate": 1.817514003945524e-06, "loss": 0.3552, "step": 342 }, { "epoch": 4.383386581469648, "grad_norm": 0.25069442159331295, "learning_rate": 1.743681945978184e-06, "loss": 0.3612, "step": 343 }, { "epoch": 4.396166134185304, "grad_norm": 0.22918100639116956, "learning_rate": 1.6713123877807413e-06, "loss": 0.3295, "step": 344 }, { "epoch": 4.4089456869009584, "grad_norm": 0.21979161747305265, "learning_rate": 1.6004111268235156e-06, "loss": 0.3414, "step": 345 }, { "epoch": 4.421725239616613, "grad_norm": 0.22536568922183237, "learning_rate": 1.5309838429526714e-06, "loss": 0.3593, "step": 346 }, { "epoch": 4.434504792332269, "grad_norm": 0.24312305004559837, "learning_rate": 1.4630360979351644e-06, "loss": 0.345, "step": 347 }, { "epoch": 4.447284345047923, "grad_norm": 0.2415641768402643, "learning_rate": 1.396573335013236e-06, "loss": 0.3197, "step": 348 }, { "epoch": 4.460063897763578, "grad_norm": 0.23516326374254073, "learning_rate": 1.3316008784683265e-06, "loss": 0.3189, "step": 349 }, { "epoch": 4.472843450479234, "grad_norm": 0.23995897116665635, "learning_rate": 1.2681239331945695e-06, "loss": 0.3308, "step": 350 }, { "epoch": 4.485623003194888, "grad_norm": 0.22716537217034832, "learning_rate": 1.2061475842818337e-06, "loss": 0.336, "step": 351 }, { "epoch": 4.498402555910543, "grad_norm": 0.2251715977404794, "learning_rate": 1.1456767966083393e-06, "loss": 0.323, "step": 352 }, { "epoch": 4.511182108626198, "grad_norm": 0.21217046379560392, "learning_rate": 1.086716414442952e-06, "loss": 0.3511, "step": 353 }, { "epoch": 4.523961661341853, "grad_norm": 0.2275715856942384, "learning_rate": 1.0292711610570904e-06, "loss": 0.334, "step": 354 }, { "epoch": 4.536741214057508, "grad_norm": 0.2269375692935684, "learning_rate": 9.733456383463658e-07, "loss": 0.3414, "step": 355 }, { "epoch": 4.549520766773163, "grad_norm": 0.2509889392145883, "learning_rate": 9.189443264619102e-07, "loss": 0.3324, "step": 356 }, { "epoch": 4.562300319488818, "grad_norm": 0.21468114512551226, "learning_rate": 8.660715834514977e-07, "loss": 0.318, "step": 357 }, { "epoch": 4.575079872204473, "grad_norm": 0.21547192613609828, "learning_rate": 8.147316449103959e-07, "loss": 0.348, "step": 358 }, { "epoch": 4.587859424920127, "grad_norm": 0.24384935031590274, "learning_rate": 7.649286236420806e-07, "loss": 0.3462, "step": 359 }, { "epoch": 4.600638977635783, "grad_norm": 0.23580188628562512, "learning_rate": 7.166665093287539e-07, "loss": 0.3117, "step": 360 }, { "epoch": 4.613418530351438, "grad_norm": 0.22586898999697266, "learning_rate": 6.69949168211721e-07, "loss": 0.3299, "step": 361 }, { "epoch": 4.626198083067092, "grad_norm": 0.22730350579864037, "learning_rate": 6.247803427816945e-07, "loss": 0.3311, "step": 362 }, { "epoch": 4.638977635782748, "grad_norm": 0.22205075710948494, "learning_rate": 5.811636514789598e-07, "loss": 0.3232, "step": 363 }, { "epoch": 4.651757188498403, "grad_norm": 0.2143264733489871, "learning_rate": 5.391025884035239e-07, "loss": 0.3418, "step": 364 }, { "epoch": 4.664536741214057, "grad_norm": 0.21450972921037162, "learning_rate": 4.986005230351954e-07, "loss": 0.3373, "step": 365 }, { "epoch": 4.677316293929713, "grad_norm": 0.23166261244742042, "learning_rate": 4.5966069996365993e-07, "loss": 0.3199, "step": 366 }, { "epoch": 4.6900958466453675, "grad_norm": 0.23057877798097104, "learning_rate": 4.22286238628562e-07, "loss": 0.3369, "step": 367 }, { "epoch": 4.702875399361022, "grad_norm": 0.24857643740680171, "learning_rate": 3.8648013306960664e-07, "loss": 0.3232, "step": 368 }, { "epoch": 4.715654952076678, "grad_norm": 0.22439313195655802, "learning_rate": 3.522452516867048e-07, "loss": 0.3234, "step": 369 }, { "epoch": 4.728434504792332, "grad_norm": 0.22242040680974812, "learning_rate": 3.1958433701019697e-07, "loss": 0.3481, "step": 370 }, { "epoch": 4.741214057507987, "grad_norm": 0.21764482062977858, "learning_rate": 2.8850000548115155e-07, "loss": 0.3364, "step": 371 }, { "epoch": 4.753993610223642, "grad_norm": 0.22981218307092738, "learning_rate": 2.5899474724174313e-07, "loss": 0.3364, "step": 372 }, { "epoch": 4.766773162939297, "grad_norm": 0.21610708752009206, "learning_rate": 2.3107092593579905e-07, "loss": 0.3263, "step": 373 }, { "epoch": 4.779552715654952, "grad_norm": 0.23068560833811716, "learning_rate": 2.0473077851942858e-07, "loss": 0.3228, "step": 374 }, { "epoch": 4.792332268370607, "grad_norm": 0.2255313865391087, "learning_rate": 1.799764150818306e-07, "loss": 0.3245, "step": 375 }, { "epoch": 4.805111821086262, "grad_norm": 0.21026047049313398, "learning_rate": 1.5680981867625566e-07, "loss": 0.3437, "step": 376 }, { "epoch": 4.817891373801917, "grad_norm": 0.20397671520194025, "learning_rate": 1.3523284516113955e-07, "loss": 0.3566, "step": 377 }, { "epoch": 4.830670926517572, "grad_norm": 0.24906277361128246, "learning_rate": 1.1524722305144231e-07, "loss": 0.3433, "step": 378 }, { "epoch": 4.843450479233227, "grad_norm": 0.21828952668874715, "learning_rate": 9.685455338016347e-08, "loss": 0.3329, "step": 379 }, { "epoch": 4.856230031948882, "grad_norm": 0.20690117617183224, "learning_rate": 8.005630957010014e-08, "loss": 0.3489, "step": 380 }, { "epoch": 4.8690095846645365, "grad_norm": 0.21595338341354317, "learning_rate": 6.485383731580142e-08, "loss": 0.3357, "step": 381 }, { "epoch": 4.881789137380192, "grad_norm": 0.2308320687948147, "learning_rate": 5.1248354475768034e-08, "loss": 0.3154, "step": 382 }, { "epoch": 4.894568690095847, "grad_norm": 0.2195533978640801, "learning_rate": 3.924095097489922e-08, "loss": 0.3221, "step": 383 }, { "epoch": 4.907348242811501, "grad_norm": 0.2282256383003407, "learning_rate": 2.8832588717164766e-08, "loss": 0.3369, "step": 384 }, { "epoch": 4.920127795527156, "grad_norm": 0.21269468824348317, "learning_rate": 2.0024101508555604e-08, "loss": 0.3413, "step": 385 }, { "epoch": 4.932907348242812, "grad_norm": 0.22085145490609895, "learning_rate": 1.281619499029274e-08, "loss": 0.3127, "step": 386 }, { "epoch": 4.945686900958466, "grad_norm": 0.22268335937504416, "learning_rate": 7.209446582292501e-09, "loss": 0.3254, "step": 387 }, { "epoch": 4.958466453674122, "grad_norm": 0.2203979320806084, "learning_rate": 3.2043054369057523e-09, "loss": 0.3315, "step": 388 }, { "epoch": 4.9712460063897765, "grad_norm": 0.20913122049040775, "learning_rate": 8.010924029533406e-10, "loss": 0.3561, "step": 389 }, { "epoch": 4.984025559105431, "grad_norm": 0.21749673347074377, "learning_rate": 0.0, "loss": 0.3372, "step": 390 }, { "epoch": 4.984025559105431, "step": 390, "total_flos": 1.322753791359058e+18, "train_loss": 0.48282129680499053, "train_runtime": 9620.7146, "train_samples_per_second": 5.197, "train_steps_per_second": 0.041 } ], "logging_steps": 1, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.322753791359058e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }