{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 10, "global_step": 261, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011494252873563218, "grad_norm": 0.4592016935348511, "learning_rate": 0.0, "loss": 1.0671, "step": 1 }, { "epoch": 0.022988505747126436, "grad_norm": 0.47092190384864807, "learning_rate": 7.4074074074074075e-06, "loss": 1.0786, "step": 2 }, { "epoch": 0.034482758620689655, "grad_norm": 0.4712238609790802, "learning_rate": 1.4814814814814815e-05, "loss": 1.074, "step": 3 }, { "epoch": 0.04597701149425287, "grad_norm": 0.4514904320240021, "learning_rate": 2.2222222222222223e-05, "loss": 1.0617, "step": 4 }, { "epoch": 0.05747126436781609, "grad_norm": 0.4478417932987213, "learning_rate": 2.962962962962963e-05, "loss": 1.0856, "step": 5 }, { "epoch": 0.06896551724137931, "grad_norm": 0.43826085329055786, "learning_rate": 3.7037037037037037e-05, "loss": 1.0464, "step": 6 }, { "epoch": 0.08045977011494253, "grad_norm": 0.42118313908576965, "learning_rate": 4.4444444444444447e-05, "loss": 1.0201, "step": 7 }, { "epoch": 0.09195402298850575, "grad_norm": 0.4139033257961273, "learning_rate": 5.185185185185185e-05, "loss": 0.983, "step": 8 }, { "epoch": 0.10344827586206896, "grad_norm": 0.3824421763420105, "learning_rate": 5.925925925925926e-05, "loss": 0.9627, "step": 9 }, { "epoch": 0.11494252873563218, "grad_norm": 0.33501842617988586, "learning_rate": 6.666666666666667e-05, "loss": 0.9351, "step": 10 }, { "epoch": 0.11494252873563218, "eval_loss": 0.9996559023857117, "eval_runtime": 577.4185, "eval_samples_per_second": 17.142, "eval_steps_per_second": 0.135, "step": 10 }, { "epoch": 0.12643678160919541, "grad_norm": 0.29872575402259827, "learning_rate": 7.407407407407407e-05, "loss": 0.9269, "step": 11 }, { "epoch": 0.13793103448275862, "grad_norm": 0.23556149005889893, "learning_rate": 8.148148148148148e-05, "loss": 0.8617, "step": 12 }, { "epoch": 0.14942528735632185, "grad_norm": 0.1813485026359558, "learning_rate": 8.888888888888889e-05, "loss": 0.8568, "step": 13 }, { "epoch": 0.16091954022988506, "grad_norm": 0.14335764944553375, "learning_rate": 9.62962962962963e-05, "loss": 0.8651, "step": 14 }, { "epoch": 0.1724137931034483, "grad_norm": 0.11154378205537796, "learning_rate": 0.0001037037037037037, "loss": 0.8281, "step": 15 }, { "epoch": 0.1839080459770115, "grad_norm": 0.10094304382801056, "learning_rate": 0.00011111111111111112, "loss": 0.8296, "step": 16 }, { "epoch": 0.19540229885057472, "grad_norm": 0.10502589493989944, "learning_rate": 0.00011851851851851852, "loss": 0.8494, "step": 17 }, { "epoch": 0.20689655172413793, "grad_norm": 0.11436028778553009, "learning_rate": 0.00012592592592592592, "loss": 0.8236, "step": 18 }, { "epoch": 0.21839080459770116, "grad_norm": 0.12607495486736298, "learning_rate": 0.00013333333333333334, "loss": 0.821, "step": 19 }, { "epoch": 0.22988505747126436, "grad_norm": 0.13643302023410797, "learning_rate": 0.00014074074074074076, "loss": 0.8087, "step": 20 }, { "epoch": 0.22988505747126436, "eval_loss": 0.8203959465026855, "eval_runtime": 573.1906, "eval_samples_per_second": 17.268, "eval_steps_per_second": 0.136, "step": 20 }, { "epoch": 0.2413793103448276, "grad_norm": 0.12495700269937515, "learning_rate": 0.00014814814814814815, "loss": 0.8005, "step": 21 }, { "epoch": 0.25287356321839083, "grad_norm": 0.114228755235672, "learning_rate": 0.00015555555555555556, "loss": 0.8099, "step": 22 }, { "epoch": 0.26436781609195403, "grad_norm": 0.10274926573038101, "learning_rate": 0.00016296296296296295, "loss": 0.8084, "step": 23 }, { "epoch": 0.27586206896551724, "grad_norm": 0.09217527508735657, "learning_rate": 0.00017037037037037037, "loss": 0.7786, "step": 24 }, { "epoch": 0.28735632183908044, "grad_norm": 0.07397017627954483, "learning_rate": 0.00017777777777777779, "loss": 0.794, "step": 25 }, { "epoch": 0.2988505747126437, "grad_norm": 0.05840110406279564, "learning_rate": 0.0001851851851851852, "loss": 0.7793, "step": 26 }, { "epoch": 0.3103448275862069, "grad_norm": 0.046803150326013565, "learning_rate": 0.0001925925925925926, "loss": 0.7779, "step": 27 }, { "epoch": 0.3218390804597701, "grad_norm": 0.04357853904366493, "learning_rate": 0.0002, "loss": 0.7871, "step": 28 }, { "epoch": 0.3333333333333333, "grad_norm": 0.04875590279698372, "learning_rate": 0.00019999098778567212, "loss": 0.7645, "step": 29 }, { "epoch": 0.3448275862068966, "grad_norm": 0.049262624233961105, "learning_rate": 0.00019996395276708856, "loss": 0.7724, "step": 30 }, { "epoch": 0.3448275862068966, "eval_loss": 0.7786664366722107, "eval_runtime": 579.7811, "eval_samples_per_second": 17.072, "eval_steps_per_second": 0.135, "step": 30 }, { "epoch": 0.3563218390804598, "grad_norm": 0.050246573984622955, "learning_rate": 0.00019991889981715698, "loss": 0.7634, "step": 31 }, { "epoch": 0.367816091954023, "grad_norm": 0.04701607674360275, "learning_rate": 0.00019985583705641418, "loss": 0.7539, "step": 32 }, { "epoch": 0.3793103448275862, "grad_norm": 0.04324931278824806, "learning_rate": 0.00019977477585156252, "loss": 0.7649, "step": 33 }, { "epoch": 0.39080459770114945, "grad_norm": 0.03931243345141411, "learning_rate": 0.00019967573081342103, "loss": 0.7698, "step": 34 }, { "epoch": 0.40229885057471265, "grad_norm": 0.03256700560450554, "learning_rate": 0.0001995587197942919, "loss": 0.7316, "step": 35 }, { "epoch": 0.41379310344827586, "grad_norm": 0.03164197504520416, "learning_rate": 0.0001994237638847428, "loss": 0.7694, "step": 36 }, { "epoch": 0.42528735632183906, "grad_norm": 0.032558053731918335, "learning_rate": 0.0001992708874098054, "loss": 0.7664, "step": 37 }, { "epoch": 0.4367816091954023, "grad_norm": 0.03510342538356781, "learning_rate": 0.00019910011792459087, "loss": 0.7676, "step": 38 }, { "epoch": 0.4482758620689655, "grad_norm": 0.03961511328816414, "learning_rate": 0.00019891148620932318, "loss": 0.7492, "step": 39 }, { "epoch": 0.45977011494252873, "grad_norm": 0.04138309136033058, "learning_rate": 0.00019870502626379127, "loss": 0.7386, "step": 40 }, { "epoch": 0.45977011494252873, "eval_loss": 0.7544006705284119, "eval_runtime": 573.7182, "eval_samples_per_second": 17.252, "eval_steps_per_second": 0.136, "step": 40 }, { "epoch": 0.47126436781609193, "grad_norm": 0.040192749351263046, "learning_rate": 0.00019848077530122083, "loss": 0.7328, "step": 41 }, { "epoch": 0.4827586206896552, "grad_norm": 0.03378569707274437, "learning_rate": 0.00019823877374156647, "loss": 0.7705, "step": 42 }, { "epoch": 0.4942528735632184, "grad_norm": 0.03214777261018753, "learning_rate": 0.00019797906520422677, "loss": 0.7501, "step": 43 }, { "epoch": 0.5057471264367817, "grad_norm": 0.035696811974048615, "learning_rate": 0.00019770169650018172, "loss": 0.7401, "step": 44 }, { "epoch": 0.5172413793103449, "grad_norm": 0.033035166561603546, "learning_rate": 0.00019740671762355548, "loss": 0.7582, "step": 45 }, { "epoch": 0.5287356321839081, "grad_norm": 0.03163702413439751, "learning_rate": 0.0001970941817426052, "loss": 0.7223, "step": 46 }, { "epoch": 0.5402298850574713, "grad_norm": 0.027231233194470406, "learning_rate": 0.00019676414519013781, "loss": 0.7318, "step": 47 }, { "epoch": 0.5517241379310345, "grad_norm": 0.02804660238325596, "learning_rate": 0.00019641666745335624, "loss": 0.7317, "step": 48 }, { "epoch": 0.5632183908045977, "grad_norm": 0.028242677450180054, "learning_rate": 0.00019605181116313724, "loss": 0.746, "step": 49 }, { "epoch": 0.5747126436781609, "grad_norm": 0.026637552306056023, "learning_rate": 0.00019566964208274254, "loss": 0.7351, "step": 50 }, { "epoch": 0.5747126436781609, "eval_loss": 0.7382059097290039, "eval_runtime": 577.3985, "eval_samples_per_second": 17.142, "eval_steps_per_second": 0.135, "step": 50 }, { "epoch": 0.5862068965517241, "grad_norm": 0.025572704151272774, "learning_rate": 0.00019527022909596536, "loss": 0.7231, "step": 51 }, { "epoch": 0.5977011494252874, "grad_norm": 0.025118378922343254, "learning_rate": 0.00019485364419471454, "loss": 0.734, "step": 52 }, { "epoch": 0.6091954022988506, "grad_norm": 0.024563653394579887, "learning_rate": 0.00019441996246603846, "loss": 0.7154, "step": 53 }, { "epoch": 0.6206896551724138, "grad_norm": 0.02712853066623211, "learning_rate": 0.00019396926207859084, "loss": 0.7042, "step": 54 }, { "epoch": 0.632183908045977, "grad_norm": 0.026732131838798523, "learning_rate": 0.0001935016242685415, "loss": 0.7405, "step": 55 }, { "epoch": 0.6436781609195402, "grad_norm": 0.025981994345784187, "learning_rate": 0.00019301713332493386, "loss": 0.7413, "step": 56 }, { "epoch": 0.6551724137931034, "grad_norm": 0.02603537030518055, "learning_rate": 0.00019251587657449236, "loss": 0.7237, "step": 57 }, { "epoch": 0.6666666666666666, "grad_norm": 0.025980589911341667, "learning_rate": 0.00019199794436588243, "loss": 0.7189, "step": 58 }, { "epoch": 0.6781609195402298, "grad_norm": 0.027198556810617447, "learning_rate": 0.00019146343005342547, "loss": 0.7346, "step": 59 }, { "epoch": 0.6896551724137931, "grad_norm": 0.026881467550992966, "learning_rate": 0.0001909124299802724, "loss": 0.7431, "step": 60 }, { "epoch": 0.6896551724137931, "eval_loss": 0.7254042029380798, "eval_runtime": 579.1419, "eval_samples_per_second": 17.091, "eval_steps_per_second": 0.135, "step": 60 }, { "epoch": 0.7011494252873564, "grad_norm": 0.025361355394124985, "learning_rate": 0.00019034504346103823, "loss": 0.7269, "step": 61 }, { "epoch": 0.7126436781609196, "grad_norm": 0.026448143646121025, "learning_rate": 0.0001897613727639014, "loss": 0.7408, "step": 62 }, { "epoch": 0.7241379310344828, "grad_norm": 0.024644356220960617, "learning_rate": 0.0001891615230921703, "loss": 0.7216, "step": 63 }, { "epoch": 0.735632183908046, "grad_norm": 0.02577868662774563, "learning_rate": 0.000188545602565321, "loss": 0.7356, "step": 64 }, { "epoch": 0.7471264367816092, "grad_norm": 0.026606209576129913, "learning_rate": 0.00018791372219950948, "loss": 0.7292, "step": 65 }, { "epoch": 0.7586206896551724, "grad_norm": 0.02574256807565689, "learning_rate": 0.00018726599588756145, "loss": 0.7161, "step": 66 }, { "epoch": 0.7701149425287356, "grad_norm": 0.02647324465215206, "learning_rate": 0.00018660254037844388, "loss": 0.7261, "step": 67 }, { "epoch": 0.7816091954022989, "grad_norm": 0.02706373669207096, "learning_rate": 0.0001859234752562217, "loss": 0.7267, "step": 68 }, { "epoch": 0.7931034482758621, "grad_norm": 0.026033291593194008, "learning_rate": 0.00018522892291850335, "loss": 0.7193, "step": 69 }, { "epoch": 0.8045977011494253, "grad_norm": 0.026747962459921837, "learning_rate": 0.0001845190085543795, "loss": 0.7183, "step": 70 }, { "epoch": 0.8045977011494253, "eval_loss": 0.7151289582252502, "eval_runtime": 574.3831, "eval_samples_per_second": 17.232, "eval_steps_per_second": 0.136, "step": 70 }, { "epoch": 0.8160919540229885, "grad_norm": 0.02563324011862278, "learning_rate": 0.00018379386012185814, "loss": 0.7031, "step": 71 }, { "epoch": 0.8275862068965517, "grad_norm": 0.027459682896733284, "learning_rate": 0.00018305360832480117, "loss": 0.7205, "step": 72 }, { "epoch": 0.8390804597701149, "grad_norm": 0.027526043355464935, "learning_rate": 0.00018229838658936564, "loss": 0.7307, "step": 73 }, { "epoch": 0.8505747126436781, "grad_norm": 0.026806244626641273, "learning_rate": 0.00018152833103995443, "loss": 0.7103, "step": 74 }, { "epoch": 0.8620689655172413, "grad_norm": 0.02682345174252987, "learning_rate": 0.0001807435804746807, "loss": 0.7115, "step": 75 }, { "epoch": 0.8735632183908046, "grad_norm": 0.027964213863015175, "learning_rate": 0.00017994427634035015, "loss": 0.6942, "step": 76 }, { "epoch": 0.8850574712643678, "grad_norm": 0.02669668011367321, "learning_rate": 0.0001791305627069662, "loss": 0.6856, "step": 77 }, { "epoch": 0.896551724137931, "grad_norm": 0.028264308348298073, "learning_rate": 0.00017830258624176225, "loss": 0.7409, "step": 78 }, { "epoch": 0.9080459770114943, "grad_norm": 0.02897650934755802, "learning_rate": 0.00017746049618276545, "loss": 0.7354, "step": 79 }, { "epoch": 0.9195402298850575, "grad_norm": 0.028627781197428703, "learning_rate": 0.0001766044443118978, "loss": 0.711, "step": 80 }, { "epoch": 0.9195402298850575, "eval_loss": 0.706480085849762, "eval_runtime": 578.3631, "eval_samples_per_second": 17.114, "eval_steps_per_second": 0.135, "step": 80 }, { "epoch": 0.9310344827586207, "grad_norm": 0.02842467464506626, "learning_rate": 0.00017573458492761801, "loss": 0.7161, "step": 81 }, { "epoch": 0.9425287356321839, "grad_norm": 0.02772713452577591, "learning_rate": 0.00017485107481711012, "loss": 0.7062, "step": 82 }, { "epoch": 0.9540229885057471, "grad_norm": 0.028318161144852638, "learning_rate": 0.00017395407322802372, "loss": 0.7136, "step": 83 }, { "epoch": 0.9655172413793104, "grad_norm": 0.027827098965644836, "learning_rate": 0.00017304374183977033, "loss": 0.675, "step": 84 }, { "epoch": 0.9770114942528736, "grad_norm": 0.029168730601668358, "learning_rate": 0.00017212024473438147, "loss": 0.7069, "step": 85 }, { "epoch": 0.9885057471264368, "grad_norm": 0.028814272955060005, "learning_rate": 0.00017118374836693406, "loss": 0.7224, "step": 86 }, { "epoch": 1.0, "grad_norm": 0.028051795437932014, "learning_rate": 0.00017023442153554777, "loss": 0.7199, "step": 87 }, { "epoch": 1.0114942528735633, "grad_norm": 0.031848177313804626, "learning_rate": 0.00016927243535095997, "loss": 0.6927, "step": 88 }, { "epoch": 1.0229885057471264, "grad_norm": 0.029947880655527115, "learning_rate": 0.00016829796320568416, "loss": 0.6876, "step": 89 }, { "epoch": 1.0344827586206897, "grad_norm": 0.029751958325505257, "learning_rate": 0.00016731118074275704, "loss": 0.6909, "step": 90 }, { "epoch": 1.0344827586206897, "eval_loss": 0.6995226740837097, "eval_runtime": 572.7322, "eval_samples_per_second": 17.282, "eval_steps_per_second": 0.136, "step": 90 }, { "epoch": 1.0459770114942528, "grad_norm": 0.02856331877410412, "learning_rate": 0.00016631226582407952, "loss": 0.6968, "step": 91 }, { "epoch": 1.0574712643678161, "grad_norm": 0.029149049893021584, "learning_rate": 0.0001653013984983585, "loss": 0.6863, "step": 92 }, { "epoch": 1.0689655172413792, "grad_norm": 0.03049040585756302, "learning_rate": 0.00016427876096865394, "loss": 0.7024, "step": 93 }, { "epoch": 1.0804597701149425, "grad_norm": 0.030065830796957016, "learning_rate": 0.00016324453755953773, "loss": 0.6943, "step": 94 }, { "epoch": 1.0919540229885056, "grad_norm": 0.03011215664446354, "learning_rate": 0.0001621989146838704, "loss": 0.669, "step": 95 }, { "epoch": 1.103448275862069, "grad_norm": 0.03143610432744026, "learning_rate": 0.00016114208080920123, "loss": 0.687, "step": 96 }, { "epoch": 1.1149425287356323, "grad_norm": 0.030098220333456993, "learning_rate": 0.0001600742264237979, "loss": 0.6912, "step": 97 }, { "epoch": 1.1264367816091954, "grad_norm": 0.030134093016386032, "learning_rate": 0.00015899554400231232, "loss": 0.6944, "step": 98 }, { "epoch": 1.1379310344827587, "grad_norm": 0.02956290729343891, "learning_rate": 0.0001579062279710879, "loss": 0.6569, "step": 99 }, { "epoch": 1.1494252873563218, "grad_norm": 0.03142748400568962, "learning_rate": 0.00015680647467311557, "loss": 0.6893, "step": 100 }, { "epoch": 1.1494252873563218, "eval_loss": 0.6938582062721252, "eval_runtime": 573.2391, "eval_samples_per_second": 17.267, "eval_steps_per_second": 0.136, "step": 100 }, { "epoch": 1.160919540229885, "grad_norm": 0.03098444640636444, "learning_rate": 0.00015569648233264394, "loss": 0.6919, "step": 101 }, { "epoch": 1.1724137931034484, "grad_norm": 0.03099571168422699, "learning_rate": 0.00015457645101945046, "loss": 0.6851, "step": 102 }, { "epoch": 1.1839080459770115, "grad_norm": 0.031869690865278244, "learning_rate": 0.0001534465826127801, "loss": 0.6826, "step": 103 }, { "epoch": 1.1954022988505748, "grad_norm": 0.031252000480890274, "learning_rate": 0.00015230708076495775, "loss": 0.6548, "step": 104 }, { "epoch": 1.206896551724138, "grad_norm": 0.033634793013334274, "learning_rate": 0.00015115815086468102, "loss": 0.6736, "step": 105 }, { "epoch": 1.2183908045977012, "grad_norm": 0.033019013702869415, "learning_rate": 0.00015000000000000001, "loss": 0.6823, "step": 106 }, { "epoch": 1.2298850574712643, "grad_norm": 0.03335465118288994, "learning_rate": 0.00014883283692099112, "loss": 0.6698, "step": 107 }, { "epoch": 1.2413793103448276, "grad_norm": 0.032002657651901245, "learning_rate": 0.0001476568720021308, "loss": 0.7054, "step": 108 }, { "epoch": 1.2528735632183907, "grad_norm": 0.03326962888240814, "learning_rate": 0.00014647231720437686, "loss": 0.6761, "step": 109 }, { "epoch": 1.264367816091954, "grad_norm": 0.03417377918958664, "learning_rate": 0.00014527938603696376, "loss": 0.6796, "step": 110 }, { "epoch": 1.264367816091954, "eval_loss": 0.6874342560768127, "eval_runtime": 579.4438, "eval_samples_per_second": 17.082, "eval_steps_per_second": 0.135, "step": 110 }, { "epoch": 1.2758620689655173, "grad_norm": 0.033368416130542755, "learning_rate": 0.00014407829351891857, "loss": 0.671, "step": 111 }, { "epoch": 1.2873563218390804, "grad_norm": 0.03217374533414841, "learning_rate": 0.00014286925614030542, "loss": 0.6834, "step": 112 }, { "epoch": 1.2988505747126438, "grad_norm": 0.03479033708572388, "learning_rate": 0.00014165249182320402, "loss": 0.6676, "step": 113 }, { "epoch": 1.3103448275862069, "grad_norm": 0.032671645283699036, "learning_rate": 0.0001404282198824305, "loss": 0.6605, "step": 114 }, { "epoch": 1.3218390804597702, "grad_norm": 0.033882081508636475, "learning_rate": 0.00013919666098600753, "loss": 0.6805, "step": 115 }, { "epoch": 1.3333333333333333, "grad_norm": 0.03380405157804489, "learning_rate": 0.00013795803711538966, "loss": 0.6844, "step": 116 }, { "epoch": 1.3448275862068966, "grad_norm": 0.0338243767619133, "learning_rate": 0.00013671257152545277, "loss": 0.6736, "step": 117 }, { "epoch": 1.3563218390804597, "grad_norm": 0.03448382019996643, "learning_rate": 0.00013546048870425356, "loss": 0.6715, "step": 118 }, { "epoch": 1.367816091954023, "grad_norm": 0.03275403752923012, "learning_rate": 0.00013420201433256689, "loss": 0.6501, "step": 119 }, { "epoch": 1.3793103448275863, "grad_norm": 0.03414244204759598, "learning_rate": 0.00013293737524320797, "loss": 0.65, "step": 120 }, { "epoch": 1.3793103448275863, "eval_loss": 0.681229293346405, "eval_runtime": 574.0613, "eval_samples_per_second": 17.242, "eval_steps_per_second": 0.136, "step": 120 }, { "epoch": 1.3908045977011494, "grad_norm": 0.03279387205839157, "learning_rate": 0.00013166679938014726, "loss": 0.6613, "step": 121 }, { "epoch": 1.4022988505747127, "grad_norm": 0.03316691517829895, "learning_rate": 0.0001303905157574247, "loss": 0.6707, "step": 122 }, { "epoch": 1.4137931034482758, "grad_norm": 0.034711454063653946, "learning_rate": 0.00012910875441787128, "loss": 0.6616, "step": 123 }, { "epoch": 1.4252873563218391, "grad_norm": 0.03387514501810074, "learning_rate": 0.0001278217463916453, "loss": 0.6846, "step": 124 }, { "epoch": 1.4367816091954024, "grad_norm": 0.033937133848667145, "learning_rate": 0.0001265297236545901, "loss": 0.6949, "step": 125 }, { "epoch": 1.4482758620689655, "grad_norm": 0.035054486244916916, "learning_rate": 0.00012523291908642217, "loss": 0.6587, "step": 126 }, { "epoch": 1.4597701149425286, "grad_norm": 0.03456171974539757, "learning_rate": 0.0001239315664287558, "loss": 0.6564, "step": 127 }, { "epoch": 1.471264367816092, "grad_norm": 0.0358516089618206, "learning_rate": 0.00012262590024297225, "loss": 0.6588, "step": 128 }, { "epoch": 1.4827586206896552, "grad_norm": 0.035148266702890396, "learning_rate": 0.0001213161558679416, "loss": 0.6799, "step": 129 }, { "epoch": 1.4942528735632183, "grad_norm": 0.03356264904141426, "learning_rate": 0.00012000256937760445, "loss": 0.6615, "step": 130 }, { "epoch": 1.4942528735632183, "eval_loss": 0.6774752140045166, "eval_runtime": 574.7453, "eval_samples_per_second": 17.222, "eval_steps_per_second": 0.136, "step": 130 }, { "epoch": 1.5057471264367817, "grad_norm": 0.03628064692020416, "learning_rate": 0.00011868537753842051, "loss": 0.6726, "step": 131 }, { "epoch": 1.5172413793103448, "grad_norm": 0.0354154035449028, "learning_rate": 0.00011736481776669306, "loss": 0.6517, "step": 132 }, { "epoch": 1.528735632183908, "grad_norm": 0.03343416005373001, "learning_rate": 0.00011604112808577603, "loss": 0.6418, "step": 133 }, { "epoch": 1.5402298850574714, "grad_norm": 0.037100158631801605, "learning_rate": 0.00011471454708317162, "loss": 0.6773, "step": 134 }, { "epoch": 1.5517241379310345, "grad_norm": 0.035807542502880096, "learning_rate": 0.00011338531386752618, "loss": 0.6815, "step": 135 }, { "epoch": 1.5632183908045976, "grad_norm": 0.036445409059524536, "learning_rate": 0.0001120536680255323, "loss": 0.6762, "step": 136 }, { "epoch": 1.5747126436781609, "grad_norm": 0.03504711762070656, "learning_rate": 0.00011071984957874479, "loss": 0.6568, "step": 137 }, { "epoch": 1.5862068965517242, "grad_norm": 0.03781822323799133, "learning_rate": 0.00010938409894031794, "loss": 0.6382, "step": 138 }, { "epoch": 1.5977011494252875, "grad_norm": 0.03475857526063919, "learning_rate": 0.00010804665687167262, "loss": 0.6805, "step": 139 }, { "epoch": 1.6091954022988506, "grad_norm": 0.0346900075674057, "learning_rate": 0.00010670776443910024, "loss": 0.6555, "step": 140 }, { "epoch": 1.6091954022988506, "eval_loss": 0.6739147305488586, "eval_runtime": 568.6275, "eval_samples_per_second": 17.407, "eval_steps_per_second": 0.137, "step": 140 }, { "epoch": 1.6206896551724137, "grad_norm": 0.035065166652202606, "learning_rate": 0.00010536766297031215, "loss": 0.6679, "step": 141 }, { "epoch": 1.632183908045977, "grad_norm": 0.03727475553750992, "learning_rate": 0.00010402659401094152, "loss": 0.6664, "step": 142 }, { "epoch": 1.6436781609195403, "grad_norm": 0.03498733788728714, "learning_rate": 0.00010268479928100614, "loss": 0.636, "step": 143 }, { "epoch": 1.6551724137931034, "grad_norm": 0.03479687497019768, "learning_rate": 0.00010134252063133975, "loss": 0.6727, "step": 144 }, { "epoch": 1.6666666666666665, "grad_norm": 0.0365520715713501, "learning_rate": 0.0001, "loss": 0.6674, "step": 145 }, { "epoch": 1.6781609195402298, "grad_norm": 0.037167515605688095, "learning_rate": 9.865747936866027e-05, "loss": 0.6816, "step": 146 }, { "epoch": 1.6896551724137931, "grad_norm": 0.034952931106090546, "learning_rate": 9.73152007189939e-05, "loss": 0.6658, "step": 147 }, { "epoch": 1.7011494252873565, "grad_norm": 0.03583049774169922, "learning_rate": 9.597340598905852e-05, "loss": 0.6753, "step": 148 }, { "epoch": 1.7126436781609196, "grad_norm": 0.03493364527821541, "learning_rate": 9.463233702968783e-05, "loss": 0.6806, "step": 149 }, { "epoch": 1.7241379310344827, "grad_norm": 0.0362800695002079, "learning_rate": 9.329223556089975e-05, "loss": 0.6522, "step": 150 }, { "epoch": 1.7241379310344827, "eval_loss": 0.6713330745697021, "eval_runtime": 567.6983, "eval_samples_per_second": 17.435, "eval_steps_per_second": 0.137, "step": 150 }, { "epoch": 1.735632183908046, "grad_norm": 0.036763522773981094, "learning_rate": 9.195334312832742e-05, "loss": 0.6606, "step": 151 }, { "epoch": 1.7471264367816093, "grad_norm": 0.035170070827007294, "learning_rate": 9.061590105968208e-05, "loss": 0.6671, "step": 152 }, { "epoch": 1.7586206896551724, "grad_norm": 0.036646973341703415, "learning_rate": 8.928015042125523e-05, "loss": 0.6459, "step": 153 }, { "epoch": 1.7701149425287355, "grad_norm": 0.03579595685005188, "learning_rate": 8.79463319744677e-05, "loss": 0.6684, "step": 154 }, { "epoch": 1.7816091954022988, "grad_norm": 0.03661736845970154, "learning_rate": 8.661468613247387e-05, "loss": 0.6593, "step": 155 }, { "epoch": 1.793103448275862, "grad_norm": 0.03629940375685692, "learning_rate": 8.528545291682838e-05, "loss": 0.6674, "step": 156 }, { "epoch": 1.8045977011494254, "grad_norm": 0.03455725312232971, "learning_rate": 8.395887191422397e-05, "loss": 0.6555, "step": 157 }, { "epoch": 1.8160919540229885, "grad_norm": 0.037991754710674286, "learning_rate": 8.263518223330697e-05, "loss": 0.6591, "step": 158 }, { "epoch": 1.8275862068965516, "grad_norm": 0.03966759145259857, "learning_rate": 8.131462246157953e-05, "loss": 0.6464, "step": 159 }, { "epoch": 1.839080459770115, "grad_norm": 0.03660488501191139, "learning_rate": 7.999743062239557e-05, "loss": 0.6545, "step": 160 }, { "epoch": 1.839080459770115, "eval_loss": 0.6687291264533997, "eval_runtime": 572.4645, "eval_samples_per_second": 17.29, "eval_steps_per_second": 0.136, "step": 160 }, { "epoch": 1.8505747126436782, "grad_norm": 0.036139026284217834, "learning_rate": 7.868384413205842e-05, "loss": 0.6612, "step": 161 }, { "epoch": 1.8620689655172413, "grad_norm": 0.03675035014748573, "learning_rate": 7.73740997570278e-05, "loss": 0.6566, "step": 162 }, { "epoch": 1.8735632183908046, "grad_norm": 0.03498603403568268, "learning_rate": 7.606843357124426e-05, "loss": 0.6498, "step": 163 }, { "epoch": 1.8850574712643677, "grad_norm": 0.03681657835841179, "learning_rate": 7.476708091357782e-05, "loss": 0.6582, "step": 164 }, { "epoch": 1.896551724137931, "grad_norm": 0.034847062081098557, "learning_rate": 7.347027634540993e-05, "loss": 0.6421, "step": 165 }, { "epoch": 1.9080459770114944, "grad_norm": 0.036876097321510315, "learning_rate": 7.217825360835473e-05, "loss": 0.6553, "step": 166 }, { "epoch": 1.9195402298850575, "grad_norm": 0.036966923624277115, "learning_rate": 7.089124558212871e-05, "loss": 0.6587, "step": 167 }, { "epoch": 1.9310344827586206, "grad_norm": 0.03830999508500099, "learning_rate": 6.960948424257532e-05, "loss": 0.6646, "step": 168 }, { "epoch": 1.9425287356321839, "grad_norm": 0.0375300832092762, "learning_rate": 6.833320061985277e-05, "loss": 0.6669, "step": 169 }, { "epoch": 1.9540229885057472, "grad_norm": 0.03690570220351219, "learning_rate": 6.706262475679205e-05, "loss": 0.648, "step": 170 }, { "epoch": 1.9540229885057472, "eval_loss": 0.6668276786804199, "eval_runtime": 573.28, "eval_samples_per_second": 17.266, "eval_steps_per_second": 0.136, "step": 170 }, { "epoch": 1.9655172413793105, "grad_norm": 0.03528446704149246, "learning_rate": 6.579798566743314e-05, "loss": 0.6329, "step": 171 }, { "epoch": 1.9770114942528736, "grad_norm": 0.03669628128409386, "learning_rate": 6.453951129574644e-05, "loss": 0.6645, "step": 172 }, { "epoch": 1.9885057471264367, "grad_norm": 0.0366462841629982, "learning_rate": 6.328742847454724e-05, "loss": 0.6529, "step": 173 }, { "epoch": 2.0, "grad_norm": 0.03732901066541672, "learning_rate": 6.204196288461037e-05, "loss": 0.6619, "step": 174 }, { "epoch": 2.0114942528735633, "grad_norm": 0.03835076466202736, "learning_rate": 6.080333901399251e-05, "loss": 0.6569, "step": 175 }, { "epoch": 2.0229885057471266, "grad_norm": 0.0394546240568161, "learning_rate": 5.957178011756952e-05, "loss": 0.6406, "step": 176 }, { "epoch": 2.0344827586206895, "grad_norm": 0.037555113434791565, "learning_rate": 5.834750817679606e-05, "loss": 0.6171, "step": 177 }, { "epoch": 2.045977011494253, "grad_norm": 0.036582354456186295, "learning_rate": 5.713074385969457e-05, "loss": 0.6391, "step": 178 }, { "epoch": 2.057471264367816, "grad_norm": 0.036117665469646454, "learning_rate": 5.59217064810814e-05, "loss": 0.6474, "step": 179 }, { "epoch": 2.0689655172413794, "grad_norm": 0.037194106727838516, "learning_rate": 5.472061396303629e-05, "loss": 0.6285, "step": 180 }, { "epoch": 2.0689655172413794, "eval_loss": 0.6662968993186951, "eval_runtime": 571.1587, "eval_samples_per_second": 17.33, "eval_steps_per_second": 0.137, "step": 180 }, { "epoch": 2.0804597701149423, "grad_norm": 0.0378338024020195, "learning_rate": 5.3527682795623146e-05, "loss": 0.6484, "step": 181 }, { "epoch": 2.0919540229885056, "grad_norm": 0.04033705219626427, "learning_rate": 5.234312799786921e-05, "loss": 0.6571, "step": 182 }, { "epoch": 2.103448275862069, "grad_norm": 0.03822549805045128, "learning_rate": 5.116716307900893e-05, "loss": 0.6489, "step": 183 }, { "epoch": 2.1149425287356323, "grad_norm": 0.0367862842977047, "learning_rate": 5.000000000000002e-05, "loss": 0.651, "step": 184 }, { "epoch": 2.1264367816091956, "grad_norm": 0.03618647903203964, "learning_rate": 4.884184913531902e-05, "loss": 0.6395, "step": 185 }, { "epoch": 2.1379310344827585, "grad_norm": 0.036637451499700546, "learning_rate": 4.7692919235042255e-05, "loss": 0.6348, "step": 186 }, { "epoch": 2.1494252873563218, "grad_norm": 0.03728080913424492, "learning_rate": 4.6553417387219886e-05, "loss": 0.6213, "step": 187 }, { "epoch": 2.160919540229885, "grad_norm": 0.037123698741197586, "learning_rate": 4.542354898054953e-05, "loss": 0.6391, "step": 188 }, { "epoch": 2.1724137931034484, "grad_norm": 0.03694508224725723, "learning_rate": 4.430351766735609e-05, "loss": 0.6576, "step": 189 }, { "epoch": 2.1839080459770113, "grad_norm": 0.036252494901418686, "learning_rate": 4.3193525326884435e-05, "loss": 0.6652, "step": 190 }, { "epoch": 2.1839080459770113, "eval_loss": 0.6655236482620239, "eval_runtime": 576.431, "eval_samples_per_second": 17.171, "eval_steps_per_second": 0.135, "step": 190 }, { "epoch": 2.1954022988505746, "grad_norm": 0.038629233837127686, "learning_rate": 4.209377202891212e-05, "loss": 0.6373, "step": 191 }, { "epoch": 2.206896551724138, "grad_norm": 0.0367644838988781, "learning_rate": 4.100445599768774e-05, "loss": 0.6414, "step": 192 }, { "epoch": 2.218390804597701, "grad_norm": 0.037642017006874084, "learning_rate": 3.99257735762021e-05, "loss": 0.6596, "step": 193 }, { "epoch": 2.2298850574712645, "grad_norm": 0.03667798638343811, "learning_rate": 3.885791919079878e-05, "loss": 0.6049, "step": 194 }, { "epoch": 2.2413793103448274, "grad_norm": 0.0369441919028759, "learning_rate": 3.7801085316129615e-05, "loss": 0.6302, "step": 195 }, { "epoch": 2.2528735632183907, "grad_norm": 0.03800995275378227, "learning_rate": 3.675546244046228e-05, "loss": 0.653, "step": 196 }, { "epoch": 2.264367816091954, "grad_norm": 0.03765873983502388, "learning_rate": 3.5721239031346066e-05, "loss": 0.6467, "step": 197 }, { "epoch": 2.2758620689655173, "grad_norm": 0.03664061799645424, "learning_rate": 3.469860150164152e-05, "loss": 0.6383, "step": 198 }, { "epoch": 2.2873563218390807, "grad_norm": 0.03744902089238167, "learning_rate": 3.36877341759205e-05, "loss": 0.629, "step": 199 }, { "epoch": 2.2988505747126435, "grad_norm": 0.036028388887643814, "learning_rate": 3.268881925724297e-05, "loss": 0.6307, "step": 200 }, { "epoch": 2.2988505747126435, "eval_loss": 0.6646614074707031, "eval_runtime": 573.3596, "eval_samples_per_second": 17.263, "eval_steps_per_second": 0.136, "step": 200 }, { "epoch": 2.310344827586207, "grad_norm": 0.037410344928503036, "learning_rate": 3.170203679431584e-05, "loss": 0.6327, "step": 201 }, { "epoch": 2.32183908045977, "grad_norm": 0.036954365670681, "learning_rate": 3.072756464904006e-05, "loss": 0.6359, "step": 202 }, { "epoch": 2.3333333333333335, "grad_norm": 0.03856337442994118, "learning_rate": 2.976557846445225e-05, "loss": 0.6485, "step": 203 }, { "epoch": 2.344827586206897, "grad_norm": 0.03655731678009033, "learning_rate": 2.881625163306596e-05, "loss": 0.6402, "step": 204 }, { "epoch": 2.3563218390804597, "grad_norm": 0.036423757672309875, "learning_rate": 2.7879755265618555e-05, "loss": 0.6327, "step": 205 }, { "epoch": 2.367816091954023, "grad_norm": 0.038285087794065475, "learning_rate": 2.6956258160229695e-05, "loss": 0.622, "step": 206 }, { "epoch": 2.3793103448275863, "grad_norm": 0.03574738651514053, "learning_rate": 2.6045926771976303e-05, "loss": 0.6387, "step": 207 }, { "epoch": 2.3908045977011496, "grad_norm": 0.03681504353880882, "learning_rate": 2.514892518288988e-05, "loss": 0.6238, "step": 208 }, { "epoch": 2.4022988505747125, "grad_norm": 0.03897320479154587, "learning_rate": 2.4265415072382016e-05, "loss": 0.6294, "step": 209 }, { "epoch": 2.413793103448276, "grad_norm": 0.038258329033851624, "learning_rate": 2.339555568810221e-05, "loss": 0.6383, "step": 210 }, { "epoch": 2.413793103448276, "eval_loss": 0.664122462272644, "eval_runtime": 571.8284, "eval_samples_per_second": 17.309, "eval_steps_per_second": 0.136, "step": 210 }, { "epoch": 2.425287356321839, "grad_norm": 0.03809202462434769, "learning_rate": 2.2539503817234553e-05, "loss": 0.651, "step": 211 }, { "epoch": 2.4367816091954024, "grad_norm": 0.0384032279253006, "learning_rate": 2.1697413758237784e-05, "loss": 0.634, "step": 212 }, { "epoch": 2.4482758620689653, "grad_norm": 0.037987615913152695, "learning_rate": 2.0869437293033835e-05, "loss": 0.626, "step": 213 }, { "epoch": 2.4597701149425286, "grad_norm": 0.03702289238572121, "learning_rate": 2.0055723659649904e-05, "loss": 0.6321, "step": 214 }, { "epoch": 2.471264367816092, "grad_norm": 0.0384170301258564, "learning_rate": 1.9256419525319313e-05, "loss": 0.644, "step": 215 }, { "epoch": 2.4827586206896552, "grad_norm": 0.03760548681020737, "learning_rate": 1.8471668960045574e-05, "loss": 0.6242, "step": 216 }, { "epoch": 2.4942528735632186, "grad_norm": 0.037029929459095, "learning_rate": 1.7701613410634365e-05, "loss": 0.6525, "step": 217 }, { "epoch": 2.5057471264367814, "grad_norm": 0.037944190204143524, "learning_rate": 1.6946391675198836e-05, "loss": 0.6026, "step": 218 }, { "epoch": 2.5172413793103448, "grad_norm": 0.03844432905316353, "learning_rate": 1.620613987814189e-05, "loss": 0.6285, "step": 219 }, { "epoch": 2.528735632183908, "grad_norm": 0.037583839148283005, "learning_rate": 1.5480991445620542e-05, "loss": 0.6394, "step": 220 }, { "epoch": 2.528735632183908, "eval_loss": 0.6636479496955872, "eval_runtime": 573.1498, "eval_samples_per_second": 17.269, "eval_steps_per_second": 0.136, "step": 220 }, { "epoch": 2.5402298850574714, "grad_norm": 0.03711175173521042, "learning_rate": 1.4771077081496654e-05, "loss": 0.6399, "step": 221 }, { "epoch": 2.5517241379310347, "grad_norm": 0.038552869111299515, "learning_rate": 1.4076524743778319e-05, "loss": 0.6201, "step": 222 }, { "epoch": 2.5632183908045976, "grad_norm": 0.036754295229911804, "learning_rate": 1.339745962155613e-05, "loss": 0.618, "step": 223 }, { "epoch": 2.574712643678161, "grad_norm": 0.03787172585725784, "learning_rate": 1.2734004112438568e-05, "loss": 0.6531, "step": 224 }, { "epoch": 2.586206896551724, "grad_norm": 0.037051599472761154, "learning_rate": 1.2086277800490554e-05, "loss": 0.6364, "step": 225 }, { "epoch": 2.5977011494252875, "grad_norm": 0.037667158991098404, "learning_rate": 1.1454397434679021e-05, "loss": 0.6222, "step": 226 }, { "epoch": 2.609195402298851, "grad_norm": 0.037177689373493195, "learning_rate": 1.083847690782972e-05, "loss": 0.6453, "step": 227 }, { "epoch": 2.6206896551724137, "grad_norm": 0.03695126622915268, "learning_rate": 1.0238627236098619e-05, "loss": 0.6613, "step": 228 }, { "epoch": 2.632183908045977, "grad_norm": 0.03670082986354828, "learning_rate": 9.65495653896179e-06, "loss": 0.6172, "step": 229 }, { "epoch": 2.6436781609195403, "grad_norm": 0.03760894760489464, "learning_rate": 9.08757001972762e-06, "loss": 0.632, "step": 230 }, { "epoch": 2.6436781609195403, "eval_loss": 0.6632330417633057, "eval_runtime": 570.3685, "eval_samples_per_second": 17.354, "eval_steps_per_second": 0.137, "step": 230 }, { "epoch": 2.655172413793103, "grad_norm": 0.03777291253209114, "learning_rate": 8.536569946574546e-06, "loss": 0.6496, "step": 231 }, { "epoch": 2.6666666666666665, "grad_norm": 0.03962570056319237, "learning_rate": 8.002055634117578e-06, "loss": 0.6426, "step": 232 }, { "epoch": 2.67816091954023, "grad_norm": 0.03831165283918381, "learning_rate": 7.4841234255076495e-06, "loss": 0.6352, "step": 233 }, { "epoch": 2.689655172413793, "grad_norm": 0.036861978471279144, "learning_rate": 6.9828666750661795e-06, "loss": 0.6339, "step": 234 }, { "epoch": 2.7011494252873565, "grad_norm": 0.03647362440824509, "learning_rate": 6.498375731458528e-06, "loss": 0.6415, "step": 235 }, { "epoch": 2.7126436781609193, "grad_norm": 0.03739969804883003, "learning_rate": 6.030737921409169e-06, "loss": 0.6339, "step": 236 }, { "epoch": 2.7241379310344827, "grad_norm": 0.038087863475084305, "learning_rate": 5.580037533961546e-06, "loss": 0.6152, "step": 237 }, { "epoch": 2.735632183908046, "grad_norm": 0.03658927232027054, "learning_rate": 5.146355805285452e-06, "loss": 0.6227, "step": 238 }, { "epoch": 2.7471264367816093, "grad_norm": 0.03749408572912216, "learning_rate": 4.729770904034647e-06, "loss": 0.6368, "step": 239 }, { "epoch": 2.7586206896551726, "grad_norm": 0.03648856654763222, "learning_rate": 4.3303579172574885e-06, "loss": 0.6416, "step": 240 }, { "epoch": 2.7586206896551726, "eval_loss": 0.6631439328193665, "eval_runtime": 571.9746, "eval_samples_per_second": 17.305, "eval_steps_per_second": 0.136, "step": 240 }, { "epoch": 2.7701149425287355, "grad_norm": 0.03831519931554794, "learning_rate": 3.948188836862776e-06, "loss": 0.6332, "step": 241 }, { "epoch": 2.781609195402299, "grad_norm": 0.037160731852054596, "learning_rate": 3.5833325466437694e-06, "loss": 0.6429, "step": 242 }, { "epoch": 2.793103448275862, "grad_norm": 0.03674718737602234, "learning_rate": 3.2358548098621932e-06, "loss": 0.6149, "step": 243 }, { "epoch": 2.8045977011494254, "grad_norm": 0.03740057721734047, "learning_rate": 2.905818257394799e-06, "loss": 0.6468, "step": 244 }, { "epoch": 2.8160919540229887, "grad_norm": 0.03783690929412842, "learning_rate": 2.5932823764445392e-06, "loss": 0.6319, "step": 245 }, { "epoch": 2.8275862068965516, "grad_norm": 0.0375899076461792, "learning_rate": 2.2983034998182997e-06, "loss": 0.6093, "step": 246 }, { "epoch": 2.839080459770115, "grad_norm": 0.03821846470236778, "learning_rate": 2.0209347957732328e-06, "loss": 0.6433, "step": 247 }, { "epoch": 2.8505747126436782, "grad_norm": 0.038088761270046234, "learning_rate": 1.7612262584335237e-06, "loss": 0.6335, "step": 248 }, { "epoch": 2.862068965517241, "grad_norm": 0.03848041221499443, "learning_rate": 1.5192246987791981e-06, "loss": 0.6285, "step": 249 }, { "epoch": 2.873563218390805, "grad_norm": 0.037073150277137756, "learning_rate": 1.2949737362087156e-06, "loss": 0.6228, "step": 250 }, { "epoch": 2.873563218390805, "eval_loss": 0.6631085276603699, "eval_runtime": 567.4402, "eval_samples_per_second": 17.443, "eval_steps_per_second": 0.137, "step": 250 }, { "epoch": 2.8850574712643677, "grad_norm": 0.03836345672607422, "learning_rate": 1.0885137906768372e-06, "loss": 0.6338, "step": 251 }, { "epoch": 2.896551724137931, "grad_norm": 0.037081461399793625, "learning_rate": 8.998820754091531e-07, "loss": 0.6277, "step": 252 }, { "epoch": 2.9080459770114944, "grad_norm": 0.03707776963710785, "learning_rate": 7.291125901946027e-07, "loss": 0.6186, "step": 253 }, { "epoch": 2.9195402298850572, "grad_norm": 0.03680020570755005, "learning_rate": 5.762361152572115e-07, "loss": 0.6428, "step": 254 }, { "epoch": 2.9310344827586206, "grad_norm": 0.03804980218410492, "learning_rate": 4.412802057081278e-07, "loss": 0.6311, "step": 255 }, { "epoch": 2.942528735632184, "grad_norm": 0.03696369379758835, "learning_rate": 3.2426918657900704e-07, "loss": 0.6268, "step": 256 }, { "epoch": 2.954022988505747, "grad_norm": 0.03842842951416969, "learning_rate": 2.2522414843748618e-07, "loss": 0.6289, "step": 257 }, { "epoch": 2.9655172413793105, "grad_norm": 0.037010688334703445, "learning_rate": 1.4416294358582384e-07, "loss": 0.6176, "step": 258 }, { "epoch": 2.9770114942528734, "grad_norm": 0.03919333964586258, "learning_rate": 8.110018284304133e-08, "loss": 0.6537, "step": 259 }, { "epoch": 2.9885057471264367, "grad_norm": 0.03734417259693146, "learning_rate": 3.60472329114625e-08, "loss": 0.6316, "step": 260 }, { "epoch": 2.9885057471264367, "eval_loss": 0.6630376577377319, "eval_runtime": 564.5729, "eval_samples_per_second": 17.532, "eval_steps_per_second": 0.138, "step": 260 }, { "epoch": 3.0, "grad_norm": 0.03720829635858536, "learning_rate": 9.012214327897006e-09, "loss": 0.6399, "step": 261 }, { "epoch": 3.0, "step": 261, "total_flos": 4.884710203742539e+19, "train_loss": 0.696434152765749, "train_runtime": 65574.4997, "train_samples_per_second": 4.075, "train_steps_per_second": 0.004 } ], "logging_steps": 1.0, "max_steps": 261, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.884710203742539e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }