| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0035714285714285713, | |
| "grad_norm": 1.5478915956248045, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 1.0625, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.007142857142857143, | |
| "grad_norm": 1.2398301518605641, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.9553, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.010714285714285714, | |
| "grad_norm": 1.4076387212655408, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.9978, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.014285714285714285, | |
| "grad_norm": 1.1985949378391438, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.9609, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.017857142857142856, | |
| "grad_norm": 1.0848575200944988, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.8853, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02142857142857143, | |
| "grad_norm": 1.092467471543203, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.8464, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.0846562083072122, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 0.874, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 1.3053964029480514, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.7936, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03214285714285714, | |
| "grad_norm": 1.1269639729113965, | |
| "learning_rate": 2e-05, | |
| "loss": 0.759, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 1.0980236614653736, | |
| "learning_rate": 1.9999328066483867e-05, | |
| "loss": 0.6741, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.039285714285714285, | |
| "grad_norm": 1.0582210049961431, | |
| "learning_rate": 1.9997312356234385e-05, | |
| "loss": 0.6477, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04285714285714286, | |
| "grad_norm": 1.0424068026905615, | |
| "learning_rate": 1.999395314013622e-05, | |
| "loss": 0.5562, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04642857142857143, | |
| "grad_norm": 0.8719364890870912, | |
| "learning_rate": 1.998925086962334e-05, | |
| "loss": 0.5305, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.9328118755011154, | |
| "learning_rate": 1.998320617661839e-05, | |
| "loss": 0.4832, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.05357142857142857, | |
| "grad_norm": 0.8896750885959509, | |
| "learning_rate": 1.997581987344772e-05, | |
| "loss": 0.4814, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.9183801340039138, | |
| "learning_rate": 1.9967092952732266e-05, | |
| "loss": 0.454, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.060714285714285714, | |
| "grad_norm": 0.9241597122472689, | |
| "learning_rate": 1.9957026587254136e-05, | |
| "loss": 0.435, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.06428571428571428, | |
| "grad_norm": 0.7126203238381043, | |
| "learning_rate": 1.9945622129799e-05, | |
| "loss": 0.3851, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06785714285714285, | |
| "grad_norm": 0.7693946450410322, | |
| "learning_rate": 1.9932881112974298e-05, | |
| "loss": 0.4033, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.705561753707911, | |
| "learning_rate": 1.9918805249003272e-05, | |
| "loss": 0.3177, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.6571310295289079, | |
| "learning_rate": 1.9903396429494882e-05, | |
| "loss": 0.3059, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07857142857142857, | |
| "grad_norm": 0.6584713658862125, | |
| "learning_rate": 1.9886656725189575e-05, | |
| "loss": 0.2787, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.08214285714285714, | |
| "grad_norm": 0.648454577238481, | |
| "learning_rate": 1.9868588385681035e-05, | |
| "loss": 0.2697, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.5254043510761319, | |
| "learning_rate": 1.9849193839113833e-05, | |
| "loss": 0.2226, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08928571428571429, | |
| "grad_norm": 0.5976203976124158, | |
| "learning_rate": 1.9828475691857148e-05, | |
| "loss": 0.2385, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09285714285714286, | |
| "grad_norm": 0.6244066856806556, | |
| "learning_rate": 1.9806436728154484e-05, | |
| "loss": 0.2964, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.09642857142857143, | |
| "grad_norm": 0.5460606128839673, | |
| "learning_rate": 1.9783079909749516e-05, | |
| "loss": 0.2288, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5409068847524089, | |
| "learning_rate": 1.975840837548807e-05, | |
| "loss": 0.2217, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.10357142857142858, | |
| "grad_norm": 0.6639598979647965, | |
| "learning_rate": 1.9732425440896298e-05, | |
| "loss": 0.2921, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.5344411293579373, | |
| "learning_rate": 1.9705134597735113e-05, | |
| "loss": 0.2367, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11071428571428571, | |
| "grad_norm": 0.4141908954409791, | |
| "learning_rate": 1.9676539513530967e-05, | |
| "loss": 0.1812, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.49622835670272675, | |
| "learning_rate": 1.9646644031082948e-05, | |
| "loss": 0.1948, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11785714285714285, | |
| "grad_norm": 0.5789972585167017, | |
| "learning_rate": 1.9615452167946383e-05, | |
| "loss": 0.2479, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.12142857142857143, | |
| "grad_norm": 0.5251274398921059, | |
| "learning_rate": 1.958296811589293e-05, | |
| "loss": 0.2377, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.5521223727466371, | |
| "learning_rate": 1.954919624034725e-05, | |
| "loss": 0.2499, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.12857142857142856, | |
| "grad_norm": 0.5108768018324138, | |
| "learning_rate": 1.951414107980036e-05, | |
| "loss": 0.1877, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.13214285714285715, | |
| "grad_norm": 0.5178181270876622, | |
| "learning_rate": 1.9477807345199717e-05, | |
| "loss": 0.1935, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.1357142857142857, | |
| "grad_norm": 0.47933474414651667, | |
| "learning_rate": 1.9440199919316125e-05, | |
| "loss": 0.1634, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1392857142857143, | |
| "grad_norm": 0.5896140031438339, | |
| "learning_rate": 1.9401323856087573e-05, | |
| "loss": 0.2086, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.6499970611340742, | |
| "learning_rate": 1.936118437994003e-05, | |
| "loss": 0.2311, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14642857142857144, | |
| "grad_norm": 0.5503129704249362, | |
| "learning_rate": 1.9319786885085366e-05, | |
| "loss": 0.2033, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.5337877909960611, | |
| "learning_rate": 1.927713693479643e-05, | |
| "loss": 0.1645, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.15357142857142858, | |
| "grad_norm": 0.4975309449048533, | |
| "learning_rate": 1.923324026065944e-05, | |
| "loss": 0.1892, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.15714285714285714, | |
| "grad_norm": 0.4344636172747461, | |
| "learning_rate": 1.918810276180372e-05, | |
| "loss": 0.139, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.16071428571428573, | |
| "grad_norm": 0.5162539386723686, | |
| "learning_rate": 1.9141730504108923e-05, | |
| "loss": 0.1752, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16428571428571428, | |
| "grad_norm": 0.49245919165379515, | |
| "learning_rate": 1.9094129719389886e-05, | |
| "loss": 0.1968, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.16785714285714284, | |
| "grad_norm": 0.5235680089426945, | |
| "learning_rate": 1.904530680455914e-05, | |
| "loss": 0.2077, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.5517523916740913, | |
| "learning_rate": 1.8995268320767254e-05, | |
| "loss": 0.187, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.4519171907644501, | |
| "learning_rate": 1.894402099252109e-05, | |
| "loss": 0.1827, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.6027324398622755, | |
| "learning_rate": 1.889157170678015e-05, | |
| "loss": 0.2084, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18214285714285713, | |
| "grad_norm": 0.41117638299222875, | |
| "learning_rate": 1.8837927512031022e-05, | |
| "loss": 0.1404, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.18571428571428572, | |
| "grad_norm": 0.5026435619169383, | |
| "learning_rate": 1.8783095617340193e-05, | |
| "loss": 0.1759, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.18928571428571428, | |
| "grad_norm": 0.34112043740605, | |
| "learning_rate": 1.872708339138522e-05, | |
| "loss": 0.1177, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.19285714285714287, | |
| "grad_norm": 0.3492186764282914, | |
| "learning_rate": 1.866989836146449e-05, | |
| "loss": 0.1209, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.19642857142857142, | |
| "grad_norm": 0.44324599063346565, | |
| "learning_rate": 1.861154821248565e-05, | |
| "loss": 0.1446, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.4871710493545156, | |
| "learning_rate": 1.8552040785932846e-05, | |
| "loss": 0.149, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.20357142857142857, | |
| "grad_norm": 0.4464914558136398, | |
| "learning_rate": 1.849138407881296e-05, | |
| "loss": 0.183, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.20714285714285716, | |
| "grad_norm": 0.4537236556618241, | |
| "learning_rate": 1.8429586242580884e-05, | |
| "loss": 0.1592, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.21071428571428572, | |
| "grad_norm": 0.3735622979036519, | |
| "learning_rate": 1.8366655582044096e-05, | |
| "loss": 0.0911, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.5501752604946842, | |
| "learning_rate": 1.83026005542466e-05, | |
| "loss": 0.1912, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21785714285714286, | |
| "grad_norm": 0.3534827565812762, | |
| "learning_rate": 1.8237429767332407e-05, | |
| "loss": 0.1352, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.22142857142857142, | |
| "grad_norm": 0.3631713691054233, | |
| "learning_rate": 1.8171151979388715e-05, | |
| "loss": 0.1125, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.4581951469644329, | |
| "learning_rate": 1.8103776097268942e-05, | |
| "loss": 0.1831, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.29589456760327454, | |
| "learning_rate": 1.803531117539577e-05, | |
| "loss": 0.1142, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.23214285714285715, | |
| "grad_norm": 0.321656217445022, | |
| "learning_rate": 1.7965766414544328e-05, | |
| "loss": 0.1071, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2357142857142857, | |
| "grad_norm": 0.3696948128162081, | |
| "learning_rate": 1.7895151160605758e-05, | |
| "loss": 0.1645, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2392857142857143, | |
| "grad_norm": 0.40287093773295857, | |
| "learning_rate": 1.782347490333123e-05, | |
| "loss": 0.153, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.24285714285714285, | |
| "grad_norm": 0.402551556493297, | |
| "learning_rate": 1.775074727505667e-05, | |
| "loss": 0.1558, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.24642857142857144, | |
| "grad_norm": 0.36576671510472414, | |
| "learning_rate": 1.7676978049408262e-05, | |
| "loss": 0.1319, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.3910560569712591, | |
| "learning_rate": 1.7602177139989046e-05, | |
| "loss": 0.1379, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25357142857142856, | |
| "grad_norm": 0.464138257243692, | |
| "learning_rate": 1.7526354599046637e-05, | |
| "loss": 0.1772, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.36872626423764915, | |
| "learning_rate": 1.7449520616122344e-05, | |
| "loss": 0.1453, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.26071428571428573, | |
| "grad_norm": 0.3762147287106459, | |
| "learning_rate": 1.7371685516681825e-05, | |
| "loss": 0.1238, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2642857142857143, | |
| "grad_norm": 0.44730344643149733, | |
| "learning_rate": 1.7292859760727493e-05, | |
| "loss": 0.0905, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.26785714285714285, | |
| "grad_norm": 0.31847467886438163, | |
| "learning_rate": 1.721305394139282e-05, | |
| "loss": 0.0962, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2714285714285714, | |
| "grad_norm": 0.3172886015603318, | |
| "learning_rate": 1.7132278783518756e-05, | |
| "loss": 0.1095, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.39833942342886014, | |
| "learning_rate": 1.7050545142212483e-05, | |
| "loss": 0.146, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2785714285714286, | |
| "grad_norm": 0.36256098601291475, | |
| "learning_rate": 1.696786400138859e-05, | |
| "loss": 0.1474, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.28214285714285714, | |
| "grad_norm": 0.5144037485713393, | |
| "learning_rate": 1.6884246472293018e-05, | |
| "loss": 0.1792, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.35433987050415405, | |
| "learning_rate": 1.679970379200983e-05, | |
| "loss": 0.1368, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2892857142857143, | |
| "grad_norm": 0.3775803207062038, | |
| "learning_rate": 1.6714247321951106e-05, | |
| "loss": 0.1502, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.29285714285714287, | |
| "grad_norm": 0.39879890621433334, | |
| "learning_rate": 1.6627888546330136e-05, | |
| "loss": 0.1627, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.29642857142857143, | |
| "grad_norm": 0.38927568667509344, | |
| "learning_rate": 1.654063907061807e-05, | |
| "loss": 0.1336, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.39077723911534096, | |
| "learning_rate": 1.64525106199843e-05, | |
| "loss": 0.1379, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.30357142857142855, | |
| "grad_norm": 0.3300014890302305, | |
| "learning_rate": 1.6363515037720774e-05, | |
| "loss": 0.1401, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.30714285714285716, | |
| "grad_norm": 0.3264032400951695, | |
| "learning_rate": 1.6273664283650393e-05, | |
| "loss": 0.1077, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3107142857142857, | |
| "grad_norm": 0.28134669681829544, | |
| "learning_rate": 1.6182970432519772e-05, | |
| "loss": 0.1383, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.39912508407687053, | |
| "learning_rate": 1.609144567237658e-05, | |
| "loss": 0.134, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.31785714285714284, | |
| "grad_norm": 0.404668324497237, | |
| "learning_rate": 1.5999102302931585e-05, | |
| "loss": 0.1396, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.5263360190906506, | |
| "learning_rate": 1.5905952733905777e-05, | |
| "loss": 0.1802, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.4080859243283811, | |
| "learning_rate": 1.5812009483362643e-05, | |
| "loss": 0.1738, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.32857142857142857, | |
| "grad_norm": 0.44589294219260617, | |
| "learning_rate": 1.5717285176025913e-05, | |
| "loss": 0.1395, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.33214285714285713, | |
| "grad_norm": 0.48805347667875654, | |
| "learning_rate": 1.5621792541582968e-05, | |
| "loss": 0.1794, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3357142857142857, | |
| "grad_norm": 0.31713728426594406, | |
| "learning_rate": 1.552554441297413e-05, | |
| "loss": 0.1287, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3392857142857143, | |
| "grad_norm": 0.3997312908073911, | |
| "learning_rate": 1.5428553724668103e-05, | |
| "loss": 0.1776, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.32512972197402723, | |
| "learning_rate": 1.533083351092372e-05, | |
| "loss": 0.0987, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3464285714285714, | |
| "grad_norm": 0.25650097315752696, | |
| "learning_rate": 1.5232396904038352e-05, | |
| "loss": 0.0963, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.3890103860914231, | |
| "learning_rate": 1.5133257132583074e-05, | |
| "loss": 0.1899, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3535714285714286, | |
| "grad_norm": 0.39434451760834605, | |
| "learning_rate": 1.503342751962493e-05, | |
| "loss": 0.1739, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.4338599110132305, | |
| "learning_rate": 1.4932921480936491e-05, | |
| "loss": 0.1273, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3607142857142857, | |
| "grad_norm": 0.41242604208869493, | |
| "learning_rate": 1.4831752523192949e-05, | |
| "loss": 0.1548, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.36428571428571427, | |
| "grad_norm": 0.3426565228545722, | |
| "learning_rate": 1.4729934242157005e-05, | |
| "loss": 0.1639, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3678571428571429, | |
| "grad_norm": 0.4260238384505358, | |
| "learning_rate": 1.4627480320851775e-05, | |
| "loss": 0.1769, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.33738414849240705, | |
| "learning_rate": 1.4524404527721977e-05, | |
| "loss": 0.1184, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.377141868162913, | |
| "learning_rate": 1.4420720714783635e-05, | |
| "loss": 0.1557, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.37857142857142856, | |
| "grad_norm": 0.401113514602946, | |
| "learning_rate": 1.4316442815762543e-05, | |
| "loss": 0.1646, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.3821428571428571, | |
| "grad_norm": 0.3177971639559263, | |
| "learning_rate": 1.4211584844221771e-05, | |
| "loss": 0.1075, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.38571428571428573, | |
| "grad_norm": 0.34835754468144986, | |
| "learning_rate": 1.4106160891678422e-05, | |
| "loss": 0.1335, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3892857142857143, | |
| "grad_norm": 0.30828103647375366, | |
| "learning_rate": 1.4000185125709919e-05, | |
| "loss": 0.117, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.32849435123561, | |
| "learning_rate": 1.3893671788050073e-05, | |
| "loss": 0.1591, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3964285714285714, | |
| "grad_norm": 0.2709136842488289, | |
| "learning_rate": 1.3786635192675184e-05, | |
| "loss": 0.1201, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5662239240435536, | |
| "learning_rate": 1.3679089723880427e-05, | |
| "loss": 0.1811, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.4035714285714286, | |
| "grad_norm": 0.3239753858152088, | |
| "learning_rate": 1.35710498343468e-05, | |
| "loss": 0.0956, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.40714285714285714, | |
| "grad_norm": 0.4577914787081394, | |
| "learning_rate": 1.3462530043198874e-05, | |
| "loss": 0.1918, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.4107142857142857, | |
| "grad_norm": 0.28893883723817343, | |
| "learning_rate": 1.3353544934053618e-05, | |
| "loss": 0.1137, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4142857142857143, | |
| "grad_norm": 0.3379009547335347, | |
| "learning_rate": 1.324410915306055e-05, | |
| "loss": 0.1249, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.41785714285714287, | |
| "grad_norm": 0.3197292040926634, | |
| "learning_rate": 1.3134237406933493e-05, | |
| "loss": 0.1008, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.42142857142857143, | |
| "grad_norm": 0.3392432971976464, | |
| "learning_rate": 1.3023944460974183e-05, | |
| "loss": 0.121, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.35558554159597894, | |
| "learning_rate": 1.2913245137088024e-05, | |
| "loss": 0.1471, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.22493336145934734, | |
| "learning_rate": 1.2802154311792196e-05, | |
| "loss": 0.1002, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.43214285714285716, | |
| "grad_norm": 0.4223106827111893, | |
| "learning_rate": 1.2690686914216475e-05, | |
| "loss": 0.1566, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4357142857142857, | |
| "grad_norm": 0.2813621781150817, | |
| "learning_rate": 1.2578857924096935e-05, | |
| "loss": 0.1098, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4392857142857143, | |
| "grad_norm": 0.29947686930576417, | |
| "learning_rate": 1.2466682369762883e-05, | |
| "loss": 0.1129, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.44285714285714284, | |
| "grad_norm": 0.38423327250889927, | |
| "learning_rate": 1.2354175326117252e-05, | |
| "loss": 0.1674, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.44642857142857145, | |
| "grad_norm": 0.41759411587377016, | |
| "learning_rate": 1.2241351912610726e-05, | |
| "loss": 0.1823, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.3287906320045648, | |
| "learning_rate": 1.212822729120989e-05, | |
| "loss": 0.1394, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.45357142857142857, | |
| "grad_norm": 0.309937820975526, | |
| "learning_rate": 1.2014816664359671e-05, | |
| "loss": 0.1252, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.3300093835113784, | |
| "learning_rate": 1.190113527294032e-05, | |
| "loss": 0.1357, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4607142857142857, | |
| "grad_norm": 0.3257793959279068, | |
| "learning_rate": 1.178719839421925e-05, | |
| "loss": 0.1226, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.33056389711570644, | |
| "learning_rate": 1.1673021339797967e-05, | |
| "loss": 0.1529, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.46785714285714286, | |
| "grad_norm": 0.35534347520612053, | |
| "learning_rate": 1.15586194535544e-05, | |
| "loss": 0.1064, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4714285714285714, | |
| "grad_norm": 0.22871074436318123, | |
| "learning_rate": 1.1444008109580884e-05, | |
| "loss": 0.0955, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.3824987433529803, | |
| "learning_rate": 1.1329202710118088e-05, | |
| "loss": 0.1612, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4785714285714286, | |
| "grad_norm": 0.3343360434084649, | |
| "learning_rate": 1.1214218683485159e-05, | |
| "loss": 0.1187, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.48214285714285715, | |
| "grad_norm": 0.34564099117172165, | |
| "learning_rate": 1.1099071482006361e-05, | |
| "loss": 0.1485, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.3776437292793847, | |
| "learning_rate": 1.0983776579934483e-05, | |
| "loss": 0.1381, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.48928571428571427, | |
| "grad_norm": 0.5196891777136812, | |
| "learning_rate": 1.0868349471371316e-05, | |
| "loss": 0.2327, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4928571428571429, | |
| "grad_norm": 0.28991752853377784, | |
| "learning_rate": 1.0752805668185442e-05, | |
| "loss": 0.1068, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.49642857142857144, | |
| "grad_norm": 0.3610469175255664, | |
| "learning_rate": 1.0637160697927651e-05, | |
| "loss": 0.1483, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.34237987234400713, | |
| "learning_rate": 1.0521430101744238e-05, | |
| "loss": 0.1327, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5035714285714286, | |
| "grad_norm": 0.3532653120531725, | |
| "learning_rate": 1.040562943228849e-05, | |
| "loss": 0.1531, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5071428571428571, | |
| "grad_norm": 0.2490959381575519, | |
| "learning_rate": 1.0289774251630602e-05, | |
| "loss": 0.0943, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5107142857142857, | |
| "grad_norm": 0.3419815025999081, | |
| "learning_rate": 1.0173880129166358e-05, | |
| "loss": 0.1199, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.27015420390524447, | |
| "learning_rate": 1.0057962639524799e-05, | |
| "loss": 0.097, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5178571428571429, | |
| "grad_norm": 0.28947245421874257, | |
| "learning_rate": 9.942037360475205e-06, | |
| "loss": 0.0992, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5214285714285715, | |
| "grad_norm": 0.3883015225071508, | |
| "learning_rate": 9.826119870833644e-06, | |
| "loss": 0.1522, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.8300529149522834, | |
| "learning_rate": 9.710225748369402e-06, | |
| "loss": 0.1396, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5285714285714286, | |
| "grad_norm": 0.202016854911906, | |
| "learning_rate": 9.594370567711512e-06, | |
| "loss": 0.0854, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5321428571428571, | |
| "grad_norm": 0.36978259812488184, | |
| "learning_rate": 9.478569898255765e-06, | |
| "loss": 0.1704, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.23665827589903268, | |
| "learning_rate": 9.362839302072354e-06, | |
| "loss": 0.0948, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5392857142857143, | |
| "grad_norm": 0.35039135014557515, | |
| "learning_rate": 9.247194331814561e-06, | |
| "loss": 0.1648, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.32816521513063157, | |
| "learning_rate": 9.131650528628688e-06, | |
| "loss": 0.1032, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5464285714285714, | |
| "grad_norm": 0.32264421993745784, | |
| "learning_rate": 9.016223420065519e-06, | |
| "loss": 0.1248, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5185778078772257, | |
| "learning_rate": 8.900928517993644e-06, | |
| "loss": 0.2034, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5535714285714286, | |
| "grad_norm": 0.33725063318309306, | |
| "learning_rate": 8.785781316514841e-06, | |
| "loss": 0.1401, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5571428571428572, | |
| "grad_norm": 0.34767124633241814, | |
| "learning_rate": 8.670797289881915e-06, | |
| "loss": 0.1564, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5607142857142857, | |
| "grad_norm": 0.3055450803151196, | |
| "learning_rate": 8.555991890419116e-06, | |
| "loss": 0.1124, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5642857142857143, | |
| "grad_norm": 0.25388977773118193, | |
| "learning_rate": 8.441380546445603e-06, | |
| "loss": 0.1081, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5678571428571428, | |
| "grad_norm": 0.4455538403435565, | |
| "learning_rate": 8.326978660202034e-06, | |
| "loss": 0.1676, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.30228248526761664, | |
| "learning_rate": 8.212801605780754e-06, | |
| "loss": 0.1271, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.2544828612445528, | |
| "learning_rate": 8.098864727059685e-06, | |
| "loss": 0.1093, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5785714285714286, | |
| "grad_norm": 0.36913988346664617, | |
| "learning_rate": 7.985183335640332e-06, | |
| "loss": 0.1516, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5821428571428572, | |
| "grad_norm": 0.25130653502490885, | |
| "learning_rate": 7.871772708790114e-06, | |
| "loss": 0.102, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5857142857142857, | |
| "grad_norm": 0.3482017405768576, | |
| "learning_rate": 7.758648087389277e-06, | |
| "loss": 0.1439, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5892857142857143, | |
| "grad_norm": 0.4080868021076819, | |
| "learning_rate": 7.64582467388275e-06, | |
| "loss": 0.1594, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5928571428571429, | |
| "grad_norm": 0.22416436168548187, | |
| "learning_rate": 7.533317630237117e-06, | |
| "loss": 0.0958, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5964285714285714, | |
| "grad_norm": 0.3548896053394149, | |
| "learning_rate": 7.421142075903067e-06, | |
| "loss": 0.1805, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.3153835017876163, | |
| "learning_rate": 7.3093130857835245e-06, | |
| "loss": 0.1576, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6035714285714285, | |
| "grad_norm": 0.37437814298012134, | |
| "learning_rate": 7.197845688207805e-06, | |
| "loss": 0.1845, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.2707210792364402, | |
| "learning_rate": 7.086754862911982e-06, | |
| "loss": 0.1074, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6107142857142858, | |
| "grad_norm": 0.38773698909477633, | |
| "learning_rate": 6.976055539025819e-06, | |
| "loss": 0.1693, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6142857142857143, | |
| "grad_norm": 0.270791960065858, | |
| "learning_rate": 6.865762593066514e-06, | |
| "loss": 0.0962, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6178571428571429, | |
| "grad_norm": 0.33404187296938787, | |
| "learning_rate": 6.755890846939454e-06, | |
| "loss": 0.1502, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6214285714285714, | |
| "grad_norm": 0.3169317085587855, | |
| "learning_rate": 6.646455065946386e-06, | |
| "loss": 0.1535, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.34202948851755516, | |
| "learning_rate": 6.537469956801128e-06, | |
| "loss": 0.1336, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6285714285714286, | |
| "grad_norm": 0.22142635555607934, | |
| "learning_rate": 6.428950165653204e-06, | |
| "loss": 0.089, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6321428571428571, | |
| "grad_norm": 0.39547548326244164, | |
| "learning_rate": 6.320910276119576e-06, | |
| "loss": 0.1622, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6357142857142857, | |
| "grad_norm": 0.34202811717449055, | |
| "learning_rate": 6.213364807324817e-06, | |
| "loss": 0.1329, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6392857142857142, | |
| "grad_norm": 0.24984802040484924, | |
| "learning_rate": 6.106328211949928e-06, | |
| "loss": 0.0978, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.2691895257848114, | |
| "learning_rate": 5.999814874290084e-06, | |
| "loss": 0.1076, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6464285714285715, | |
| "grad_norm": 0.35670010783973816, | |
| "learning_rate": 5.893839108321584e-06, | |
| "loss": 0.1409, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.3063688782891636, | |
| "learning_rate": 5.7884151557782305e-06, | |
| "loss": 0.1111, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6535714285714286, | |
| "grad_norm": 0.4478859628471062, | |
| "learning_rate": 5.68355718423746e-06, | |
| "loss": 0.1582, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6571428571428571, | |
| "grad_norm": 0.4079013493850928, | |
| "learning_rate": 5.579279285216369e-06, | |
| "loss": 0.1832, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6607142857142857, | |
| "grad_norm": 0.36684648196887143, | |
| "learning_rate": 5.4755954722780236e-06, | |
| "loss": 0.1443, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6642857142857143, | |
| "grad_norm": 0.4666764433183556, | |
| "learning_rate": 5.372519679148227e-06, | |
| "loss": 0.1924, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6678571428571428, | |
| "grad_norm": 0.3679225305422777, | |
| "learning_rate": 5.270065757843e-06, | |
| "loss": 0.1637, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6714285714285714, | |
| "grad_norm": 0.5074893835770381, | |
| "learning_rate": 5.168247476807054e-06, | |
| "loss": 0.1859, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 0.2996145714991962, | |
| "learning_rate": 5.067078519063514e-06, | |
| "loss": 0.1343, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.3516548476388983, | |
| "learning_rate": 4.966572480375076e-06, | |
| "loss": 0.1559, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6821428571428572, | |
| "grad_norm": 0.356885235057829, | |
| "learning_rate": 4.86674286741693e-06, | |
| "loss": 0.1475, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.5734359096126976, | |
| "learning_rate": 4.767603095961652e-06, | |
| "loss": 0.1914, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6892857142857143, | |
| "grad_norm": 0.40445955460343774, | |
| "learning_rate": 4.669166489076283e-06, | |
| "loss": 0.1766, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6928571428571428, | |
| "grad_norm": 0.2800652614843086, | |
| "learning_rate": 4.571446275331903e-06, | |
| "loss": 0.1188, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6964285714285714, | |
| "grad_norm": 0.3757570796942492, | |
| "learning_rate": 4.47445558702587e-06, | |
| "loss": 0.1831, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.32608548991744285, | |
| "learning_rate": 4.378207458417035e-06, | |
| "loss": 0.1422, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7035714285714286, | |
| "grad_norm": 0.38378600912734245, | |
| "learning_rate": 4.282714823974088e-06, | |
| "loss": 0.1512, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7071428571428572, | |
| "grad_norm": 0.35671347574479906, | |
| "learning_rate": 4.187990516637361e-06, | |
| "loss": 0.1261, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.7107142857142857, | |
| "grad_norm": 0.30915599350731704, | |
| "learning_rate": 4.094047266094225e-06, | |
| "loss": 0.1115, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.20400134484541857, | |
| "learning_rate": 4.000897697068418e-06, | |
| "loss": 0.0755, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7178571428571429, | |
| "grad_norm": 0.3783702453179056, | |
| "learning_rate": 3.908554327623425e-06, | |
| "loss": 0.1462, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7214285714285714, | |
| "grad_norm": 0.39604207952130577, | |
| "learning_rate": 3.817029567480228e-06, | |
| "loss": 0.1749, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 0.39751469906997566, | |
| "learning_rate": 3.7263357163496118e-06, | |
| "loss": 0.1626, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.7285714285714285, | |
| "grad_norm": 0.2879746919882601, | |
| "learning_rate": 3.6364849622792262e-06, | |
| "loss": 0.1236, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.7321428571428571, | |
| "grad_norm": 0.3656185312372868, | |
| "learning_rate": 3.5474893800157005e-06, | |
| "loss": 0.1309, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7357142857142858, | |
| "grad_norm": 0.27352675259784487, | |
| "learning_rate": 3.459360929381931e-06, | |
| "loss": 0.0926, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7392857142857143, | |
| "grad_norm": 0.41951631132226436, | |
| "learning_rate": 3.372111453669864e-06, | |
| "loss": 0.1485, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.7428571428571429, | |
| "grad_norm": 0.31261953552147415, | |
| "learning_rate": 3.2857526780488925e-06, | |
| "loss": 0.151, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7464285714285714, | |
| "grad_norm": 0.3142527429717002, | |
| "learning_rate": 3.2002962079901743e-06, | |
| "loss": 0.1286, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.41272553233415865, | |
| "learning_rate": 3.115753527706986e-06, | |
| "loss": 0.1484, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7535714285714286, | |
| "grad_norm": 0.2491459925114637, | |
| "learning_rate": 3.0321359986114096e-06, | |
| "loss": 0.0886, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.7571428571428571, | |
| "grad_norm": 0.383107500116008, | |
| "learning_rate": 2.9494548577875195e-06, | |
| "loss": 0.1349, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7607142857142857, | |
| "grad_norm": 0.3719900753257283, | |
| "learning_rate": 2.8677212164812464e-06, | |
| "loss": 0.1285, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.7642857142857142, | |
| "grad_norm": 0.36185954441920426, | |
| "learning_rate": 2.786946058607187e-06, | |
| "loss": 0.1728, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7678571428571429, | |
| "grad_norm": 0.4357674701225695, | |
| "learning_rate": 2.70714023927251e-06, | |
| "loss": 0.1713, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7714285714285715, | |
| "grad_norm": 0.32556051670602676, | |
| "learning_rate": 2.628314483318178e-06, | |
| "loss": 0.1506, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 0.3447117803793854, | |
| "learning_rate": 2.5504793838776585e-06, | |
| "loss": 0.1213, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7785714285714286, | |
| "grad_norm": 0.4342987664730663, | |
| "learning_rate": 2.473645400953366e-06, | |
| "loss": 0.153, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7821428571428571, | |
| "grad_norm": 0.3920281781124031, | |
| "learning_rate": 2.3978228600109564e-06, | |
| "loss": 0.1474, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.37630174801216276, | |
| "learning_rate": 2.323021950591743e-06, | |
| "loss": 0.1493, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7892857142857143, | |
| "grad_norm": 0.39066945140138887, | |
| "learning_rate": 2.249252724943336e-06, | |
| "loss": 0.1576, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7928571428571428, | |
| "grad_norm": 0.3262711059595468, | |
| "learning_rate": 2.176525096668769e-06, | |
| "loss": 0.1414, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7964285714285714, | |
| "grad_norm": 0.33865138896969227, | |
| "learning_rate": 2.1048488393942455e-06, | |
| "loss": 0.1611, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.3379236752698617, | |
| "learning_rate": 2.0342335854556738e-06, | |
| "loss": 0.1344, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.8035714285714286, | |
| "grad_norm": 0.3109450803199544, | |
| "learning_rate": 1.964688824604234e-06, | |
| "loss": 0.1457, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8071428571428572, | |
| "grad_norm": 0.2875185632580354, | |
| "learning_rate": 1.896223902731058e-06, | |
| "loss": 0.0977, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.8107142857142857, | |
| "grad_norm": 0.2347911682820601, | |
| "learning_rate": 1.8288480206112879e-06, | |
| "loss": 0.0868, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.8142857142857143, | |
| "grad_norm": 0.4121777863518364, | |
| "learning_rate": 1.7625702326675952e-06, | |
| "loss": 0.1538, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.8178571428571428, | |
| "grad_norm": 0.2912601864031465, | |
| "learning_rate": 1.6973994457534026e-06, | |
| "loss": 0.1274, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.3815620088872596, | |
| "learning_rate": 1.6333444179559078e-06, | |
| "loss": 0.1042, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 0.47161502108864845, | |
| "learning_rate": 1.5704137574191202e-06, | |
| "loss": 0.1571, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.8285714285714286, | |
| "grad_norm": 0.4662370295285976, | |
| "learning_rate": 1.5086159211870445e-06, | |
| "loss": 0.1984, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.8321428571428572, | |
| "grad_norm": 0.4383207536709599, | |
| "learning_rate": 1.447959214067155e-06, | |
| "loss": 0.1796, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.8357142857142857, | |
| "grad_norm": 0.31912149691231545, | |
| "learning_rate": 1.3884517875143544e-06, | |
| "loss": 0.1585, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.8392857142857143, | |
| "grad_norm": 0.40168679552274317, | |
| "learning_rate": 1.3301016385355093e-06, | |
| "loss": 0.1865, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8428571428571429, | |
| "grad_norm": 0.2366119414746658, | |
| "learning_rate": 1.2729166086147803e-06, | |
| "loss": 0.1045, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8464285714285714, | |
| "grad_norm": 0.34244833431583455, | |
| "learning_rate": 1.216904382659806e-06, | |
| "loss": 0.1337, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.4468357126703566, | |
| "learning_rate": 1.1620724879689793e-06, | |
| "loss": 0.1814, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.8535714285714285, | |
| "grad_norm": 0.2949748414048653, | |
| "learning_rate": 1.1084282932198543e-06, | |
| "loss": 0.1573, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.3478075924736447, | |
| "learning_rate": 1.0559790074789134e-06, | |
| "loss": 0.1445, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8607142857142858, | |
| "grad_norm": 0.28328715407169286, | |
| "learning_rate": 1.00473167923275e-06, | |
| "loss": 0.1063, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.8642857142857143, | |
| "grad_norm": 0.38920844855326037, | |
| "learning_rate": 9.546931954408622e-07, | |
| "loss": 0.1703, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.8678571428571429, | |
| "grad_norm": 0.5996209968463109, | |
| "learning_rate": 9.058702806101172e-07, | |
| "loss": 0.1834, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.8714285714285714, | |
| "grad_norm": 0.27975031106611575, | |
| "learning_rate": 8.582694958910809e-07, | |
| "loss": 0.1356, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.7648856362732316, | |
| "learning_rate": 8.118972381962853e-07, | |
| "loss": 0.2202, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8785714285714286, | |
| "grad_norm": 0.3691758708862301, | |
| "learning_rate": 7.667597393405602e-07, | |
| "loss": 0.1237, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.8821428571428571, | |
| "grad_norm": 0.3178006502647586, | |
| "learning_rate": 7.228630652035717e-07, | |
| "loss": 0.1178, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8857142857142857, | |
| "grad_norm": 0.384435688223958, | |
| "learning_rate": 6.802131149146374e-07, | |
| "loss": 0.1524, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8892857142857142, | |
| "grad_norm": 0.3068368906735643, | |
| "learning_rate": 6.388156200599726e-07, | |
| "loss": 0.1247, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.2915652541414803, | |
| "learning_rate": 5.986761439124289e-07, | |
| "loss": 0.1264, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8964285714285715, | |
| "grad_norm": 0.4962653014147133, | |
| "learning_rate": 5.598000806838766e-07, | |
| "loss": 0.2289, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.3105670095756884, | |
| "learning_rate": 5.221926548002876e-07, | |
| "loss": 0.1692, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.9035714285714286, | |
| "grad_norm": 0.2599346187591853, | |
| "learning_rate": 4.858589201996433e-07, | |
| "loss": 0.1099, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.9071428571428571, | |
| "grad_norm": 0.37266496166460017, | |
| "learning_rate": 4.5080375965275256e-07, | |
| "loss": 0.155, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.9107142857142857, | |
| "grad_norm": 0.3523382656079408, | |
| "learning_rate": 4.1703188410707087e-07, | |
| "loss": 0.1568, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.3139689914742574, | |
| "learning_rate": 3.845478320536178e-07, | |
| "loss": 0.1442, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.9178571428571428, | |
| "grad_norm": 0.3886010972906401, | |
| "learning_rate": 3.5335596891705406e-07, | |
| "loss": 0.2008, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.9214285714285714, | |
| "grad_norm": 0.3016440986889294, | |
| "learning_rate": 3.2346048646903494e-07, | |
| "loss": 0.1743, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 0.3677684465566086, | |
| "learning_rate": 2.9486540226488556e-07, | |
| "loss": 0.1359, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.4386983440480599, | |
| "learning_rate": 2.6757455910370486e-07, | |
| "loss": 0.1479, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9321428571428572, | |
| "grad_norm": 0.21979803308995374, | |
| "learning_rate": 2.41591624511931e-07, | |
| "loss": 0.0829, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.9357142857142857, | |
| "grad_norm": 0.33381820914249016, | |
| "learning_rate": 2.169200902504842e-07, | |
| "loss": 0.1453, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.9392857142857143, | |
| "grad_norm": 0.3320040694348736, | |
| "learning_rate": 1.9356327184551716e-07, | |
| "loss": 0.1123, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.9428571428571428, | |
| "grad_norm": 0.3412668141144355, | |
| "learning_rate": 1.7152430814285303e-07, | |
| "loss": 0.13, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.9464285714285714, | |
| "grad_norm": 0.3696237830621812, | |
| "learning_rate": 1.5080616088616884e-07, | |
| "loss": 0.1907, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.30494316246947967, | |
| "learning_rate": 1.3141161431896809e-07, | |
| "loss": 0.1555, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.9535714285714286, | |
| "grad_norm": 0.3189572020801009, | |
| "learning_rate": 1.1334327481042573e-07, | |
| "loss": 0.1219, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.9571428571428572, | |
| "grad_norm": 0.33361495127745605, | |
| "learning_rate": 9.660357050512158e-08, | |
| "loss": 0.1431, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.9607142857142857, | |
| "grad_norm": 0.48450511535248025, | |
| "learning_rate": 8.119475099673035e-08, | |
| "loss": 0.1928, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.2906209996194275, | |
| "learning_rate": 6.711888702570556e-08, | |
| "loss": 0.1028, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9678571428571429, | |
| "grad_norm": 0.3401517685437452, | |
| "learning_rate": 5.437787020100116e-08, | |
| "loss": 0.1484, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.9714285714285714, | |
| "grad_norm": 0.4092102841602329, | |
| "learning_rate": 4.297341274586475e-08, | |
| "loss": 0.1955, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 0.4364496819852462, | |
| "learning_rate": 3.290704726773619e-08, | |
| "loss": 0.1753, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.9785714285714285, | |
| "grad_norm": 0.3842525728886007, | |
| "learning_rate": 2.4180126552284523e-08, | |
| "loss": 0.1609, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.9821428571428571, | |
| "grad_norm": 0.43086325504903594, | |
| "learning_rate": 1.6793823381614506e-08, | |
| "loss": 0.1804, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9857142857142858, | |
| "grad_norm": 0.3825806313174142, | |
| "learning_rate": 1.0749130376659366e-08, | |
| "loss": 0.1329, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.9892857142857143, | |
| "grad_norm": 0.3932695276283175, | |
| "learning_rate": 6.046859863781951e-09, | |
| "loss": 0.1756, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.9928571428571429, | |
| "grad_norm": 0.4324987338144805, | |
| "learning_rate": 2.6876437656153663e-09, | |
| "loss": 0.1774, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.9964285714285714, | |
| "grad_norm": 0.2871247824045713, | |
| "learning_rate": 6.719335161364804e-10, | |
| "loss": 0.1208, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.3615603880208619, | |
| "learning_rate": 0.0, | |
| "loss": 0.1365, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 280, | |
| "total_flos": 610370176024576.0, | |
| "train_loss": 0.18859978249030454, | |
| "train_runtime": 2395.0124, | |
| "train_samples_per_second": 14.959, | |
| "train_steps_per_second": 0.117 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 610370176024576.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |