diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12727 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 18123, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016553550736633008, + "grad_norm": 7.984700679779053, + "learning_rate": 4.964147821290679e-08, + "loss": 1.0634, + "step": 10 + }, + { + "epoch": 0.0033107101473266016, + "grad_norm": 6.1845703125, + "learning_rate": 1.0479867622724768e-07, + "loss": 1.1529, + "step": 20 + }, + { + "epoch": 0.004966065220989902, + "grad_norm": 7.543320655822754, + "learning_rate": 1.5995587424158854e-07, + "loss": 1.2636, + "step": 30 + }, + { + "epoch": 0.006621420294653203, + "grad_norm": 6.893922328948975, + "learning_rate": 2.1511307225592942e-07, + "loss": 1.2297, + "step": 40 + }, + { + "epoch": 0.008276775368316504, + "grad_norm": 7.185118675231934, + "learning_rate": 2.702702702702703e-07, + "loss": 1.1841, + "step": 50 + }, + { + "epoch": 0.009932130441979804, + "grad_norm": 8.388279914855957, + "learning_rate": 3.2542746828461117e-07, + "loss": 1.1145, + "step": 60 + }, + { + "epoch": 0.011587485515643106, + "grad_norm": 6.8379292488098145, + "learning_rate": 3.805846662989521e-07, + "loss": 1.1385, + "step": 70 + }, + { + "epoch": 0.013242840589306406, + "grad_norm": 8.465346336364746, + "learning_rate": 4.357418643132929e-07, + "loss": 1.1756, + "step": 80 + }, + { + "epoch": 0.014898195662969706, + "grad_norm": 6.7175493240356445, + "learning_rate": 4.908990623276337e-07, + "loss": 1.029, + "step": 90 + }, + { + "epoch": 0.01655355073663301, + "grad_norm": 4.6793212890625, + "learning_rate": 5.460562603419746e-07, + "loss": 1.1213, + "step": 100 + }, + { + "epoch": 0.01820890581029631, + "grad_norm": 4.11850643157959, + "learning_rate": 6.012134583563156e-07, + "loss": 1.0167, + "step": 110 + }, + { + "epoch": 0.01986426088395961, + "grad_norm": 6.64889669418335, + "learning_rate": 6.563706563706564e-07, + "loss": 1.0059, + "step": 120 + }, + { + "epoch": 0.02151961595762291, + "grad_norm": 3.417647123336792, + "learning_rate": 7.115278543849972e-07, + "loss": 1.0292, + "step": 130 + }, + { + "epoch": 0.023174971031286212, + "grad_norm": 3.204617738723755, + "learning_rate": 7.666850523993381e-07, + "loss": 0.9241, + "step": 140 + }, + { + "epoch": 0.02483032610494951, + "grad_norm": 5.174026966094971, + "learning_rate": 8.21842250413679e-07, + "loss": 0.9826, + "step": 150 + }, + { + "epoch": 0.026485681178612813, + "grad_norm": 5.526364803314209, + "learning_rate": 8.769994484280199e-07, + "loss": 0.9245, + "step": 160 + }, + { + "epoch": 0.028141036252276114, + "grad_norm": 5.8991570472717285, + "learning_rate": 9.321566464423607e-07, + "loss": 0.9536, + "step": 170 + }, + { + "epoch": 0.029796391325939413, + "grad_norm": 4.1310248374938965, + "learning_rate": 9.873138444567016e-07, + "loss": 0.9059, + "step": 180 + }, + { + "epoch": 0.031451746399602715, + "grad_norm": 4.935130596160889, + "learning_rate": 1.0424710424710426e-06, + "loss": 0.9174, + "step": 190 + }, + { + "epoch": 0.03310710147326602, + "grad_norm": 3.474487066268921, + "learning_rate": 1.0976282404853834e-06, + "loss": 0.9133, + "step": 200 + }, + { + "epoch": 0.03476245654692932, + "grad_norm": 10.657366752624512, + "learning_rate": 1.1527854384997244e-06, + "loss": 0.7568, + "step": 210 + }, + { + "epoch": 0.03641781162059262, + "grad_norm": 3.7891452312469482, + "learning_rate": 1.2079426365140651e-06, + "loss": 0.8258, + "step": 220 + }, + { + "epoch": 0.038073166694255915, + "grad_norm": 5.292945861816406, + "learning_rate": 1.263099834528406e-06, + "loss": 0.784, + "step": 230 + }, + { + "epoch": 0.03972852176791922, + "grad_norm": 4.9213690757751465, + "learning_rate": 1.3182570325427469e-06, + "loss": 0.7955, + "step": 240 + }, + { + "epoch": 0.04138387684158252, + "grad_norm": 4.905694484710693, + "learning_rate": 1.3734142305570876e-06, + "loss": 0.877, + "step": 250 + }, + { + "epoch": 0.04303923191524582, + "grad_norm": 3.1677496433258057, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.8612, + "step": 260 + }, + { + "epoch": 0.04469458698890912, + "grad_norm": 3.0269603729248047, + "learning_rate": 1.4837286265857694e-06, + "loss": 0.821, + "step": 270 + }, + { + "epoch": 0.046349942062572425, + "grad_norm": 3.246804714202881, + "learning_rate": 1.5388858246001104e-06, + "loss": 0.8345, + "step": 280 + }, + { + "epoch": 0.04800529713623572, + "grad_norm": 3.3569605350494385, + "learning_rate": 1.5940430226144514e-06, + "loss": 0.7517, + "step": 290 + }, + { + "epoch": 0.04966065220989902, + "grad_norm": 3.597111701965332, + "learning_rate": 1.6492002206287921e-06, + "loss": 0.8384, + "step": 300 + }, + { + "epoch": 0.05131600728356232, + "grad_norm": 3.895134925842285, + "learning_rate": 1.7043574186431331e-06, + "loss": 0.8339, + "step": 310 + }, + { + "epoch": 0.052971362357225625, + "grad_norm": 4.427375316619873, + "learning_rate": 1.7595146166574739e-06, + "loss": 0.784, + "step": 320 + }, + { + "epoch": 0.05462671743088893, + "grad_norm": 3.222888708114624, + "learning_rate": 1.8146718146718149e-06, + "loss": 0.7882, + "step": 330 + }, + { + "epoch": 0.05628207250455223, + "grad_norm": 3.1282763481140137, + "learning_rate": 1.8698290126861556e-06, + "loss": 0.7937, + "step": 340 + }, + { + "epoch": 0.05793742757821553, + "grad_norm": 4.77147102355957, + "learning_rate": 1.9249862107004966e-06, + "loss": 0.7097, + "step": 350 + }, + { + "epoch": 0.059592782651878826, + "grad_norm": 4.3493571281433105, + "learning_rate": 1.9801434087148376e-06, + "loss": 0.6705, + "step": 360 + }, + { + "epoch": 0.06124813772554213, + "grad_norm": 3.3635661602020264, + "learning_rate": 2.035300606729178e-06, + "loss": 0.8396, + "step": 370 + }, + { + "epoch": 0.06290349279920543, + "grad_norm": 4.846203804016113, + "learning_rate": 2.090457804743519e-06, + "loss": 0.7266, + "step": 380 + }, + { + "epoch": 0.06455884787286872, + "grad_norm": 4.919671058654785, + "learning_rate": 2.14561500275786e-06, + "loss": 0.6616, + "step": 390 + }, + { + "epoch": 0.06621420294653203, + "grad_norm": 3.959589958190918, + "learning_rate": 2.200772200772201e-06, + "loss": 0.7379, + "step": 400 + }, + { + "epoch": 0.06786955802019533, + "grad_norm": 4.111237049102783, + "learning_rate": 2.255929398786542e-06, + "loss": 0.7201, + "step": 410 + }, + { + "epoch": 0.06952491309385864, + "grad_norm": 4.0857343673706055, + "learning_rate": 2.3110865968008826e-06, + "loss": 0.7806, + "step": 420 + }, + { + "epoch": 0.07118026816752193, + "grad_norm": 2.696610689163208, + "learning_rate": 2.3662437948152236e-06, + "loss": 0.7176, + "step": 430 + }, + { + "epoch": 0.07283562324118524, + "grad_norm": 3.1696786880493164, + "learning_rate": 2.4214009928295646e-06, + "loss": 0.7062, + "step": 440 + }, + { + "epoch": 0.07449097831484854, + "grad_norm": 4.226884365081787, + "learning_rate": 2.4765581908439056e-06, + "loss": 0.7227, + "step": 450 + }, + { + "epoch": 0.07614633338851183, + "grad_norm": 2.852306842803955, + "learning_rate": 2.531715388858246e-06, + "loss": 0.7354, + "step": 460 + }, + { + "epoch": 0.07780168846217514, + "grad_norm": 3.600233316421509, + "learning_rate": 2.5868725868725867e-06, + "loss": 0.7057, + "step": 470 + }, + { + "epoch": 0.07945704353583843, + "grad_norm": 6.20136833190918, + "learning_rate": 2.642029784886928e-06, + "loss": 0.713, + "step": 480 + }, + { + "epoch": 0.08111239860950174, + "grad_norm": 5.960463047027588, + "learning_rate": 2.6971869829012687e-06, + "loss": 0.7262, + "step": 490 + }, + { + "epoch": 0.08276775368316504, + "grad_norm": 2.7353596687316895, + "learning_rate": 2.7523441809156096e-06, + "loss": 0.7249, + "step": 500 + }, + { + "epoch": 0.08442310875682833, + "grad_norm": 2.9528822898864746, + "learning_rate": 2.8075013789299506e-06, + "loss": 0.7158, + "step": 510 + }, + { + "epoch": 0.08607846383049164, + "grad_norm": 3.323730230331421, + "learning_rate": 2.8626585769442916e-06, + "loss": 0.704, + "step": 520 + }, + { + "epoch": 0.08773381890415494, + "grad_norm": 3.2658655643463135, + "learning_rate": 2.917815774958632e-06, + "loss": 0.7194, + "step": 530 + }, + { + "epoch": 0.08938917397781825, + "grad_norm": 3.035658121109009, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.6506, + "step": 540 + }, + { + "epoch": 0.09104452905148154, + "grad_norm": 3.595717668533325, + "learning_rate": 3.028130170987314e-06, + "loss": 0.7421, + "step": 550 + }, + { + "epoch": 0.09269988412514485, + "grad_norm": 4.5476202964782715, + "learning_rate": 3.083287369001655e-06, + "loss": 0.704, + "step": 560 + }, + { + "epoch": 0.09435523919880814, + "grad_norm": 3.4544177055358887, + "learning_rate": 3.1384445670159957e-06, + "loss": 0.6521, + "step": 570 + }, + { + "epoch": 0.09601059427247144, + "grad_norm": 3.8915786743164062, + "learning_rate": 3.193601765030337e-06, + "loss": 0.6756, + "step": 580 + }, + { + "epoch": 0.09766594934613475, + "grad_norm": 4.146472930908203, + "learning_rate": 3.2487589630446776e-06, + "loss": 0.7626, + "step": 590 + }, + { + "epoch": 0.09932130441979804, + "grad_norm": 3.272199869155884, + "learning_rate": 3.3039161610590186e-06, + "loss": 0.7363, + "step": 600 + }, + { + "epoch": 0.10097665949346135, + "grad_norm": 2.5908915996551514, + "learning_rate": 3.359073359073359e-06, + "loss": 0.649, + "step": 610 + }, + { + "epoch": 0.10263201456712465, + "grad_norm": 3.507157325744629, + "learning_rate": 3.4142305570877006e-06, + "loss": 0.7292, + "step": 620 + }, + { + "epoch": 0.10428736964078796, + "grad_norm": 3.2821595668792725, + "learning_rate": 3.469387755102041e-06, + "loss": 0.7024, + "step": 630 + }, + { + "epoch": 0.10594272471445125, + "grad_norm": 3.930805206298828, + "learning_rate": 3.524544953116382e-06, + "loss": 0.7038, + "step": 640 + }, + { + "epoch": 0.10759807978811455, + "grad_norm": 2.831280469894409, + "learning_rate": 3.5797021511307227e-06, + "loss": 0.6795, + "step": 650 + }, + { + "epoch": 0.10925343486177785, + "grad_norm": 2.831242799758911, + "learning_rate": 3.634859349145064e-06, + "loss": 0.6852, + "step": 660 + }, + { + "epoch": 0.11090878993544115, + "grad_norm": 2.9769375324249268, + "learning_rate": 3.6900165471594046e-06, + "loss": 0.6547, + "step": 670 + }, + { + "epoch": 0.11256414500910446, + "grad_norm": 2.967987060546875, + "learning_rate": 3.745173745173745e-06, + "loss": 0.7238, + "step": 680 + }, + { + "epoch": 0.11421950008276775, + "grad_norm": 3.29638671875, + "learning_rate": 3.800330943188086e-06, + "loss": 0.6848, + "step": 690 + }, + { + "epoch": 0.11587485515643106, + "grad_norm": 4.40482759475708, + "learning_rate": 3.855488141202427e-06, + "loss": 0.673, + "step": 700 + }, + { + "epoch": 0.11753021023009436, + "grad_norm": 3.206632375717163, + "learning_rate": 3.910645339216768e-06, + "loss": 0.6148, + "step": 710 + }, + { + "epoch": 0.11918556530375765, + "grad_norm": 3.0711138248443604, + "learning_rate": 3.965802537231109e-06, + "loss": 0.6717, + "step": 720 + }, + { + "epoch": 0.12084092037742096, + "grad_norm": 3.105243444442749, + "learning_rate": 4.02095973524545e-06, + "loss": 0.5857, + "step": 730 + }, + { + "epoch": 0.12249627545108425, + "grad_norm": 4.079644680023193, + "learning_rate": 4.07611693325979e-06, + "loss": 0.6482, + "step": 740 + }, + { + "epoch": 0.12415163052474756, + "grad_norm": 3.890226364135742, + "learning_rate": 4.131274131274132e-06, + "loss": 0.5871, + "step": 750 + }, + { + "epoch": 0.12580698559841086, + "grad_norm": 2.5968997478485107, + "learning_rate": 4.186431329288472e-06, + "loss": 0.6882, + "step": 760 + }, + { + "epoch": 0.12746234067207415, + "grad_norm": 2.7869668006896973, + "learning_rate": 4.241588527302813e-06, + "loss": 0.6667, + "step": 770 + }, + { + "epoch": 0.12911769574573745, + "grad_norm": 3.2500760555267334, + "learning_rate": 4.296745725317154e-06, + "loss": 0.6381, + "step": 780 + }, + { + "epoch": 0.13077305081940077, + "grad_norm": 3.1198246479034424, + "learning_rate": 4.351902923331495e-06, + "loss": 0.6226, + "step": 790 + }, + { + "epoch": 0.13242840589306407, + "grad_norm": 3.8074584007263184, + "learning_rate": 4.407060121345836e-06, + "loss": 0.6221, + "step": 800 + }, + { + "epoch": 0.13408376096672736, + "grad_norm": 2.311058521270752, + "learning_rate": 4.462217319360177e-06, + "loss": 0.6789, + "step": 810 + }, + { + "epoch": 0.13573911604039066, + "grad_norm": 2.830004930496216, + "learning_rate": 4.517374517374517e-06, + "loss": 0.6512, + "step": 820 + }, + { + "epoch": 0.13739447111405395, + "grad_norm": 3.7663865089416504, + "learning_rate": 4.572531715388859e-06, + "loss": 0.7057, + "step": 830 + }, + { + "epoch": 0.13904982618771727, + "grad_norm": 2.8656136989593506, + "learning_rate": 4.627688913403199e-06, + "loss": 0.6411, + "step": 840 + }, + { + "epoch": 0.14070518126138057, + "grad_norm": 3.1035988330841064, + "learning_rate": 4.68284611141754e-06, + "loss": 0.6221, + "step": 850 + }, + { + "epoch": 0.14236053633504386, + "grad_norm": 3.384016990661621, + "learning_rate": 4.738003309431881e-06, + "loss": 0.6321, + "step": 860 + }, + { + "epoch": 0.14401589140870716, + "grad_norm": 3.3458902835845947, + "learning_rate": 4.793160507446222e-06, + "loss": 0.6163, + "step": 870 + }, + { + "epoch": 0.14567124648237048, + "grad_norm": 3.6326417922973633, + "learning_rate": 4.848317705460563e-06, + "loss": 0.6228, + "step": 880 + }, + { + "epoch": 0.14732660155603378, + "grad_norm": 3.407176971435547, + "learning_rate": 4.903474903474904e-06, + "loss": 0.675, + "step": 890 + }, + { + "epoch": 0.14898195662969707, + "grad_norm": 2.779155731201172, + "learning_rate": 4.958632101489245e-06, + "loss": 0.6416, + "step": 900 + }, + { + "epoch": 0.15063731170336037, + "grad_norm": 3.1110782623291016, + "learning_rate": 5.013789299503585e-06, + "loss": 0.5804, + "step": 910 + }, + { + "epoch": 0.15229266677702366, + "grad_norm": 3.0117592811584473, + "learning_rate": 5.068946497517927e-06, + "loss": 0.7233, + "step": 920 + }, + { + "epoch": 0.15394802185068698, + "grad_norm": 3.690901756286621, + "learning_rate": 5.124103695532267e-06, + "loss": 0.6569, + "step": 930 + }, + { + "epoch": 0.15560337692435028, + "grad_norm": 3.155195713043213, + "learning_rate": 5.179260893546608e-06, + "loss": 0.6537, + "step": 940 + }, + { + "epoch": 0.15725873199801357, + "grad_norm": 3.386868715286255, + "learning_rate": 5.234418091560949e-06, + "loss": 0.6306, + "step": 950 + }, + { + "epoch": 0.15891408707167687, + "grad_norm": 4.336483001708984, + "learning_rate": 5.28957528957529e-06, + "loss": 0.6967, + "step": 960 + }, + { + "epoch": 0.16056944214534016, + "grad_norm": 3.386204481124878, + "learning_rate": 5.344732487589631e-06, + "loss": 0.6151, + "step": 970 + }, + { + "epoch": 0.16222479721900349, + "grad_norm": 3.8770434856414795, + "learning_rate": 5.399889685603972e-06, + "loss": 0.6667, + "step": 980 + }, + { + "epoch": 0.16388015229266678, + "grad_norm": 3.62699294090271, + "learning_rate": 5.455046883618312e-06, + "loss": 0.6323, + "step": 990 + }, + { + "epoch": 0.16553550736633008, + "grad_norm": 2.923297882080078, + "learning_rate": 5.510204081632653e-06, + "loss": 0.6294, + "step": 1000 + }, + { + "epoch": 0.16719086243999337, + "grad_norm": 3.1828577518463135, + "learning_rate": 5.565361279646995e-06, + "loss": 0.6644, + "step": 1010 + }, + { + "epoch": 0.16884621751365667, + "grad_norm": 2.8371975421905518, + "learning_rate": 5.620518477661335e-06, + "loss": 0.6294, + "step": 1020 + }, + { + "epoch": 0.17050157258732, + "grad_norm": 2.7191975116729736, + "learning_rate": 5.675675675675676e-06, + "loss": 0.7093, + "step": 1030 + }, + { + "epoch": 0.17215692766098328, + "grad_norm": 2.4802534580230713, + "learning_rate": 5.730832873690016e-06, + "loss": 0.5608, + "step": 1040 + }, + { + "epoch": 0.17381228273464658, + "grad_norm": 5.264163017272949, + "learning_rate": 5.785990071704358e-06, + "loss": 0.6454, + "step": 1050 + }, + { + "epoch": 0.17546763780830987, + "grad_norm": 2.8554723262786865, + "learning_rate": 5.841147269718699e-06, + "loss": 0.6278, + "step": 1060 + }, + { + "epoch": 0.1771229928819732, + "grad_norm": 3.11765718460083, + "learning_rate": 5.896304467733039e-06, + "loss": 0.6203, + "step": 1070 + }, + { + "epoch": 0.1787783479556365, + "grad_norm": 2.88675594329834, + "learning_rate": 5.95146166574738e-06, + "loss": 0.6984, + "step": 1080 + }, + { + "epoch": 0.18043370302929979, + "grad_norm": 2.93239688873291, + "learning_rate": 6.006618863761722e-06, + "loss": 0.6054, + "step": 1090 + }, + { + "epoch": 0.18208905810296308, + "grad_norm": 3.4710001945495605, + "learning_rate": 6.061776061776062e-06, + "loss": 0.6106, + "step": 1100 + }, + { + "epoch": 0.18374441317662638, + "grad_norm": 2.3426156044006348, + "learning_rate": 6.116933259790403e-06, + "loss": 0.6256, + "step": 1110 + }, + { + "epoch": 0.1853997682502897, + "grad_norm": 5.0461602210998535, + "learning_rate": 6.172090457804743e-06, + "loss": 0.6798, + "step": 1120 + }, + { + "epoch": 0.187055123323953, + "grad_norm": 2.646273374557495, + "learning_rate": 6.227247655819085e-06, + "loss": 0.5688, + "step": 1130 + }, + { + "epoch": 0.1887104783976163, + "grad_norm": 3.764864444732666, + "learning_rate": 6.282404853833426e-06, + "loss": 0.5802, + "step": 1140 + }, + { + "epoch": 0.19036583347127958, + "grad_norm": 2.7304487228393555, + "learning_rate": 6.337562051847766e-06, + "loss": 0.6034, + "step": 1150 + }, + { + "epoch": 0.19202118854494288, + "grad_norm": 2.973935127258301, + "learning_rate": 6.392719249862107e-06, + "loss": 0.6278, + "step": 1160 + }, + { + "epoch": 0.1936765436186062, + "grad_norm": 2.750199794769287, + "learning_rate": 6.447876447876449e-06, + "loss": 0.6995, + "step": 1170 + }, + { + "epoch": 0.1953318986922695, + "grad_norm": 2.852870225906372, + "learning_rate": 6.503033645890789e-06, + "loss": 0.6436, + "step": 1180 + }, + { + "epoch": 0.1969872537659328, + "grad_norm": 3.1429359912872314, + "learning_rate": 6.55819084390513e-06, + "loss": 0.6216, + "step": 1190 + }, + { + "epoch": 0.19864260883959609, + "grad_norm": 2.8767406940460205, + "learning_rate": 6.61334804191947e-06, + "loss": 0.649, + "step": 1200 + }, + { + "epoch": 0.20029796391325938, + "grad_norm": 4.3968281745910645, + "learning_rate": 6.668505239933812e-06, + "loss": 0.626, + "step": 1210 + }, + { + "epoch": 0.2019533189869227, + "grad_norm": 2.600132465362549, + "learning_rate": 6.723662437948153e-06, + "loss": 0.6757, + "step": 1220 + }, + { + "epoch": 0.203608674060586, + "grad_norm": 3.5382864475250244, + "learning_rate": 6.778819635962493e-06, + "loss": 0.6118, + "step": 1230 + }, + { + "epoch": 0.2052640291342493, + "grad_norm": 2.8111588954925537, + "learning_rate": 6.833976833976834e-06, + "loss": 0.58, + "step": 1240 + }, + { + "epoch": 0.2069193842079126, + "grad_norm": 3.0125858783721924, + "learning_rate": 6.889134031991176e-06, + "loss": 0.6049, + "step": 1250 + }, + { + "epoch": 0.2085747392815759, + "grad_norm": 3.686331272125244, + "learning_rate": 6.944291230005516e-06, + "loss": 0.5898, + "step": 1260 + }, + { + "epoch": 0.2102300943552392, + "grad_norm": 4.72364616394043, + "learning_rate": 6.999448428019857e-06, + "loss": 0.5901, + "step": 1270 + }, + { + "epoch": 0.2118854494289025, + "grad_norm": 2.531151294708252, + "learning_rate": 7.054605626034198e-06, + "loss": 0.6588, + "step": 1280 + }, + { + "epoch": 0.2135408045025658, + "grad_norm": 3.2532570362091064, + "learning_rate": 7.109762824048539e-06, + "loss": 0.5635, + "step": 1290 + }, + { + "epoch": 0.2151961595762291, + "grad_norm": 3.614664316177368, + "learning_rate": 7.16492002206288e-06, + "loss": 0.5676, + "step": 1300 + }, + { + "epoch": 0.2168515146498924, + "grad_norm": 4.736164569854736, + "learning_rate": 7.22007722007722e-06, + "loss": 0.6029, + "step": 1310 + }, + { + "epoch": 0.2185068697235557, + "grad_norm": 3.391282320022583, + "learning_rate": 7.275234418091561e-06, + "loss": 0.6341, + "step": 1320 + }, + { + "epoch": 0.220162224797219, + "grad_norm": 3.4282443523406982, + "learning_rate": 7.330391616105902e-06, + "loss": 0.6301, + "step": 1330 + }, + { + "epoch": 0.2218175798708823, + "grad_norm": 2.516094923019409, + "learning_rate": 7.385548814120243e-06, + "loss": 0.616, + "step": 1340 + }, + { + "epoch": 0.2234729349445456, + "grad_norm": 2.89660906791687, + "learning_rate": 7.440706012134584e-06, + "loss": 0.6081, + "step": 1350 + }, + { + "epoch": 0.22512829001820892, + "grad_norm": 2.6692073345184326, + "learning_rate": 7.495863210148925e-06, + "loss": 0.5554, + "step": 1360 + }, + { + "epoch": 0.2267836450918722, + "grad_norm": 2.7826545238494873, + "learning_rate": 7.551020408163265e-06, + "loss": 0.6513, + "step": 1370 + }, + { + "epoch": 0.2284390001655355, + "grad_norm": 4.580424785614014, + "learning_rate": 7.606177606177607e-06, + "loss": 0.6532, + "step": 1380 + }, + { + "epoch": 0.2300943552391988, + "grad_norm": 2.9366021156311035, + "learning_rate": 7.661334804191947e-06, + "loss": 0.6284, + "step": 1390 + }, + { + "epoch": 0.23174971031286212, + "grad_norm": 2.872647762298584, + "learning_rate": 7.716492002206288e-06, + "loss": 0.65, + "step": 1400 + }, + { + "epoch": 0.23340506538652542, + "grad_norm": 2.3038113117218018, + "learning_rate": 7.77164920022063e-06, + "loss": 0.6274, + "step": 1410 + }, + { + "epoch": 0.2350604204601887, + "grad_norm": 2.807044744491577, + "learning_rate": 7.82680639823497e-06, + "loss": 0.5892, + "step": 1420 + }, + { + "epoch": 0.236715775533852, + "grad_norm": 2.3904168605804443, + "learning_rate": 7.881963596249311e-06, + "loss": 0.6505, + "step": 1430 + }, + { + "epoch": 0.2383711306075153, + "grad_norm": 2.934279441833496, + "learning_rate": 7.937120794263652e-06, + "loss": 0.5834, + "step": 1440 + }, + { + "epoch": 0.24002648568117863, + "grad_norm": 2.8970916271209717, + "learning_rate": 7.992277992277993e-06, + "loss": 0.6136, + "step": 1450 + }, + { + "epoch": 0.24168184075484192, + "grad_norm": 2.5647435188293457, + "learning_rate": 8.047435190292334e-06, + "loss": 0.6831, + "step": 1460 + }, + { + "epoch": 0.24333719582850522, + "grad_norm": 3.708890438079834, + "learning_rate": 8.102592388306675e-06, + "loss": 0.6018, + "step": 1470 + }, + { + "epoch": 0.2449925509021685, + "grad_norm": 3.2741858959198, + "learning_rate": 8.157749586321016e-06, + "loss": 0.5826, + "step": 1480 + }, + { + "epoch": 0.2466479059758318, + "grad_norm": 2.7993056774139404, + "learning_rate": 8.212906784335355e-06, + "loss": 0.6209, + "step": 1490 + }, + { + "epoch": 0.24830326104949513, + "grad_norm": 2.5464768409729004, + "learning_rate": 8.268063982349698e-06, + "loss": 0.6195, + "step": 1500 + }, + { + "epoch": 0.24995861612315842, + "grad_norm": 2.438197612762451, + "learning_rate": 8.323221180364039e-06, + "loss": 0.607, + "step": 1510 + }, + { + "epoch": 0.2516139711968217, + "grad_norm": 3.0494797229766846, + "learning_rate": 8.378378378378378e-06, + "loss": 0.6154, + "step": 1520 + }, + { + "epoch": 0.25326932627048504, + "grad_norm": 3.6615447998046875, + "learning_rate": 8.43353557639272e-06, + "loss": 0.6569, + "step": 1530 + }, + { + "epoch": 0.2549246813441483, + "grad_norm": 4.182124614715576, + "learning_rate": 8.488692774407062e-06, + "loss": 0.6069, + "step": 1540 + }, + { + "epoch": 0.25658003641781163, + "grad_norm": 3.966855525970459, + "learning_rate": 8.543849972421401e-06, + "loss": 0.6061, + "step": 1550 + }, + { + "epoch": 0.2582353914914749, + "grad_norm": 3.1076576709747314, + "learning_rate": 8.599007170435742e-06, + "loss": 0.5949, + "step": 1560 + }, + { + "epoch": 0.2598907465651382, + "grad_norm": 2.6115076541900635, + "learning_rate": 8.654164368450083e-06, + "loss": 0.5933, + "step": 1570 + }, + { + "epoch": 0.26154610163880154, + "grad_norm": 2.357980489730835, + "learning_rate": 8.709321566464424e-06, + "loss": 0.6573, + "step": 1580 + }, + { + "epoch": 0.2632014567124648, + "grad_norm": 2.8000941276550293, + "learning_rate": 8.764478764478765e-06, + "loss": 0.6048, + "step": 1590 + }, + { + "epoch": 0.26485681178612813, + "grad_norm": 2.700944185256958, + "learning_rate": 8.819635962493106e-06, + "loss": 0.6463, + "step": 1600 + }, + { + "epoch": 0.2665121668597914, + "grad_norm": 2.6585984230041504, + "learning_rate": 8.874793160507447e-06, + "loss": 0.6004, + "step": 1610 + }, + { + "epoch": 0.2681675219334547, + "grad_norm": 2.9849841594696045, + "learning_rate": 8.929950358521788e-06, + "loss": 0.5947, + "step": 1620 + }, + { + "epoch": 0.26982287700711804, + "grad_norm": 2.6921303272247314, + "learning_rate": 8.985107556536129e-06, + "loss": 0.6284, + "step": 1630 + }, + { + "epoch": 0.2714782320807813, + "grad_norm": 3.0850579738616943, + "learning_rate": 9.04026475455047e-06, + "loss": 0.5238, + "step": 1640 + }, + { + "epoch": 0.27313358715444463, + "grad_norm": 3.351339340209961, + "learning_rate": 9.09542195256481e-06, + "loss": 0.635, + "step": 1650 + }, + { + "epoch": 0.2747889422281079, + "grad_norm": 2.2097935676574707, + "learning_rate": 9.15057915057915e-06, + "loss": 0.6214, + "step": 1660 + }, + { + "epoch": 0.2764442973017712, + "grad_norm": 3.295842170715332, + "learning_rate": 9.205736348593493e-06, + "loss": 0.6673, + "step": 1670 + }, + { + "epoch": 0.27809965237543455, + "grad_norm": 3.469433069229126, + "learning_rate": 9.260893546607832e-06, + "loss": 0.6393, + "step": 1680 + }, + { + "epoch": 0.2797550074490978, + "grad_norm": 2.8949124813079834, + "learning_rate": 9.316050744622173e-06, + "loss": 0.6166, + "step": 1690 + }, + { + "epoch": 0.28141036252276114, + "grad_norm": 3.850156545639038, + "learning_rate": 9.371207942636514e-06, + "loss": 0.5904, + "step": 1700 + }, + { + "epoch": 0.28306571759642446, + "grad_norm": 2.344604730606079, + "learning_rate": 9.426365140650855e-06, + "loss": 0.6567, + "step": 1710 + }, + { + "epoch": 0.2847210726700877, + "grad_norm": 2.964304208755493, + "learning_rate": 9.481522338665196e-06, + "loss": 0.6023, + "step": 1720 + }, + { + "epoch": 0.28637642774375105, + "grad_norm": 3.1953861713409424, + "learning_rate": 9.536679536679537e-06, + "loss": 0.6116, + "step": 1730 + }, + { + "epoch": 0.2880317828174143, + "grad_norm": 2.340214967727661, + "learning_rate": 9.591836734693878e-06, + "loss": 0.5774, + "step": 1740 + }, + { + "epoch": 0.28968713789107764, + "grad_norm": 2.7652084827423096, + "learning_rate": 9.64699393270822e-06, + "loss": 0.6332, + "step": 1750 + }, + { + "epoch": 0.29134249296474096, + "grad_norm": 2.380476713180542, + "learning_rate": 9.70215113072256e-06, + "loss": 0.5401, + "step": 1760 + }, + { + "epoch": 0.29299784803840423, + "grad_norm": 2.936915397644043, + "learning_rate": 9.757308328736901e-06, + "loss": 0.6103, + "step": 1770 + }, + { + "epoch": 0.29465320311206755, + "grad_norm": 2.580601215362549, + "learning_rate": 9.812465526751242e-06, + "loss": 0.6312, + "step": 1780 + }, + { + "epoch": 0.2963085581857308, + "grad_norm": 2.9639976024627686, + "learning_rate": 9.867622724765583e-06, + "loss": 0.5844, + "step": 1790 + }, + { + "epoch": 0.29796391325939414, + "grad_norm": 2.4568283557891846, + "learning_rate": 9.922779922779924e-06, + "loss": 0.6088, + "step": 1800 + }, + { + "epoch": 0.29961926833305746, + "grad_norm": 2.4476640224456787, + "learning_rate": 9.977937120794265e-06, + "loss": 0.5739, + "step": 1810 + }, + { + "epoch": 0.30127462340672073, + "grad_norm": 2.506065845489502, + "learning_rate": 9.999996660862644e-06, + "loss": 0.6742, + "step": 1820 + }, + { + "epoch": 0.30292997848038405, + "grad_norm": 2.4358372688293457, + "learning_rate": 9.999976255039388e-06, + "loss": 0.6403, + "step": 1830 + }, + { + "epoch": 0.3045853335540473, + "grad_norm": 3.3008499145507812, + "learning_rate": 9.999937298544802e-06, + "loss": 0.6091, + "step": 1840 + }, + { + "epoch": 0.30624068862771064, + "grad_norm": 3.557581901550293, + "learning_rate": 9.99987979152342e-06, + "loss": 0.6313, + "step": 1850 + }, + { + "epoch": 0.30789604370137397, + "grad_norm": 2.6936228275299072, + "learning_rate": 9.999803734188604e-06, + "loss": 0.6611, + "step": 1860 + }, + { + "epoch": 0.30955139877503723, + "grad_norm": 3.0474772453308105, + "learning_rate": 9.999709126822536e-06, + "loss": 0.5883, + "step": 1870 + }, + { + "epoch": 0.31120675384870056, + "grad_norm": 2.5758018493652344, + "learning_rate": 9.999595969776225e-06, + "loss": 0.5915, + "step": 1880 + }, + { + "epoch": 0.3128621089223638, + "grad_norm": 3.6697728633880615, + "learning_rate": 9.999464263469503e-06, + "loss": 0.5736, + "step": 1890 + }, + { + "epoch": 0.31451746399602715, + "grad_norm": 2.458699941635132, + "learning_rate": 9.999314008391014e-06, + "loss": 0.6448, + "step": 1900 + }, + { + "epoch": 0.31617281906969047, + "grad_norm": 2.6207146644592285, + "learning_rate": 9.999145205098234e-06, + "loss": 0.57, + "step": 1910 + }, + { + "epoch": 0.31782817414335374, + "grad_norm": 1.6553720235824585, + "learning_rate": 9.998957854217444e-06, + "loss": 0.5705, + "step": 1920 + }, + { + "epoch": 0.31948352921701706, + "grad_norm": 2.402009963989258, + "learning_rate": 9.998751956443748e-06, + "loss": 0.6143, + "step": 1930 + }, + { + "epoch": 0.3211388842906803, + "grad_norm": 2.5485851764678955, + "learning_rate": 9.998527512541056e-06, + "loss": 0.5782, + "step": 1940 + }, + { + "epoch": 0.32279423936434365, + "grad_norm": 2.4615025520324707, + "learning_rate": 9.998284523342088e-06, + "loss": 0.6105, + "step": 1950 + }, + { + "epoch": 0.32444959443800697, + "grad_norm": 2.0781445503234863, + "learning_rate": 9.998022989748371e-06, + "loss": 0.5435, + "step": 1960 + }, + { + "epoch": 0.32610494951167024, + "grad_norm": 2.8392488956451416, + "learning_rate": 9.997742912730238e-06, + "loss": 0.6502, + "step": 1970 + }, + { + "epoch": 0.32776030458533356, + "grad_norm": 3.554027557373047, + "learning_rate": 9.997444293326812e-06, + "loss": 0.6015, + "step": 1980 + }, + { + "epoch": 0.32941565965899683, + "grad_norm": 2.370697259902954, + "learning_rate": 9.99712713264602e-06, + "loss": 0.6348, + "step": 1990 + }, + { + "epoch": 0.33107101473266015, + "grad_norm": 2.2110366821289062, + "learning_rate": 9.996791431864577e-06, + "loss": 0.5641, + "step": 2000 + }, + { + "epoch": 0.3327263698063235, + "grad_norm": 2.8644583225250244, + "learning_rate": 9.996437192227979e-06, + "loss": 0.5921, + "step": 2010 + }, + { + "epoch": 0.33438172487998674, + "grad_norm": 1.8451977968215942, + "learning_rate": 9.996064415050515e-06, + "loss": 0.6194, + "step": 2020 + }, + { + "epoch": 0.33603707995365006, + "grad_norm": 2.718644857406616, + "learning_rate": 9.99567310171524e-06, + "loss": 0.5981, + "step": 2030 + }, + { + "epoch": 0.33769243502731333, + "grad_norm": 2.4472815990448, + "learning_rate": 9.995263253673989e-06, + "loss": 0.6079, + "step": 2040 + }, + { + "epoch": 0.33934779010097665, + "grad_norm": 2.6010594367980957, + "learning_rate": 9.994834872447357e-06, + "loss": 0.608, + "step": 2050 + }, + { + "epoch": 0.34100314517464, + "grad_norm": 3.013094425201416, + "learning_rate": 9.994387959624707e-06, + "loss": 0.5139, + "step": 2060 + }, + { + "epoch": 0.34265850024830324, + "grad_norm": 2.3363730907440186, + "learning_rate": 9.993922516864154e-06, + "loss": 0.5739, + "step": 2070 + }, + { + "epoch": 0.34431385532196657, + "grad_norm": 2.283562421798706, + "learning_rate": 9.993438545892557e-06, + "loss": 0.5872, + "step": 2080 + }, + { + "epoch": 0.3459692103956299, + "grad_norm": 2.672161817550659, + "learning_rate": 9.992936048505525e-06, + "loss": 0.592, + "step": 2090 + }, + { + "epoch": 0.34762456546929316, + "grad_norm": 2.5038950443267822, + "learning_rate": 9.9924150265674e-06, + "loss": 0.5921, + "step": 2100 + }, + { + "epoch": 0.3492799205429565, + "grad_norm": 2.3302109241485596, + "learning_rate": 9.991875482011251e-06, + "loss": 0.6143, + "step": 2110 + }, + { + "epoch": 0.35093527561661975, + "grad_norm": 3.3396990299224854, + "learning_rate": 9.991317416838871e-06, + "loss": 0.5817, + "step": 2120 + }, + { + "epoch": 0.35259063069028307, + "grad_norm": 2.076221466064453, + "learning_rate": 9.990740833120765e-06, + "loss": 0.5941, + "step": 2130 + }, + { + "epoch": 0.3542459857639464, + "grad_norm": 2.247899293899536, + "learning_rate": 9.990145732996149e-06, + "loss": 0.6015, + "step": 2140 + }, + { + "epoch": 0.35590134083760966, + "grad_norm": 2.4962527751922607, + "learning_rate": 9.989532118672933e-06, + "loss": 0.6052, + "step": 2150 + }, + { + "epoch": 0.357556695911273, + "grad_norm": 2.5895111560821533, + "learning_rate": 9.988899992427719e-06, + "loss": 0.6154, + "step": 2160 + }, + { + "epoch": 0.35921205098493625, + "grad_norm": 2.0279507637023926, + "learning_rate": 9.988249356605793e-06, + "loss": 0.5683, + "step": 2170 + }, + { + "epoch": 0.36086740605859957, + "grad_norm": 2.5845603942871094, + "learning_rate": 9.98758021362111e-06, + "loss": 0.6027, + "step": 2180 + }, + { + "epoch": 0.3625227611322629, + "grad_norm": 2.4495720863342285, + "learning_rate": 9.986892565956296e-06, + "loss": 0.6118, + "step": 2190 + }, + { + "epoch": 0.36417811620592616, + "grad_norm": 2.7284724712371826, + "learning_rate": 9.986186416162624e-06, + "loss": 0.6262, + "step": 2200 + }, + { + "epoch": 0.3658334712795895, + "grad_norm": 3.050304412841797, + "learning_rate": 9.985461766860021e-06, + "loss": 0.5922, + "step": 2210 + }, + { + "epoch": 0.36748882635325275, + "grad_norm": 2.5775296688079834, + "learning_rate": 9.984718620737044e-06, + "loss": 0.5898, + "step": 2220 + }, + { + "epoch": 0.3691441814269161, + "grad_norm": 2.2817842960357666, + "learning_rate": 9.983956980550877e-06, + "loss": 0.5528, + "step": 2230 + }, + { + "epoch": 0.3707995365005794, + "grad_norm": 2.512723445892334, + "learning_rate": 9.983176849127323e-06, + "loss": 0.6015, + "step": 2240 + }, + { + "epoch": 0.37245489157424266, + "grad_norm": 3.1209611892700195, + "learning_rate": 9.982378229360785e-06, + "loss": 0.5624, + "step": 2250 + }, + { + "epoch": 0.374110246647906, + "grad_norm": 2.4052345752716064, + "learning_rate": 9.981561124214267e-06, + "loss": 0.5279, + "step": 2260 + }, + { + "epoch": 0.37576560172156925, + "grad_norm": 3.7475814819335938, + "learning_rate": 9.980725536719352e-06, + "loss": 0.6319, + "step": 2270 + }, + { + "epoch": 0.3774209567952326, + "grad_norm": 2.4055349826812744, + "learning_rate": 9.979871469976197e-06, + "loss": 0.5653, + "step": 2280 + }, + { + "epoch": 0.3790763118688959, + "grad_norm": 2.1154866218566895, + "learning_rate": 9.978998927153516e-06, + "loss": 0.6503, + "step": 2290 + }, + { + "epoch": 0.38073166694255917, + "grad_norm": 2.19960618019104, + "learning_rate": 9.978107911488581e-06, + "loss": 0.5311, + "step": 2300 + }, + { + "epoch": 0.3823870220162225, + "grad_norm": 2.6673924922943115, + "learning_rate": 9.97719842628719e-06, + "loss": 0.59, + "step": 2310 + }, + { + "epoch": 0.38404237708988576, + "grad_norm": 2.573767900466919, + "learning_rate": 9.976270474923675e-06, + "loss": 0.559, + "step": 2320 + }, + { + "epoch": 0.3856977321635491, + "grad_norm": 2.6069254875183105, + "learning_rate": 9.975324060840874e-06, + "loss": 0.6021, + "step": 2330 + }, + { + "epoch": 0.3873530872372124, + "grad_norm": 2.925135374069214, + "learning_rate": 9.97435918755013e-06, + "loss": 0.5488, + "step": 2340 + }, + { + "epoch": 0.38900844231087567, + "grad_norm": 2.631096601486206, + "learning_rate": 9.973375858631266e-06, + "loss": 0.5977, + "step": 2350 + }, + { + "epoch": 0.390663797384539, + "grad_norm": 2.7871811389923096, + "learning_rate": 9.972374077732585e-06, + "loss": 0.6449, + "step": 2360 + }, + { + "epoch": 0.39231915245820226, + "grad_norm": 1.6462632417678833, + "learning_rate": 9.971353848570845e-06, + "loss": 0.5528, + "step": 2370 + }, + { + "epoch": 0.3939745075318656, + "grad_norm": 3.6753463745117188, + "learning_rate": 9.97031517493125e-06, + "loss": 0.562, + "step": 2380 + }, + { + "epoch": 0.3956298626055289, + "grad_norm": 2.3143789768218994, + "learning_rate": 9.969258060667435e-06, + "loss": 0.59, + "step": 2390 + }, + { + "epoch": 0.39728521767919217, + "grad_norm": 1.9230400323867798, + "learning_rate": 9.96818250970146e-06, + "loss": 0.5805, + "step": 2400 + }, + { + "epoch": 0.3989405727528555, + "grad_norm": 2.318477153778076, + "learning_rate": 9.96708852602378e-06, + "loss": 0.6163, + "step": 2410 + }, + { + "epoch": 0.40059592782651876, + "grad_norm": 2.983262300491333, + "learning_rate": 9.965976113693238e-06, + "loss": 0.5384, + "step": 2420 + }, + { + "epoch": 0.4022512829001821, + "grad_norm": 2.5981662273406982, + "learning_rate": 9.964845276837057e-06, + "loss": 0.5324, + "step": 2430 + }, + { + "epoch": 0.4039066379738454, + "grad_norm": 2.5291125774383545, + "learning_rate": 9.96369601965081e-06, + "loss": 0.5857, + "step": 2440 + }, + { + "epoch": 0.4055619930475087, + "grad_norm": 2.275705575942993, + "learning_rate": 9.962528346398418e-06, + "loss": 0.5985, + "step": 2450 + }, + { + "epoch": 0.407217348121172, + "grad_norm": 3.31367826461792, + "learning_rate": 9.961342261412125e-06, + "loss": 0.5318, + "step": 2460 + }, + { + "epoch": 0.4088727031948353, + "grad_norm": 2.098184585571289, + "learning_rate": 9.960137769092487e-06, + "loss": 0.5643, + "step": 2470 + }, + { + "epoch": 0.4105280582684986, + "grad_norm": 2.2636635303497314, + "learning_rate": 9.958914873908353e-06, + "loss": 0.5759, + "step": 2480 + }, + { + "epoch": 0.4121834133421619, + "grad_norm": 2.2176895141601562, + "learning_rate": 9.95767358039685e-06, + "loss": 0.5145, + "step": 2490 + }, + { + "epoch": 0.4138387684158252, + "grad_norm": 2.798469305038452, + "learning_rate": 9.956413893163365e-06, + "loss": 0.6119, + "step": 2500 + }, + { + "epoch": 0.4154941234894885, + "grad_norm": 2.1799635887145996, + "learning_rate": 9.95513581688153e-06, + "loss": 0.5912, + "step": 2510 + }, + { + "epoch": 0.4171494785631518, + "grad_norm": 3.491344690322876, + "learning_rate": 9.953839356293202e-06, + "loss": 0.5705, + "step": 2520 + }, + { + "epoch": 0.4188048336368151, + "grad_norm": 3.2916722297668457, + "learning_rate": 9.952524516208447e-06, + "loss": 0.5886, + "step": 2530 + }, + { + "epoch": 0.4204601887104784, + "grad_norm": 2.2452287673950195, + "learning_rate": 9.951191301505519e-06, + "loss": 0.4922, + "step": 2540 + }, + { + "epoch": 0.4221155437841417, + "grad_norm": 2.5860681533813477, + "learning_rate": 9.949839717130849e-06, + "loss": 0.5827, + "step": 2550 + }, + { + "epoch": 0.423770898857805, + "grad_norm": 2.4785280227661133, + "learning_rate": 9.94846976809902e-06, + "loss": 0.5541, + "step": 2560 + }, + { + "epoch": 0.4254262539314683, + "grad_norm": 1.9815374612808228, + "learning_rate": 9.947081459492751e-06, + "loss": 0.5863, + "step": 2570 + }, + { + "epoch": 0.4270816090051316, + "grad_norm": 2.4965639114379883, + "learning_rate": 9.945674796462879e-06, + "loss": 0.5708, + "step": 2580 + }, + { + "epoch": 0.4287369640787949, + "grad_norm": 3.2701520919799805, + "learning_rate": 9.944249784228335e-06, + "loss": 0.5784, + "step": 2590 + }, + { + "epoch": 0.4303923191524582, + "grad_norm": 2.430939197540283, + "learning_rate": 9.942806428076132e-06, + "loss": 0.5823, + "step": 2600 + }, + { + "epoch": 0.4320476742261215, + "grad_norm": 2.519144296646118, + "learning_rate": 9.941344733361344e-06, + "loss": 0.5767, + "step": 2610 + }, + { + "epoch": 0.4337030292997848, + "grad_norm": 2.906513214111328, + "learning_rate": 9.939864705507073e-06, + "loss": 0.548, + "step": 2620 + }, + { + "epoch": 0.4353583843734481, + "grad_norm": 2.306670665740967, + "learning_rate": 9.938366350004454e-06, + "loss": 0.5714, + "step": 2630 + }, + { + "epoch": 0.4370137394471114, + "grad_norm": 2.199439525604248, + "learning_rate": 9.93684967241261e-06, + "loss": 0.5586, + "step": 2640 + }, + { + "epoch": 0.4386690945207747, + "grad_norm": 2.4085533618927, + "learning_rate": 9.935314678358644e-06, + "loss": 0.5811, + "step": 2650 + }, + { + "epoch": 0.440324449594438, + "grad_norm": 2.153773307800293, + "learning_rate": 9.933761373537621e-06, + "loss": 0.5879, + "step": 2660 + }, + { + "epoch": 0.44197980466810133, + "grad_norm": 2.2745254039764404, + "learning_rate": 9.932189763712537e-06, + "loss": 0.5682, + "step": 2670 + }, + { + "epoch": 0.4436351597417646, + "grad_norm": 2.499537706375122, + "learning_rate": 9.9305998547143e-06, + "loss": 0.5803, + "step": 2680 + }, + { + "epoch": 0.4452905148154279, + "grad_norm": 2.2464380264282227, + "learning_rate": 9.928991652441717e-06, + "loss": 0.5679, + "step": 2690 + }, + { + "epoch": 0.4469458698890912, + "grad_norm": 2.874124526977539, + "learning_rate": 9.92736516286146e-06, + "loss": 0.5925, + "step": 2700 + }, + { + "epoch": 0.4486012249627545, + "grad_norm": 2.706362247467041, + "learning_rate": 9.925720392008056e-06, + "loss": 0.5674, + "step": 2710 + }, + { + "epoch": 0.45025658003641783, + "grad_norm": 2.4833574295043945, + "learning_rate": 9.924057345983851e-06, + "loss": 0.5762, + "step": 2720 + }, + { + "epoch": 0.4519119351100811, + "grad_norm": 1.9858320951461792, + "learning_rate": 9.922376030959e-06, + "loss": 0.5669, + "step": 2730 + }, + { + "epoch": 0.4535672901837444, + "grad_norm": 2.3540823459625244, + "learning_rate": 9.920676453171438e-06, + "loss": 0.5652, + "step": 2740 + }, + { + "epoch": 0.4552226452574077, + "grad_norm": 2.1306300163269043, + "learning_rate": 9.918958618926855e-06, + "loss": 0.5218, + "step": 2750 + }, + { + "epoch": 0.456878000331071, + "grad_norm": 2.7033262252807617, + "learning_rate": 9.917222534598679e-06, + "loss": 0.5731, + "step": 2760 + }, + { + "epoch": 0.45853335540473433, + "grad_norm": 1.9918255805969238, + "learning_rate": 9.915468206628046e-06, + "loss": 0.5535, + "step": 2770 + }, + { + "epoch": 0.4601887104783976, + "grad_norm": 2.858841896057129, + "learning_rate": 9.913695641523777e-06, + "loss": 0.601, + "step": 2780 + }, + { + "epoch": 0.4618440655520609, + "grad_norm": 2.050906181335449, + "learning_rate": 9.91190484586236e-06, + "loss": 0.5605, + "step": 2790 + }, + { + "epoch": 0.46349942062572425, + "grad_norm": 2.51007342338562, + "learning_rate": 9.910095826287918e-06, + "loss": 0.6002, + "step": 2800 + }, + { + "epoch": 0.4651547756993875, + "grad_norm": 2.3648529052734375, + "learning_rate": 9.908268589512187e-06, + "loss": 0.6065, + "step": 2810 + }, + { + "epoch": 0.46681013077305084, + "grad_norm": 2.8302104473114014, + "learning_rate": 9.906423142314497e-06, + "loss": 0.6004, + "step": 2820 + }, + { + "epoch": 0.4684654858467141, + "grad_norm": 2.7113876342773438, + "learning_rate": 9.904559491541735e-06, + "loss": 0.5727, + "step": 2830 + }, + { + "epoch": 0.4701208409203774, + "grad_norm": 1.8766558170318604, + "learning_rate": 9.902677644108327e-06, + "loss": 0.4911, + "step": 2840 + }, + { + "epoch": 0.47177619599404075, + "grad_norm": 2.576292037963867, + "learning_rate": 9.900777606996213e-06, + "loss": 0.5602, + "step": 2850 + }, + { + "epoch": 0.473431551067704, + "grad_norm": 2.50095796585083, + "learning_rate": 9.898859387254823e-06, + "loss": 0.5659, + "step": 2860 + }, + { + "epoch": 0.47508690614136734, + "grad_norm": 2.3355982303619385, + "learning_rate": 9.89692299200104e-06, + "loss": 0.58, + "step": 2870 + }, + { + "epoch": 0.4767422612150306, + "grad_norm": 2.1642611026763916, + "learning_rate": 9.894968428419187e-06, + "loss": 0.563, + "step": 2880 + }, + { + "epoch": 0.4783976162886939, + "grad_norm": 2.645699977874756, + "learning_rate": 9.892995703760988e-06, + "loss": 0.5911, + "step": 2890 + }, + { + "epoch": 0.48005297136235725, + "grad_norm": 3.1719911098480225, + "learning_rate": 9.891004825345555e-06, + "loss": 0.5638, + "step": 2900 + }, + { + "epoch": 0.4817083264360205, + "grad_norm": 2.728663444519043, + "learning_rate": 9.888995800559347e-06, + "loss": 0.5808, + "step": 2910 + }, + { + "epoch": 0.48336368150968384, + "grad_norm": 2.2633676528930664, + "learning_rate": 9.886968636856153e-06, + "loss": 0.4939, + "step": 2920 + }, + { + "epoch": 0.4850190365833471, + "grad_norm": 1.8135114908218384, + "learning_rate": 9.884923341757056e-06, + "loss": 0.5163, + "step": 2930 + }, + { + "epoch": 0.48667439165701043, + "grad_norm": 2.147280216217041, + "learning_rate": 9.882859922850412e-06, + "loss": 0.5427, + "step": 2940 + }, + { + "epoch": 0.48832974673067375, + "grad_norm": 2.248060464859009, + "learning_rate": 9.88077838779182e-06, + "loss": 0.5357, + "step": 2950 + }, + { + "epoch": 0.489985101804337, + "grad_norm": 2.9170844554901123, + "learning_rate": 9.87867874430409e-06, + "loss": 0.5257, + "step": 2960 + }, + { + "epoch": 0.49164045687800034, + "grad_norm": 2.0134947299957275, + "learning_rate": 9.87656100017722e-06, + "loss": 0.559, + "step": 2970 + }, + { + "epoch": 0.4932958119516636, + "grad_norm": 2.3166182041168213, + "learning_rate": 9.87442516326836e-06, + "loss": 0.5788, + "step": 2980 + }, + { + "epoch": 0.49495116702532693, + "grad_norm": 2.9038329124450684, + "learning_rate": 9.87227124150179e-06, + "loss": 0.6224, + "step": 2990 + }, + { + "epoch": 0.49660652209899026, + "grad_norm": 1.8098182678222656, + "learning_rate": 9.870099242868887e-06, + "loss": 0.5759, + "step": 3000 + }, + { + "epoch": 0.4982618771726535, + "grad_norm": 2.722839593887329, + "learning_rate": 9.867909175428096e-06, + "loss": 0.4706, + "step": 3010 + }, + { + "epoch": 0.49991723224631684, + "grad_norm": 2.420764446258545, + "learning_rate": 9.8657010473049e-06, + "loss": 0.577, + "step": 3020 + }, + { + "epoch": 0.5015725873199801, + "grad_norm": 2.254530906677246, + "learning_rate": 9.86347486669179e-06, + "loss": 0.5189, + "step": 3030 + }, + { + "epoch": 0.5032279423936434, + "grad_norm": 2.428337335586548, + "learning_rate": 9.861230641848233e-06, + "loss": 0.6204, + "step": 3040 + }, + { + "epoch": 0.5048832974673068, + "grad_norm": 2.543137550354004, + "learning_rate": 9.858968381100646e-06, + "loss": 0.5617, + "step": 3050 + }, + { + "epoch": 0.5065386525409701, + "grad_norm": 2.627063751220703, + "learning_rate": 9.856688092842357e-06, + "loss": 0.5348, + "step": 3060 + }, + { + "epoch": 0.5081940076146333, + "grad_norm": 2.149599075317383, + "learning_rate": 9.854389785533585e-06, + "loss": 0.5694, + "step": 3070 + }, + { + "epoch": 0.5098493626882966, + "grad_norm": 2.343366861343384, + "learning_rate": 9.852073467701398e-06, + "loss": 0.5729, + "step": 3080 + }, + { + "epoch": 0.5115047177619599, + "grad_norm": 2.3842127323150635, + "learning_rate": 9.849739147939685e-06, + "loss": 0.5948, + "step": 3090 + }, + { + "epoch": 0.5131600728356233, + "grad_norm": 2.269623041152954, + "learning_rate": 9.84738683490913e-06, + "loss": 0.5486, + "step": 3100 + }, + { + "epoch": 0.5148154279092866, + "grad_norm": 2.9736173152923584, + "learning_rate": 9.845016537337168e-06, + "loss": 0.5615, + "step": 3110 + }, + { + "epoch": 0.5164707829829498, + "grad_norm": 2.21465802192688, + "learning_rate": 9.842628264017969e-06, + "loss": 0.5914, + "step": 3120 + }, + { + "epoch": 0.5181261380566131, + "grad_norm": 2.490063428878784, + "learning_rate": 9.840222023812383e-06, + "loss": 0.5833, + "step": 3130 + }, + { + "epoch": 0.5197814931302764, + "grad_norm": 2.3841171264648438, + "learning_rate": 9.83779782564793e-06, + "loss": 0.577, + "step": 3140 + }, + { + "epoch": 0.5214368482039398, + "grad_norm": 1.9457083940505981, + "learning_rate": 9.835355678518754e-06, + "loss": 0.5872, + "step": 3150 + }, + { + "epoch": 0.5230922032776031, + "grad_norm": 2.5294015407562256, + "learning_rate": 9.83289559148559e-06, + "loss": 0.504, + "step": 3160 + }, + { + "epoch": 0.5247475583512663, + "grad_norm": 3.2622299194335938, + "learning_rate": 9.830417573675737e-06, + "loss": 0.5809, + "step": 3170 + }, + { + "epoch": 0.5264029134249296, + "grad_norm": 2.026658773422241, + "learning_rate": 9.827921634283015e-06, + "loss": 0.5413, + "step": 3180 + }, + { + "epoch": 0.5280582684985929, + "grad_norm": 2.9788129329681396, + "learning_rate": 9.825407782567738e-06, + "loss": 0.5578, + "step": 3190 + }, + { + "epoch": 0.5297136235722563, + "grad_norm": 2.1878185272216797, + "learning_rate": 9.822876027856679e-06, + "loss": 0.5203, + "step": 3200 + }, + { + "epoch": 0.5313689786459196, + "grad_norm": 2.129364013671875, + "learning_rate": 9.820326379543032e-06, + "loss": 0.5794, + "step": 3210 + }, + { + "epoch": 0.5330243337195828, + "grad_norm": 2.1982228755950928, + "learning_rate": 9.817758847086381e-06, + "loss": 0.5827, + "step": 3220 + }, + { + "epoch": 0.5346796887932461, + "grad_norm": 2.233294725418091, + "learning_rate": 9.815173440012657e-06, + "loss": 0.5868, + "step": 3230 + }, + { + "epoch": 0.5363350438669094, + "grad_norm": 2.100982666015625, + "learning_rate": 9.81257016791412e-06, + "loss": 0.5339, + "step": 3240 + }, + { + "epoch": 0.5379903989405728, + "grad_norm": 2.1711225509643555, + "learning_rate": 9.809949040449298e-06, + "loss": 0.5841, + "step": 3250 + }, + { + "epoch": 0.5396457540142361, + "grad_norm": 2.9437408447265625, + "learning_rate": 9.807310067342976e-06, + "loss": 0.5521, + "step": 3260 + }, + { + "epoch": 0.5413011090878993, + "grad_norm": 4.610905170440674, + "learning_rate": 9.804653258386145e-06, + "loss": 0.5947, + "step": 3270 + }, + { + "epoch": 0.5429564641615626, + "grad_norm": 2.275070905685425, + "learning_rate": 9.801978623435967e-06, + "loss": 0.5725, + "step": 3280 + }, + { + "epoch": 0.544611819235226, + "grad_norm": 2.3280909061431885, + "learning_rate": 9.799286172415746e-06, + "loss": 0.5722, + "step": 3290 + }, + { + "epoch": 0.5462671743088893, + "grad_norm": 2.2407679557800293, + "learning_rate": 9.796575915314884e-06, + "loss": 0.5641, + "step": 3300 + }, + { + "epoch": 0.5479225293825526, + "grad_norm": 2.3367135524749756, + "learning_rate": 9.793847862188848e-06, + "loss": 0.5626, + "step": 3310 + }, + { + "epoch": 0.5495778844562158, + "grad_norm": 1.977503776550293, + "learning_rate": 9.791102023159125e-06, + "loss": 0.5515, + "step": 3320 + }, + { + "epoch": 0.5512332395298791, + "grad_norm": 2.1004018783569336, + "learning_rate": 9.7883384084132e-06, + "loss": 0.5421, + "step": 3330 + }, + { + "epoch": 0.5528885946035424, + "grad_norm": 2.568941116333008, + "learning_rate": 9.7855570282045e-06, + "loss": 0.5618, + "step": 3340 + }, + { + "epoch": 0.5545439496772058, + "grad_norm": 2.1921818256378174, + "learning_rate": 9.782757892852367e-06, + "loss": 0.5418, + "step": 3350 + }, + { + "epoch": 0.5561993047508691, + "grad_norm": 2.2598955631256104, + "learning_rate": 9.779941012742025e-06, + "loss": 0.5643, + "step": 3360 + }, + { + "epoch": 0.5578546598245324, + "grad_norm": 2.0559098720550537, + "learning_rate": 9.77710639832452e-06, + "loss": 0.6003, + "step": 3370 + }, + { + "epoch": 0.5595100148981956, + "grad_norm": 2.366570234298706, + "learning_rate": 9.774254060116703e-06, + "loss": 0.5208, + "step": 3380 + }, + { + "epoch": 0.561165369971859, + "grad_norm": 2.092085123062134, + "learning_rate": 9.771384008701185e-06, + "loss": 0.5987, + "step": 3390 + }, + { + "epoch": 0.5628207250455223, + "grad_norm": 2.402676820755005, + "learning_rate": 9.768496254726293e-06, + "loss": 0.5836, + "step": 3400 + }, + { + "epoch": 0.5644760801191856, + "grad_norm": 2.451547861099243, + "learning_rate": 9.765590808906029e-06, + "loss": 0.5874, + "step": 3410 + }, + { + "epoch": 0.5661314351928489, + "grad_norm": 1.9011714458465576, + "learning_rate": 9.76266768202004e-06, + "loss": 0.495, + "step": 3420 + }, + { + "epoch": 0.5677867902665121, + "grad_norm": 2.3859992027282715, + "learning_rate": 9.759726884913572e-06, + "loss": 0.5934, + "step": 3430 + }, + { + "epoch": 0.5694421453401755, + "grad_norm": 2.100632905960083, + "learning_rate": 9.756768428497427e-06, + "loss": 0.5215, + "step": 3440 + }, + { + "epoch": 0.5710975004138388, + "grad_norm": 2.366837501525879, + "learning_rate": 9.753792323747928e-06, + "loss": 0.4966, + "step": 3450 + }, + { + "epoch": 0.5727528554875021, + "grad_norm": 2.0204997062683105, + "learning_rate": 9.75079858170688e-06, + "loss": 0.5586, + "step": 3460 + }, + { + "epoch": 0.5744082105611654, + "grad_norm": 2.957526683807373, + "learning_rate": 9.747787213481511e-06, + "loss": 0.5437, + "step": 3470 + }, + { + "epoch": 0.5760635656348286, + "grad_norm": 2.0730838775634766, + "learning_rate": 9.744758230244465e-06, + "loss": 0.5447, + "step": 3480 + }, + { + "epoch": 0.577718920708492, + "grad_norm": 2.0039265155792236, + "learning_rate": 9.741711643233724e-06, + "loss": 0.5071, + "step": 3490 + }, + { + "epoch": 0.5793742757821553, + "grad_norm": 1.7638771533966064, + "learning_rate": 9.73864746375259e-06, + "loss": 0.5415, + "step": 3500 + }, + { + "epoch": 0.5810296308558186, + "grad_norm": 2.048380136489868, + "learning_rate": 9.735565703169634e-06, + "loss": 0.5371, + "step": 3510 + }, + { + "epoch": 0.5826849859294819, + "grad_norm": 2.077610492706299, + "learning_rate": 9.732466372918656e-06, + "loss": 0.5548, + "step": 3520 + }, + { + "epoch": 0.5843403410031451, + "grad_norm": 2.1865971088409424, + "learning_rate": 9.729349484498642e-06, + "loss": 0.5595, + "step": 3530 + }, + { + "epoch": 0.5859956960768085, + "grad_norm": 2.1196391582489014, + "learning_rate": 9.726215049473722e-06, + "loss": 0.5333, + "step": 3540 + }, + { + "epoch": 0.5876510511504718, + "grad_norm": 2.613009452819824, + "learning_rate": 9.723063079473124e-06, + "loss": 0.5031, + "step": 3550 + }, + { + "epoch": 0.5893064062241351, + "grad_norm": 2.1916675567626953, + "learning_rate": 9.719893586191137e-06, + "loss": 0.4728, + "step": 3560 + }, + { + "epoch": 0.5909617612977984, + "grad_norm": 1.9465389251708984, + "learning_rate": 9.716706581387065e-06, + "loss": 0.5766, + "step": 3570 + }, + { + "epoch": 0.5926171163714616, + "grad_norm": 2.581099271774292, + "learning_rate": 9.713502076885174e-06, + "loss": 0.5806, + "step": 3580 + }, + { + "epoch": 0.594272471445125, + "grad_norm": 2.1889805793762207, + "learning_rate": 9.710280084574667e-06, + "loss": 0.5075, + "step": 3590 + }, + { + "epoch": 0.5959278265187883, + "grad_norm": 1.5537147521972656, + "learning_rate": 9.707040616409623e-06, + "loss": 0.5388, + "step": 3600 + }, + { + "epoch": 0.5975831815924516, + "grad_norm": 2.479175329208374, + "learning_rate": 9.703783684408961e-06, + "loss": 0.5215, + "step": 3610 + }, + { + "epoch": 0.5992385366661149, + "grad_norm": 2.344923257827759, + "learning_rate": 9.700509300656395e-06, + "loss": 0.5267, + "step": 3620 + }, + { + "epoch": 0.6008938917397781, + "grad_norm": 3.7248928546905518, + "learning_rate": 9.697217477300385e-06, + "loss": 0.5766, + "step": 3630 + }, + { + "epoch": 0.6025492468134415, + "grad_norm": 2.325535297393799, + "learning_rate": 9.693908226554094e-06, + "loss": 0.5377, + "step": 3640 + }, + { + "epoch": 0.6042046018871048, + "grad_norm": 2.5930399894714355, + "learning_rate": 9.690581560695346e-06, + "loss": 0.5351, + "step": 3650 + }, + { + "epoch": 0.6058599569607681, + "grad_norm": 2.1282172203063965, + "learning_rate": 9.68723749206658e-06, + "loss": 0.5148, + "step": 3660 + }, + { + "epoch": 0.6075153120344314, + "grad_norm": 2.241332530975342, + "learning_rate": 9.683876033074793e-06, + "loss": 0.5813, + "step": 3670 + }, + { + "epoch": 0.6091706671080946, + "grad_norm": 1.9913647174835205, + "learning_rate": 9.680497196191511e-06, + "loss": 0.5826, + "step": 3680 + }, + { + "epoch": 0.610826022181758, + "grad_norm": 2.073176622390747, + "learning_rate": 9.677100993952732e-06, + "loss": 0.5039, + "step": 3690 + }, + { + "epoch": 0.6124813772554213, + "grad_norm": 2.6423120498657227, + "learning_rate": 9.673687438958883e-06, + "loss": 0.4948, + "step": 3700 + }, + { + "epoch": 0.6141367323290846, + "grad_norm": 2.061516761779785, + "learning_rate": 9.670256543874769e-06, + "loss": 0.5831, + "step": 3710 + }, + { + "epoch": 0.6157920874027479, + "grad_norm": 2.189225196838379, + "learning_rate": 9.666808321429534e-06, + "loss": 0.5646, + "step": 3720 + }, + { + "epoch": 0.6174474424764111, + "grad_norm": 2.219355344772339, + "learning_rate": 9.663342784416609e-06, + "loss": 0.553, + "step": 3730 + }, + { + "epoch": 0.6191027975500745, + "grad_norm": 2.61342453956604, + "learning_rate": 9.659859945693658e-06, + "loss": 0.5279, + "step": 3740 + }, + { + "epoch": 0.6207581526237378, + "grad_norm": 3.1807034015655518, + "learning_rate": 9.656359818182543e-06, + "loss": 0.5833, + "step": 3750 + }, + { + "epoch": 0.6224135076974011, + "grad_norm": 2.533095121383667, + "learning_rate": 9.65284241486927e-06, + "loss": 0.5927, + "step": 3760 + }, + { + "epoch": 0.6240688627710644, + "grad_norm": 2.798417329788208, + "learning_rate": 9.649307748803939e-06, + "loss": 0.5413, + "step": 3770 + }, + { + "epoch": 0.6257242178447276, + "grad_norm": 2.3580000400543213, + "learning_rate": 9.645755833100699e-06, + "loss": 0.553, + "step": 3780 + }, + { + "epoch": 0.627379572918391, + "grad_norm": 1.9809789657592773, + "learning_rate": 9.642186680937695e-06, + "loss": 0.4882, + "step": 3790 + }, + { + "epoch": 0.6290349279920543, + "grad_norm": 2.5609359741210938, + "learning_rate": 9.638600305557025e-06, + "loss": 0.5164, + "step": 3800 + }, + { + "epoch": 0.6306902830657176, + "grad_norm": 2.4713363647460938, + "learning_rate": 9.634996720264684e-06, + "loss": 0.535, + "step": 3810 + }, + { + "epoch": 0.6323456381393809, + "grad_norm": 2.4818222522735596, + "learning_rate": 9.631375938430525e-06, + "loss": 0.5605, + "step": 3820 + }, + { + "epoch": 0.6340009932130442, + "grad_norm": 2.3091225624084473, + "learning_rate": 9.627737973488194e-06, + "loss": 0.5705, + "step": 3830 + }, + { + "epoch": 0.6356563482867075, + "grad_norm": 2.624854803085327, + "learning_rate": 9.624082838935096e-06, + "loss": 0.4703, + "step": 3840 + }, + { + "epoch": 0.6373117033603708, + "grad_norm": 1.9231913089752197, + "learning_rate": 9.620410548332336e-06, + "loss": 0.4887, + "step": 3850 + }, + { + "epoch": 0.6389670584340341, + "grad_norm": 2.4588680267333984, + "learning_rate": 9.616721115304669e-06, + "loss": 0.4479, + "step": 3860 + }, + { + "epoch": 0.6406224135076974, + "grad_norm": 2.4490933418273926, + "learning_rate": 9.61301455354045e-06, + "loss": 0.5036, + "step": 3870 + }, + { + "epoch": 0.6422777685813607, + "grad_norm": 2.085653305053711, + "learning_rate": 9.609290876791589e-06, + "loss": 0.5332, + "step": 3880 + }, + { + "epoch": 0.643933123655024, + "grad_norm": 2.1635732650756836, + "learning_rate": 9.60555009887349e-06, + "loss": 0.5819, + "step": 3890 + }, + { + "epoch": 0.6455884787286873, + "grad_norm": 1.8872817754745483, + "learning_rate": 9.601792233665007e-06, + "loss": 0.5124, + "step": 3900 + }, + { + "epoch": 0.6472438338023506, + "grad_norm": 2.5642919540405273, + "learning_rate": 9.598017295108395e-06, + "loss": 0.5272, + "step": 3910 + }, + { + "epoch": 0.6488991888760139, + "grad_norm": 2.362342357635498, + "learning_rate": 9.594225297209245e-06, + "loss": 0.512, + "step": 3920 + }, + { + "epoch": 0.6505545439496772, + "grad_norm": 2.04681134223938, + "learning_rate": 9.590416254036447e-06, + "loss": 0.5667, + "step": 3930 + }, + { + "epoch": 0.6522098990233405, + "grad_norm": 2.4778685569763184, + "learning_rate": 9.586590179722131e-06, + "loss": 0.5527, + "step": 3940 + }, + { + "epoch": 0.6538652540970038, + "grad_norm": 2.4283761978149414, + "learning_rate": 9.58274708846161e-06, + "loss": 0.5567, + "step": 3950 + }, + { + "epoch": 0.6555206091706671, + "grad_norm": 2.188366413116455, + "learning_rate": 9.57888699451334e-06, + "loss": 0.5653, + "step": 3960 + }, + { + "epoch": 0.6571759642443304, + "grad_norm": 1.9464526176452637, + "learning_rate": 9.575009912198853e-06, + "loss": 0.5482, + "step": 3970 + }, + { + "epoch": 0.6588313193179937, + "grad_norm": 2.024707555770874, + "learning_rate": 9.571115855902715e-06, + "loss": 0.5105, + "step": 3980 + }, + { + "epoch": 0.660486674391657, + "grad_norm": 2.27622127532959, + "learning_rate": 9.567204840072466e-06, + "loss": 0.5506, + "step": 3990 + }, + { + "epoch": 0.6621420294653203, + "grad_norm": 1.9838156700134277, + "learning_rate": 9.563276879218568e-06, + "loss": 0.5619, + "step": 4000 + }, + { + "epoch": 0.6637973845389836, + "grad_norm": 2.1720376014709473, + "learning_rate": 9.559331987914354e-06, + "loss": 0.549, + "step": 4010 + }, + { + "epoch": 0.665452739612647, + "grad_norm": 1.8095346689224243, + "learning_rate": 9.555370180795967e-06, + "loss": 0.5652, + "step": 4020 + }, + { + "epoch": 0.6671080946863102, + "grad_norm": 2.1045024394989014, + "learning_rate": 9.551391472562316e-06, + "loss": 0.5002, + "step": 4030 + }, + { + "epoch": 0.6687634497599735, + "grad_norm": 2.3684701919555664, + "learning_rate": 9.547395877975012e-06, + "loss": 0.5148, + "step": 4040 + }, + { + "epoch": 0.6704188048336368, + "grad_norm": 2.073254108428955, + "learning_rate": 9.543383411858318e-06, + "loss": 0.5044, + "step": 4050 + }, + { + "epoch": 0.6720741599073001, + "grad_norm": 2.361924171447754, + "learning_rate": 9.539354089099092e-06, + "loss": 0.5516, + "step": 4060 + }, + { + "epoch": 0.6737295149809635, + "grad_norm": 2.4293785095214844, + "learning_rate": 9.535307924646735e-06, + "loss": 0.5263, + "step": 4070 + }, + { + "epoch": 0.6753848700546267, + "grad_norm": 1.8119914531707764, + "learning_rate": 9.531244933513129e-06, + "loss": 0.5128, + "step": 4080 + }, + { + "epoch": 0.67704022512829, + "grad_norm": 1.9259525537490845, + "learning_rate": 9.52716513077259e-06, + "loss": 0.4829, + "step": 4090 + }, + { + "epoch": 0.6786955802019533, + "grad_norm": 2.1588046550750732, + "learning_rate": 9.523068531561805e-06, + "loss": 0.5347, + "step": 4100 + }, + { + "epoch": 0.6803509352756166, + "grad_norm": 2.8328020572662354, + "learning_rate": 9.518955151079781e-06, + "loss": 0.5013, + "step": 4110 + }, + { + "epoch": 0.68200629034928, + "grad_norm": 2.9788923263549805, + "learning_rate": 9.514825004587784e-06, + "loss": 0.515, + "step": 4120 + }, + { + "epoch": 0.6836616454229433, + "grad_norm": 2.3657615184783936, + "learning_rate": 9.510678107409282e-06, + "loss": 0.5444, + "step": 4130 + }, + { + "epoch": 0.6853170004966065, + "grad_norm": 2.1405718326568604, + "learning_rate": 9.506514474929896e-06, + "loss": 0.4501, + "step": 4140 + }, + { + "epoch": 0.6869723555702698, + "grad_norm": 1.9537779092788696, + "learning_rate": 9.502334122597335e-06, + "loss": 0.4819, + "step": 4150 + }, + { + "epoch": 0.6886277106439331, + "grad_norm": 2.405439615249634, + "learning_rate": 9.49813706592134e-06, + "loss": 0.4746, + "step": 4160 + }, + { + "epoch": 0.6902830657175965, + "grad_norm": 2.6599020957946777, + "learning_rate": 9.493923320473628e-06, + "loss": 0.5271, + "step": 4170 + }, + { + "epoch": 0.6919384207912598, + "grad_norm": 2.166412830352783, + "learning_rate": 9.489692901887837e-06, + "loss": 0.5563, + "step": 4180 + }, + { + "epoch": 0.693593775864923, + "grad_norm": 1.994268774986267, + "learning_rate": 9.48544582585946e-06, + "loss": 0.5235, + "step": 4190 + }, + { + "epoch": 0.6952491309385863, + "grad_norm": 3.00091814994812, + "learning_rate": 9.481182108145798e-06, + "loss": 0.4902, + "step": 4200 + }, + { + "epoch": 0.6969044860122496, + "grad_norm": 2.903369426727295, + "learning_rate": 9.476901764565887e-06, + "loss": 0.5175, + "step": 4210 + }, + { + "epoch": 0.698559841085913, + "grad_norm": 2.3588013648986816, + "learning_rate": 9.472604811000453e-06, + "loss": 0.5236, + "step": 4220 + }, + { + "epoch": 0.7002151961595763, + "grad_norm": 2.4532833099365234, + "learning_rate": 9.468291263391847e-06, + "loss": 0.4895, + "step": 4230 + }, + { + "epoch": 0.7018705512332395, + "grad_norm": 2.1600773334503174, + "learning_rate": 9.463961137743986e-06, + "loss": 0.5251, + "step": 4240 + }, + { + "epoch": 0.7035259063069028, + "grad_norm": 1.243881344795227, + "learning_rate": 9.459614450122293e-06, + "loss": 0.5456, + "step": 4250 + }, + { + "epoch": 0.7051812613805661, + "grad_norm": 2.715055465698242, + "learning_rate": 9.45525121665364e-06, + "loss": 0.5019, + "step": 4260 + }, + { + "epoch": 0.7068366164542295, + "grad_norm": 2.0910329818725586, + "learning_rate": 9.450871453526285e-06, + "loss": 0.5024, + "step": 4270 + }, + { + "epoch": 0.7084919715278928, + "grad_norm": 2.3491621017456055, + "learning_rate": 9.446475176989816e-06, + "loss": 0.5481, + "step": 4280 + }, + { + "epoch": 0.710147326601556, + "grad_norm": 2.1992132663726807, + "learning_rate": 9.442062403355085e-06, + "loss": 0.5166, + "step": 4290 + }, + { + "epoch": 0.7118026816752193, + "grad_norm": 2.599609375, + "learning_rate": 9.437633148994154e-06, + "loss": 0.5459, + "step": 4300 + }, + { + "epoch": 0.7134580367488826, + "grad_norm": 1.8802043199539185, + "learning_rate": 9.433187430340228e-06, + "loss": 0.557, + "step": 4310 + }, + { + "epoch": 0.715113391822546, + "grad_norm": 1.949325680732727, + "learning_rate": 9.428725263887599e-06, + "loss": 0.5385, + "step": 4320 + }, + { + "epoch": 0.7167687468962093, + "grad_norm": 2.2506093978881836, + "learning_rate": 9.424246666191582e-06, + "loss": 0.5123, + "step": 4330 + }, + { + "epoch": 0.7184241019698725, + "grad_norm": 2.1846063137054443, + "learning_rate": 9.419751653868456e-06, + "loss": 0.5491, + "step": 4340 + }, + { + "epoch": 0.7200794570435358, + "grad_norm": 2.2196507453918457, + "learning_rate": 9.415240243595397e-06, + "loss": 0.4818, + "step": 4350 + }, + { + "epoch": 0.7217348121171991, + "grad_norm": 1.7722293138504028, + "learning_rate": 9.410712452110425e-06, + "loss": 0.5207, + "step": 4360 + }, + { + "epoch": 0.7233901671908625, + "grad_norm": 1.8259626626968384, + "learning_rate": 9.406168296212332e-06, + "loss": 0.5789, + "step": 4370 + }, + { + "epoch": 0.7250455222645258, + "grad_norm": 3.340482711791992, + "learning_rate": 9.401607792760628e-06, + "loss": 0.5335, + "step": 4380 + }, + { + "epoch": 0.726700877338189, + "grad_norm": 2.3359949588775635, + "learning_rate": 9.397030958675473e-06, + "loss": 0.5716, + "step": 4390 + }, + { + "epoch": 0.7283562324118523, + "grad_norm": 2.8024532794952393, + "learning_rate": 9.392437810937615e-06, + "loss": 0.5089, + "step": 4400 + }, + { + "epoch": 0.7300115874855156, + "grad_norm": 2.381237506866455, + "learning_rate": 9.387828366588333e-06, + "loss": 0.5497, + "step": 4410 + }, + { + "epoch": 0.731666942559179, + "grad_norm": 2.007678270339966, + "learning_rate": 9.383202642729363e-06, + "loss": 0.4973, + "step": 4420 + }, + { + "epoch": 0.7333222976328423, + "grad_norm": 2.0614326000213623, + "learning_rate": 9.378560656522845e-06, + "loss": 0.5366, + "step": 4430 + }, + { + "epoch": 0.7349776527065055, + "grad_norm": 2.1177313327789307, + "learning_rate": 9.37390242519125e-06, + "loss": 0.5676, + "step": 4440 + }, + { + "epoch": 0.7366330077801688, + "grad_norm": 1.7760319709777832, + "learning_rate": 9.369227966017326e-06, + "loss": 0.4562, + "step": 4450 + }, + { + "epoch": 0.7382883628538321, + "grad_norm": 2.2758963108062744, + "learning_rate": 9.364537296344029e-06, + "loss": 0.5742, + "step": 4460 + }, + { + "epoch": 0.7399437179274955, + "grad_norm": 1.834656834602356, + "learning_rate": 9.359830433574451e-06, + "loss": 0.5405, + "step": 4470 + }, + { + "epoch": 0.7415990730011588, + "grad_norm": 1.730483889579773, + "learning_rate": 9.35510739517177e-06, + "loss": 0.5217, + "step": 4480 + }, + { + "epoch": 0.743254428074822, + "grad_norm": 2.00947642326355, + "learning_rate": 9.350368198659174e-06, + "loss": 0.5132, + "step": 4490 + }, + { + "epoch": 0.7449097831484853, + "grad_norm": 2.0409324169158936, + "learning_rate": 9.345612861619805e-06, + "loss": 0.5914, + "step": 4500 + }, + { + "epoch": 0.7465651382221486, + "grad_norm": 2.4126083850860596, + "learning_rate": 9.340841401696686e-06, + "loss": 0.5179, + "step": 4510 + }, + { + "epoch": 0.748220493295812, + "grad_norm": 1.9900450706481934, + "learning_rate": 9.336053836592653e-06, + "loss": 0.4732, + "step": 4520 + }, + { + "epoch": 0.7498758483694753, + "grad_norm": 2.0462656021118164, + "learning_rate": 9.331250184070307e-06, + "loss": 0.4873, + "step": 4530 + }, + { + "epoch": 0.7515312034431385, + "grad_norm": 2.1668014526367188, + "learning_rate": 9.326430461951922e-06, + "loss": 0.5338, + "step": 4540 + }, + { + "epoch": 0.7531865585168018, + "grad_norm": 2.973837375640869, + "learning_rate": 9.3215946881194e-06, + "loss": 0.5087, + "step": 4550 + }, + { + "epoch": 0.7548419135904652, + "grad_norm": 1.9659732580184937, + "learning_rate": 9.3167428805142e-06, + "loss": 0.5518, + "step": 4560 + }, + { + "epoch": 0.7564972686641285, + "grad_norm": 1.702977180480957, + "learning_rate": 9.311875057137263e-06, + "loss": 0.5058, + "step": 4570 + }, + { + "epoch": 0.7581526237377918, + "grad_norm": 1.8664321899414062, + "learning_rate": 9.306991236048953e-06, + "loss": 0.4974, + "step": 4580 + }, + { + "epoch": 0.759807978811455, + "grad_norm": 1.9654330015182495, + "learning_rate": 9.302091435368988e-06, + "loss": 0.5579, + "step": 4590 + }, + { + "epoch": 0.7614633338851183, + "grad_norm": 2.2180731296539307, + "learning_rate": 9.297175673276372e-06, + "loss": 0.5048, + "step": 4600 + }, + { + "epoch": 0.7631186889587817, + "grad_norm": 2.6190831661224365, + "learning_rate": 9.292243968009332e-06, + "loss": 0.5507, + "step": 4610 + }, + { + "epoch": 0.764774044032445, + "grad_norm": 2.6507656574249268, + "learning_rate": 9.28729633786524e-06, + "loss": 0.5322, + "step": 4620 + }, + { + "epoch": 0.7664293991061083, + "grad_norm": 2.1486687660217285, + "learning_rate": 9.282332801200557e-06, + "loss": 0.5358, + "step": 4630 + }, + { + "epoch": 0.7680847541797715, + "grad_norm": 2.3483917713165283, + "learning_rate": 9.277353376430758e-06, + "loss": 0.5372, + "step": 4640 + }, + { + "epoch": 0.7697401092534348, + "grad_norm": 1.9544059038162231, + "learning_rate": 9.272358082030263e-06, + "loss": 0.5375, + "step": 4650 + }, + { + "epoch": 0.7713954643270982, + "grad_norm": 2.1422975063323975, + "learning_rate": 9.267346936532377e-06, + "loss": 0.5314, + "step": 4660 + }, + { + "epoch": 0.7730508194007615, + "grad_norm": 2.089794874191284, + "learning_rate": 9.26231995852921e-06, + "loss": 0.5177, + "step": 4670 + }, + { + "epoch": 0.7747061744744248, + "grad_norm": 2.1427338123321533, + "learning_rate": 9.25727716667161e-06, + "loss": 0.5485, + "step": 4680 + }, + { + "epoch": 0.776361529548088, + "grad_norm": 1.6319947242736816, + "learning_rate": 9.252218579669105e-06, + "loss": 0.5343, + "step": 4690 + }, + { + "epoch": 0.7780168846217513, + "grad_norm": 2.030916452407837, + "learning_rate": 9.247144216289821e-06, + "loss": 0.5499, + "step": 4700 + }, + { + "epoch": 0.7796722396954147, + "grad_norm": 2.0921289920806885, + "learning_rate": 9.242054095360413e-06, + "loss": 0.4607, + "step": 4710 + }, + { + "epoch": 0.781327594769078, + "grad_norm": 1.9406028985977173, + "learning_rate": 9.236948235766004e-06, + "loss": 0.5819, + "step": 4720 + }, + { + "epoch": 0.7829829498427413, + "grad_norm": 2.238834857940674, + "learning_rate": 9.231826656450112e-06, + "loss": 0.4634, + "step": 4730 + }, + { + "epoch": 0.7846383049164045, + "grad_norm": 2.383314609527588, + "learning_rate": 9.226689376414571e-06, + "loss": 0.5393, + "step": 4740 + }, + { + "epoch": 0.7862936599900678, + "grad_norm": 2.1563191413879395, + "learning_rate": 9.221536414719472e-06, + "loss": 0.5372, + "step": 4750 + }, + { + "epoch": 0.7879490150637312, + "grad_norm": 2.5362823009490967, + "learning_rate": 9.216367790483085e-06, + "loss": 0.5739, + "step": 4760 + }, + { + "epoch": 0.7896043701373945, + "grad_norm": 2.164787769317627, + "learning_rate": 9.211183522881788e-06, + "loss": 0.5437, + "step": 4770 + }, + { + "epoch": 0.7912597252110578, + "grad_norm": 1.8392080068588257, + "learning_rate": 9.205983631150005e-06, + "loss": 0.5152, + "step": 4780 + }, + { + "epoch": 0.792915080284721, + "grad_norm": 2.01461124420166, + "learning_rate": 9.200768134580124e-06, + "loss": 0.5199, + "step": 4790 + }, + { + "epoch": 0.7945704353583843, + "grad_norm": 1.9222743511199951, + "learning_rate": 9.195537052522428e-06, + "loss": 0.5363, + "step": 4800 + }, + { + "epoch": 0.7962257904320477, + "grad_norm": 2.3261756896972656, + "learning_rate": 9.190290404385025e-06, + "loss": 0.5161, + "step": 4810 + }, + { + "epoch": 0.797881145505711, + "grad_norm": 1.8926310539245605, + "learning_rate": 9.18502820963378e-06, + "loss": 0.5086, + "step": 4820 + }, + { + "epoch": 0.7995365005793743, + "grad_norm": 2.2464699745178223, + "learning_rate": 9.179750487792232e-06, + "loss": 0.5446, + "step": 4830 + }, + { + "epoch": 0.8011918556530375, + "grad_norm": 2.444520950317383, + "learning_rate": 9.17445725844153e-06, + "loss": 0.5071, + "step": 4840 + }, + { + "epoch": 0.8028472107267008, + "grad_norm": 2.0580263137817383, + "learning_rate": 9.169148541220361e-06, + "loss": 0.5257, + "step": 4850 + }, + { + "epoch": 0.8045025658003642, + "grad_norm": 3.0564193725585938, + "learning_rate": 9.163824355824871e-06, + "loss": 0.5054, + "step": 4860 + }, + { + "epoch": 0.8061579208740275, + "grad_norm": 2.9118645191192627, + "learning_rate": 9.158484722008596e-06, + "loss": 0.5212, + "step": 4870 + }, + { + "epoch": 0.8078132759476908, + "grad_norm": 2.128197193145752, + "learning_rate": 9.15312965958239e-06, + "loss": 0.4898, + "step": 4880 + }, + { + "epoch": 0.8094686310213541, + "grad_norm": 2.0659148693084717, + "learning_rate": 9.147759188414348e-06, + "loss": 0.5409, + "step": 4890 + }, + { + "epoch": 0.8111239860950173, + "grad_norm": 2.312744140625, + "learning_rate": 9.142373328429733e-06, + "loss": 0.5638, + "step": 4900 + }, + { + "epoch": 0.8127793411686807, + "grad_norm": 2.416036367416382, + "learning_rate": 9.136972099610901e-06, + "loss": 0.5098, + "step": 4910 + }, + { + "epoch": 0.814434696242344, + "grad_norm": 2.6420984268188477, + "learning_rate": 9.131555521997236e-06, + "loss": 0.5312, + "step": 4920 + }, + { + "epoch": 0.8160900513160073, + "grad_norm": 2.3493874073028564, + "learning_rate": 9.126123615685061e-06, + "loss": 0.5222, + "step": 4930 + }, + { + "epoch": 0.8177454063896706, + "grad_norm": 2.002620220184326, + "learning_rate": 9.120676400827575e-06, + "loss": 0.5188, + "step": 4940 + }, + { + "epoch": 0.8194007614633338, + "grad_norm": 1.8725508451461792, + "learning_rate": 9.11521389763477e-06, + "loss": 0.5371, + "step": 4950 + }, + { + "epoch": 0.8210561165369972, + "grad_norm": 2.774479389190674, + "learning_rate": 9.109736126373364e-06, + "loss": 0.4711, + "step": 4960 + }, + { + "epoch": 0.8227114716106605, + "grad_norm": 1.985054850578308, + "learning_rate": 9.10424310736672e-06, + "loss": 0.5487, + "step": 4970 + }, + { + "epoch": 0.8243668266843238, + "grad_norm": 2.2153406143188477, + "learning_rate": 9.098734860994774e-06, + "loss": 0.5096, + "step": 4980 + }, + { + "epoch": 0.8260221817579871, + "grad_norm": 2.1796064376831055, + "learning_rate": 9.093211407693954e-06, + "loss": 0.5158, + "step": 4990 + }, + { + "epoch": 0.8276775368316504, + "grad_norm": 2.429814338684082, + "learning_rate": 9.087672767957114e-06, + "loss": 0.4919, + "step": 5000 + }, + { + "epoch": 0.8293328919053137, + "grad_norm": 2.0959694385528564, + "learning_rate": 9.082118962333445e-06, + "loss": 0.4874, + "step": 5010 + }, + { + "epoch": 0.830988246978977, + "grad_norm": 2.562439203262329, + "learning_rate": 9.076550011428415e-06, + "loss": 0.4963, + "step": 5020 + }, + { + "epoch": 0.8326436020526403, + "grad_norm": 2.4915542602539062, + "learning_rate": 9.070965935903672e-06, + "loss": 0.5219, + "step": 5030 + }, + { + "epoch": 0.8342989571263036, + "grad_norm": 2.3988428115844727, + "learning_rate": 9.065366756476987e-06, + "loss": 0.5804, + "step": 5040 + }, + { + "epoch": 0.8359543121999669, + "grad_norm": 2.0617735385894775, + "learning_rate": 9.059752493922165e-06, + "loss": 0.5585, + "step": 5050 + }, + { + "epoch": 0.8376096672736302, + "grad_norm": 2.474543571472168, + "learning_rate": 9.054123169068974e-06, + "loss": 0.5238, + "step": 5060 + }, + { + "epoch": 0.8392650223472935, + "grad_norm": 1.58362877368927, + "learning_rate": 9.048478802803062e-06, + "loss": 0.5623, + "step": 5070 + }, + { + "epoch": 0.8409203774209568, + "grad_norm": 1.7996039390563965, + "learning_rate": 9.042819416065888e-06, + "loss": 0.5107, + "step": 5080 + }, + { + "epoch": 0.8425757324946201, + "grad_norm": 2.7114357948303223, + "learning_rate": 9.037145029854637e-06, + "loss": 0.4495, + "step": 5090 + }, + { + "epoch": 0.8442310875682834, + "grad_norm": 2.9728622436523438, + "learning_rate": 9.03145566522214e-06, + "loss": 0.5361, + "step": 5100 + }, + { + "epoch": 0.8458864426419467, + "grad_norm": 2.092686891555786, + "learning_rate": 9.025751343276806e-06, + "loss": 0.4656, + "step": 5110 + }, + { + "epoch": 0.84754179771561, + "grad_norm": 2.272784471511841, + "learning_rate": 9.020032085182535e-06, + "loss": 0.546, + "step": 5120 + }, + { + "epoch": 0.8491971527892733, + "grad_norm": 2.0549049377441406, + "learning_rate": 9.014297912158645e-06, + "loss": 0.5034, + "step": 5130 + }, + { + "epoch": 0.8508525078629366, + "grad_norm": 1.5659208297729492, + "learning_rate": 9.008548845479787e-06, + "loss": 0.57, + "step": 5140 + }, + { + "epoch": 0.8525078629365999, + "grad_norm": 1.5937261581420898, + "learning_rate": 9.002784906475872e-06, + "loss": 0.4394, + "step": 5150 + }, + { + "epoch": 0.8541632180102632, + "grad_norm": 1.9835437536239624, + "learning_rate": 8.99700611653199e-06, + "loss": 0.492, + "step": 5160 + }, + { + "epoch": 0.8558185730839265, + "grad_norm": 2.044243574142456, + "learning_rate": 8.991212497088328e-06, + "loss": 0.5157, + "step": 5170 + }, + { + "epoch": 0.8574739281575898, + "grad_norm": 2.410057544708252, + "learning_rate": 8.985404069640096e-06, + "loss": 0.557, + "step": 5180 + }, + { + "epoch": 0.8591292832312531, + "grad_norm": 1.7656784057617188, + "learning_rate": 8.97958085573744e-06, + "loss": 0.5092, + "step": 5190 + }, + { + "epoch": 0.8607846383049164, + "grad_norm": 2.221282958984375, + "learning_rate": 8.973742876985369e-06, + "loss": 0.5198, + "step": 5200 + }, + { + "epoch": 0.8624399933785797, + "grad_norm": 1.847607135772705, + "learning_rate": 8.967890155043672e-06, + "loss": 0.516, + "step": 5210 + }, + { + "epoch": 0.864095348452243, + "grad_norm": 1.8600164651870728, + "learning_rate": 8.962022711626835e-06, + "loss": 0.4892, + "step": 5220 + }, + { + "epoch": 0.8657507035259063, + "grad_norm": 1.9303275346755981, + "learning_rate": 8.956140568503964e-06, + "loss": 0.5309, + "step": 5230 + }, + { + "epoch": 0.8674060585995697, + "grad_norm": 2.567194938659668, + "learning_rate": 8.950243747498704e-06, + "loss": 0.5249, + "step": 5240 + }, + { + "epoch": 0.8690614136732329, + "grad_norm": 2.3800063133239746, + "learning_rate": 8.944332270489156e-06, + "loss": 0.4972, + "step": 5250 + }, + { + "epoch": 0.8707167687468962, + "grad_norm": 1.9158269166946411, + "learning_rate": 8.938406159407798e-06, + "loss": 0.4903, + "step": 5260 + }, + { + "epoch": 0.8723721238205595, + "grad_norm": 2.1870522499084473, + "learning_rate": 8.932465436241403e-06, + "loss": 0.5044, + "step": 5270 + }, + { + "epoch": 0.8740274788942228, + "grad_norm": 2.8295249938964844, + "learning_rate": 8.926510123030955e-06, + "loss": 0.5025, + "step": 5280 + }, + { + "epoch": 0.8756828339678862, + "grad_norm": 2.0139286518096924, + "learning_rate": 8.92054024187157e-06, + "loss": 0.5451, + "step": 5290 + }, + { + "epoch": 0.8773381890415494, + "grad_norm": 1.9241939783096313, + "learning_rate": 8.914555814912416e-06, + "loss": 0.4705, + "step": 5300 + }, + { + "epoch": 0.8789935441152127, + "grad_norm": 1.9577219486236572, + "learning_rate": 8.908556864356625e-06, + "loss": 0.5262, + "step": 5310 + }, + { + "epoch": 0.880648899188876, + "grad_norm": 2.2654361724853516, + "learning_rate": 8.902543412461214e-06, + "loss": 0.4939, + "step": 5320 + }, + { + "epoch": 0.8823042542625393, + "grad_norm": 1.9551903009414673, + "learning_rate": 8.896515481537004e-06, + "loss": 0.5221, + "step": 5330 + }, + { + "epoch": 0.8839596093362027, + "grad_norm": 2.27404522895813, + "learning_rate": 8.890473093948532e-06, + "loss": 0.5371, + "step": 5340 + }, + { + "epoch": 0.8856149644098659, + "grad_norm": 2.3164634704589844, + "learning_rate": 8.884416272113977e-06, + "loss": 0.5328, + "step": 5350 + }, + { + "epoch": 0.8872703194835292, + "grad_norm": 1.646775484085083, + "learning_rate": 8.878345038505067e-06, + "loss": 0.5262, + "step": 5360 + }, + { + "epoch": 0.8889256745571925, + "grad_norm": 2.1168019771575928, + "learning_rate": 8.872259415646998e-06, + "loss": 0.5132, + "step": 5370 + }, + { + "epoch": 0.8905810296308558, + "grad_norm": 1.5265601873397827, + "learning_rate": 8.86615942611836e-06, + "loss": 0.4874, + "step": 5380 + }, + { + "epoch": 0.8922363847045192, + "grad_norm": 1.885336995124817, + "learning_rate": 8.860045092551034e-06, + "loss": 0.4704, + "step": 5390 + }, + { + "epoch": 0.8938917397781824, + "grad_norm": 2.109318256378174, + "learning_rate": 8.853916437630135e-06, + "loss": 0.4941, + "step": 5400 + }, + { + "epoch": 0.8955470948518457, + "grad_norm": 1.7149527072906494, + "learning_rate": 8.847773484093896e-06, + "loss": 0.5617, + "step": 5410 + }, + { + "epoch": 0.897202449925509, + "grad_norm": 2.4205265045166016, + "learning_rate": 8.84161625473361e-06, + "loss": 0.4991, + "step": 5420 + }, + { + "epoch": 0.8988578049991723, + "grad_norm": 2.258826971054077, + "learning_rate": 8.835444772393534e-06, + "loss": 0.5105, + "step": 5430 + }, + { + "epoch": 0.9005131600728357, + "grad_norm": 2.3182597160339355, + "learning_rate": 8.829259059970805e-06, + "loss": 0.5127, + "step": 5440 + }, + { + "epoch": 0.9021685151464989, + "grad_norm": 1.9707034826278687, + "learning_rate": 8.823059140415355e-06, + "loss": 0.4753, + "step": 5450 + }, + { + "epoch": 0.9038238702201622, + "grad_norm": 1.8292397260665894, + "learning_rate": 8.816845036729827e-06, + "loss": 0.4998, + "step": 5460 + }, + { + "epoch": 0.9054792252938255, + "grad_norm": 1.737146258354187, + "learning_rate": 8.81061677196949e-06, + "loss": 0.4546, + "step": 5470 + }, + { + "epoch": 0.9071345803674888, + "grad_norm": 1.6696151494979858, + "learning_rate": 8.80437436924215e-06, + "loss": 0.5443, + "step": 5480 + }, + { + "epoch": 0.9087899354411522, + "grad_norm": 2.815847635269165, + "learning_rate": 8.798117851708072e-06, + "loss": 0.5335, + "step": 5490 + }, + { + "epoch": 0.9104452905148154, + "grad_norm": 1.8941160440444946, + "learning_rate": 8.791847242579887e-06, + "loss": 0.5121, + "step": 5500 + }, + { + "epoch": 0.9121006455884787, + "grad_norm": 2.581604242324829, + "learning_rate": 8.785562565122505e-06, + "loss": 0.5301, + "step": 5510 + }, + { + "epoch": 0.913756000662142, + "grad_norm": 1.7762013673782349, + "learning_rate": 8.779263842653034e-06, + "loss": 0.5159, + "step": 5520 + }, + { + "epoch": 0.9154113557358053, + "grad_norm": 2.0996243953704834, + "learning_rate": 8.772951098540692e-06, + "loss": 0.5227, + "step": 5530 + }, + { + "epoch": 0.9170667108094687, + "grad_norm": 2.195647716522217, + "learning_rate": 8.76662435620672e-06, + "loss": 0.4409, + "step": 5540 + }, + { + "epoch": 0.9187220658831319, + "grad_norm": 3.163017749786377, + "learning_rate": 8.760283639124289e-06, + "loss": 0.5603, + "step": 5550 + }, + { + "epoch": 0.9203774209567952, + "grad_norm": 2.1786811351776123, + "learning_rate": 8.753928970818426e-06, + "loss": 0.5127, + "step": 5560 + }, + { + "epoch": 0.9220327760304585, + "grad_norm": 1.9021741151809692, + "learning_rate": 8.747560374865913e-06, + "loss": 0.5137, + "step": 5570 + }, + { + "epoch": 0.9236881311041218, + "grad_norm": 2.3319544792175293, + "learning_rate": 8.74117787489521e-06, + "loss": 0.515, + "step": 5580 + }, + { + "epoch": 0.9253434861777852, + "grad_norm": 2.084627389907837, + "learning_rate": 8.734781494586363e-06, + "loss": 0.5457, + "step": 5590 + }, + { + "epoch": 0.9269988412514485, + "grad_norm": 2.1023311614990234, + "learning_rate": 8.72837125767091e-06, + "loss": 0.5222, + "step": 5600 + }, + { + "epoch": 0.9286541963251117, + "grad_norm": 1.880613923072815, + "learning_rate": 8.721947187931807e-06, + "loss": 0.5315, + "step": 5610 + }, + { + "epoch": 0.930309551398775, + "grad_norm": 2.219151735305786, + "learning_rate": 8.715509309203327e-06, + "loss": 0.526, + "step": 5620 + }, + { + "epoch": 0.9319649064724383, + "grad_norm": 2.4242360591888428, + "learning_rate": 8.709057645370977e-06, + "loss": 0.4777, + "step": 5630 + }, + { + "epoch": 0.9336202615461017, + "grad_norm": 1.921246886253357, + "learning_rate": 8.702592220371413e-06, + "loss": 0.523, + "step": 5640 + }, + { + "epoch": 0.935275616619765, + "grad_norm": 2.206521987915039, + "learning_rate": 8.696113058192341e-06, + "loss": 0.5092, + "step": 5650 + }, + { + "epoch": 0.9369309716934282, + "grad_norm": 1.9734609127044678, + "learning_rate": 8.689620182872435e-06, + "loss": 0.5065, + "step": 5660 + }, + { + "epoch": 0.9385863267670915, + "grad_norm": 1.9264705181121826, + "learning_rate": 8.683113618501256e-06, + "loss": 0.5365, + "step": 5670 + }, + { + "epoch": 0.9402416818407549, + "grad_norm": 2.4335641860961914, + "learning_rate": 8.676593389219137e-06, + "loss": 0.4719, + "step": 5680 + }, + { + "epoch": 0.9418970369144182, + "grad_norm": 2.1883111000061035, + "learning_rate": 8.670059519217124e-06, + "loss": 0.4727, + "step": 5690 + }, + { + "epoch": 0.9435523919880815, + "grad_norm": 2.0516345500946045, + "learning_rate": 8.663512032736868e-06, + "loss": 0.5501, + "step": 5700 + }, + { + "epoch": 0.9452077470617447, + "grad_norm": 2.072754144668579, + "learning_rate": 8.656950954070536e-06, + "loss": 0.5249, + "step": 5710 + }, + { + "epoch": 0.946863102135408, + "grad_norm": 1.922422170639038, + "learning_rate": 8.650376307560726e-06, + "loss": 0.5304, + "step": 5720 + }, + { + "epoch": 0.9485184572090714, + "grad_norm": 1.9621427059173584, + "learning_rate": 8.643788117600376e-06, + "loss": 0.5281, + "step": 5730 + }, + { + "epoch": 0.9501738122827347, + "grad_norm": 2.2022109031677246, + "learning_rate": 8.637186408632673e-06, + "loss": 0.4689, + "step": 5740 + }, + { + "epoch": 0.951829167356398, + "grad_norm": 2.15291690826416, + "learning_rate": 8.630571205150957e-06, + "loss": 0.5009, + "step": 5750 + }, + { + "epoch": 0.9534845224300612, + "grad_norm": 2.6725265979766846, + "learning_rate": 8.62394253169864e-06, + "loss": 0.4522, + "step": 5760 + }, + { + "epoch": 0.9551398775037245, + "grad_norm": 2.1421377658843994, + "learning_rate": 8.617300412869105e-06, + "loss": 0.5002, + "step": 5770 + }, + { + "epoch": 0.9567952325773879, + "grad_norm": 1.951465368270874, + "learning_rate": 8.610644873305625e-06, + "loss": 0.4692, + "step": 5780 + }, + { + "epoch": 0.9584505876510512, + "grad_norm": 2.3892083168029785, + "learning_rate": 8.60397593770126e-06, + "loss": 0.485, + "step": 5790 + }, + { + "epoch": 0.9601059427247145, + "grad_norm": 2.2727770805358887, + "learning_rate": 8.597293630798776e-06, + "loss": 0.5378, + "step": 5800 + }, + { + "epoch": 0.9617612977983777, + "grad_norm": 1.7045542001724243, + "learning_rate": 8.590597977390542e-06, + "loss": 0.4663, + "step": 5810 + }, + { + "epoch": 0.963416652872041, + "grad_norm": 2.008007526397705, + "learning_rate": 8.583889002318455e-06, + "loss": 0.5119, + "step": 5820 + }, + { + "epoch": 0.9650720079457044, + "grad_norm": 1.846248745918274, + "learning_rate": 8.57716673047383e-06, + "loss": 0.5105, + "step": 5830 + }, + { + "epoch": 0.9667273630193677, + "grad_norm": 3.5718061923980713, + "learning_rate": 8.570431186797314e-06, + "loss": 0.4927, + "step": 5840 + }, + { + "epoch": 0.968382718093031, + "grad_norm": 2.1415624618530273, + "learning_rate": 8.563682396278799e-06, + "loss": 0.4978, + "step": 5850 + }, + { + "epoch": 0.9700380731666942, + "grad_norm": 2.2250213623046875, + "learning_rate": 8.556920383957322e-06, + "loss": 0.4732, + "step": 5860 + }, + { + "epoch": 0.9716934282403575, + "grad_norm": 2.6127257347106934, + "learning_rate": 8.550145174920977e-06, + "loss": 0.4866, + "step": 5870 + }, + { + "epoch": 0.9733487833140209, + "grad_norm": 2.1522982120513916, + "learning_rate": 8.543356794306818e-06, + "loss": 0.5143, + "step": 5880 + }, + { + "epoch": 0.9750041383876842, + "grad_norm": 2.036477565765381, + "learning_rate": 8.536555267300766e-06, + "loss": 0.5438, + "step": 5890 + }, + { + "epoch": 0.9766594934613475, + "grad_norm": 2.8033342361450195, + "learning_rate": 8.529740619137523e-06, + "loss": 0.514, + "step": 5900 + }, + { + "epoch": 0.9783148485350107, + "grad_norm": 2.0152909755706787, + "learning_rate": 8.522912875100467e-06, + "loss": 0.5071, + "step": 5910 + }, + { + "epoch": 0.979970203608674, + "grad_norm": 2.0931289196014404, + "learning_rate": 8.516072060521566e-06, + "loss": 0.5031, + "step": 5920 + }, + { + "epoch": 0.9816255586823374, + "grad_norm": 1.396856427192688, + "learning_rate": 8.509218200781278e-06, + "loss": 0.5602, + "step": 5930 + }, + { + "epoch": 0.9832809137560007, + "grad_norm": 2.251620054244995, + "learning_rate": 8.502351321308468e-06, + "loss": 0.5185, + "step": 5940 + }, + { + "epoch": 0.984936268829664, + "grad_norm": 2.3058903217315674, + "learning_rate": 8.4954714475803e-06, + "loss": 0.4617, + "step": 5950 + }, + { + "epoch": 0.9865916239033272, + "grad_norm": 2.428722620010376, + "learning_rate": 8.488578605122149e-06, + "loss": 0.5123, + "step": 5960 + }, + { + "epoch": 0.9882469789769905, + "grad_norm": 2.9448606967926025, + "learning_rate": 8.48167281950751e-06, + "loss": 0.4912, + "step": 5970 + }, + { + "epoch": 0.9899023340506539, + "grad_norm": 1.9352291822433472, + "learning_rate": 8.474754116357895e-06, + "loss": 0.4906, + "step": 5980 + }, + { + "epoch": 0.9915576891243172, + "grad_norm": 2.116455078125, + "learning_rate": 8.467822521342744e-06, + "loss": 0.4532, + "step": 5990 + }, + { + "epoch": 0.9932130441979805, + "grad_norm": 1.96929931640625, + "learning_rate": 8.460878060179326e-06, + "loss": 0.4909, + "step": 6000 + }, + { + "epoch": 0.9948683992716437, + "grad_norm": 2.103734254837036, + "learning_rate": 8.45392075863265e-06, + "loss": 0.4986, + "step": 6010 + }, + { + "epoch": 0.996523754345307, + "grad_norm": 1.8520708084106445, + "learning_rate": 8.446950642515359e-06, + "loss": 0.4959, + "step": 6020 + }, + { + "epoch": 0.9981791094189704, + "grad_norm": 1.5398201942443848, + "learning_rate": 8.439967737687642e-06, + "loss": 0.5061, + "step": 6030 + }, + { + "epoch": 0.9998344644926337, + "grad_norm": 2.7384068965911865, + "learning_rate": 8.432972070057137e-06, + "loss": 0.4164, + "step": 6040 + }, + { + "epoch": 1.001489819566297, + "grad_norm": 1.699214220046997, + "learning_rate": 8.425963665578833e-06, + "loss": 0.4717, + "step": 6050 + }, + { + "epoch": 1.0031451746399602, + "grad_norm": 2.7517991065979004, + "learning_rate": 8.418942550254978e-06, + "loss": 0.4783, + "step": 6060 + }, + { + "epoch": 1.0048005297136235, + "grad_norm": 2.293407440185547, + "learning_rate": 8.411908750134973e-06, + "loss": 0.428, + "step": 6070 + }, + { + "epoch": 1.0064558847872869, + "grad_norm": 2.3699285984039307, + "learning_rate": 8.404862291315287e-06, + "loss": 0.4529, + "step": 6080 + }, + { + "epoch": 1.0081112398609502, + "grad_norm": 2.27496337890625, + "learning_rate": 8.39780319993935e-06, + "loss": 0.4486, + "step": 6090 + }, + { + "epoch": 1.0097665949346135, + "grad_norm": 2.0127158164978027, + "learning_rate": 8.390731502197465e-06, + "loss": 0.4529, + "step": 6100 + }, + { + "epoch": 1.0114219500082768, + "grad_norm": 2.6068265438079834, + "learning_rate": 8.383647224326704e-06, + "loss": 0.4397, + "step": 6110 + }, + { + "epoch": 1.0130773050819402, + "grad_norm": 1.9427989721298218, + "learning_rate": 8.376550392610813e-06, + "loss": 0.4236, + "step": 6120 + }, + { + "epoch": 1.0147326601556035, + "grad_norm": 2.082646369934082, + "learning_rate": 8.369441033380119e-06, + "loss": 0.4352, + "step": 6130 + }, + { + "epoch": 1.0163880152292666, + "grad_norm": 2.0163888931274414, + "learning_rate": 8.362319173011421e-06, + "loss": 0.4359, + "step": 6140 + }, + { + "epoch": 1.01804337030293, + "grad_norm": 1.7320841550827026, + "learning_rate": 8.355184837927906e-06, + "loss": 0.4454, + "step": 6150 + }, + { + "epoch": 1.0196987253765932, + "grad_norm": 1.9709644317626953, + "learning_rate": 8.348038054599037e-06, + "loss": 0.4593, + "step": 6160 + }, + { + "epoch": 1.0213540804502566, + "grad_norm": 2.382404327392578, + "learning_rate": 8.340878849540466e-06, + "loss": 0.3941, + "step": 6170 + }, + { + "epoch": 1.0230094355239199, + "grad_norm": 2.566659450531006, + "learning_rate": 8.333707249313933e-06, + "loss": 0.4474, + "step": 6180 + }, + { + "epoch": 1.0246647905975832, + "grad_norm": 1.6790319681167603, + "learning_rate": 8.326523280527165e-06, + "loss": 0.437, + "step": 6190 + }, + { + "epoch": 1.0263201456712465, + "grad_norm": 1.9605789184570312, + "learning_rate": 8.319326969833776e-06, + "loss": 0.4346, + "step": 6200 + }, + { + "epoch": 1.0279755007449098, + "grad_norm": 1.5337015390396118, + "learning_rate": 8.312118343933172e-06, + "loss": 0.4329, + "step": 6210 + }, + { + "epoch": 1.0296308558185732, + "grad_norm": 2.0103812217712402, + "learning_rate": 8.304897429570448e-06, + "loss": 0.4199, + "step": 6220 + }, + { + "epoch": 1.0312862108922365, + "grad_norm": 3.106372594833374, + "learning_rate": 8.297664253536296e-06, + "loss": 0.4429, + "step": 6230 + }, + { + "epoch": 1.0329415659658996, + "grad_norm": 2.3438146114349365, + "learning_rate": 8.290418842666894e-06, + "loss": 0.4588, + "step": 6240 + }, + { + "epoch": 1.034596921039563, + "grad_norm": 1.6596814393997192, + "learning_rate": 8.28316122384382e-06, + "loss": 0.4314, + "step": 6250 + }, + { + "epoch": 1.0362522761132262, + "grad_norm": 2.1829092502593994, + "learning_rate": 8.275891423993943e-06, + "loss": 0.4292, + "step": 6260 + }, + { + "epoch": 1.0379076311868896, + "grad_norm": 1.525658130645752, + "learning_rate": 8.268609470089322e-06, + "loss": 0.4206, + "step": 6270 + }, + { + "epoch": 1.0395629862605529, + "grad_norm": 1.846199870109558, + "learning_rate": 8.261315389147113e-06, + "loss": 0.3781, + "step": 6280 + }, + { + "epoch": 1.0412183413342162, + "grad_norm": 2.2340259552001953, + "learning_rate": 8.254009208229464e-06, + "loss": 0.4107, + "step": 6290 + }, + { + "epoch": 1.0428736964078795, + "grad_norm": 2.0799405574798584, + "learning_rate": 8.246690954443416e-06, + "loss": 0.3953, + "step": 6300 + }, + { + "epoch": 1.0445290514815428, + "grad_norm": 2.073094367980957, + "learning_rate": 8.239360654940803e-06, + "loss": 0.4502, + "step": 6310 + }, + { + "epoch": 1.0461844065552062, + "grad_norm": 2.092500925064087, + "learning_rate": 8.232018336918145e-06, + "loss": 0.4067, + "step": 6320 + }, + { + "epoch": 1.0478397616288695, + "grad_norm": 2.2009036540985107, + "learning_rate": 8.224664027616565e-06, + "loss": 0.4153, + "step": 6330 + }, + { + "epoch": 1.0494951167025326, + "grad_norm": 2.1179239749908447, + "learning_rate": 8.217297754321661e-06, + "loss": 0.3964, + "step": 6340 + }, + { + "epoch": 1.051150471776196, + "grad_norm": 1.9350873231887817, + "learning_rate": 8.209919544363428e-06, + "loss": 0.4776, + "step": 6350 + }, + { + "epoch": 1.0528058268498592, + "grad_norm": 2.0609846115112305, + "learning_rate": 8.202529425116145e-06, + "loss": 0.4193, + "step": 6360 + }, + { + "epoch": 1.0544611819235226, + "grad_norm": 2.755305528640747, + "learning_rate": 8.195127423998279e-06, + "loss": 0.4682, + "step": 6370 + }, + { + "epoch": 1.0561165369971859, + "grad_norm": 1.6468199491500854, + "learning_rate": 8.187713568472375e-06, + "loss": 0.4609, + "step": 6380 + }, + { + "epoch": 1.0577718920708492, + "grad_norm": 2.1090316772460938, + "learning_rate": 8.180287886044967e-06, + "loss": 0.4468, + "step": 6390 + }, + { + "epoch": 1.0594272471445125, + "grad_norm": 3.3360707759857178, + "learning_rate": 8.172850404266462e-06, + "loss": 0.4611, + "step": 6400 + }, + { + "epoch": 1.0610826022181759, + "grad_norm": 1.8891501426696777, + "learning_rate": 8.165401150731045e-06, + "loss": 0.42, + "step": 6410 + }, + { + "epoch": 1.0627379572918392, + "grad_norm": 2.2766528129577637, + "learning_rate": 8.157940153076582e-06, + "loss": 0.3972, + "step": 6420 + }, + { + "epoch": 1.0643933123655025, + "grad_norm": 2.1425061225891113, + "learning_rate": 8.150467438984507e-06, + "loss": 0.4378, + "step": 6430 + }, + { + "epoch": 1.0660486674391656, + "grad_norm": 2.0477399826049805, + "learning_rate": 8.142983036179723e-06, + "loss": 0.4627, + "step": 6440 + }, + { + "epoch": 1.067704022512829, + "grad_norm": 1.6100428104400635, + "learning_rate": 8.135486972430502e-06, + "loss": 0.415, + "step": 6450 + }, + { + "epoch": 1.0693593775864922, + "grad_norm": 2.3228981494903564, + "learning_rate": 8.127979275548376e-06, + "loss": 0.4399, + "step": 6460 + }, + { + "epoch": 1.0710147326601556, + "grad_norm": 2.075024127960205, + "learning_rate": 8.120459973388046e-06, + "loss": 0.4679, + "step": 6470 + }, + { + "epoch": 1.0726700877338189, + "grad_norm": 2.06042218208313, + "learning_rate": 8.112929093847262e-06, + "loss": 0.3853, + "step": 6480 + }, + { + "epoch": 1.0743254428074822, + "grad_norm": 2.618030071258545, + "learning_rate": 8.105386664866732e-06, + "loss": 0.4163, + "step": 6490 + }, + { + "epoch": 1.0759807978811455, + "grad_norm": 2.011997938156128, + "learning_rate": 8.09783271443001e-06, + "loss": 0.4632, + "step": 6500 + }, + { + "epoch": 1.0776361529548089, + "grad_norm": 1.7011245489120483, + "learning_rate": 8.090267270563403e-06, + "loss": 0.422, + "step": 6510 + }, + { + "epoch": 1.0792915080284722, + "grad_norm": 2.1394636631011963, + "learning_rate": 8.082690361335857e-06, + "loss": 0.4856, + "step": 6520 + }, + { + "epoch": 1.0809468631021355, + "grad_norm": 1.878475308418274, + "learning_rate": 8.075102014858854e-06, + "loss": 0.4819, + "step": 6530 + }, + { + "epoch": 1.0826022181757988, + "grad_norm": 2.2121503353118896, + "learning_rate": 8.067502259286313e-06, + "loss": 0.4312, + "step": 6540 + }, + { + "epoch": 1.084257573249462, + "grad_norm": 2.6810719966888428, + "learning_rate": 8.059891122814481e-06, + "loss": 0.4271, + "step": 6550 + }, + { + "epoch": 1.0859129283231252, + "grad_norm": 2.5359325408935547, + "learning_rate": 8.05226863368183e-06, + "loss": 0.4325, + "step": 6560 + }, + { + "epoch": 1.0875682833967886, + "grad_norm": 2.2482380867004395, + "learning_rate": 8.044634820168954e-06, + "loss": 0.4763, + "step": 6570 + }, + { + "epoch": 1.089223638470452, + "grad_norm": 1.8877545595169067, + "learning_rate": 8.036989710598458e-06, + "loss": 0.4688, + "step": 6580 + }, + { + "epoch": 1.0908789935441152, + "grad_norm": 2.3584036827087402, + "learning_rate": 8.029333333334863e-06, + "loss": 0.4688, + "step": 6590 + }, + { + "epoch": 1.0925343486177785, + "grad_norm": 2.0854368209838867, + "learning_rate": 8.02166571678449e-06, + "loss": 0.4252, + "step": 6600 + }, + { + "epoch": 1.0941897036914419, + "grad_norm": 1.9038151502609253, + "learning_rate": 8.01398688939536e-06, + "loss": 0.4127, + "step": 6610 + }, + { + "epoch": 1.0958450587651052, + "grad_norm": 1.8824822902679443, + "learning_rate": 8.00629687965709e-06, + "loss": 0.4672, + "step": 6620 + }, + { + "epoch": 1.0975004138387685, + "grad_norm": 2.430506467819214, + "learning_rate": 7.998595716100783e-06, + "loss": 0.4449, + "step": 6630 + }, + { + "epoch": 1.0991557689124316, + "grad_norm": 2.1402385234832764, + "learning_rate": 7.990883427298927e-06, + "loss": 0.4782, + "step": 6640 + }, + { + "epoch": 1.100811123986095, + "grad_norm": 2.3543331623077393, + "learning_rate": 7.983160041865285e-06, + "loss": 0.4212, + "step": 6650 + }, + { + "epoch": 1.1024664790597583, + "grad_norm": 2.142435312271118, + "learning_rate": 7.975425588454788e-06, + "loss": 0.4939, + "step": 6660 + }, + { + "epoch": 1.1041218341334216, + "grad_norm": 2.1134002208709717, + "learning_rate": 7.967680095763434e-06, + "loss": 0.4463, + "step": 6670 + }, + { + "epoch": 1.105777189207085, + "grad_norm": 2.716151237487793, + "learning_rate": 7.959923592528177e-06, + "loss": 0.4561, + "step": 6680 + }, + { + "epoch": 1.1074325442807482, + "grad_norm": 1.9613409042358398, + "learning_rate": 7.952156107526826e-06, + "loss": 0.4141, + "step": 6690 + }, + { + "epoch": 1.1090878993544115, + "grad_norm": 1.8834747076034546, + "learning_rate": 7.944377669577924e-06, + "loss": 0.4361, + "step": 6700 + }, + { + "epoch": 1.1107432544280749, + "grad_norm": 1.4934910535812378, + "learning_rate": 7.93658830754066e-06, + "loss": 0.4413, + "step": 6710 + }, + { + "epoch": 1.1123986095017382, + "grad_norm": 1.9365501403808594, + "learning_rate": 7.928788050314751e-06, + "loss": 0.4656, + "step": 6720 + }, + { + "epoch": 1.1140539645754015, + "grad_norm": 1.9965449571609497, + "learning_rate": 7.920976926840334e-06, + "loss": 0.4671, + "step": 6730 + }, + { + "epoch": 1.1157093196490648, + "grad_norm": 1.9242488145828247, + "learning_rate": 7.913154966097865e-06, + "loss": 0.4345, + "step": 6740 + }, + { + "epoch": 1.117364674722728, + "grad_norm": 1.4173060655593872, + "learning_rate": 7.905322197108006e-06, + "loss": 0.4674, + "step": 6750 + }, + { + "epoch": 1.1190200297963913, + "grad_norm": 1.9419013261795044, + "learning_rate": 7.897478648931521e-06, + "loss": 0.4157, + "step": 6760 + }, + { + "epoch": 1.1206753848700546, + "grad_norm": 1.910142421722412, + "learning_rate": 7.889624350669162e-06, + "loss": 0.4764, + "step": 6770 + }, + { + "epoch": 1.122330739943718, + "grad_norm": 1.9535759687423706, + "learning_rate": 7.88175933146157e-06, + "loss": 0.4167, + "step": 6780 + }, + { + "epoch": 1.1239860950173812, + "grad_norm": 2.105318784713745, + "learning_rate": 7.873883620489164e-06, + "loss": 0.4169, + "step": 6790 + }, + { + "epoch": 1.1256414500910445, + "grad_norm": 2.383732318878174, + "learning_rate": 7.865997246972023e-06, + "loss": 0.4533, + "step": 6800 + }, + { + "epoch": 1.1272968051647079, + "grad_norm": 1.8368053436279297, + "learning_rate": 7.858100240169792e-06, + "loss": 0.4594, + "step": 6810 + }, + { + "epoch": 1.1289521602383712, + "grad_norm": 1.8236784934997559, + "learning_rate": 7.850192629381568e-06, + "loss": 0.4501, + "step": 6820 + }, + { + "epoch": 1.1306075153120345, + "grad_norm": 1.8940480947494507, + "learning_rate": 7.842274443945785e-06, + "loss": 0.3969, + "step": 6830 + }, + { + "epoch": 1.1322628703856976, + "grad_norm": 2.2269837856292725, + "learning_rate": 7.834345713240114e-06, + "loss": 0.4762, + "step": 6840 + }, + { + "epoch": 1.133918225459361, + "grad_norm": 2.336608409881592, + "learning_rate": 7.826406466681354e-06, + "loss": 0.4974, + "step": 6850 + }, + { + "epoch": 1.1355735805330243, + "grad_norm": 2.0093603134155273, + "learning_rate": 7.81845673372531e-06, + "loss": 0.4577, + "step": 6860 + }, + { + "epoch": 1.1372289356066876, + "grad_norm": 1.781472086906433, + "learning_rate": 7.810496543866704e-06, + "loss": 0.434, + "step": 6870 + }, + { + "epoch": 1.138884290680351, + "grad_norm": 3.0077342987060547, + "learning_rate": 7.802525926639045e-06, + "loss": 0.4651, + "step": 6880 + }, + { + "epoch": 1.1405396457540142, + "grad_norm": 2.0327229499816895, + "learning_rate": 7.794544911614537e-06, + "loss": 0.4405, + "step": 6890 + }, + { + "epoch": 1.1421950008276776, + "grad_norm": 1.7821011543273926, + "learning_rate": 7.786553528403954e-06, + "loss": 0.4293, + "step": 6900 + }, + { + "epoch": 1.1438503559013409, + "grad_norm": 2.4968810081481934, + "learning_rate": 7.778551806656546e-06, + "loss": 0.4273, + "step": 6910 + }, + { + "epoch": 1.1455057109750042, + "grad_norm": 2.041292905807495, + "learning_rate": 7.770539776059914e-06, + "loss": 0.4605, + "step": 6920 + }, + { + "epoch": 1.1471610660486675, + "grad_norm": 2.0464673042297363, + "learning_rate": 7.762517466339905e-06, + "loss": 0.4407, + "step": 6930 + }, + { + "epoch": 1.1488164211223308, + "grad_norm": 1.8867651224136353, + "learning_rate": 7.754484907260513e-06, + "loss": 0.4575, + "step": 6940 + }, + { + "epoch": 1.150471776195994, + "grad_norm": 1.7853606939315796, + "learning_rate": 7.74644212862375e-06, + "loss": 0.4158, + "step": 6950 + }, + { + "epoch": 1.1521271312696573, + "grad_norm": 1.7646677494049072, + "learning_rate": 7.738389160269542e-06, + "loss": 0.4148, + "step": 6960 + }, + { + "epoch": 1.1537824863433206, + "grad_norm": 1.729858636856079, + "learning_rate": 7.73032603207563e-06, + "loss": 0.4153, + "step": 6970 + }, + { + "epoch": 1.155437841416984, + "grad_norm": 2.1216416358947754, + "learning_rate": 7.722252773957442e-06, + "loss": 0.4578, + "step": 6980 + }, + { + "epoch": 1.1570931964906472, + "grad_norm": 2.449660062789917, + "learning_rate": 7.714169415867991e-06, + "loss": 0.4396, + "step": 6990 + }, + { + "epoch": 1.1587485515643106, + "grad_norm": 2.7764759063720703, + "learning_rate": 7.706075987797767e-06, + "loss": 0.4814, + "step": 7000 + }, + { + "epoch": 1.1604039066379739, + "grad_norm": 1.6688860654830933, + "learning_rate": 7.697972519774612e-06, + "loss": 0.3879, + "step": 7010 + }, + { + "epoch": 1.1620592617116372, + "grad_norm": 2.147960662841797, + "learning_rate": 7.689859041863628e-06, + "loss": 0.4821, + "step": 7020 + }, + { + "epoch": 1.1637146167853005, + "grad_norm": 1.482928991317749, + "learning_rate": 7.681735584167048e-06, + "loss": 0.4695, + "step": 7030 + }, + { + "epoch": 1.1653699718589636, + "grad_norm": 3.1044204235076904, + "learning_rate": 7.673602176824134e-06, + "loss": 0.4205, + "step": 7040 + }, + { + "epoch": 1.167025326932627, + "grad_norm": 1.5646145343780518, + "learning_rate": 7.665458850011062e-06, + "loss": 0.4767, + "step": 7050 + }, + { + "epoch": 1.1686806820062903, + "grad_norm": 2.1758921146392822, + "learning_rate": 7.657305633940816e-06, + "loss": 0.4152, + "step": 7060 + }, + { + "epoch": 1.1703360370799536, + "grad_norm": 2.1989285945892334, + "learning_rate": 7.649142558863056e-06, + "loss": 0.4307, + "step": 7070 + }, + { + "epoch": 1.171991392153617, + "grad_norm": 2.195272207260132, + "learning_rate": 7.640969655064042e-06, + "loss": 0.4449, + "step": 7080 + }, + { + "epoch": 1.1736467472272802, + "grad_norm": 2.0945467948913574, + "learning_rate": 7.63278695286648e-06, + "loss": 0.4225, + "step": 7090 + }, + { + "epoch": 1.1753021023009436, + "grad_norm": 1.938080906867981, + "learning_rate": 7.624594482629442e-06, + "loss": 0.4001, + "step": 7100 + }, + { + "epoch": 1.1769574573746069, + "grad_norm": 1.8821436166763306, + "learning_rate": 7.616392274748235e-06, + "loss": 0.3847, + "step": 7110 + }, + { + "epoch": 1.1786128124482702, + "grad_norm": 2.344076633453369, + "learning_rate": 7.608180359654298e-06, + "loss": 0.4675, + "step": 7120 + }, + { + "epoch": 1.1802681675219335, + "grad_norm": 1.977381944656372, + "learning_rate": 7.599958767815081e-06, + "loss": 0.4387, + "step": 7130 + }, + { + "epoch": 1.1819235225955969, + "grad_norm": 2.3042211532592773, + "learning_rate": 7.591727529733941e-06, + "loss": 0.459, + "step": 7140 + }, + { + "epoch": 1.1835788776692602, + "grad_norm": 1.93119478225708, + "learning_rate": 7.583486675950021e-06, + "loss": 0.4625, + "step": 7150 + }, + { + "epoch": 1.1852342327429233, + "grad_norm": 1.6517666578292847, + "learning_rate": 7.575236237038136e-06, + "loss": 0.474, + "step": 7160 + }, + { + "epoch": 1.1868895878165866, + "grad_norm": 1.7987353801727295, + "learning_rate": 7.566976243608673e-06, + "loss": 0.4182, + "step": 7170 + }, + { + "epoch": 1.18854494289025, + "grad_norm": 2.2666845321655273, + "learning_rate": 7.558706726307459e-06, + "loss": 0.43, + "step": 7180 + }, + { + "epoch": 1.1902002979639132, + "grad_norm": 1.540936827659607, + "learning_rate": 7.55042771581566e-06, + "loss": 0.4305, + "step": 7190 + }, + { + "epoch": 1.1918556530375766, + "grad_norm": 1.6909763813018799, + "learning_rate": 7.542139242849664e-06, + "loss": 0.4059, + "step": 7200 + }, + { + "epoch": 1.19351100811124, + "grad_norm": 2.2971370220184326, + "learning_rate": 7.533841338160963e-06, + "loss": 0.4068, + "step": 7210 + }, + { + "epoch": 1.1951663631849032, + "grad_norm": 1.584033727645874, + "learning_rate": 7.525534032536044e-06, + "loss": 0.4416, + "step": 7220 + }, + { + "epoch": 1.1968217182585665, + "grad_norm": 1.8350242376327515, + "learning_rate": 7.517217356796272e-06, + "loss": 0.4115, + "step": 7230 + }, + { + "epoch": 1.1984770733322299, + "grad_norm": 1.7758424282073975, + "learning_rate": 7.508891341797777e-06, + "loss": 0.4243, + "step": 7240 + }, + { + "epoch": 1.200132428405893, + "grad_norm": 1.4368244409561157, + "learning_rate": 7.500556018431342e-06, + "loss": 0.4006, + "step": 7250 + }, + { + "epoch": 1.2017877834795563, + "grad_norm": 2.0176913738250732, + "learning_rate": 7.492211417622278e-06, + "loss": 0.4484, + "step": 7260 + }, + { + "epoch": 1.2034431385532196, + "grad_norm": 2.6586618423461914, + "learning_rate": 7.483857570330326e-06, + "loss": 0.4808, + "step": 7270 + }, + { + "epoch": 1.205098493626883, + "grad_norm": 2.3201072216033936, + "learning_rate": 7.475494507549526e-06, + "loss": 0.4636, + "step": 7280 + }, + { + "epoch": 1.2067538487005462, + "grad_norm": 2.491241693496704, + "learning_rate": 7.4671222603081115e-06, + "loss": 0.454, + "step": 7290 + }, + { + "epoch": 1.2084092037742096, + "grad_norm": 2.3110857009887695, + "learning_rate": 7.458740859668391e-06, + "loss": 0.4583, + "step": 7300 + }, + { + "epoch": 1.210064558847873, + "grad_norm": 2.2038140296936035, + "learning_rate": 7.450350336726635e-06, + "loss": 0.4352, + "step": 7310 + }, + { + "epoch": 1.2117199139215362, + "grad_norm": 1.6321648359298706, + "learning_rate": 7.441950722612957e-06, + "loss": 0.4219, + "step": 7320 + }, + { + "epoch": 1.2133752689951995, + "grad_norm": 1.9630392789840698, + "learning_rate": 7.433542048491201e-06, + "loss": 0.4495, + "step": 7330 + }, + { + "epoch": 1.2150306240688629, + "grad_norm": 2.2168731689453125, + "learning_rate": 7.4251243455588266e-06, + "loss": 0.4646, + "step": 7340 + }, + { + "epoch": 1.2166859791425262, + "grad_norm": 2.295635938644409, + "learning_rate": 7.416697645046789e-06, + "loss": 0.4543, + "step": 7350 + }, + { + "epoch": 1.2183413342161893, + "grad_norm": 2.0987188816070557, + "learning_rate": 7.408261978219426e-06, + "loss": 0.464, + "step": 7360 + }, + { + "epoch": 1.2199966892898526, + "grad_norm": 2.4366235733032227, + "learning_rate": 7.399817376374346e-06, + "loss": 0.4494, + "step": 7370 + }, + { + "epoch": 1.221652044363516, + "grad_norm": 2.054619073867798, + "learning_rate": 7.391363870842299e-06, + "loss": 0.4304, + "step": 7380 + }, + { + "epoch": 1.2233073994371793, + "grad_norm": 1.9227901697158813, + "learning_rate": 7.3829014929870805e-06, + "loss": 0.4561, + "step": 7390 + }, + { + "epoch": 1.2249627545108426, + "grad_norm": 1.8692800998687744, + "learning_rate": 7.374430274205395e-06, + "loss": 0.46, + "step": 7400 + }, + { + "epoch": 1.226618109584506, + "grad_norm": 2.2331535816192627, + "learning_rate": 7.3659502459267516e-06, + "loss": 0.3858, + "step": 7410 + }, + { + "epoch": 1.2282734646581692, + "grad_norm": 2.1220543384552, + "learning_rate": 7.357461439613341e-06, + "loss": 0.4511, + "step": 7420 + }, + { + "epoch": 1.2299288197318325, + "grad_norm": 1.9789310693740845, + "learning_rate": 7.348963886759926e-06, + "loss": 0.4326, + "step": 7430 + }, + { + "epoch": 1.2315841748054959, + "grad_norm": 1.7887349128723145, + "learning_rate": 7.340457618893717e-06, + "loss": 0.4375, + "step": 7440 + }, + { + "epoch": 1.233239529879159, + "grad_norm": 2.1581339836120605, + "learning_rate": 7.331942667574262e-06, + "loss": 0.4178, + "step": 7450 + }, + { + "epoch": 1.2348948849528223, + "grad_norm": 2.1266531944274902, + "learning_rate": 7.323419064393321e-06, + "loss": 0.4299, + "step": 7460 + }, + { + "epoch": 1.2365502400264856, + "grad_norm": 1.832108974456787, + "learning_rate": 7.3148868409747585e-06, + "loss": 0.429, + "step": 7470 + }, + { + "epoch": 1.238205595100149, + "grad_norm": 1.735487937927246, + "learning_rate": 7.306346028974418e-06, + "loss": 0.4511, + "step": 7480 + }, + { + "epoch": 1.2398609501738123, + "grad_norm": 2.0371644496917725, + "learning_rate": 7.297796660080011e-06, + "loss": 0.471, + "step": 7490 + }, + { + "epoch": 1.2415163052474756, + "grad_norm": 2.0136220455169678, + "learning_rate": 7.289238766010992e-06, + "loss": 0.507, + "step": 7500 + }, + { + "epoch": 1.243171660321139, + "grad_norm": 1.9500830173492432, + "learning_rate": 7.280672378518449e-06, + "loss": 0.461, + "step": 7510 + }, + { + "epoch": 1.2448270153948022, + "grad_norm": 2.5668957233428955, + "learning_rate": 7.2720975293849824e-06, + "loss": 0.4572, + "step": 7520 + }, + { + "epoch": 1.2464823704684656, + "grad_norm": 1.870226263999939, + "learning_rate": 7.263514250424582e-06, + "loss": 0.4433, + "step": 7530 + }, + { + "epoch": 1.2481377255421289, + "grad_norm": 1.7757699489593506, + "learning_rate": 7.254922573482518e-06, + "loss": 0.4711, + "step": 7540 + }, + { + "epoch": 1.2497930806157922, + "grad_norm": 2.230046510696411, + "learning_rate": 7.246322530435217e-06, + "loss": 0.418, + "step": 7550 + }, + { + "epoch": 1.2514484356894555, + "grad_norm": 1.9823356866836548, + "learning_rate": 7.237714153190143e-06, + "loss": 0.4646, + "step": 7560 + }, + { + "epoch": 1.2531037907631186, + "grad_norm": 2.5432960987091064, + "learning_rate": 7.229097473685686e-06, + "loss": 0.457, + "step": 7570 + }, + { + "epoch": 1.254759145836782, + "grad_norm": 1.8689072132110596, + "learning_rate": 7.220472523891035e-06, + "loss": 0.4398, + "step": 7580 + }, + { + "epoch": 1.2564145009104453, + "grad_norm": 2.0174477100372314, + "learning_rate": 7.211839335806061e-06, + "loss": 0.4594, + "step": 7590 + }, + { + "epoch": 1.2580698559841086, + "grad_norm": 2.0547597408294678, + "learning_rate": 7.203197941461206e-06, + "loss": 0.3943, + "step": 7600 + }, + { + "epoch": 1.259725211057772, + "grad_norm": 1.7089883089065552, + "learning_rate": 7.194548372917356e-06, + "loss": 0.4635, + "step": 7610 + }, + { + "epoch": 1.2613805661314352, + "grad_norm": 1.6787538528442383, + "learning_rate": 7.185890662265721e-06, + "loss": 0.3988, + "step": 7620 + }, + { + "epoch": 1.2630359212050986, + "grad_norm": 2.023766279220581, + "learning_rate": 7.177224841627724e-06, + "loss": 0.4548, + "step": 7630 + }, + { + "epoch": 1.2646912762787617, + "grad_norm": 1.8672133684158325, + "learning_rate": 7.168550943154877e-06, + "loss": 0.4203, + "step": 7640 + }, + { + "epoch": 1.266346631352425, + "grad_norm": 2.4731225967407227, + "learning_rate": 7.159868999028658e-06, + "loss": 0.4652, + "step": 7650 + }, + { + "epoch": 1.2680019864260883, + "grad_norm": 1.876103401184082, + "learning_rate": 7.151179041460402e-06, + "loss": 0.4101, + "step": 7660 + }, + { + "epoch": 1.2696573414997516, + "grad_norm": 1.7336941957473755, + "learning_rate": 7.142481102691167e-06, + "loss": 0.3927, + "step": 7670 + }, + { + "epoch": 1.271312696573415, + "grad_norm": 1.5859344005584717, + "learning_rate": 7.133775214991632e-06, + "loss": 0.4542, + "step": 7680 + }, + { + "epoch": 1.2729680516470783, + "grad_norm": 1.7735705375671387, + "learning_rate": 7.125061410661959e-06, + "loss": 0.4417, + "step": 7690 + }, + { + "epoch": 1.2746234067207416, + "grad_norm": 2.3329756259918213, + "learning_rate": 7.1163397220316865e-06, + "loss": 0.4277, + "step": 7700 + }, + { + "epoch": 1.276278761794405, + "grad_norm": 2.6831562519073486, + "learning_rate": 7.107610181459603e-06, + "loss": 0.3803, + "step": 7710 + }, + { + "epoch": 1.2779341168680682, + "grad_norm": 1.7790484428405762, + "learning_rate": 7.098872821333633e-06, + "loss": 0.4598, + "step": 7720 + }, + { + "epoch": 1.2795894719417316, + "grad_norm": 1.8431192636489868, + "learning_rate": 7.090127674070707e-06, + "loss": 0.426, + "step": 7730 + }, + { + "epoch": 1.2812448270153949, + "grad_norm": 2.489999532699585, + "learning_rate": 7.081374772116652e-06, + "loss": 0.4192, + "step": 7740 + }, + { + "epoch": 1.2829001820890582, + "grad_norm": 1.8376771211624146, + "learning_rate": 7.07261414794606e-06, + "loss": 0.4184, + "step": 7750 + }, + { + "epoch": 1.2845555371627215, + "grad_norm": 1.8397380113601685, + "learning_rate": 7.063845834062178e-06, + "loss": 0.4131, + "step": 7760 + }, + { + "epoch": 1.2862108922363846, + "grad_norm": 1.7695285081863403, + "learning_rate": 7.055069862996786e-06, + "loss": 0.424, + "step": 7770 + }, + { + "epoch": 1.287866247310048, + "grad_norm": 1.8908298015594482, + "learning_rate": 7.0462862673100675e-06, + "loss": 0.4538, + "step": 7780 + }, + { + "epoch": 1.2895216023837113, + "grad_norm": 2.012000799179077, + "learning_rate": 7.037495079590494e-06, + "loss": 0.4166, + "step": 7790 + }, + { + "epoch": 1.2911769574573746, + "grad_norm": 2.016519784927368, + "learning_rate": 7.028696332454712e-06, + "loss": 0.4505, + "step": 7800 + }, + { + "epoch": 1.292832312531038, + "grad_norm": 1.9093087911605835, + "learning_rate": 7.0198900585474065e-06, + "loss": 0.4142, + "step": 7810 + }, + { + "epoch": 1.2944876676047012, + "grad_norm": 1.9886523485183716, + "learning_rate": 7.01107629054119e-06, + "loss": 0.4588, + "step": 7820 + }, + { + "epoch": 1.2961430226783646, + "grad_norm": 2.125847101211548, + "learning_rate": 7.0022550611364835e-06, + "loss": 0.4451, + "step": 7830 + }, + { + "epoch": 1.2977983777520279, + "grad_norm": 2.2600979804992676, + "learning_rate": 6.993426403061389e-06, + "loss": 0.4324, + "step": 7840 + }, + { + "epoch": 1.299453732825691, + "grad_norm": 2.3956971168518066, + "learning_rate": 6.984590349071564e-06, + "loss": 0.4473, + "step": 7850 + }, + { + "epoch": 1.3011090878993543, + "grad_norm": 2.047929286956787, + "learning_rate": 6.975746931950116e-06, + "loss": 0.4193, + "step": 7860 + }, + { + "epoch": 1.3027644429730176, + "grad_norm": 1.9703757762908936, + "learning_rate": 6.9668961845074615e-06, + "loss": 0.4446, + "step": 7870 + }, + { + "epoch": 1.304419798046681, + "grad_norm": 2.0574028491973877, + "learning_rate": 6.95803813958122e-06, + "loss": 0.4543, + "step": 7880 + }, + { + "epoch": 1.3060751531203443, + "grad_norm": 1.9040168523788452, + "learning_rate": 6.949172830036084e-06, + "loss": 0.3936, + "step": 7890 + }, + { + "epoch": 1.3077305081940076, + "grad_norm": 2.8889362812042236, + "learning_rate": 6.940300288763697e-06, + "loss": 0.4244, + "step": 7900 + }, + { + "epoch": 1.309385863267671, + "grad_norm": 2.0269064903259277, + "learning_rate": 6.931420548682535e-06, + "loss": 0.4036, + "step": 7910 + }, + { + "epoch": 1.3110412183413342, + "grad_norm": 3.0732882022857666, + "learning_rate": 6.9225336427377835e-06, + "loss": 0.459, + "step": 7920 + }, + { + "epoch": 1.3126965734149976, + "grad_norm": 2.3372762203216553, + "learning_rate": 6.9136396039012125e-06, + "loss": 0.4699, + "step": 7930 + }, + { + "epoch": 1.314351928488661, + "grad_norm": 1.708390474319458, + "learning_rate": 6.904738465171058e-06, + "loss": 0.412, + "step": 7940 + }, + { + "epoch": 1.3160072835623242, + "grad_norm": 1.5611287355422974, + "learning_rate": 6.895830259571894e-06, + "loss": 0.4658, + "step": 7950 + }, + { + "epoch": 1.3176626386359875, + "grad_norm": 2.084836721420288, + "learning_rate": 6.886915020154519e-06, + "loss": 0.4428, + "step": 7960 + }, + { + "epoch": 1.3193179937096506, + "grad_norm": 4.934893608093262, + "learning_rate": 6.877992779995825e-06, + "loss": 0.4382, + "step": 7970 + }, + { + "epoch": 1.320973348783314, + "grad_norm": 1.7433263063430786, + "learning_rate": 6.869063572198678e-06, + "loss": 0.4303, + "step": 7980 + }, + { + "epoch": 1.3226287038569773, + "grad_norm": 2.039020538330078, + "learning_rate": 6.860127429891792e-06, + "loss": 0.4334, + "step": 7990 + }, + { + "epoch": 1.3242840589306406, + "grad_norm": 2.0853657722473145, + "learning_rate": 6.851184386229617e-06, + "loss": 0.4279, + "step": 8000 + }, + { + "epoch": 1.325939414004304, + "grad_norm": 2.357513904571533, + "learning_rate": 6.842234474392201e-06, + "loss": 0.4589, + "step": 8010 + }, + { + "epoch": 1.3275947690779673, + "grad_norm": 1.799069881439209, + "learning_rate": 6.833277727585076e-06, + "loss": 0.3904, + "step": 8020 + }, + { + "epoch": 1.3292501241516306, + "grad_norm": 1.7437762022018433, + "learning_rate": 6.8243141790391345e-06, + "loss": 0.4397, + "step": 8030 + }, + { + "epoch": 1.330905479225294, + "grad_norm": 2.2342443466186523, + "learning_rate": 6.8153438620105005e-06, + "loss": 0.4762, + "step": 8040 + }, + { + "epoch": 1.332560834298957, + "grad_norm": 3.045323133468628, + "learning_rate": 6.806366809780415e-06, + "loss": 0.4217, + "step": 8050 + }, + { + "epoch": 1.3342161893726203, + "grad_norm": 1.7698055505752563, + "learning_rate": 6.797383055655105e-06, + "loss": 0.4276, + "step": 8060 + }, + { + "epoch": 1.3358715444462836, + "grad_norm": 1.860013723373413, + "learning_rate": 6.788392632965661e-06, + "loss": 0.3845, + "step": 8070 + }, + { + "epoch": 1.337526899519947, + "grad_norm": 2.04567551612854, + "learning_rate": 6.779395575067919e-06, + "loss": 0.4531, + "step": 8080 + }, + { + "epoch": 1.3391822545936103, + "grad_norm": 1.8434317111968994, + "learning_rate": 6.770391915342329e-06, + "loss": 0.4137, + "step": 8090 + }, + { + "epoch": 1.3408376096672736, + "grad_norm": 2.2670910358428955, + "learning_rate": 6.761381687193836e-06, + "loss": 0.4245, + "step": 8100 + }, + { + "epoch": 1.342492964740937, + "grad_norm": 2.1821208000183105, + "learning_rate": 6.752364924051757e-06, + "loss": 0.4651, + "step": 8110 + }, + { + "epoch": 1.3441483198146003, + "grad_norm": 1.7987016439437866, + "learning_rate": 6.7433416593696485e-06, + "loss": 0.4426, + "step": 8120 + }, + { + "epoch": 1.3458036748882636, + "grad_norm": 2.464167833328247, + "learning_rate": 6.734311926625198e-06, + "loss": 0.4304, + "step": 8130 + }, + { + "epoch": 1.347459029961927, + "grad_norm": 2.0317628383636475, + "learning_rate": 6.725275759320082e-06, + "loss": 0.418, + "step": 8140 + }, + { + "epoch": 1.3491143850355902, + "grad_norm": 1.8057821989059448, + "learning_rate": 6.716233190979855e-06, + "loss": 0.4229, + "step": 8150 + }, + { + "epoch": 1.3507697401092535, + "grad_norm": 2.1078262329101562, + "learning_rate": 6.707184255153818e-06, + "loss": 0.4275, + "step": 8160 + }, + { + "epoch": 1.3524250951829169, + "grad_norm": 1.794978380203247, + "learning_rate": 6.698128985414899e-06, + "loss": 0.4377, + "step": 8170 + }, + { + "epoch": 1.35408045025658, + "grad_norm": 1.7811648845672607, + "learning_rate": 6.689067415359522e-06, + "loss": 0.4426, + "step": 8180 + }, + { + "epoch": 1.3557358053302433, + "grad_norm": 2.394949197769165, + "learning_rate": 6.6799995786074916e-06, + "loss": 0.4604, + "step": 8190 + }, + { + "epoch": 1.3573911604039066, + "grad_norm": 2.0753695964813232, + "learning_rate": 6.6709255088018545e-06, + "loss": 0.4313, + "step": 8200 + }, + { + "epoch": 1.35904651547757, + "grad_norm": 2.000762939453125, + "learning_rate": 6.661845239608792e-06, + "loss": 0.3965, + "step": 8210 + }, + { + "epoch": 1.3607018705512333, + "grad_norm": 2.1908514499664307, + "learning_rate": 6.652758804717479e-06, + "loss": 0.4589, + "step": 8220 + }, + { + "epoch": 1.3623572256248966, + "grad_norm": 1.8629430532455444, + "learning_rate": 6.643666237839973e-06, + "loss": 0.4131, + "step": 8230 + }, + { + "epoch": 1.36401258069856, + "grad_norm": 1.983957290649414, + "learning_rate": 6.6345675727110745e-06, + "loss": 0.4443, + "step": 8240 + }, + { + "epoch": 1.365667935772223, + "grad_norm": 1.5461257696151733, + "learning_rate": 6.625462843088214e-06, + "loss": 0.3976, + "step": 8250 + }, + { + "epoch": 1.3673232908458863, + "grad_norm": 2.0671067237854004, + "learning_rate": 6.616352082751322e-06, + "loss": 0.4613, + "step": 8260 + }, + { + "epoch": 1.3689786459195497, + "grad_norm": 1.9865403175354004, + "learning_rate": 6.607235325502703e-06, + "loss": 0.3965, + "step": 8270 + }, + { + "epoch": 1.370634000993213, + "grad_norm": 1.8592555522918701, + "learning_rate": 6.598112605166909e-06, + "loss": 0.4623, + "step": 8280 + }, + { + "epoch": 1.3722893560668763, + "grad_norm": 2.026170253753662, + "learning_rate": 6.588983955590622e-06, + "loss": 0.4354, + "step": 8290 + }, + { + "epoch": 1.3739447111405396, + "grad_norm": 1.7887972593307495, + "learning_rate": 6.5798494106425155e-06, + "loss": 0.4408, + "step": 8300 + }, + { + "epoch": 1.375600066214203, + "grad_norm": 1.863389253616333, + "learning_rate": 6.570709004213139e-06, + "loss": 0.4266, + "step": 8310 + }, + { + "epoch": 1.3772554212878663, + "grad_norm": 1.7692625522613525, + "learning_rate": 6.56156277021479e-06, + "loss": 0.444, + "step": 8320 + }, + { + "epoch": 1.3789107763615296, + "grad_norm": 2.5655503273010254, + "learning_rate": 6.5524107425813834e-06, + "loss": 0.408, + "step": 8330 + }, + { + "epoch": 1.380566131435193, + "grad_norm": 2.5863282680511475, + "learning_rate": 6.543252955268335e-06, + "loss": 0.4313, + "step": 8340 + }, + { + "epoch": 1.3822214865088562, + "grad_norm": 1.7978219985961914, + "learning_rate": 6.5340894422524246e-06, + "loss": 0.4399, + "step": 8350 + }, + { + "epoch": 1.3838768415825196, + "grad_norm": 1.9100663661956787, + "learning_rate": 6.524920237531678e-06, + "loss": 0.4277, + "step": 8360 + }, + { + "epoch": 1.3855321966561829, + "grad_norm": 1.9516162872314453, + "learning_rate": 6.515745375125236e-06, + "loss": 0.4612, + "step": 8370 + }, + { + "epoch": 1.387187551729846, + "grad_norm": 1.9329428672790527, + "learning_rate": 6.506564889073233e-06, + "loss": 0.4383, + "step": 8380 + }, + { + "epoch": 1.3888429068035093, + "grad_norm": 1.867749810218811, + "learning_rate": 6.497378813436667e-06, + "loss": 0.4156, + "step": 8390 + }, + { + "epoch": 1.3904982618771726, + "grad_norm": 2.531853437423706, + "learning_rate": 6.488187182297272e-06, + "loss": 0.4459, + "step": 8400 + }, + { + "epoch": 1.392153616950836, + "grad_norm": 1.8974616527557373, + "learning_rate": 6.4789900297573985e-06, + "loss": 0.4295, + "step": 8410 + }, + { + "epoch": 1.3938089720244993, + "grad_norm": 2.2630221843719482, + "learning_rate": 6.4697873899398756e-06, + "loss": 0.4561, + "step": 8420 + }, + { + "epoch": 1.3954643270981626, + "grad_norm": 2.2150561809539795, + "learning_rate": 6.460579296987899e-06, + "loss": 0.4594, + "step": 8430 + }, + { + "epoch": 1.397119682171826, + "grad_norm": 2.0618362426757812, + "learning_rate": 6.451365785064887e-06, + "loss": 0.4673, + "step": 8440 + }, + { + "epoch": 1.3987750372454892, + "grad_norm": 1.7820813655853271, + "learning_rate": 6.442146888354373e-06, + "loss": 0.4342, + "step": 8450 + }, + { + "epoch": 1.4004303923191523, + "grad_norm": 1.9812285900115967, + "learning_rate": 6.4329226410598625e-06, + "loss": 0.4662, + "step": 8460 + }, + { + "epoch": 1.4020857473928157, + "grad_norm": 2.486772060394287, + "learning_rate": 6.423693077404713e-06, + "loss": 0.4428, + "step": 8470 + }, + { + "epoch": 1.403741102466479, + "grad_norm": 2.0029282569885254, + "learning_rate": 6.4144582316320085e-06, + "loss": 0.4074, + "step": 8480 + }, + { + "epoch": 1.4053964575401423, + "grad_norm": 1.755991816520691, + "learning_rate": 6.405218138004428e-06, + "loss": 0.4384, + "step": 8490 + }, + { + "epoch": 1.4070518126138056, + "grad_norm": 2.043971300125122, + "learning_rate": 6.395972830804125e-06, + "loss": 0.4143, + "step": 8500 + }, + { + "epoch": 1.408707167687469, + "grad_norm": 2.047865152359009, + "learning_rate": 6.386722344332591e-06, + "loss": 0.3758, + "step": 8510 + }, + { + "epoch": 1.4103625227611323, + "grad_norm": 1.7395375967025757, + "learning_rate": 6.3774667129105374e-06, + "loss": 0.4224, + "step": 8520 + }, + { + "epoch": 1.4120178778347956, + "grad_norm": 1.747536063194275, + "learning_rate": 6.36820597087776e-06, + "loss": 0.4524, + "step": 8530 + }, + { + "epoch": 1.413673232908459, + "grad_norm": 1.9488874673843384, + "learning_rate": 6.358940152593021e-06, + "loss": 0.4208, + "step": 8540 + }, + { + "epoch": 1.4153285879821222, + "grad_norm": 2.2660651206970215, + "learning_rate": 6.349669292433913e-06, + "loss": 0.426, + "step": 8550 + }, + { + "epoch": 1.4169839430557856, + "grad_norm": 2.1231184005737305, + "learning_rate": 6.340393424796735e-06, + "loss": 0.432, + "step": 8560 + }, + { + "epoch": 1.418639298129449, + "grad_norm": 2.2942655086517334, + "learning_rate": 6.331112584096364e-06, + "loss": 0.4558, + "step": 8570 + }, + { + "epoch": 1.420294653203112, + "grad_norm": 2.481250762939453, + "learning_rate": 6.3218268047661316e-06, + "loss": 0.3894, + "step": 8580 + }, + { + "epoch": 1.4219500082767753, + "grad_norm": 1.7492702007293701, + "learning_rate": 6.312536121257685e-06, + "loss": 0.4185, + "step": 8590 + }, + { + "epoch": 1.4236053633504386, + "grad_norm": 1.907914400100708, + "learning_rate": 6.303240568040875e-06, + "loss": 0.4708, + "step": 8600 + }, + { + "epoch": 1.425260718424102, + "grad_norm": 2.4471542835235596, + "learning_rate": 6.293940179603614e-06, + "loss": 0.3845, + "step": 8610 + }, + { + "epoch": 1.4269160734977653, + "grad_norm": 2.4658238887786865, + "learning_rate": 6.284634990451755e-06, + "loss": 0.3813, + "step": 8620 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 1.6748656034469604, + "learning_rate": 6.275325035108966e-06, + "loss": 0.4004, + "step": 8630 + }, + { + "epoch": 1.430226783645092, + "grad_norm": 2.4648280143737793, + "learning_rate": 6.266010348116592e-06, + "loss": 0.4744, + "step": 8640 + }, + { + "epoch": 1.4318821387187552, + "grad_norm": 1.831211805343628, + "learning_rate": 6.256690964033537e-06, + "loss": 0.4707, + "step": 8650 + }, + { + "epoch": 1.4335374937924183, + "grad_norm": 1.9654415845870972, + "learning_rate": 6.247366917436135e-06, + "loss": 0.4265, + "step": 8660 + }, + { + "epoch": 1.4351928488660817, + "grad_norm": 1.9499164819717407, + "learning_rate": 6.238038242918012e-06, + "loss": 0.4302, + "step": 8670 + }, + { + "epoch": 1.436848203939745, + "grad_norm": 1.837054967880249, + "learning_rate": 6.228704975089966e-06, + "loss": 0.4066, + "step": 8680 + }, + { + "epoch": 1.4385035590134083, + "grad_norm": 2.007383108139038, + "learning_rate": 6.2193671485798414e-06, + "loss": 0.4104, + "step": 8690 + }, + { + "epoch": 1.4401589140870716, + "grad_norm": 1.9571911096572876, + "learning_rate": 6.2100247980323925e-06, + "loss": 0.4567, + "step": 8700 + }, + { + "epoch": 1.441814269160735, + "grad_norm": 1.5203781127929688, + "learning_rate": 6.200677958109156e-06, + "loss": 0.4417, + "step": 8710 + }, + { + "epoch": 1.4434696242343983, + "grad_norm": 2.683063268661499, + "learning_rate": 6.191326663488331e-06, + "loss": 0.4542, + "step": 8720 + }, + { + "epoch": 1.4451249793080616, + "grad_norm": 2.251321792602539, + "learning_rate": 6.181970948864637e-06, + "loss": 0.45, + "step": 8730 + }, + { + "epoch": 1.446780334381725, + "grad_norm": 1.943559169769287, + "learning_rate": 6.172610848949201e-06, + "loss": 0.42, + "step": 8740 + }, + { + "epoch": 1.4484356894553883, + "grad_norm": 1.8431415557861328, + "learning_rate": 6.163246398469413e-06, + "loss": 0.4268, + "step": 8750 + }, + { + "epoch": 1.4500910445290516, + "grad_norm": 1.8626545667648315, + "learning_rate": 6.153877632168805e-06, + "loss": 0.4441, + "step": 8760 + }, + { + "epoch": 1.451746399602715, + "grad_norm": 1.4640800952911377, + "learning_rate": 6.144504584806924e-06, + "loss": 0.4061, + "step": 8770 + }, + { + "epoch": 1.4534017546763782, + "grad_norm": 2.071951150894165, + "learning_rate": 6.135127291159201e-06, + "loss": 0.4492, + "step": 8780 + }, + { + "epoch": 1.4550571097500413, + "grad_norm": 2.0795013904571533, + "learning_rate": 6.125745786016818e-06, + "loss": 0.4285, + "step": 8790 + }, + { + "epoch": 1.4567124648237046, + "grad_norm": 1.7705730199813843, + "learning_rate": 6.116360104186586e-06, + "loss": 0.3959, + "step": 8800 + }, + { + "epoch": 1.458367819897368, + "grad_norm": 2.478497266769409, + "learning_rate": 6.106970280490807e-06, + "loss": 0.4817, + "step": 8810 + }, + { + "epoch": 1.4600231749710313, + "grad_norm": 1.7252123355865479, + "learning_rate": 6.097576349767155e-06, + "loss": 0.4097, + "step": 8820 + }, + { + "epoch": 1.4616785300446946, + "grad_norm": 1.9439191818237305, + "learning_rate": 6.08817834686854e-06, + "loss": 0.422, + "step": 8830 + }, + { + "epoch": 1.463333885118358, + "grad_norm": 1.86637282371521, + "learning_rate": 6.07877630666298e-06, + "loss": 0.4097, + "step": 8840 + }, + { + "epoch": 1.4649892401920213, + "grad_norm": 2.0580270290374756, + "learning_rate": 6.069370264033472e-06, + "loss": 0.4683, + "step": 8850 + }, + { + "epoch": 1.4666445952656844, + "grad_norm": 1.9036422967910767, + "learning_rate": 6.059960253877861e-06, + "loss": 0.4154, + "step": 8860 + }, + { + "epoch": 1.4682999503393477, + "grad_norm": 2.2205872535705566, + "learning_rate": 6.050546311108718e-06, + "loss": 0.432, + "step": 8870 + }, + { + "epoch": 1.469955305413011, + "grad_norm": 1.7262464761734009, + "learning_rate": 6.041128470653197e-06, + "loss": 0.4073, + "step": 8880 + }, + { + "epoch": 1.4716106604866743, + "grad_norm": 1.961290955543518, + "learning_rate": 6.0317067674529186e-06, + "loss": 0.4177, + "step": 8890 + }, + { + "epoch": 1.4732660155603376, + "grad_norm": 1.9213963747024536, + "learning_rate": 6.022281236463829e-06, + "loss": 0.4509, + "step": 8900 + }, + { + "epoch": 1.474921370634001, + "grad_norm": 2.405870199203491, + "learning_rate": 6.012851912656084e-06, + "loss": 0.4234, + "step": 8910 + }, + { + "epoch": 1.4765767257076643, + "grad_norm": 1.8768267631530762, + "learning_rate": 6.003418831013908e-06, + "loss": 0.4133, + "step": 8920 + }, + { + "epoch": 1.4782320807813276, + "grad_norm": 1.9255216121673584, + "learning_rate": 5.993982026535461e-06, + "loss": 0.4286, + "step": 8930 + }, + { + "epoch": 1.479887435854991, + "grad_norm": 1.9891632795333862, + "learning_rate": 5.984541534232725e-06, + "loss": 0.4363, + "step": 8940 + }, + { + "epoch": 1.4815427909286543, + "grad_norm": 2.4171807765960693, + "learning_rate": 5.97509738913136e-06, + "loss": 0.4239, + "step": 8950 + }, + { + "epoch": 1.4831981460023176, + "grad_norm": 2.0096046924591064, + "learning_rate": 5.96564962627058e-06, + "loss": 0.4461, + "step": 8960 + }, + { + "epoch": 1.484853501075981, + "grad_norm": 2.0493736267089844, + "learning_rate": 5.956198280703016e-06, + "loss": 0.4759, + "step": 8970 + }, + { + "epoch": 1.4865088561496442, + "grad_norm": 2.407358169555664, + "learning_rate": 5.946743387494598e-06, + "loss": 0.4372, + "step": 8980 + }, + { + "epoch": 1.4881642112233073, + "grad_norm": 2.0386157035827637, + "learning_rate": 5.937284981724416e-06, + "loss": 0.4227, + "step": 8990 + }, + { + "epoch": 1.4898195662969707, + "grad_norm": 1.7787805795669556, + "learning_rate": 5.9278230984845934e-06, + "loss": 0.384, + "step": 9000 + }, + { + "epoch": 1.491474921370634, + "grad_norm": 2.143181085586548, + "learning_rate": 5.9183577728801524e-06, + "loss": 0.4499, + "step": 9010 + }, + { + "epoch": 1.4931302764442973, + "grad_norm": 2.1203973293304443, + "learning_rate": 5.908889040028887e-06, + "loss": 0.4441, + "step": 9020 + }, + { + "epoch": 1.4947856315179606, + "grad_norm": 1.7282978296279907, + "learning_rate": 5.899416935061237e-06, + "loss": 0.3813, + "step": 9030 + }, + { + "epoch": 1.496440986591624, + "grad_norm": 1.6689910888671875, + "learning_rate": 5.889941493120151e-06, + "loss": 0.4233, + "step": 9040 + }, + { + "epoch": 1.4980963416652873, + "grad_norm": 1.9057822227478027, + "learning_rate": 5.880462749360956e-06, + "loss": 0.4192, + "step": 9050 + }, + { + "epoch": 1.4997516967389504, + "grad_norm": 2.1098954677581787, + "learning_rate": 5.8709807389512294e-06, + "loss": 0.431, + "step": 9060 + }, + { + "epoch": 1.5014070518126137, + "grad_norm": 1.6635946035385132, + "learning_rate": 5.861495497070675e-06, + "loss": 0.4421, + "step": 9070 + }, + { + "epoch": 1.503062406886277, + "grad_norm": 1.9180066585540771, + "learning_rate": 5.8520070589109755e-06, + "loss": 0.4599, + "step": 9080 + }, + { + "epoch": 1.5047177619599403, + "grad_norm": 1.7293899059295654, + "learning_rate": 5.842515459675681e-06, + "loss": 0.4043, + "step": 9090 + }, + { + "epoch": 1.5063731170336037, + "grad_norm": 2.8191874027252197, + "learning_rate": 5.833020734580065e-06, + "loss": 0.4037, + "step": 9100 + }, + { + "epoch": 1.508028472107267, + "grad_norm": 1.7623112201690674, + "learning_rate": 5.823522918851e-06, + "loss": 0.4251, + "step": 9110 + }, + { + "epoch": 1.5096838271809303, + "grad_norm": 2.5156078338623047, + "learning_rate": 5.814022047726826e-06, + "loss": 0.4173, + "step": 9120 + }, + { + "epoch": 1.5113391822545936, + "grad_norm": 2.159651279449463, + "learning_rate": 5.804518156457216e-06, + "loss": 0.4365, + "step": 9130 + }, + { + "epoch": 1.512994537328257, + "grad_norm": 1.5260064601898193, + "learning_rate": 5.7950112803030504e-06, + "loss": 0.4707, + "step": 9140 + }, + { + "epoch": 1.5146498924019203, + "grad_norm": 1.9621107578277588, + "learning_rate": 5.785501454536286e-06, + "loss": 0.4374, + "step": 9150 + }, + { + "epoch": 1.5163052474755836, + "grad_norm": 1.5603402853012085, + "learning_rate": 5.775988714439817e-06, + "loss": 0.457, + "step": 9160 + }, + { + "epoch": 1.517960602549247, + "grad_norm": 1.9429376125335693, + "learning_rate": 5.766473095307357e-06, + "loss": 0.4234, + "step": 9170 + }, + { + "epoch": 1.5196159576229102, + "grad_norm": 1.9809801578521729, + "learning_rate": 5.756954632443297e-06, + "loss": 0.4448, + "step": 9180 + }, + { + "epoch": 1.5212713126965736, + "grad_norm": 2.2237918376922607, + "learning_rate": 5.747433361162581e-06, + "loss": 0.3984, + "step": 9190 + }, + { + "epoch": 1.5229266677702367, + "grad_norm": 2.021394729614258, + "learning_rate": 5.737909316790571e-06, + "loss": 0.4425, + "step": 9200 + }, + { + "epoch": 1.5245820228439, + "grad_norm": 2.0034067630767822, + "learning_rate": 5.728382534662917e-06, + "loss": 0.4359, + "step": 9210 + }, + { + "epoch": 1.5262373779175633, + "grad_norm": 1.3382004499435425, + "learning_rate": 5.718853050125429e-06, + "loss": 0.4135, + "step": 9220 + }, + { + "epoch": 1.5278927329912266, + "grad_norm": 2.4722602367401123, + "learning_rate": 5.709320898533942e-06, + "loss": 0.4485, + "step": 9230 + }, + { + "epoch": 1.52954808806489, + "grad_norm": 1.733469843864441, + "learning_rate": 5.699786115254187e-06, + "loss": 0.4223, + "step": 9240 + }, + { + "epoch": 1.531203443138553, + "grad_norm": 2.123274087905884, + "learning_rate": 5.690248735661655e-06, + "loss": 0.4305, + "step": 9250 + }, + { + "epoch": 1.5328587982122164, + "grad_norm": 1.6809306144714355, + "learning_rate": 5.680708795141478e-06, + "loss": 0.4472, + "step": 9260 + }, + { + "epoch": 1.5345141532858797, + "grad_norm": 1.8373470306396484, + "learning_rate": 5.671166329088278e-06, + "loss": 0.4031, + "step": 9270 + }, + { + "epoch": 1.536169508359543, + "grad_norm": 2.4719669818878174, + "learning_rate": 5.661621372906058e-06, + "loss": 0.3956, + "step": 9280 + }, + { + "epoch": 1.5378248634332063, + "grad_norm": 2.367678642272949, + "learning_rate": 5.652073962008054e-06, + "loss": 0.4775, + "step": 9290 + }, + { + "epoch": 1.5394802185068697, + "grad_norm": 2.1993610858917236, + "learning_rate": 5.64252413181661e-06, + "loss": 0.4199, + "step": 9300 + }, + { + "epoch": 1.541135573580533, + "grad_norm": 2.5951197147369385, + "learning_rate": 5.632971917763047e-06, + "loss": 0.4151, + "step": 9310 + }, + { + "epoch": 1.5427909286541963, + "grad_norm": 2.3838794231414795, + "learning_rate": 5.623417355287532e-06, + "loss": 0.4379, + "step": 9320 + }, + { + "epoch": 1.5444462837278596, + "grad_norm": 2.005969285964966, + "learning_rate": 5.61386047983894e-06, + "loss": 0.4107, + "step": 9330 + }, + { + "epoch": 1.546101638801523, + "grad_norm": 2.2176222801208496, + "learning_rate": 5.604301326874729e-06, + "loss": 0.451, + "step": 9340 + }, + { + "epoch": 1.5477569938751863, + "grad_norm": 2.1976559162139893, + "learning_rate": 5.594739931860812e-06, + "loss": 0.4042, + "step": 9350 + }, + { + "epoch": 1.5494123489488496, + "grad_norm": 2.2575743198394775, + "learning_rate": 5.585176330271417e-06, + "loss": 0.4359, + "step": 9360 + }, + { + "epoch": 1.551067704022513, + "grad_norm": 2.771317720413208, + "learning_rate": 5.575610557588955e-06, + "loss": 0.4128, + "step": 9370 + }, + { + "epoch": 1.5527230590961763, + "grad_norm": 1.7214008569717407, + "learning_rate": 5.566042649303899e-06, + "loss": 0.4136, + "step": 9380 + }, + { + "epoch": 1.5543784141698396, + "grad_norm": 2.055534601211548, + "learning_rate": 5.556472640914639e-06, + "loss": 0.4742, + "step": 9390 + }, + { + "epoch": 1.556033769243503, + "grad_norm": 2.1422717571258545, + "learning_rate": 5.5469005679273616e-06, + "loss": 0.4336, + "step": 9400 + }, + { + "epoch": 1.557689124317166, + "grad_norm": 2.063735246658325, + "learning_rate": 5.537326465855911e-06, + "loss": 0.4168, + "step": 9410 + }, + { + "epoch": 1.5593444793908293, + "grad_norm": 2.0593113899230957, + "learning_rate": 5.527750370221661e-06, + "loss": 0.4187, + "step": 9420 + }, + { + "epoch": 1.5609998344644926, + "grad_norm": 2.017012596130371, + "learning_rate": 5.518172316553378e-06, + "loss": 0.448, + "step": 9430 + }, + { + "epoch": 1.562655189538156, + "grad_norm": 1.9413402080535889, + "learning_rate": 5.5085923403871e-06, + "loss": 0.4058, + "step": 9440 + }, + { + "epoch": 1.5643105446118193, + "grad_norm": 1.7295361757278442, + "learning_rate": 5.4990104772659895e-06, + "loss": 0.4131, + "step": 9450 + }, + { + "epoch": 1.5659658996854824, + "grad_norm": 2.06921124458313, + "learning_rate": 5.489426762740218e-06, + "loss": 0.4081, + "step": 9460 + }, + { + "epoch": 1.5676212547591457, + "grad_norm": 1.8260921239852905, + "learning_rate": 5.479841232366818e-06, + "loss": 0.3937, + "step": 9470 + }, + { + "epoch": 1.569276609832809, + "grad_norm": 2.1799001693725586, + "learning_rate": 5.470253921709565e-06, + "loss": 0.3873, + "step": 9480 + }, + { + "epoch": 1.5709319649064724, + "grad_norm": 2.076521158218384, + "learning_rate": 5.46066486633884e-06, + "loss": 0.4254, + "step": 9490 + }, + { + "epoch": 1.5725873199801357, + "grad_norm": 2.1282382011413574, + "learning_rate": 5.451074101831492e-06, + "loss": 0.3999, + "step": 9500 + }, + { + "epoch": 1.574242675053799, + "grad_norm": 2.0794641971588135, + "learning_rate": 5.4414816637707115e-06, + "loss": 0.3946, + "step": 9510 + }, + { + "epoch": 1.5758980301274623, + "grad_norm": 2.05676531791687, + "learning_rate": 5.431887587745906e-06, + "loss": 0.3978, + "step": 9520 + }, + { + "epoch": 1.5775533852011256, + "grad_norm": 2.6630096435546875, + "learning_rate": 5.422291909352554e-06, + "loss": 0.4185, + "step": 9530 + }, + { + "epoch": 1.579208740274789, + "grad_norm": 2.0524003505706787, + "learning_rate": 5.4126946641920766e-06, + "loss": 0.4344, + "step": 9540 + }, + { + "epoch": 1.5808640953484523, + "grad_norm": 2.0351216793060303, + "learning_rate": 5.403095887871712e-06, + "loss": 0.3743, + "step": 9550 + }, + { + "epoch": 1.5825194504221156, + "grad_norm": 1.9727466106414795, + "learning_rate": 5.393495616004382e-06, + "loss": 0.4016, + "step": 9560 + }, + { + "epoch": 1.584174805495779, + "grad_norm": 2.116448402404785, + "learning_rate": 5.383893884208548e-06, + "loss": 0.4173, + "step": 9570 + }, + { + "epoch": 1.5858301605694423, + "grad_norm": 2.624253273010254, + "learning_rate": 5.3742907281080956e-06, + "loss": 0.4528, + "step": 9580 + }, + { + "epoch": 1.5874855156431056, + "grad_norm": 1.7432576417922974, + "learning_rate": 5.364686183332194e-06, + "loss": 0.3978, + "step": 9590 + }, + { + "epoch": 1.589140870716769, + "grad_norm": 2.323892831802368, + "learning_rate": 5.35508028551516e-06, + "loss": 0.4039, + "step": 9600 + }, + { + "epoch": 1.590796225790432, + "grad_norm": 1.9422341585159302, + "learning_rate": 5.345473070296337e-06, + "loss": 0.42, + "step": 9610 + }, + { + "epoch": 1.5924515808640953, + "grad_norm": 1.8938024044036865, + "learning_rate": 5.335864573319951e-06, + "loss": 0.3762, + "step": 9620 + }, + { + "epoch": 1.5941069359377587, + "grad_norm": 2.0808968544006348, + "learning_rate": 5.326254830234984e-06, + "loss": 0.4134, + "step": 9630 + }, + { + "epoch": 1.595762291011422, + "grad_norm": 1.8897898197174072, + "learning_rate": 5.316643876695043e-06, + "loss": 0.4059, + "step": 9640 + }, + { + "epoch": 1.5974176460850853, + "grad_norm": 1.8060656785964966, + "learning_rate": 5.307031748358227e-06, + "loss": 0.4234, + "step": 9650 + }, + { + "epoch": 1.5990730011587484, + "grad_norm": 1.8971941471099854, + "learning_rate": 5.29741848088699e-06, + "loss": 0.4222, + "step": 9660 + }, + { + "epoch": 1.6007283562324117, + "grad_norm": 2.0121896266937256, + "learning_rate": 5.2878041099480145e-06, + "loss": 0.3959, + "step": 9670 + }, + { + "epoch": 1.602383711306075, + "grad_norm": 2.076582908630371, + "learning_rate": 5.278188671212079e-06, + "loss": 0.4592, + "step": 9680 + }, + { + "epoch": 1.6040390663797384, + "grad_norm": 2.6054913997650146, + "learning_rate": 5.2685722003539215e-06, + "loss": 0.4128, + "step": 9690 + }, + { + "epoch": 1.6056944214534017, + "grad_norm": 2.281392812728882, + "learning_rate": 5.258954733052109e-06, + "loss": 0.4024, + "step": 9700 + }, + { + "epoch": 1.607349776527065, + "grad_norm": 2.435349225997925, + "learning_rate": 5.249336304988904e-06, + "loss": 0.4275, + "step": 9710 + }, + { + "epoch": 1.6090051316007283, + "grad_norm": 1.3645190000534058, + "learning_rate": 5.239716951850136e-06, + "loss": 0.3877, + "step": 9720 + }, + { + "epoch": 1.6106604866743917, + "grad_norm": 2.2975997924804688, + "learning_rate": 5.230096709325069e-06, + "loss": 0.4439, + "step": 9730 + }, + { + "epoch": 1.612315841748055, + "grad_norm": 2.102614164352417, + "learning_rate": 5.220475613106261e-06, + "loss": 0.4095, + "step": 9740 + }, + { + "epoch": 1.6139711968217183, + "grad_norm": 1.812879204750061, + "learning_rate": 5.210853698889442e-06, + "loss": 0.3996, + "step": 9750 + }, + { + "epoch": 1.6156265518953816, + "grad_norm": 1.8167248964309692, + "learning_rate": 5.201231002373374e-06, + "loss": 0.3547, + "step": 9760 + }, + { + "epoch": 1.617281906969045, + "grad_norm": 1.8382624387741089, + "learning_rate": 5.191607559259723e-06, + "loss": 0.418, + "step": 9770 + }, + { + "epoch": 1.6189372620427083, + "grad_norm": 1.9886109828948975, + "learning_rate": 5.181983405252925e-06, + "loss": 0.4426, + "step": 9780 + }, + { + "epoch": 1.6205926171163716, + "grad_norm": 2.0963995456695557, + "learning_rate": 5.172358576060052e-06, + "loss": 0.438, + "step": 9790 + }, + { + "epoch": 1.622247972190035, + "grad_norm": 2.365187168121338, + "learning_rate": 5.162733107390684e-06, + "loss": 0.4224, + "step": 9800 + }, + { + "epoch": 1.623903327263698, + "grad_norm": 2.2666516304016113, + "learning_rate": 5.153107034956772e-06, + "loss": 0.4314, + "step": 9810 + }, + { + "epoch": 1.6255586823373613, + "grad_norm": 2.123664379119873, + "learning_rate": 5.143480394472504e-06, + "loss": 0.409, + "step": 9820 + }, + { + "epoch": 1.6272140374110247, + "grad_norm": 2.0788767337799072, + "learning_rate": 5.13385322165418e-06, + "loss": 0.4135, + "step": 9830 + }, + { + "epoch": 1.628869392484688, + "grad_norm": 2.1448142528533936, + "learning_rate": 5.124225552220073e-06, + "loss": 0.3914, + "step": 9840 + }, + { + "epoch": 1.6305247475583513, + "grad_norm": 1.6150662899017334, + "learning_rate": 5.114597421890302e-06, + "loss": 0.3785, + "step": 9850 + }, + { + "epoch": 1.6321801026320144, + "grad_norm": 2.1484649181365967, + "learning_rate": 5.104968866386687e-06, + "loss": 0.4356, + "step": 9860 + }, + { + "epoch": 1.6338354577056777, + "grad_norm": 1.89021635055542, + "learning_rate": 5.095339921432636e-06, + "loss": 0.3862, + "step": 9870 + }, + { + "epoch": 1.635490812779341, + "grad_norm": 2.2360267639160156, + "learning_rate": 5.085710622752994e-06, + "loss": 0.4052, + "step": 9880 + }, + { + "epoch": 1.6371461678530044, + "grad_norm": 1.9827935695648193, + "learning_rate": 5.076081006073925e-06, + "loss": 0.4089, + "step": 9890 + }, + { + "epoch": 1.6388015229266677, + "grad_norm": 2.240103006362915, + "learning_rate": 5.0664511071227676e-06, + "loss": 0.3983, + "step": 9900 + }, + { + "epoch": 1.640456878000331, + "grad_norm": 2.3089191913604736, + "learning_rate": 5.0568209616279095e-06, + "loss": 0.4148, + "step": 9910 + }, + { + "epoch": 1.6421122330739943, + "grad_norm": 2.1113412380218506, + "learning_rate": 5.047190605318652e-06, + "loss": 0.4184, + "step": 9920 + }, + { + "epoch": 1.6437675881476577, + "grad_norm": 1.8843637704849243, + "learning_rate": 5.0375600739250855e-06, + "loss": 0.3846, + "step": 9930 + }, + { + "epoch": 1.645422943221321, + "grad_norm": 2.040696859359741, + "learning_rate": 5.027929403177936e-06, + "loss": 0.3744, + "step": 9940 + }, + { + "epoch": 1.6470782982949843, + "grad_norm": 1.737169861793518, + "learning_rate": 5.01829862880846e-06, + "loss": 0.3958, + "step": 9950 + }, + { + "epoch": 1.6487336533686476, + "grad_norm": 1.7232611179351807, + "learning_rate": 5.008667786548291e-06, + "loss": 0.4123, + "step": 9960 + }, + { + "epoch": 1.650389008442311, + "grad_norm": 2.533484935760498, + "learning_rate": 4.999036912129319e-06, + "loss": 0.4368, + "step": 9970 + }, + { + "epoch": 1.6520443635159743, + "grad_norm": 2.971538782119751, + "learning_rate": 4.989406041283549e-06, + "loss": 0.4178, + "step": 9980 + }, + { + "epoch": 1.6536997185896376, + "grad_norm": 2.1657238006591797, + "learning_rate": 4.9797752097429744e-06, + "loss": 0.4145, + "step": 9990 + }, + { + "epoch": 1.655355073663301, + "grad_norm": 2.1859874725341797, + "learning_rate": 4.970144453239443e-06, + "loss": 0.4099, + "step": 10000 + }, + { + "epoch": 1.657010428736964, + "grad_norm": 1.8412467241287231, + "learning_rate": 4.960513807504523e-06, + "loss": 0.3941, + "step": 10010 + }, + { + "epoch": 1.6586657838106273, + "grad_norm": 1.6499814987182617, + "learning_rate": 4.950883308269378e-06, + "loss": 0.4175, + "step": 10020 + }, + { + "epoch": 1.6603211388842907, + "grad_norm": 2.210827350616455, + "learning_rate": 4.941252991264619e-06, + "loss": 0.4301, + "step": 10030 + }, + { + "epoch": 1.661976493957954, + "grad_norm": 1.6489720344543457, + "learning_rate": 4.931622892220184e-06, + "loss": 0.4519, + "step": 10040 + }, + { + "epoch": 1.6636318490316173, + "grad_norm": 2.0074875354766846, + "learning_rate": 4.921993046865205e-06, + "loss": 0.4016, + "step": 10050 + }, + { + "epoch": 1.6652872041052804, + "grad_norm": 1.9746689796447754, + "learning_rate": 4.9123634909278705e-06, + "loss": 0.4213, + "step": 10060 + }, + { + "epoch": 1.6669425591789437, + "grad_norm": 1.6770844459533691, + "learning_rate": 4.902734260135293e-06, + "loss": 0.3922, + "step": 10070 + }, + { + "epoch": 1.668597914252607, + "grad_norm": 2.5734076499938965, + "learning_rate": 4.893105390213386e-06, + "loss": 0.4437, + "step": 10080 + }, + { + "epoch": 1.6702532693262704, + "grad_norm": 2.2573366165161133, + "learning_rate": 4.883476916886716e-06, + "loss": 0.4324, + "step": 10090 + }, + { + "epoch": 1.6719086243999337, + "grad_norm": 1.611232042312622, + "learning_rate": 4.8738488758783835e-06, + "loss": 0.4148, + "step": 10100 + }, + { + "epoch": 1.673563979473597, + "grad_norm": 1.7315454483032227, + "learning_rate": 4.864221302909882e-06, + "loss": 0.4148, + "step": 10110 + }, + { + "epoch": 1.6752193345472604, + "grad_norm": 1.6441781520843506, + "learning_rate": 4.854594233700969e-06, + "loss": 0.4179, + "step": 10120 + }, + { + "epoch": 1.6768746896209237, + "grad_norm": 1.7164368629455566, + "learning_rate": 4.844967703969532e-06, + "loss": 0.399, + "step": 10130 + }, + { + "epoch": 1.678530044694587, + "grad_norm": 1.6734899282455444, + "learning_rate": 4.835341749431464e-06, + "loss": 0.421, + "step": 10140 + }, + { + "epoch": 1.6801853997682503, + "grad_norm": 1.6350212097167969, + "learning_rate": 4.825716405800513e-06, + "loss": 0.4089, + "step": 10150 + }, + { + "epoch": 1.6818407548419136, + "grad_norm": 2.038209915161133, + "learning_rate": 4.816091708788168e-06, + "loss": 0.4091, + "step": 10160 + }, + { + "epoch": 1.683496109915577, + "grad_norm": 1.6218624114990234, + "learning_rate": 4.806467694103516e-06, + "loss": 0.411, + "step": 10170 + }, + { + "epoch": 1.6851514649892403, + "grad_norm": 1.921259880065918, + "learning_rate": 4.796844397453115e-06, + "loss": 0.4145, + "step": 10180 + }, + { + "epoch": 1.6868068200629036, + "grad_norm": 2.2492053508758545, + "learning_rate": 4.787221854540853e-06, + "loss": 0.4506, + "step": 10190 + }, + { + "epoch": 1.688462175136567, + "grad_norm": 1.9784705638885498, + "learning_rate": 4.7776001010678305e-06, + "loss": 0.4293, + "step": 10200 + }, + { + "epoch": 1.6901175302102303, + "grad_norm": 1.732125997543335, + "learning_rate": 4.767979172732212e-06, + "loss": 0.412, + "step": 10210 + }, + { + "epoch": 1.6917728852838934, + "grad_norm": 1.7823375463485718, + "learning_rate": 4.758359105229103e-06, + "loss": 0.4042, + "step": 10220 + }, + { + "epoch": 1.6934282403575567, + "grad_norm": 1.6599950790405273, + "learning_rate": 4.748739934250416e-06, + "loss": 0.3674, + "step": 10230 + }, + { + "epoch": 1.69508359543122, + "grad_norm": 1.7613921165466309, + "learning_rate": 4.739121695484734e-06, + "loss": 0.406, + "step": 10240 + }, + { + "epoch": 1.6967389505048833, + "grad_norm": 1.9368239641189575, + "learning_rate": 4.7295044246171865e-06, + "loss": 0.4133, + "step": 10250 + }, + { + "epoch": 1.6983943055785466, + "grad_norm": 2.9349164962768555, + "learning_rate": 4.719888157329309e-06, + "loss": 0.4141, + "step": 10260 + }, + { + "epoch": 1.7000496606522097, + "grad_norm": 1.70059335231781, + "learning_rate": 4.710272929298912e-06, + "loss": 0.3912, + "step": 10270 + }, + { + "epoch": 1.701705015725873, + "grad_norm": 1.844007968902588, + "learning_rate": 4.700658776199952e-06, + "loss": 0.4032, + "step": 10280 + }, + { + "epoch": 1.7033603707995364, + "grad_norm": 1.9813511371612549, + "learning_rate": 4.691045733702398e-06, + "loss": 0.4067, + "step": 10290 + }, + { + "epoch": 1.7050157258731997, + "grad_norm": 2.3343167304992676, + "learning_rate": 4.681433837472097e-06, + "loss": 0.4264, + "step": 10300 + }, + { + "epoch": 1.706671080946863, + "grad_norm": 2.3405990600585938, + "learning_rate": 4.671823123170646e-06, + "loss": 0.3879, + "step": 10310 + }, + { + "epoch": 1.7083264360205264, + "grad_norm": 2.8165476322174072, + "learning_rate": 4.662213626455253e-06, + "loss": 0.4121, + "step": 10320 + }, + { + "epoch": 1.7099817910941897, + "grad_norm": 3.3409717082977295, + "learning_rate": 4.65260538297861e-06, + "loss": 0.4053, + "step": 10330 + }, + { + "epoch": 1.711637146167853, + "grad_norm": 2.1133646965026855, + "learning_rate": 4.642998428388761e-06, + "loss": 0.4122, + "step": 10340 + }, + { + "epoch": 1.7132925012415163, + "grad_norm": 2.1685373783111572, + "learning_rate": 4.633392798328966e-06, + "loss": 0.4506, + "step": 10350 + }, + { + "epoch": 1.7149478563151797, + "grad_norm": 2.0110175609588623, + "learning_rate": 4.623788528437571e-06, + "loss": 0.4, + "step": 10360 + }, + { + "epoch": 1.716603211388843, + "grad_norm": 2.60198712348938, + "learning_rate": 4.614185654347877e-06, + "loss": 0.4176, + "step": 10370 + }, + { + "epoch": 1.7182585664625063, + "grad_norm": 1.7562052011489868, + "learning_rate": 4.604584211688004e-06, + "loss": 0.3981, + "step": 10380 + }, + { + "epoch": 1.7199139215361696, + "grad_norm": 2.134413957595825, + "learning_rate": 4.594984236080765e-06, + "loss": 0.4336, + "step": 10390 + }, + { + "epoch": 1.721569276609833, + "grad_norm": 2.218580961227417, + "learning_rate": 4.585385763143526e-06, + "loss": 0.4235, + "step": 10400 + }, + { + "epoch": 1.7232246316834963, + "grad_norm": 1.7758234739303589, + "learning_rate": 4.575788828488078e-06, + "loss": 0.4221, + "step": 10410 + }, + { + "epoch": 1.7248799867571594, + "grad_norm": 1.8942153453826904, + "learning_rate": 4.566193467720506e-06, + "loss": 0.4035, + "step": 10420 + }, + { + "epoch": 1.7265353418308227, + "grad_norm": 1.9919090270996094, + "learning_rate": 4.556599716441062e-06, + "loss": 0.4017, + "step": 10430 + }, + { + "epoch": 1.728190696904486, + "grad_norm": 2.020827293395996, + "learning_rate": 4.547007610244015e-06, + "loss": 0.4353, + "step": 10440 + }, + { + "epoch": 1.7298460519781493, + "grad_norm": 2.077829360961914, + "learning_rate": 4.537417184717536e-06, + "loss": 0.4721, + "step": 10450 + }, + { + "epoch": 1.7315014070518127, + "grad_norm": 3.0025055408477783, + "learning_rate": 4.527828475443562e-06, + "loss": 0.4413, + "step": 10460 + }, + { + "epoch": 1.7331567621254758, + "grad_norm": 2.1447229385375977, + "learning_rate": 4.518241517997657e-06, + "loss": 0.4025, + "step": 10470 + }, + { + "epoch": 1.734812117199139, + "grad_norm": 1.9372047185897827, + "learning_rate": 4.508656347948896e-06, + "loss": 0.455, + "step": 10480 + }, + { + "epoch": 1.7364674722728024, + "grad_norm": 2.317401647567749, + "learning_rate": 4.4990730008597115e-06, + "loss": 0.4008, + "step": 10490 + }, + { + "epoch": 1.7381228273464657, + "grad_norm": 1.678004264831543, + "learning_rate": 4.489491512285776e-06, + "loss": 0.4216, + "step": 10500 + }, + { + "epoch": 1.739778182420129, + "grad_norm": 1.9995408058166504, + "learning_rate": 4.47991191777587e-06, + "loss": 0.3926, + "step": 10510 + }, + { + "epoch": 1.7414335374937924, + "grad_norm": 2.1435697078704834, + "learning_rate": 4.470334252871743e-06, + "loss": 0.398, + "step": 10520 + }, + { + "epoch": 1.7430888925674557, + "grad_norm": 1.8928353786468506, + "learning_rate": 4.4607585531079844e-06, + "loss": 0.4211, + "step": 10530 + }, + { + "epoch": 1.744744247641119, + "grad_norm": 2.309457302093506, + "learning_rate": 4.451184854011898e-06, + "loss": 0.4261, + "step": 10540 + }, + { + "epoch": 1.7463996027147823, + "grad_norm": 1.7776364088058472, + "learning_rate": 4.44161319110336e-06, + "loss": 0.4415, + "step": 10550 + }, + { + "epoch": 1.7480549577884457, + "grad_norm": 1.907957911491394, + "learning_rate": 4.432043599894694e-06, + "loss": 0.4271, + "step": 10560 + }, + { + "epoch": 1.749710312862109, + "grad_norm": 2.8148648738861084, + "learning_rate": 4.422476115890537e-06, + "loss": 0.4588, + "step": 10570 + }, + { + "epoch": 1.7513656679357723, + "grad_norm": 1.931691288948059, + "learning_rate": 4.412910774587708e-06, + "loss": 0.4122, + "step": 10580 + }, + { + "epoch": 1.7530210230094356, + "grad_norm": 2.7117698192596436, + "learning_rate": 4.403347611475073e-06, + "loss": 0.442, + "step": 10590 + }, + { + "epoch": 1.754676378083099, + "grad_norm": 1.596501111984253, + "learning_rate": 4.393786662033424e-06, + "loss": 0.4256, + "step": 10600 + }, + { + "epoch": 1.7563317331567623, + "grad_norm": 2.3910441398620605, + "learning_rate": 4.384227961735334e-06, + "loss": 0.458, + "step": 10610 + }, + { + "epoch": 1.7579870882304254, + "grad_norm": 2.2331271171569824, + "learning_rate": 4.374671546045031e-06, + "loss": 0.3796, + "step": 10620 + }, + { + "epoch": 1.7596424433040887, + "grad_norm": 1.9223552942276, + "learning_rate": 4.365117450418274e-06, + "loss": 0.4167, + "step": 10630 + }, + { + "epoch": 1.761297798377752, + "grad_norm": 2.403165578842163, + "learning_rate": 4.355565710302203e-06, + "loss": 0.4277, + "step": 10640 + }, + { + "epoch": 1.7629531534514153, + "grad_norm": 2.100135564804077, + "learning_rate": 4.346016361135225e-06, + "loss": 0.3965, + "step": 10650 + }, + { + "epoch": 1.7646085085250787, + "grad_norm": 2.3119328022003174, + "learning_rate": 4.3364694383468805e-06, + "loss": 0.462, + "step": 10660 + }, + { + "epoch": 1.7662638635987418, + "grad_norm": 1.6545701026916504, + "learning_rate": 4.3269249773577e-06, + "loss": 0.3747, + "step": 10670 + }, + { + "epoch": 1.767919218672405, + "grad_norm": 2.4669320583343506, + "learning_rate": 4.317383013579084e-06, + "loss": 0.3581, + "step": 10680 + }, + { + "epoch": 1.7695745737460684, + "grad_norm": 1.7150194644927979, + "learning_rate": 4.307843582413166e-06, + "loss": 0.38, + "step": 10690 + }, + { + "epoch": 1.7712299288197317, + "grad_norm": 2.031128168106079, + "learning_rate": 4.298306719252686e-06, + "loss": 0.4136, + "step": 10700 + }, + { + "epoch": 1.772885283893395, + "grad_norm": 2.0807642936706543, + "learning_rate": 4.288772459480854e-06, + "loss": 0.4435, + "step": 10710 + }, + { + "epoch": 1.7745406389670584, + "grad_norm": 2.0820484161376953, + "learning_rate": 4.2792408384712245e-06, + "loss": 0.4107, + "step": 10720 + }, + { + "epoch": 1.7761959940407217, + "grad_norm": 2.2544493675231934, + "learning_rate": 4.269711891587556e-06, + "loss": 0.381, + "step": 10730 + }, + { + "epoch": 1.777851349114385, + "grad_norm": 1.900412917137146, + "learning_rate": 4.260185654183689e-06, + "loss": 0.4134, + "step": 10740 + }, + { + "epoch": 1.7795067041880483, + "grad_norm": 1.8195431232452393, + "learning_rate": 4.250662161603414e-06, + "loss": 0.4334, + "step": 10750 + }, + { + "epoch": 1.7811620592617117, + "grad_norm": 1.9061986207962036, + "learning_rate": 4.24114144918033e-06, + "loss": 0.4173, + "step": 10760 + }, + { + "epoch": 1.782817414335375, + "grad_norm": 2.0322930812835693, + "learning_rate": 4.231623552237731e-06, + "loss": 0.3825, + "step": 10770 + }, + { + "epoch": 1.7844727694090383, + "grad_norm": 1.757241129875183, + "learning_rate": 4.222108506088457e-06, + "loss": 0.3781, + "step": 10780 + }, + { + "epoch": 1.7861281244827016, + "grad_norm": 2.6230099201202393, + "learning_rate": 4.212596346034778e-06, + "loss": 0.4582, + "step": 10790 + }, + { + "epoch": 1.787783479556365, + "grad_norm": 1.8682286739349365, + "learning_rate": 4.20308710736825e-06, + "loss": 0.416, + "step": 10800 + }, + { + "epoch": 1.7894388346300283, + "grad_norm": 1.6394546031951904, + "learning_rate": 4.193580825369591e-06, + "loss": 0.3774, + "step": 10810 + }, + { + "epoch": 1.7910941897036916, + "grad_norm": 1.5209989547729492, + "learning_rate": 4.184077535308554e-06, + "loss": 0.4161, + "step": 10820 + }, + { + "epoch": 1.7927495447773547, + "grad_norm": 2.6634228229522705, + "learning_rate": 4.174577272443788e-06, + "loss": 0.3987, + "step": 10830 + }, + { + "epoch": 1.794404899851018, + "grad_norm": 2.2621631622314453, + "learning_rate": 4.165080072022711e-06, + "loss": 0.4146, + "step": 10840 + }, + { + "epoch": 1.7960602549246814, + "grad_norm": 2.3683760166168213, + "learning_rate": 4.15558596928138e-06, + "loss": 0.4474, + "step": 10850 + }, + { + "epoch": 1.7977156099983447, + "grad_norm": 2.4090867042541504, + "learning_rate": 4.146094999444355e-06, + "loss": 0.3767, + "step": 10860 + }, + { + "epoch": 1.799370965072008, + "grad_norm": 1.345140814781189, + "learning_rate": 4.13660719772458e-06, + "loss": 0.3677, + "step": 10870 + }, + { + "epoch": 1.801026320145671, + "grad_norm": 1.5577553510665894, + "learning_rate": 4.127122599323235e-06, + "loss": 0.4328, + "step": 10880 + }, + { + "epoch": 1.8026816752193344, + "grad_norm": 2.3623404502868652, + "learning_rate": 4.1176412394296265e-06, + "loss": 0.3946, + "step": 10890 + }, + { + "epoch": 1.8043370302929977, + "grad_norm": 1.6678826808929443, + "learning_rate": 4.108163153221036e-06, + "loss": 0.4202, + "step": 10900 + }, + { + "epoch": 1.805992385366661, + "grad_norm": 1.8554919958114624, + "learning_rate": 4.098688375862605e-06, + "loss": 0.4213, + "step": 10910 + }, + { + "epoch": 1.8076477404403244, + "grad_norm": 2.539498805999756, + "learning_rate": 4.089216942507196e-06, + "loss": 0.4458, + "step": 10920 + }, + { + "epoch": 1.8093030955139877, + "grad_norm": 2.0051186084747314, + "learning_rate": 4.0797488882952625e-06, + "loss": 0.4274, + "step": 10930 + }, + { + "epoch": 1.810958450587651, + "grad_norm": 2.1093010902404785, + "learning_rate": 4.070284248354728e-06, + "loss": 0.3565, + "step": 10940 + }, + { + "epoch": 1.8126138056613144, + "grad_norm": 2.032156229019165, + "learning_rate": 4.060823057800842e-06, + "loss": 0.4315, + "step": 10950 + }, + { + "epoch": 1.8142691607349777, + "grad_norm": 1.781123161315918, + "learning_rate": 4.05136535173606e-06, + "loss": 0.4531, + "step": 10960 + }, + { + "epoch": 1.815924515808641, + "grad_norm": 2.380314350128174, + "learning_rate": 4.041911165249905e-06, + "loss": 0.3931, + "step": 10970 + }, + { + "epoch": 1.8175798708823043, + "grad_norm": 1.8698779344558716, + "learning_rate": 4.0324605334188474e-06, + "loss": 0.3803, + "step": 10980 + }, + { + "epoch": 1.8192352259559676, + "grad_norm": 1.6024515628814697, + "learning_rate": 4.0230134913061645e-06, + "loss": 0.394, + "step": 10990 + }, + { + "epoch": 1.820890581029631, + "grad_norm": 2.1540608406066895, + "learning_rate": 4.0135700739618205e-06, + "loss": 0.401, + "step": 11000 + }, + { + "epoch": 1.8225459361032943, + "grad_norm": 2.3779587745666504, + "learning_rate": 4.004130316422327e-06, + "loss": 0.4131, + "step": 11010 + }, + { + "epoch": 1.8242012911769576, + "grad_norm": 1.9901835918426514, + "learning_rate": 3.994694253710617e-06, + "loss": 0.4113, + "step": 11020 + }, + { + "epoch": 1.8258566462506207, + "grad_norm": 1.5083597898483276, + "learning_rate": 3.985261920835917e-06, + "loss": 0.3972, + "step": 11030 + }, + { + "epoch": 1.827512001324284, + "grad_norm": 2.1913838386535645, + "learning_rate": 3.975833352793615e-06, + "loss": 0.4391, + "step": 11040 + }, + { + "epoch": 1.8291673563979474, + "grad_norm": 2.0615127086639404, + "learning_rate": 3.966408584565127e-06, + "loss": 0.4167, + "step": 11050 + }, + { + "epoch": 1.8308227114716107, + "grad_norm": 1.6090701818466187, + "learning_rate": 3.9569876511177805e-06, + "loss": 0.4146, + "step": 11060 + }, + { + "epoch": 1.832478066545274, + "grad_norm": 2.14567494392395, + "learning_rate": 3.947570587404667e-06, + "loss": 0.3594, + "step": 11070 + }, + { + "epoch": 1.834133421618937, + "grad_norm": 1.3794012069702148, + "learning_rate": 3.9381574283645215e-06, + "loss": 0.4172, + "step": 11080 + }, + { + "epoch": 1.8357887766926004, + "grad_norm": 1.5953060388565063, + "learning_rate": 3.928748208921597e-06, + "loss": 0.356, + "step": 11090 + }, + { + "epoch": 1.8374441317662638, + "grad_norm": 2.281615972518921, + "learning_rate": 3.919342963985524e-06, + "loss": 0.3627, + "step": 11100 + }, + { + "epoch": 1.839099486839927, + "grad_norm": 1.8211688995361328, + "learning_rate": 3.909941728451188e-06, + "loss": 0.4197, + "step": 11110 + }, + { + "epoch": 1.8407548419135904, + "grad_norm": 2.0925374031066895, + "learning_rate": 3.900544537198607e-06, + "loss": 0.4497, + "step": 11120 + }, + { + "epoch": 1.8424101969872537, + "grad_norm": 2.1480743885040283, + "learning_rate": 3.891151425092783e-06, + "loss": 0.3789, + "step": 11130 + }, + { + "epoch": 1.844065552060917, + "grad_norm": 1.6905628442764282, + "learning_rate": 3.8817624269835904e-06, + "loss": 0.4086, + "step": 11140 + }, + { + "epoch": 1.8457209071345804, + "grad_norm": 1.8402177095413208, + "learning_rate": 3.872377577705637e-06, + "loss": 0.4026, + "step": 11150 + }, + { + "epoch": 1.8473762622082437, + "grad_norm": 1.9096100330352783, + "learning_rate": 3.862996912078138e-06, + "loss": 0.4017, + "step": 11160 + }, + { + "epoch": 1.849031617281907, + "grad_norm": 2.6157302856445312, + "learning_rate": 3.853620464904792e-06, + "loss": 0.3848, + "step": 11170 + }, + { + "epoch": 1.8506869723555703, + "grad_norm": 3.2500317096710205, + "learning_rate": 3.844248270973639e-06, + "loss": 0.4319, + "step": 11180 + }, + { + "epoch": 1.8523423274292337, + "grad_norm": 2.082878351211548, + "learning_rate": 3.834880365056942e-06, + "loss": 0.4486, + "step": 11190 + }, + { + "epoch": 1.853997682502897, + "grad_norm": 2.111647605895996, + "learning_rate": 3.825516781911056e-06, + "loss": 0.4077, + "step": 11200 + }, + { + "epoch": 1.8556530375765603, + "grad_norm": 1.6086077690124512, + "learning_rate": 3.816157556276295e-06, + "loss": 0.4098, + "step": 11210 + }, + { + "epoch": 1.8573083926502236, + "grad_norm": 2.252664566040039, + "learning_rate": 3.806802722876808e-06, + "loss": 0.4283, + "step": 11220 + }, + { + "epoch": 1.8589637477238867, + "grad_norm": 2.4436564445495605, + "learning_rate": 3.79745231642045e-06, + "loss": 0.4302, + "step": 11230 + }, + { + "epoch": 1.86061910279755, + "grad_norm": 2.0436229705810547, + "learning_rate": 3.7881063715986466e-06, + "loss": 0.4131, + "step": 11240 + }, + { + "epoch": 1.8622744578712134, + "grad_norm": 1.9353731870651245, + "learning_rate": 3.7787649230862746e-06, + "loss": 0.3978, + "step": 11250 + }, + { + "epoch": 1.8639298129448767, + "grad_norm": 1.7772618532180786, + "learning_rate": 3.769428005541525e-06, + "loss": 0.4193, + "step": 11260 + }, + { + "epoch": 1.86558516801854, + "grad_norm": 1.960239052772522, + "learning_rate": 3.76009565360578e-06, + "loss": 0.4456, + "step": 11270 + }, + { + "epoch": 1.8672405230922031, + "grad_norm": 2.3844010829925537, + "learning_rate": 3.7507679019034827e-06, + "loss": 0.4229, + "step": 11280 + }, + { + "epoch": 1.8688958781658664, + "grad_norm": 1.5547751188278198, + "learning_rate": 3.7414447850420116e-06, + "loss": 0.4223, + "step": 11290 + }, + { + "epoch": 1.8705512332395298, + "grad_norm": 2.0283639430999756, + "learning_rate": 3.732126337611544e-06, + "loss": 0.4348, + "step": 11300 + }, + { + "epoch": 1.872206588313193, + "grad_norm": 2.0857224464416504, + "learning_rate": 3.7228125941849347e-06, + "loss": 0.4507, + "step": 11310 + }, + { + "epoch": 1.8738619433868564, + "grad_norm": 1.7453672885894775, + "learning_rate": 3.7135035893175873e-06, + "loss": 0.4225, + "step": 11320 + }, + { + "epoch": 1.8755172984605197, + "grad_norm": 2.237485885620117, + "learning_rate": 3.7041993575473245e-06, + "loss": 0.4036, + "step": 11330 + }, + { + "epoch": 1.877172653534183, + "grad_norm": 1.7037478685379028, + "learning_rate": 3.6948999333942558e-06, + "loss": 0.391, + "step": 11340 + }, + { + "epoch": 1.8788280086078464, + "grad_norm": 2.2421579360961914, + "learning_rate": 3.6856053513606615e-06, + "loss": 0.3896, + "step": 11350 + }, + { + "epoch": 1.8804833636815097, + "grad_norm": 1.633567452430725, + "learning_rate": 3.676315645930851e-06, + "loss": 0.4273, + "step": 11360 + }, + { + "epoch": 1.882138718755173, + "grad_norm": 1.8429170846939087, + "learning_rate": 3.667030851571043e-06, + "loss": 0.3861, + "step": 11370 + }, + { + "epoch": 1.8837940738288363, + "grad_norm": 2.3093669414520264, + "learning_rate": 3.657751002729234e-06, + "loss": 0.3958, + "step": 11380 + }, + { + "epoch": 1.8854494289024997, + "grad_norm": 1.9372445344924927, + "learning_rate": 3.6484761338350703e-06, + "loss": 0.4537, + "step": 11390 + }, + { + "epoch": 1.887104783976163, + "grad_norm": 2.930211305618286, + "learning_rate": 3.6392062792997284e-06, + "loss": 0.3707, + "step": 11400 + }, + { + "epoch": 1.8887601390498263, + "grad_norm": 1.6780815124511719, + "learning_rate": 3.6299414735157767e-06, + "loss": 0.3917, + "step": 11410 + }, + { + "epoch": 1.8904154941234896, + "grad_norm": 2.5266661643981934, + "learning_rate": 3.620681750857049e-06, + "loss": 0.4232, + "step": 11420 + }, + { + "epoch": 1.8920708491971527, + "grad_norm": 1.8783165216445923, + "learning_rate": 3.6114271456785237e-06, + "loss": 0.415, + "step": 11430 + }, + { + "epoch": 1.893726204270816, + "grad_norm": 1.8861968517303467, + "learning_rate": 3.6021776923161927e-06, + "loss": 0.3868, + "step": 11440 + }, + { + "epoch": 1.8953815593444794, + "grad_norm": 1.9357882738113403, + "learning_rate": 3.5929334250869297e-06, + "loss": 0.4461, + "step": 11450 + }, + { + "epoch": 1.8970369144181427, + "grad_norm": 1.3567824363708496, + "learning_rate": 3.5836943782883747e-06, + "loss": 0.3844, + "step": 11460 + }, + { + "epoch": 1.898692269491806, + "grad_norm": 2.8328826427459717, + "learning_rate": 3.5744605861987925e-06, + "loss": 0.4203, + "step": 11470 + }, + { + "epoch": 1.9003476245654691, + "grad_norm": 2.007815361022949, + "learning_rate": 3.565232083076954e-06, + "loss": 0.3869, + "step": 11480 + }, + { + "epoch": 1.9020029796391325, + "grad_norm": 1.5280500650405884, + "learning_rate": 3.556008903162007e-06, + "loss": 0.4193, + "step": 11490 + }, + { + "epoch": 1.9036583347127958, + "grad_norm": 2.2297863960266113, + "learning_rate": 3.546791080673351e-06, + "loss": 0.408, + "step": 11500 + }, + { + "epoch": 1.905313689786459, + "grad_norm": 2.147474527359009, + "learning_rate": 3.537578649810505e-06, + "loss": 0.4071, + "step": 11510 + }, + { + "epoch": 1.9069690448601224, + "grad_norm": 2.1503307819366455, + "learning_rate": 3.52837164475299e-06, + "loss": 0.4037, + "step": 11520 + }, + { + "epoch": 1.9086243999337857, + "grad_norm": 2.1771047115325928, + "learning_rate": 3.519170099660192e-06, + "loss": 0.3948, + "step": 11530 + }, + { + "epoch": 1.910279755007449, + "grad_norm": 1.9087363481521606, + "learning_rate": 3.509974048671241e-06, + "loss": 0.4097, + "step": 11540 + }, + { + "epoch": 1.9119351100811124, + "grad_norm": 1.270500659942627, + "learning_rate": 3.5007835259048818e-06, + "loss": 0.395, + "step": 11550 + }, + { + "epoch": 1.9135904651547757, + "grad_norm": 1.6682640314102173, + "learning_rate": 3.491598565459351e-06, + "loss": 0.3878, + "step": 11560 + }, + { + "epoch": 1.915245820228439, + "grad_norm": 1.9983856678009033, + "learning_rate": 3.482419201412246e-06, + "loss": 0.4232, + "step": 11570 + }, + { + "epoch": 1.9169011753021024, + "grad_norm": 1.9866315126419067, + "learning_rate": 3.473245467820403e-06, + "loss": 0.4339, + "step": 11580 + }, + { + "epoch": 1.9185565303757657, + "grad_norm": 1.7992204427719116, + "learning_rate": 3.4640773987197662e-06, + "loss": 0.4178, + "step": 11590 + }, + { + "epoch": 1.920211885449429, + "grad_norm": 1.685878872871399, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.3865, + "step": 11600 + }, + { + "epoch": 1.9218672405230923, + "grad_norm": 1.3427140712738037, + "learning_rate": 3.4457583900306835e-06, + "loss": 0.4346, + "step": 11610 + }, + { + "epoch": 1.9235225955967556, + "grad_norm": 1.8562428951263428, + "learning_rate": 3.4366075184085403e-06, + "loss": 0.4133, + "step": 11620 + }, + { + "epoch": 1.925177950670419, + "grad_norm": 1.8822894096374512, + "learning_rate": 3.4274624472099626e-06, + "loss": 0.4147, + "step": 11630 + }, + { + "epoch": 1.926833305744082, + "grad_norm": 1.918009638786316, + "learning_rate": 3.4183232103645508e-06, + "loss": 0.392, + "step": 11640 + }, + { + "epoch": 1.9284886608177454, + "grad_norm": 1.8526767492294312, + "learning_rate": 3.409189841780263e-06, + "loss": 0.4166, + "step": 11650 + }, + { + "epoch": 1.9301440158914087, + "grad_norm": 2.163173198699951, + "learning_rate": 3.400062375343283e-06, + "loss": 0.4057, + "step": 11660 + }, + { + "epoch": 1.931799370965072, + "grad_norm": 2.5304856300354004, + "learning_rate": 3.390940844917897e-06, + "loss": 0.3884, + "step": 11670 + }, + { + "epoch": 1.9334547260387354, + "grad_norm": 2.086353302001953, + "learning_rate": 3.3818252843463676e-06, + "loss": 0.3999, + "step": 11680 + }, + { + "epoch": 1.9351100811123985, + "grad_norm": 1.9789005517959595, + "learning_rate": 3.3727157274488113e-06, + "loss": 0.4293, + "step": 11690 + }, + { + "epoch": 1.9367654361860618, + "grad_norm": 1.9313982725143433, + "learning_rate": 3.363612208023068e-06, + "loss": 0.3912, + "step": 11700 + }, + { + "epoch": 1.938420791259725, + "grad_norm": 1.6824219226837158, + "learning_rate": 3.354514759844576e-06, + "loss": 0.3633, + "step": 11710 + }, + { + "epoch": 1.9400761463333884, + "grad_norm": 2.064709424972534, + "learning_rate": 3.345423416666249e-06, + "loss": 0.3762, + "step": 11720 + }, + { + "epoch": 1.9417315014070518, + "grad_norm": 1.9601470232009888, + "learning_rate": 3.336338212218354e-06, + "loss": 0.3626, + "step": 11730 + }, + { + "epoch": 1.943386856480715, + "grad_norm": 1.799816608428955, + "learning_rate": 3.327259180208375e-06, + "loss": 0.3649, + "step": 11740 + }, + { + "epoch": 1.9450422115543784, + "grad_norm": 2.4325437545776367, + "learning_rate": 3.318186354320905e-06, + "loss": 0.3869, + "step": 11750 + }, + { + "epoch": 1.9466975666280417, + "grad_norm": 1.9126996994018555, + "learning_rate": 3.3091197682175023e-06, + "loss": 0.4228, + "step": 11760 + }, + { + "epoch": 1.948352921701705, + "grad_norm": 1.600388526916504, + "learning_rate": 3.300059455536579e-06, + "loss": 0.3965, + "step": 11770 + }, + { + "epoch": 1.9500082767753684, + "grad_norm": 2.1479532718658447, + "learning_rate": 3.291005449893273e-06, + "loss": 0.3816, + "step": 11780 + }, + { + "epoch": 1.9516636318490317, + "grad_norm": 1.8163764476776123, + "learning_rate": 3.281957784879317e-06, + "loss": 0.4336, + "step": 11790 + }, + { + "epoch": 1.953318986922695, + "grad_norm": 2.1194796562194824, + "learning_rate": 3.2729164940629264e-06, + "loss": 0.4344, + "step": 11800 + }, + { + "epoch": 1.9549743419963583, + "grad_norm": 1.6033542156219482, + "learning_rate": 3.2638816109886604e-06, + "loss": 0.3745, + "step": 11810 + }, + { + "epoch": 1.9566296970700217, + "grad_norm": 2.0563807487487793, + "learning_rate": 3.254853169177311e-06, + "loss": 0.408, + "step": 11820 + }, + { + "epoch": 1.958285052143685, + "grad_norm": 2.165522813796997, + "learning_rate": 3.2458312021257656e-06, + "loss": 0.417, + "step": 11830 + }, + { + "epoch": 1.959940407217348, + "grad_norm": 1.5800554752349854, + "learning_rate": 3.2368157433068916e-06, + "loss": 0.4462, + "step": 11840 + }, + { + "epoch": 1.9615957622910114, + "grad_norm": 1.5213532447814941, + "learning_rate": 3.2278068261694106e-06, + "loss": 0.3837, + "step": 11850 + }, + { + "epoch": 1.9632511173646747, + "grad_norm": 1.8690297603607178, + "learning_rate": 3.2188044841377773e-06, + "loss": 0.3819, + "step": 11860 + }, + { + "epoch": 1.964906472438338, + "grad_norm": 1.4503165483474731, + "learning_rate": 3.2098087506120456e-06, + "loss": 0.4349, + "step": 11870 + }, + { + "epoch": 1.9665618275120014, + "grad_norm": 1.9367406368255615, + "learning_rate": 3.2008196589677532e-06, + "loss": 0.465, + "step": 11880 + }, + { + "epoch": 1.9682171825856645, + "grad_norm": 3.2155959606170654, + "learning_rate": 3.1918372425557932e-06, + "loss": 0.3958, + "step": 11890 + }, + { + "epoch": 1.9698725376593278, + "grad_norm": 2.281371593475342, + "learning_rate": 3.1828615347022984e-06, + "loss": 0.4041, + "step": 11900 + }, + { + "epoch": 1.9715278927329911, + "grad_norm": 1.9571949243545532, + "learning_rate": 3.173892568708505e-06, + "loss": 0.3746, + "step": 11910 + }, + { + "epoch": 1.9731832478066544, + "grad_norm": 1.9914932250976562, + "learning_rate": 3.1649303778506425e-06, + "loss": 0.4203, + "step": 11920 + }, + { + "epoch": 1.9748386028803178, + "grad_norm": 1.7412394285202026, + "learning_rate": 3.1559749953797973e-06, + "loss": 0.3784, + "step": 11930 + }, + { + "epoch": 1.976493957953981, + "grad_norm": 1.6468333005905151, + "learning_rate": 3.147026454521801e-06, + "loss": 0.4356, + "step": 11940 + }, + { + "epoch": 1.9781493130276444, + "grad_norm": 1.9956331253051758, + "learning_rate": 3.138084788477098e-06, + "loss": 0.4486, + "step": 11950 + }, + { + "epoch": 1.9798046681013077, + "grad_norm": 2.0449624061584473, + "learning_rate": 3.1291500304206262e-06, + "loss": 0.3309, + "step": 11960 + }, + { + "epoch": 1.981460023174971, + "grad_norm": 2.1764378547668457, + "learning_rate": 3.120222213501697e-06, + "loss": 0.4128, + "step": 11970 + }, + { + "epoch": 1.9831153782486344, + "grad_norm": 1.891964077949524, + "learning_rate": 3.1113013708438653e-06, + "loss": 0.4075, + "step": 11980 + }, + { + "epoch": 1.9847707333222977, + "grad_norm": 2.1923632621765137, + "learning_rate": 3.1023875355448153e-06, + "loss": 0.4233, + "step": 11990 + }, + { + "epoch": 1.986426088395961, + "grad_norm": 2.2308120727539062, + "learning_rate": 3.093480740676228e-06, + "loss": 0.4074, + "step": 12000 + }, + { + "epoch": 1.9880814434696243, + "grad_norm": 2.1118459701538086, + "learning_rate": 3.0845810192836645e-06, + "loss": 0.423, + "step": 12010 + }, + { + "epoch": 1.9897367985432877, + "grad_norm": 1.4029134511947632, + "learning_rate": 3.075688404386442e-06, + "loss": 0.3564, + "step": 12020 + }, + { + "epoch": 1.991392153616951, + "grad_norm": 1.9621480703353882, + "learning_rate": 3.0668029289775163e-06, + "loss": 0.3869, + "step": 12030 + }, + { + "epoch": 1.993047508690614, + "grad_norm": 1.8076422214508057, + "learning_rate": 3.0579246260233486e-06, + "loss": 0.4299, + "step": 12040 + }, + { + "epoch": 1.9947028637642774, + "grad_norm": 2.1234450340270996, + "learning_rate": 3.04905352846379e-06, + "loss": 0.4006, + "step": 12050 + }, + { + "epoch": 1.9963582188379407, + "grad_norm": 1.8213273286819458, + "learning_rate": 3.0401896692119626e-06, + "loss": 0.4036, + "step": 12060 + }, + { + "epoch": 1.998013573911604, + "grad_norm": 2.5499227046966553, + "learning_rate": 3.031333081154129e-06, + "loss": 0.4424, + "step": 12070 + }, + { + "epoch": 1.9996689289852674, + "grad_norm": 2.273587942123413, + "learning_rate": 3.0224837971495736e-06, + "loss": 0.4118, + "step": 12080 + }, + { + "epoch": 2.0013242840589305, + "grad_norm": 1.7035340070724487, + "learning_rate": 3.0136418500304888e-06, + "loss": 0.3556, + "step": 12090 + }, + { + "epoch": 2.002979639132594, + "grad_norm": 1.4942774772644043, + "learning_rate": 3.0048072726018386e-06, + "loss": 0.3315, + "step": 12100 + }, + { + "epoch": 2.004634994206257, + "grad_norm": 1.6278843879699707, + "learning_rate": 2.995980097641248e-06, + "loss": 0.3448, + "step": 12110 + }, + { + "epoch": 2.0062903492799204, + "grad_norm": 2.276960849761963, + "learning_rate": 2.987160357898877e-06, + "loss": 0.3363, + "step": 12120 + }, + { + "epoch": 2.0079457043535838, + "grad_norm": 1.993857502937317, + "learning_rate": 2.978348086097298e-06, + "loss": 0.3246, + "step": 12130 + }, + { + "epoch": 2.009601059427247, + "grad_norm": 1.7416425943374634, + "learning_rate": 2.9695433149313774e-06, + "loss": 0.3325, + "step": 12140 + }, + { + "epoch": 2.0112564145009104, + "grad_norm": 2.1714820861816406, + "learning_rate": 2.960746077068158e-06, + "loss": 0.3331, + "step": 12150 + }, + { + "epoch": 2.0129117695745737, + "grad_norm": 1.8540669679641724, + "learning_rate": 2.951956405146725e-06, + "loss": 0.3549, + "step": 12160 + }, + { + "epoch": 2.014567124648237, + "grad_norm": 1.6556892395019531, + "learning_rate": 2.9431743317780957e-06, + "loss": 0.3138, + "step": 12170 + }, + { + "epoch": 2.0162224797219004, + "grad_norm": 1.8258768320083618, + "learning_rate": 2.934399889545099e-06, + "loss": 0.3415, + "step": 12180 + }, + { + "epoch": 2.0178778347955637, + "grad_norm": 1.6194030046463013, + "learning_rate": 2.9256331110022463e-06, + "loss": 0.3436, + "step": 12190 + }, + { + "epoch": 2.019533189869227, + "grad_norm": 2.233779191970825, + "learning_rate": 2.9168740286756157e-06, + "loss": 0.3653, + "step": 12200 + }, + { + "epoch": 2.0211885449428904, + "grad_norm": 1.9311305284500122, + "learning_rate": 2.9081226750627367e-06, + "loss": 0.3901, + "step": 12210 + }, + { + "epoch": 2.0228439000165537, + "grad_norm": 2.0436785221099854, + "learning_rate": 2.89937908263246e-06, + "loss": 0.343, + "step": 12220 + }, + { + "epoch": 2.024499255090217, + "grad_norm": 1.8099346160888672, + "learning_rate": 2.890643283824837e-06, + "loss": 0.3493, + "step": 12230 + }, + { + "epoch": 2.0261546101638803, + "grad_norm": 2.596013307571411, + "learning_rate": 2.8819153110510147e-06, + "loss": 0.3721, + "step": 12240 + }, + { + "epoch": 2.0278099652375436, + "grad_norm": 2.1353633403778076, + "learning_rate": 2.8731951966930917e-06, + "loss": 0.361, + "step": 12250 + }, + { + "epoch": 2.029465320311207, + "grad_norm": 2.3608481884002686, + "learning_rate": 2.8644829731040214e-06, + "loss": 0.3404, + "step": 12260 + }, + { + "epoch": 2.03112067538487, + "grad_norm": 1.9163801670074463, + "learning_rate": 2.8557786726074755e-06, + "loss": 0.3844, + "step": 12270 + }, + { + "epoch": 2.032776030458533, + "grad_norm": 2.0741047859191895, + "learning_rate": 2.84708232749773e-06, + "loss": 0.3721, + "step": 12280 + }, + { + "epoch": 2.0344313855321965, + "grad_norm": 2.1986711025238037, + "learning_rate": 2.8383939700395456e-06, + "loss": 0.3018, + "step": 12290 + }, + { + "epoch": 2.03608674060586, + "grad_norm": 1.790903925895691, + "learning_rate": 2.8297136324680498e-06, + "loss": 0.3716, + "step": 12300 + }, + { + "epoch": 2.037742095679523, + "grad_norm": 1.8230623006820679, + "learning_rate": 2.8210413469886094e-06, + "loss": 0.3459, + "step": 12310 + }, + { + "epoch": 2.0393974507531865, + "grad_norm": 1.7864844799041748, + "learning_rate": 2.812377145776724e-06, + "loss": 0.3428, + "step": 12320 + }, + { + "epoch": 2.04105280582685, + "grad_norm": 1.8330215215682983, + "learning_rate": 2.8037210609778975e-06, + "loss": 0.3533, + "step": 12330 + }, + { + "epoch": 2.042708160900513, + "grad_norm": 2.0148110389709473, + "learning_rate": 2.795073124707518e-06, + "loss": 0.3093, + "step": 12340 + }, + { + "epoch": 2.0443635159741764, + "grad_norm": 1.9366060495376587, + "learning_rate": 2.786433369050742e-06, + "loss": 0.3289, + "step": 12350 + }, + { + "epoch": 2.0460188710478397, + "grad_norm": 3.5124638080596924, + "learning_rate": 2.777801826062375e-06, + "loss": 0.3476, + "step": 12360 + }, + { + "epoch": 2.047674226121503, + "grad_norm": 2.0207085609436035, + "learning_rate": 2.7691785277667506e-06, + "loss": 0.3595, + "step": 12370 + }, + { + "epoch": 2.0493295811951664, + "grad_norm": 1.430230736732483, + "learning_rate": 2.7605635061576195e-06, + "loss": 0.3462, + "step": 12380 + }, + { + "epoch": 2.0509849362688297, + "grad_norm": 2.4573211669921875, + "learning_rate": 2.7519567931980185e-06, + "loss": 0.3782, + "step": 12390 + }, + { + "epoch": 2.052640291342493, + "grad_norm": 1.84562087059021, + "learning_rate": 2.7433584208201577e-06, + "loss": 0.3536, + "step": 12400 + }, + { + "epoch": 2.0542956464161564, + "grad_norm": 1.9053126573562622, + "learning_rate": 2.734768420925308e-06, + "loss": 0.3187, + "step": 12410 + }, + { + "epoch": 2.0559510014898197, + "grad_norm": 2.1721508502960205, + "learning_rate": 2.726186825383673e-06, + "loss": 0.3245, + "step": 12420 + }, + { + "epoch": 2.057606356563483, + "grad_norm": 2.5854411125183105, + "learning_rate": 2.717613666034272e-06, + "loss": 0.3681, + "step": 12430 + }, + { + "epoch": 2.0592617116371463, + "grad_norm": 1.7167317867279053, + "learning_rate": 2.7090489746848336e-06, + "loss": 0.3286, + "step": 12440 + }, + { + "epoch": 2.0609170667108097, + "grad_norm": 1.8206017017364502, + "learning_rate": 2.7004927831116614e-06, + "loss": 0.3294, + "step": 12450 + }, + { + "epoch": 2.062572421784473, + "grad_norm": 1.7760072946548462, + "learning_rate": 2.691945123059525e-06, + "loss": 0.3465, + "step": 12460 + }, + { + "epoch": 2.064227776858136, + "grad_norm": 2.4801549911499023, + "learning_rate": 2.6834060262415425e-06, + "loss": 0.3392, + "step": 12470 + }, + { + "epoch": 2.065883131931799, + "grad_norm": 2.1817026138305664, + "learning_rate": 2.674875524339057e-06, + "loss": 0.3275, + "step": 12480 + }, + { + "epoch": 2.0675384870054625, + "grad_norm": 1.8844431638717651, + "learning_rate": 2.6663536490015284e-06, + "loss": 0.3257, + "step": 12490 + }, + { + "epoch": 2.069193842079126, + "grad_norm": 2.3225162029266357, + "learning_rate": 2.6578404318464112e-06, + "loss": 0.3575, + "step": 12500 + }, + { + "epoch": 2.070849197152789, + "grad_norm": 1.740742802619934, + "learning_rate": 2.6493359044590317e-06, + "loss": 0.3333, + "step": 12510 + }, + { + "epoch": 2.0725045522264525, + "grad_norm": 2.0152180194854736, + "learning_rate": 2.640840098392478e-06, + "loss": 0.3601, + "step": 12520 + }, + { + "epoch": 2.074159907300116, + "grad_norm": 2.1492371559143066, + "learning_rate": 2.6323530451674815e-06, + "loss": 0.3131, + "step": 12530 + }, + { + "epoch": 2.075815262373779, + "grad_norm": 2.565166473388672, + "learning_rate": 2.623874776272296e-06, + "loss": 0.3644, + "step": 12540 + }, + { + "epoch": 2.0774706174474424, + "grad_norm": 2.024240493774414, + "learning_rate": 2.615405323162592e-06, + "loss": 0.2927, + "step": 12550 + }, + { + "epoch": 2.0791259725211058, + "grad_norm": 1.9475926160812378, + "learning_rate": 2.606944717261323e-06, + "loss": 0.3036, + "step": 12560 + }, + { + "epoch": 2.080781327594769, + "grad_norm": 2.2539281845092773, + "learning_rate": 2.598492989958622e-06, + "loss": 0.3522, + "step": 12570 + }, + { + "epoch": 2.0824366826684324, + "grad_norm": 1.7754216194152832, + "learning_rate": 2.590050172611684e-06, + "loss": 0.3114, + "step": 12580 + }, + { + "epoch": 2.0840920377420957, + "grad_norm": 1.9726289510726929, + "learning_rate": 2.5816162965446416e-06, + "loss": 0.3297, + "step": 12590 + }, + { + "epoch": 2.085747392815759, + "grad_norm": 1.9303443431854248, + "learning_rate": 2.5731913930484543e-06, + "loss": 0.3168, + "step": 12600 + }, + { + "epoch": 2.0874027478894224, + "grad_norm": 3.2593839168548584, + "learning_rate": 2.564775493380798e-06, + "loss": 0.3679, + "step": 12610 + }, + { + "epoch": 2.0890581029630857, + "grad_norm": 2.273982524871826, + "learning_rate": 2.556368628765935e-06, + "loss": 0.3396, + "step": 12620 + }, + { + "epoch": 2.090713458036749, + "grad_norm": 2.084463119506836, + "learning_rate": 2.5479708303946126e-06, + "loss": 0.3432, + "step": 12630 + }, + { + "epoch": 2.0923688131104123, + "grad_norm": 1.9511936902999878, + "learning_rate": 2.539582129423934e-06, + "loss": 0.3304, + "step": 12640 + }, + { + "epoch": 2.0940241681840757, + "grad_norm": 3.004499912261963, + "learning_rate": 2.531202556977259e-06, + "loss": 0.3302, + "step": 12650 + }, + { + "epoch": 2.095679523257739, + "grad_norm": 2.4408180713653564, + "learning_rate": 2.5228321441440705e-06, + "loss": 0.3696, + "step": 12660 + }, + { + "epoch": 2.0973348783314023, + "grad_norm": 1.5785390138626099, + "learning_rate": 2.514470921979874e-06, + "loss": 0.3248, + "step": 12670 + }, + { + "epoch": 2.098990233405065, + "grad_norm": 2.020374059677124, + "learning_rate": 2.5061189215060733e-06, + "loss": 0.3153, + "step": 12680 + }, + { + "epoch": 2.1006455884787285, + "grad_norm": 4.800167560577393, + "learning_rate": 2.497776173709857e-06, + "loss": 0.3579, + "step": 12690 + }, + { + "epoch": 2.102300943552392, + "grad_norm": 2.8129446506500244, + "learning_rate": 2.4894427095440883e-06, + "loss": 0.3541, + "step": 12700 + }, + { + "epoch": 2.103956298626055, + "grad_norm": 2.103689670562744, + "learning_rate": 2.4811185599271813e-06, + "loss": 0.3655, + "step": 12710 + }, + { + "epoch": 2.1056116536997185, + "grad_norm": 2.179983139038086, + "learning_rate": 2.4728037557430012e-06, + "loss": 0.381, + "step": 12720 + }, + { + "epoch": 2.107267008773382, + "grad_norm": 1.8805456161499023, + "learning_rate": 2.464498327840729e-06, + "loss": 0.3306, + "step": 12730 + }, + { + "epoch": 2.108922363847045, + "grad_norm": 1.8576078414916992, + "learning_rate": 2.4562023070347685e-06, + "loss": 0.3927, + "step": 12740 + }, + { + "epoch": 2.1105777189207084, + "grad_norm": 2.150397300720215, + "learning_rate": 2.4479157241046143e-06, + "loss": 0.3269, + "step": 12750 + }, + { + "epoch": 2.1122330739943718, + "grad_norm": 1.8863481283187866, + "learning_rate": 2.4396386097947482e-06, + "loss": 0.3511, + "step": 12760 + }, + { + "epoch": 2.113888429068035, + "grad_norm": 1.7663947343826294, + "learning_rate": 2.4313709948145193e-06, + "loss": 0.3388, + "step": 12770 + }, + { + "epoch": 2.1155437841416984, + "grad_norm": 2.267031669616699, + "learning_rate": 2.423112909838039e-06, + "loss": 0.3502, + "step": 12780 + }, + { + "epoch": 2.1171991392153617, + "grad_norm": 2.0435879230499268, + "learning_rate": 2.4148643855040543e-06, + "loss": 0.3621, + "step": 12790 + }, + { + "epoch": 2.118854494289025, + "grad_norm": 2.001474618911743, + "learning_rate": 2.406625452415844e-06, + "loss": 0.3321, + "step": 12800 + }, + { + "epoch": 2.1205098493626884, + "grad_norm": 1.835279941558838, + "learning_rate": 2.398396141141099e-06, + "loss": 0.369, + "step": 12810 + }, + { + "epoch": 2.1221652044363517, + "grad_norm": 1.7734911441802979, + "learning_rate": 2.390176482211818e-06, + "loss": 0.34, + "step": 12820 + }, + { + "epoch": 2.123820559510015, + "grad_norm": 2.4885096549987793, + "learning_rate": 2.3819665061241798e-06, + "loss": 0.3385, + "step": 12830 + }, + { + "epoch": 2.1254759145836783, + "grad_norm": 1.844212293624878, + "learning_rate": 2.3737662433384474e-06, + "loss": 0.3229, + "step": 12840 + }, + { + "epoch": 2.1271312696573417, + "grad_norm": 1.8578639030456543, + "learning_rate": 2.3655757242788385e-06, + "loss": 0.354, + "step": 12850 + }, + { + "epoch": 2.128786624731005, + "grad_norm": 2.1224701404571533, + "learning_rate": 2.357394979333423e-06, + "loss": 0.3277, + "step": 12860 + }, + { + "epoch": 2.130441979804668, + "grad_norm": 1.8834820985794067, + "learning_rate": 2.349224038854007e-06, + "loss": 0.3472, + "step": 12870 + }, + { + "epoch": 2.132097334878331, + "grad_norm": 1.910531759262085, + "learning_rate": 2.3410629331560197e-06, + "loss": 0.3553, + "step": 12880 + }, + { + "epoch": 2.1337526899519945, + "grad_norm": 2.2018206119537354, + "learning_rate": 2.332911692518406e-06, + "loss": 0.3408, + "step": 12890 + }, + { + "epoch": 2.135408045025658, + "grad_norm": 1.6149979829788208, + "learning_rate": 2.3247703471835028e-06, + "loss": 0.3158, + "step": 12900 + }, + { + "epoch": 2.137063400099321, + "grad_norm": 2.036621570587158, + "learning_rate": 2.3166389273569416e-06, + "loss": 0.3459, + "step": 12910 + }, + { + "epoch": 2.1387187551729845, + "grad_norm": 2.2086009979248047, + "learning_rate": 2.3085174632075234e-06, + "loss": 0.3457, + "step": 12920 + }, + { + "epoch": 2.140374110246648, + "grad_norm": 1.6859116554260254, + "learning_rate": 2.3004059848671133e-06, + "loss": 0.3355, + "step": 12930 + }, + { + "epoch": 2.142029465320311, + "grad_norm": 2.271038770675659, + "learning_rate": 2.2923045224305267e-06, + "loss": 0.3251, + "step": 12940 + }, + { + "epoch": 2.1436848203939745, + "grad_norm": 1.8713353872299194, + "learning_rate": 2.2842131059554228e-06, + "loss": 0.3506, + "step": 12950 + }, + { + "epoch": 2.1453401754676378, + "grad_norm": 2.219752550125122, + "learning_rate": 2.2761317654621853e-06, + "loss": 0.3681, + "step": 12960 + }, + { + "epoch": 2.146995530541301, + "grad_norm": 2.1416406631469727, + "learning_rate": 2.2680605309338103e-06, + "loss": 0.3645, + "step": 12970 + }, + { + "epoch": 2.1486508856149644, + "grad_norm": 1.8972928524017334, + "learning_rate": 2.2599994323158103e-06, + "loss": 0.3625, + "step": 12980 + }, + { + "epoch": 2.1503062406886277, + "grad_norm": 1.7961077690124512, + "learning_rate": 2.2519484995160818e-06, + "loss": 0.3336, + "step": 12990 + }, + { + "epoch": 2.151961595762291, + "grad_norm": 2.188765287399292, + "learning_rate": 2.2439077624048074e-06, + "loss": 0.3458, + "step": 13000 + }, + { + "epoch": 2.1536169508359544, + "grad_norm": 1.96406888961792, + "learning_rate": 2.2358772508143467e-06, + "loss": 0.3536, + "step": 13010 + }, + { + "epoch": 2.1552723059096177, + "grad_norm": 2.556922435760498, + "learning_rate": 2.227856994539115e-06, + "loss": 0.3177, + "step": 13020 + }, + { + "epoch": 2.156927660983281, + "grad_norm": 2.1325490474700928, + "learning_rate": 2.2198470233354817e-06, + "loss": 0.3601, + "step": 13030 + }, + { + "epoch": 2.1585830160569444, + "grad_norm": 2.0026392936706543, + "learning_rate": 2.2118473669216568e-06, + "loss": 0.3097, + "step": 13040 + }, + { + "epoch": 2.1602383711306077, + "grad_norm": 2.2289645671844482, + "learning_rate": 2.203858054977578e-06, + "loss": 0.4102, + "step": 13050 + }, + { + "epoch": 2.161893726204271, + "grad_norm": 1.3882899284362793, + "learning_rate": 2.1958791171448083e-06, + "loss": 0.3555, + "step": 13060 + }, + { + "epoch": 2.1635490812779343, + "grad_norm": 1.9401888847351074, + "learning_rate": 2.1879105830264213e-06, + "loss": 0.3216, + "step": 13070 + }, + { + "epoch": 2.1652044363515976, + "grad_norm": 2.3579916954040527, + "learning_rate": 2.179952482186886e-06, + "loss": 0.3729, + "step": 13080 + }, + { + "epoch": 2.1668597914252605, + "grad_norm": 1.7595404386520386, + "learning_rate": 2.1720048441519665e-06, + "loss": 0.3457, + "step": 13090 + }, + { + "epoch": 2.168515146498924, + "grad_norm": 1.9767661094665527, + "learning_rate": 2.164067698408606e-06, + "loss": 0.3442, + "step": 13100 + }, + { + "epoch": 2.170170501572587, + "grad_norm": 2.1314504146575928, + "learning_rate": 2.1561410744048213e-06, + "loss": 0.3674, + "step": 13110 + }, + { + "epoch": 2.1718258566462505, + "grad_norm": 2.088491439819336, + "learning_rate": 2.148225001549589e-06, + "loss": 0.3049, + "step": 13120 + }, + { + "epoch": 2.173481211719914, + "grad_norm": 1.943539023399353, + "learning_rate": 2.140319509212746e-06, + "loss": 0.3338, + "step": 13130 + }, + { + "epoch": 2.175136566793577, + "grad_norm": 1.8165936470031738, + "learning_rate": 2.1324246267248657e-06, + "loss": 0.343, + "step": 13140 + }, + { + "epoch": 2.1767919218672405, + "grad_norm": 1.5986518859863281, + "learning_rate": 2.124540383377165e-06, + "loss": 0.3654, + "step": 13150 + }, + { + "epoch": 2.178447276940904, + "grad_norm": 2.2084920406341553, + "learning_rate": 2.1166668084213822e-06, + "loss": 0.3188, + "step": 13160 + }, + { + "epoch": 2.180102632014567, + "grad_norm": 1.8947356939315796, + "learning_rate": 2.1088039310696744e-06, + "loss": 0.3347, + "step": 13170 + }, + { + "epoch": 2.1817579870882304, + "grad_norm": 1.6848961114883423, + "learning_rate": 2.1009517804945146e-06, + "loss": 0.3666, + "step": 13180 + }, + { + "epoch": 2.1834133421618938, + "grad_norm": 1.5556492805480957, + "learning_rate": 2.0931103858285725e-06, + "loss": 0.3552, + "step": 13190 + }, + { + "epoch": 2.185068697235557, + "grad_norm": 1.661304235458374, + "learning_rate": 2.0852797761646125e-06, + "loss": 0.3119, + "step": 13200 + }, + { + "epoch": 2.1867240523092204, + "grad_norm": 2.146554470062256, + "learning_rate": 2.0774599805553873e-06, + "loss": 0.3636, + "step": 13210 + }, + { + "epoch": 2.1883794073828837, + "grad_norm": 2.547508478164673, + "learning_rate": 2.069651028013523e-06, + "loss": 0.3619, + "step": 13220 + }, + { + "epoch": 2.190034762456547, + "grad_norm": 1.6659605503082275, + "learning_rate": 2.0618529475114218e-06, + "loss": 0.2849, + "step": 13230 + }, + { + "epoch": 2.1916901175302104, + "grad_norm": 2.031393051147461, + "learning_rate": 2.054065767981149e-06, + "loss": 0.3449, + "step": 13240 + }, + { + "epoch": 2.1933454726038737, + "grad_norm": 1.7782469987869263, + "learning_rate": 2.0462895183143217e-06, + "loss": 0.3301, + "step": 13250 + }, + { + "epoch": 2.195000827677537, + "grad_norm": 2.75504994392395, + "learning_rate": 2.038524227362007e-06, + "loss": 0.3668, + "step": 13260 + }, + { + "epoch": 2.1966561827512003, + "grad_norm": 2.0453379154205322, + "learning_rate": 2.0307699239346136e-06, + "loss": 0.366, + "step": 13270 + }, + { + "epoch": 2.198311537824863, + "grad_norm": 2.0684151649475098, + "learning_rate": 2.023026636801785e-06, + "loss": 0.361, + "step": 13280 + }, + { + "epoch": 2.1999668928985265, + "grad_norm": 2.427762746810913, + "learning_rate": 2.0152943946922904e-06, + "loss": 0.3451, + "step": 13290 + }, + { + "epoch": 2.20162224797219, + "grad_norm": 1.8866089582443237, + "learning_rate": 2.007573226293927e-06, + "loss": 0.3214, + "step": 13300 + }, + { + "epoch": 2.203277603045853, + "grad_norm": 2.268911600112915, + "learning_rate": 1.999863160253398e-06, + "loss": 0.3247, + "step": 13310 + }, + { + "epoch": 2.2049329581195165, + "grad_norm": 2.1680514812469482, + "learning_rate": 1.992164225176223e-06, + "loss": 0.3316, + "step": 13320 + }, + { + "epoch": 2.20658831319318, + "grad_norm": 1.8509600162506104, + "learning_rate": 1.9844764496266195e-06, + "loss": 0.3393, + "step": 13330 + }, + { + "epoch": 2.208243668266843, + "grad_norm": 1.7594846487045288, + "learning_rate": 1.9767998621274014e-06, + "loss": 0.3732, + "step": 13340 + }, + { + "epoch": 2.2098990233405065, + "grad_norm": 2.6831443309783936, + "learning_rate": 1.969134491159873e-06, + "loss": 0.3263, + "step": 13350 + }, + { + "epoch": 2.21155437841417, + "grad_norm": 1.7843753099441528, + "learning_rate": 1.961480365163729e-06, + "loss": 0.3278, + "step": 13360 + }, + { + "epoch": 2.213209733487833, + "grad_norm": 2.025347948074341, + "learning_rate": 1.953837512536936e-06, + "loss": 0.3137, + "step": 13370 + }, + { + "epoch": 2.2148650885614964, + "grad_norm": 2.1478426456451416, + "learning_rate": 1.9462059616356377e-06, + "loss": 0.3597, + "step": 13380 + }, + { + "epoch": 2.2165204436351598, + "grad_norm": 2.198599100112915, + "learning_rate": 1.9385857407740504e-06, + "loss": 0.3527, + "step": 13390 + }, + { + "epoch": 2.218175798708823, + "grad_norm": 2.009014844894409, + "learning_rate": 1.9309768782243473e-06, + "loss": 0.3131, + "step": 13400 + }, + { + "epoch": 2.2198311537824864, + "grad_norm": 2.736865997314453, + "learning_rate": 1.9233794022165674e-06, + "loss": 0.3361, + "step": 13410 + }, + { + "epoch": 2.2214865088561497, + "grad_norm": 1.7309480905532837, + "learning_rate": 1.9157933409384993e-06, + "loss": 0.3225, + "step": 13420 + }, + { + "epoch": 2.223141863929813, + "grad_norm": 2.736997127532959, + "learning_rate": 1.908218722535582e-06, + "loss": 0.3144, + "step": 13430 + }, + { + "epoch": 2.2247972190034764, + "grad_norm": 1.6784467697143555, + "learning_rate": 1.9006555751108001e-06, + "loss": 0.34, + "step": 13440 + }, + { + "epoch": 2.2264525740771397, + "grad_norm": 2.2146949768066406, + "learning_rate": 1.8931039267245803e-06, + "loss": 0.3752, + "step": 13450 + }, + { + "epoch": 2.228107929150803, + "grad_norm": 1.708247423171997, + "learning_rate": 1.8855638053946823e-06, + "loss": 0.3439, + "step": 13460 + }, + { + "epoch": 2.2297632842244663, + "grad_norm": 2.3849120140075684, + "learning_rate": 1.8780352390961042e-06, + "loss": 0.3661, + "step": 13470 + }, + { + "epoch": 2.2314186392981297, + "grad_norm": 2.215123176574707, + "learning_rate": 1.8705182557609714e-06, + "loss": 0.3341, + "step": 13480 + }, + { + "epoch": 2.2330739943717925, + "grad_norm": 3.159491777420044, + "learning_rate": 1.8630128832784323e-06, + "loss": 0.3365, + "step": 13490 + }, + { + "epoch": 2.234729349445456, + "grad_norm": 1.2012838125228882, + "learning_rate": 1.8555191494945586e-06, + "loss": 0.3193, + "step": 13500 + }, + { + "epoch": 2.236384704519119, + "grad_norm": 2.053283452987671, + "learning_rate": 1.8480370822122412e-06, + "loss": 0.3612, + "step": 13510 + }, + { + "epoch": 2.2380400595927825, + "grad_norm": 2.0240302085876465, + "learning_rate": 1.8405667091910845e-06, + "loss": 0.3702, + "step": 13520 + }, + { + "epoch": 2.239695414666446, + "grad_norm": 2.051102638244629, + "learning_rate": 1.83310805814731e-06, + "loss": 0.3525, + "step": 13530 + }, + { + "epoch": 2.241350769740109, + "grad_norm": 2.752277135848999, + "learning_rate": 1.8256611567536442e-06, + "loss": 0.3599, + "step": 13540 + }, + { + "epoch": 2.2430061248137725, + "grad_norm": 1.6095753908157349, + "learning_rate": 1.8182260326392208e-06, + "loss": 0.2926, + "step": 13550 + }, + { + "epoch": 2.244661479887436, + "grad_norm": 2.0322036743164062, + "learning_rate": 1.8108027133894828e-06, + "loss": 0.3475, + "step": 13560 + }, + { + "epoch": 2.246316834961099, + "grad_norm": 1.9392104148864746, + "learning_rate": 1.8033912265460695e-06, + "loss": 0.3489, + "step": 13570 + }, + { + "epoch": 2.2479721900347625, + "grad_norm": 1.6775577068328857, + "learning_rate": 1.7959915996067256e-06, + "loss": 0.3406, + "step": 13580 + }, + { + "epoch": 2.2496275451084258, + "grad_norm": 1.9670125246047974, + "learning_rate": 1.7886038600251888e-06, + "loss": 0.3347, + "step": 13590 + }, + { + "epoch": 2.251282900182089, + "grad_norm": 2.1814675331115723, + "learning_rate": 1.7812280352110956e-06, + "loss": 0.3371, + "step": 13600 + }, + { + "epoch": 2.2529382552557524, + "grad_norm": 1.8385521173477173, + "learning_rate": 1.7738641525298766e-06, + "loss": 0.3531, + "step": 13610 + }, + { + "epoch": 2.2545936103294157, + "grad_norm": 1.961987018585205, + "learning_rate": 1.7665122393026523e-06, + "loss": 0.3291, + "step": 13620 + }, + { + "epoch": 2.256248965403079, + "grad_norm": 2.28271746635437, + "learning_rate": 1.759172322806142e-06, + "loss": 0.3615, + "step": 13630 + }, + { + "epoch": 2.2579043204767424, + "grad_norm": 1.908655047416687, + "learning_rate": 1.7518444302725467e-06, + "loss": 0.3458, + "step": 13640 + }, + { + "epoch": 2.2595596755504057, + "grad_norm": 2.255382776260376, + "learning_rate": 1.7445285888894641e-06, + "loss": 0.3442, + "step": 13650 + }, + { + "epoch": 2.261215030624069, + "grad_norm": 1.4033169746398926, + "learning_rate": 1.7372248257997753e-06, + "loss": 0.3455, + "step": 13660 + }, + { + "epoch": 2.2628703856977324, + "grad_norm": 2.0262038707733154, + "learning_rate": 1.7299331681015508e-06, + "loss": 0.3265, + "step": 13670 + }, + { + "epoch": 2.2645257407713952, + "grad_norm": 2.379423141479492, + "learning_rate": 1.722653642847948e-06, + "loss": 0.3533, + "step": 13680 + }, + { + "epoch": 2.2661810958450586, + "grad_norm": 1.8121516704559326, + "learning_rate": 1.7153862770471096e-06, + "loss": 0.3274, + "step": 13690 + }, + { + "epoch": 2.267836450918722, + "grad_norm": 2.4560039043426514, + "learning_rate": 1.7081310976620696e-06, + "loss": 0.3399, + "step": 13700 + }, + { + "epoch": 2.269491805992385, + "grad_norm": 1.8034098148345947, + "learning_rate": 1.700888131610643e-06, + "loss": 0.3056, + "step": 13710 + }, + { + "epoch": 2.2711471610660485, + "grad_norm": 1.6858197450637817, + "learning_rate": 1.6936574057653366e-06, + "loss": 0.3334, + "step": 13720 + }, + { + "epoch": 2.272802516139712, + "grad_norm": 1.8124359846115112, + "learning_rate": 1.6864389469532393e-06, + "loss": 0.3583, + "step": 13730 + }, + { + "epoch": 2.274457871213375, + "grad_norm": 1.7046393156051636, + "learning_rate": 1.6792327819559313e-06, + "loss": 0.332, + "step": 13740 + }, + { + "epoch": 2.2761132262870385, + "grad_norm": 2.347473382949829, + "learning_rate": 1.672038937509376e-06, + "loss": 0.4029, + "step": 13750 + }, + { + "epoch": 2.277768581360702, + "grad_norm": 1.9951478242874146, + "learning_rate": 1.6648574403038325e-06, + "loss": 0.3345, + "step": 13760 + }, + { + "epoch": 2.279423936434365, + "grad_norm": 2.5333080291748047, + "learning_rate": 1.657688316983746e-06, + "loss": 0.3114, + "step": 13770 + }, + { + "epoch": 2.2810792915080285, + "grad_norm": 1.9064871072769165, + "learning_rate": 1.6505315941476507e-06, + "loss": 0.3244, + "step": 13780 + }, + { + "epoch": 2.282734646581692, + "grad_norm": 1.6432058811187744, + "learning_rate": 1.6433872983480758e-06, + "loss": 0.3372, + "step": 13790 + }, + { + "epoch": 2.284390001655355, + "grad_norm": 1.8521987199783325, + "learning_rate": 1.636255456091444e-06, + "loss": 0.3509, + "step": 13800 + }, + { + "epoch": 2.2860453567290184, + "grad_norm": 1.8956800699234009, + "learning_rate": 1.6291360938379752e-06, + "loss": 0.3816, + "step": 13810 + }, + { + "epoch": 2.2877007118026818, + "grad_norm": 2.190884828567505, + "learning_rate": 1.622029238001584e-06, + "loss": 0.3157, + "step": 13820 + }, + { + "epoch": 2.289356066876345, + "grad_norm": 2.1423823833465576, + "learning_rate": 1.6149349149497833e-06, + "loss": 0.2804, + "step": 13830 + }, + { + "epoch": 2.2910114219500084, + "grad_norm": 2.221036672592163, + "learning_rate": 1.607853151003591e-06, + "loss": 0.3681, + "step": 13840 + }, + { + "epoch": 2.2926667770236717, + "grad_norm": 2.1888656616210938, + "learning_rate": 1.6007839724374253e-06, + "loss": 0.3536, + "step": 13850 + }, + { + "epoch": 2.294322132097335, + "grad_norm": 2.2025184631347656, + "learning_rate": 1.593727405479012e-06, + "loss": 0.3667, + "step": 13860 + }, + { + "epoch": 2.2959774871709984, + "grad_norm": 1.9448472261428833, + "learning_rate": 1.5866834763092885e-06, + "loss": 0.3342, + "step": 13870 + }, + { + "epoch": 2.2976328422446617, + "grad_norm": 2.2824110984802246, + "learning_rate": 1.5796522110623002e-06, + "loss": 0.3581, + "step": 13880 + }, + { + "epoch": 2.299288197318325, + "grad_norm": 2.112283706665039, + "learning_rate": 1.5726336358251104e-06, + "loss": 0.3352, + "step": 13890 + }, + { + "epoch": 2.300943552391988, + "grad_norm": 2.393695592880249, + "learning_rate": 1.5656277766376992e-06, + "loss": 0.3287, + "step": 13900 + }, + { + "epoch": 2.302598907465651, + "grad_norm": 2.341130018234253, + "learning_rate": 1.5586346594928675e-06, + "loss": 0.3384, + "step": 13910 + }, + { + "epoch": 2.3042542625393145, + "grad_norm": 2.0110530853271484, + "learning_rate": 1.5516543103361403e-06, + "loss": 0.3276, + "step": 13920 + }, + { + "epoch": 2.305909617612978, + "grad_norm": 3.0257723331451416, + "learning_rate": 1.544686755065677e-06, + "loss": 0.3544, + "step": 13930 + }, + { + "epoch": 2.307564972686641, + "grad_norm": 1.825762391090393, + "learning_rate": 1.5377320195321642e-06, + "loss": 0.3448, + "step": 13940 + }, + { + "epoch": 2.3092203277603045, + "grad_norm": 1.9047185182571411, + "learning_rate": 1.5307901295387268e-06, + "loss": 0.3239, + "step": 13950 + }, + { + "epoch": 2.310875682833968, + "grad_norm": 2.2745182514190674, + "learning_rate": 1.5238611108408292e-06, + "loss": 0.3447, + "step": 13960 + }, + { + "epoch": 2.312531037907631, + "grad_norm": 2.037247896194458, + "learning_rate": 1.5169449891461867e-06, + "loss": 0.3188, + "step": 13970 + }, + { + "epoch": 2.3141863929812945, + "grad_norm": 2.3445773124694824, + "learning_rate": 1.5100417901146585e-06, + "loss": 0.3335, + "step": 13980 + }, + { + "epoch": 2.315841748054958, + "grad_norm": 2.1629180908203125, + "learning_rate": 1.5031515393581642e-06, + "loss": 0.3525, + "step": 13990 + }, + { + "epoch": 2.317497103128621, + "grad_norm": 2.212402820587158, + "learning_rate": 1.496274262440579e-06, + "loss": 0.3469, + "step": 14000 + }, + { + "epoch": 2.3191524582022844, + "grad_norm": 2.41933012008667, + "learning_rate": 1.4894099848776444e-06, + "loss": 0.3755, + "step": 14010 + }, + { + "epoch": 2.3208078132759478, + "grad_norm": 1.9362802505493164, + "learning_rate": 1.482558732136874e-06, + "loss": 0.3416, + "step": 14020 + }, + { + "epoch": 2.322463168349611, + "grad_norm": 1.8047678470611572, + "learning_rate": 1.4757205296374532e-06, + "loss": 0.3323, + "step": 14030 + }, + { + "epoch": 2.3241185234232744, + "grad_norm": 1.8480398654937744, + "learning_rate": 1.4688954027501545e-06, + "loss": 0.3112, + "step": 14040 + }, + { + "epoch": 2.3257738784969377, + "grad_norm": 2.211897611618042, + "learning_rate": 1.4620833767972365e-06, + "loss": 0.3252, + "step": 14050 + }, + { + "epoch": 2.327429233570601, + "grad_norm": 2.3595709800720215, + "learning_rate": 1.4552844770523477e-06, + "loss": 0.336, + "step": 14060 + }, + { + "epoch": 2.3290845886442644, + "grad_norm": 3.3694913387298584, + "learning_rate": 1.4484987287404407e-06, + "loss": 0.3253, + "step": 14070 + }, + { + "epoch": 2.3307399437179273, + "grad_norm": 1.8503248691558838, + "learning_rate": 1.4417261570376701e-06, + "loss": 0.3293, + "step": 14080 + }, + { + "epoch": 2.3323952987915906, + "grad_norm": 2.116312265396118, + "learning_rate": 1.4349667870713057e-06, + "loss": 0.3472, + "step": 14090 + }, + { + "epoch": 2.334050653865254, + "grad_norm": 2.352787971496582, + "learning_rate": 1.4282206439196395e-06, + "loss": 0.3609, + "step": 14100 + }, + { + "epoch": 2.335706008938917, + "grad_norm": 1.6080447435379028, + "learning_rate": 1.4214877526118853e-06, + "loss": 0.3355, + "step": 14110 + }, + { + "epoch": 2.3373613640125805, + "grad_norm": 2.383387565612793, + "learning_rate": 1.4147681381280909e-06, + "loss": 0.3439, + "step": 14120 + }, + { + "epoch": 2.339016719086244, + "grad_norm": 2.3709840774536133, + "learning_rate": 1.4080618253990502e-06, + "loss": 0.3425, + "step": 14130 + }, + { + "epoch": 2.340672074159907, + "grad_norm": 2.1909902095794678, + "learning_rate": 1.4013688393062003e-06, + "loss": 0.3428, + "step": 14140 + }, + { + "epoch": 2.3423274292335705, + "grad_norm": 1.8654948472976685, + "learning_rate": 1.3946892046815341e-06, + "loss": 0.3237, + "step": 14150 + }, + { + "epoch": 2.343982784307234, + "grad_norm": 1.9633307456970215, + "learning_rate": 1.3880229463075146e-06, + "loss": 0.3227, + "step": 14160 + }, + { + "epoch": 2.345638139380897, + "grad_norm": 2.1636171340942383, + "learning_rate": 1.3813700889169707e-06, + "loss": 0.3633, + "step": 14170 + }, + { + "epoch": 2.3472934944545605, + "grad_norm": 2.097670555114746, + "learning_rate": 1.3747306571930141e-06, + "loss": 0.363, + "step": 14180 + }, + { + "epoch": 2.348948849528224, + "grad_norm": 2.1034417152404785, + "learning_rate": 1.3681046757689448e-06, + "loss": 0.3479, + "step": 14190 + }, + { + "epoch": 2.350604204601887, + "grad_norm": 1.7305580377578735, + "learning_rate": 1.3614921692281586e-06, + "loss": 0.3351, + "step": 14200 + }, + { + "epoch": 2.3522595596755504, + "grad_norm": 2.346315860748291, + "learning_rate": 1.3548931621040607e-06, + "loss": 0.382, + "step": 14210 + }, + { + "epoch": 2.3539149147492138, + "grad_norm": 2.112192392349243, + "learning_rate": 1.3483076788799715e-06, + "loss": 0.3498, + "step": 14220 + }, + { + "epoch": 2.355570269822877, + "grad_norm": 2.4844164848327637, + "learning_rate": 1.3417357439890323e-06, + "loss": 0.359, + "step": 14230 + }, + { + "epoch": 2.3572256248965404, + "grad_norm": 2.170315742492676, + "learning_rate": 1.33517738181412e-06, + "loss": 0.3237, + "step": 14240 + }, + { + "epoch": 2.3588809799702037, + "grad_norm": 1.9840915203094482, + "learning_rate": 1.328632616687754e-06, + "loss": 0.3087, + "step": 14250 + }, + { + "epoch": 2.360536335043867, + "grad_norm": 2.098491907119751, + "learning_rate": 1.3221014728920056e-06, + "loss": 0.3542, + "step": 14260 + }, + { + "epoch": 2.3621916901175304, + "grad_norm": 2.413897752761841, + "learning_rate": 1.3155839746584138e-06, + "loss": 0.3359, + "step": 14270 + }, + { + "epoch": 2.3638470451911937, + "grad_norm": 2.1317267417907715, + "learning_rate": 1.3090801461678848e-06, + "loss": 0.3045, + "step": 14280 + }, + { + "epoch": 2.365502400264857, + "grad_norm": 1.6186710596084595, + "learning_rate": 1.3025900115506086e-06, + "loss": 0.347, + "step": 14290 + }, + { + "epoch": 2.3671577553385204, + "grad_norm": 2.192939519882202, + "learning_rate": 1.2961135948859737e-06, + "loss": 0.4005, + "step": 14300 + }, + { + "epoch": 2.3688131104121832, + "grad_norm": 2.0540459156036377, + "learning_rate": 1.2896509202024682e-06, + "loss": 0.3477, + "step": 14310 + }, + { + "epoch": 2.3704684654858466, + "grad_norm": 2.3481903076171875, + "learning_rate": 1.2832020114775951e-06, + "loss": 0.3254, + "step": 14320 + }, + { + "epoch": 2.37212382055951, + "grad_norm": 1.9363404512405396, + "learning_rate": 1.2767668926377885e-06, + "loss": 0.3361, + "step": 14330 + }, + { + "epoch": 2.373779175633173, + "grad_norm": 2.1417112350463867, + "learning_rate": 1.2703455875583148e-06, + "loss": 0.3502, + "step": 14340 + }, + { + "epoch": 2.3754345307068365, + "grad_norm": 2.0085740089416504, + "learning_rate": 1.263938120063191e-06, + "loss": 0.3358, + "step": 14350 + }, + { + "epoch": 2.3770898857805, + "grad_norm": 1.862365961074829, + "learning_rate": 1.2575445139250936e-06, + "loss": 0.3062, + "step": 14360 + }, + { + "epoch": 2.378745240854163, + "grad_norm": 1.9572957754135132, + "learning_rate": 1.2511647928652754e-06, + "loss": 0.3038, + "step": 14370 + }, + { + "epoch": 2.3804005959278265, + "grad_norm": 2.7064197063446045, + "learning_rate": 1.2447989805534677e-06, + "loss": 0.3101, + "step": 14380 + }, + { + "epoch": 2.38205595100149, + "grad_norm": 2.0222458839416504, + "learning_rate": 1.2384471006078036e-06, + "loss": 0.3894, + "step": 14390 + }, + { + "epoch": 2.383711306075153, + "grad_norm": 1.8793877363204956, + "learning_rate": 1.2321091765947214e-06, + "loss": 0.2965, + "step": 14400 + }, + { + "epoch": 2.3853666611488165, + "grad_norm": 1.7460297346115112, + "learning_rate": 1.2257852320288815e-06, + "loss": 0.3337, + "step": 14410 + }, + { + "epoch": 2.38702201622248, + "grad_norm": 2.319016218185425, + "learning_rate": 1.219475290373079e-06, + "loss": 0.3176, + "step": 14420 + }, + { + "epoch": 2.388677371296143, + "grad_norm": 2.06717848777771, + "learning_rate": 1.2131793750381554e-06, + "loss": 0.3919, + "step": 14430 + }, + { + "epoch": 2.3903327263698064, + "grad_norm": 2.179219961166382, + "learning_rate": 1.2068975093829123e-06, + "loss": 0.3412, + "step": 14440 + }, + { + "epoch": 2.3919880814434697, + "grad_norm": 1.629726767539978, + "learning_rate": 1.2006297167140257e-06, + "loss": 0.307, + "step": 14450 + }, + { + "epoch": 2.393643436517133, + "grad_norm": 2.2001848220825195, + "learning_rate": 1.1943760202859606e-06, + "loss": 0.3739, + "step": 14460 + }, + { + "epoch": 2.3952987915907964, + "grad_norm": 1.8462634086608887, + "learning_rate": 1.188136443300879e-06, + "loss": 0.3466, + "step": 14470 + }, + { + "epoch": 2.3969541466644597, + "grad_norm": 1.7766656875610352, + "learning_rate": 1.1819110089085595e-06, + "loss": 0.3227, + "step": 14480 + }, + { + "epoch": 2.3986095017381226, + "grad_norm": 2.0982725620269775, + "learning_rate": 1.1756997402063069e-06, + "loss": 0.3111, + "step": 14490 + }, + { + "epoch": 2.400264856811786, + "grad_norm": 2.340324878692627, + "learning_rate": 1.1695026602388755e-06, + "loss": 0.3458, + "step": 14500 + }, + { + "epoch": 2.4019202118854492, + "grad_norm": 2.2285056114196777, + "learning_rate": 1.1633197919983707e-06, + "loss": 0.3496, + "step": 14510 + }, + { + "epoch": 2.4035755669591126, + "grad_norm": 1.9656524658203125, + "learning_rate": 1.1571511584241735e-06, + "loss": 0.3217, + "step": 14520 + }, + { + "epoch": 2.405230922032776, + "grad_norm": 2.775507688522339, + "learning_rate": 1.1509967824028496e-06, + "loss": 0.3409, + "step": 14530 + }, + { + "epoch": 2.406886277106439, + "grad_norm": 2.0390307903289795, + "learning_rate": 1.1448566867680715e-06, + "loss": 0.3302, + "step": 14540 + }, + { + "epoch": 2.4085416321801025, + "grad_norm": 2.3175621032714844, + "learning_rate": 1.1387308943005248e-06, + "loss": 0.3494, + "step": 14550 + }, + { + "epoch": 2.410196987253766, + "grad_norm": 2.031985282897949, + "learning_rate": 1.132619427727832e-06, + "loss": 0.3195, + "step": 14560 + }, + { + "epoch": 2.411852342327429, + "grad_norm": 1.8072073459625244, + "learning_rate": 1.1265223097244604e-06, + "loss": 0.3405, + "step": 14570 + }, + { + "epoch": 2.4135076974010925, + "grad_norm": 2.3952126502990723, + "learning_rate": 1.1204395629116445e-06, + "loss": 0.3388, + "step": 14580 + }, + { + "epoch": 2.415163052474756, + "grad_norm": 2.0482804775238037, + "learning_rate": 1.114371209857299e-06, + "loss": 0.337, + "step": 14590 + }, + { + "epoch": 2.416818407548419, + "grad_norm": 2.1736388206481934, + "learning_rate": 1.1083172730759351e-06, + "loss": 0.3629, + "step": 14600 + }, + { + "epoch": 2.4184737626220825, + "grad_norm": 2.3148396015167236, + "learning_rate": 1.1022777750285767e-06, + "loss": 0.3523, + "step": 14610 + }, + { + "epoch": 2.420129117695746, + "grad_norm": 2.455919027328491, + "learning_rate": 1.0962527381226795e-06, + "loss": 0.3413, + "step": 14620 + }, + { + "epoch": 2.421784472769409, + "grad_norm": 2.373774766921997, + "learning_rate": 1.0902421847120482e-06, + "loss": 0.3068, + "step": 14630 + }, + { + "epoch": 2.4234398278430724, + "grad_norm": 2.160219669342041, + "learning_rate": 1.084246137096746e-06, + "loss": 0.345, + "step": 14640 + }, + { + "epoch": 2.4250951829167358, + "grad_norm": 1.6808048486709595, + "learning_rate": 1.0782646175230217e-06, + "loss": 0.3279, + "step": 14650 + }, + { + "epoch": 2.426750537990399, + "grad_norm": 1.472030520439148, + "learning_rate": 1.0722976481832205e-06, + "loss": 0.3158, + "step": 14660 + }, + { + "epoch": 2.4284058930640624, + "grad_norm": 1.8282170295715332, + "learning_rate": 1.0663452512157035e-06, + "loss": 0.323, + "step": 14670 + }, + { + "epoch": 2.4300612481377257, + "grad_norm": 1.9233111143112183, + "learning_rate": 1.0604074487047705e-06, + "loss": 0.3741, + "step": 14680 + }, + { + "epoch": 2.431716603211389, + "grad_norm": 1.9156076908111572, + "learning_rate": 1.0544842626805684e-06, + "loss": 0.3296, + "step": 14690 + }, + { + "epoch": 2.4333719582850524, + "grad_norm": 2.5253238677978516, + "learning_rate": 1.0485757151190195e-06, + "loss": 0.3149, + "step": 14700 + }, + { + "epoch": 2.4350273133587153, + "grad_norm": 1.5516034364700317, + "learning_rate": 1.0426818279417306e-06, + "loss": 0.3297, + "step": 14710 + }, + { + "epoch": 2.4366826684323786, + "grad_norm": 1.9635651111602783, + "learning_rate": 1.036802623015918e-06, + "loss": 0.3331, + "step": 14720 + }, + { + "epoch": 2.438338023506042, + "grad_norm": 1.9763412475585938, + "learning_rate": 1.0309381221543268e-06, + "loss": 0.3308, + "step": 14730 + }, + { + "epoch": 2.439993378579705, + "grad_norm": 1.942713975906372, + "learning_rate": 1.0250883471151451e-06, + "loss": 0.3429, + "step": 14740 + }, + { + "epoch": 2.4416487336533685, + "grad_norm": 2.4745335578918457, + "learning_rate": 1.0192533196019267e-06, + "loss": 0.3287, + "step": 14750 + }, + { + "epoch": 2.443304088727032, + "grad_norm": 2.276266098022461, + "learning_rate": 1.0134330612635101e-06, + "loss": 0.3187, + "step": 14760 + }, + { + "epoch": 2.444959443800695, + "grad_norm": 1.9105377197265625, + "learning_rate": 1.0076275936939357e-06, + "loss": 0.2986, + "step": 14770 + }, + { + "epoch": 2.4466147988743585, + "grad_norm": 2.1453704833984375, + "learning_rate": 1.0018369384323722e-06, + "loss": 0.3588, + "step": 14780 + }, + { + "epoch": 2.448270153948022, + "grad_norm": 2.0307068824768066, + "learning_rate": 9.960611169630308e-07, + "loss": 0.3284, + "step": 14790 + }, + { + "epoch": 2.449925509021685, + "grad_norm": 2.2573883533477783, + "learning_rate": 9.90300150715085e-07, + "loss": 0.3408, + "step": 14800 + }, + { + "epoch": 2.4515808640953485, + "grad_norm": 2.224506378173828, + "learning_rate": 9.845540610625953e-07, + "loss": 0.338, + "step": 14810 + }, + { + "epoch": 2.453236219169012, + "grad_norm": 2.1470425128936768, + "learning_rate": 9.788228693244266e-07, + "loss": 0.3116, + "step": 14820 + }, + { + "epoch": 2.454891574242675, + "grad_norm": 1.8685119152069092, + "learning_rate": 9.731065967641712e-07, + "loss": 0.3046, + "step": 14830 + }, + { + "epoch": 2.4565469293163384, + "grad_norm": 2.515958070755005, + "learning_rate": 9.67405264590066e-07, + "loss": 0.3251, + "step": 14840 + }, + { + "epoch": 2.4582022843900018, + "grad_norm": 2.4196338653564453, + "learning_rate": 9.617188939549232e-07, + "loss": 0.318, + "step": 14850 + }, + { + "epoch": 2.459857639463665, + "grad_norm": 2.0046846866607666, + "learning_rate": 9.560475059560388e-07, + "loss": 0.325, + "step": 14860 + }, + { + "epoch": 2.4615129945373284, + "grad_norm": 2.182124376296997, + "learning_rate": 9.503911216351252e-07, + "loss": 0.3061, + "step": 14870 + }, + { + "epoch": 2.4631683496109917, + "grad_norm": 2.041778087615967, + "learning_rate": 9.447497619782269e-07, + "loss": 0.31, + "step": 14880 + }, + { + "epoch": 2.4648237046846546, + "grad_norm": 2.510472297668457, + "learning_rate": 9.391234479156452e-07, + "loss": 0.3278, + "step": 14890 + }, + { + "epoch": 2.466479059758318, + "grad_norm": 1.6733882427215576, + "learning_rate": 9.335122003218583e-07, + "loss": 0.3061, + "step": 14900 + }, + { + "epoch": 2.4681344148319813, + "grad_norm": 1.9581342935562134, + "learning_rate": 9.279160400154497e-07, + "loss": 0.3268, + "step": 14910 + }, + { + "epoch": 2.4697897699056446, + "grad_norm": 2.368661642074585, + "learning_rate": 9.223349877590226e-07, + "loss": 0.3642, + "step": 14920 + }, + { + "epoch": 2.471445124979308, + "grad_norm": 1.5843191146850586, + "learning_rate": 9.167690642591287e-07, + "loss": 0.3566, + "step": 14930 + }, + { + "epoch": 2.4731004800529712, + "grad_norm": 1.8079990148544312, + "learning_rate": 9.112182901661881e-07, + "loss": 0.3455, + "step": 14940 + }, + { + "epoch": 2.4747558351266346, + "grad_norm": 2.184272289276123, + "learning_rate": 9.056826860744178e-07, + "loss": 0.3407, + "step": 14950 + }, + { + "epoch": 2.476411190200298, + "grad_norm": 1.532046914100647, + "learning_rate": 9.001622725217495e-07, + "loss": 0.31, + "step": 14960 + }, + { + "epoch": 2.478066545273961, + "grad_norm": 2.5433170795440674, + "learning_rate": 8.946570699897566e-07, + "loss": 0.3267, + "step": 14970 + }, + { + "epoch": 2.4797219003476245, + "grad_norm": 2.535883903503418, + "learning_rate": 8.89167098903575e-07, + "loss": 0.3047, + "step": 14980 + }, + { + "epoch": 2.481377255421288, + "grad_norm": 1.7732888460159302, + "learning_rate": 8.836923796318325e-07, + "loss": 0.3173, + "step": 14990 + }, + { + "epoch": 2.483032610494951, + "grad_norm": 2.467151403427124, + "learning_rate": 8.78232932486568e-07, + "loss": 0.3411, + "step": 15000 + }, + { + "epoch": 2.4846879655686145, + "grad_norm": 2.1632003784179688, + "learning_rate": 8.727887777231591e-07, + "loss": 0.3038, + "step": 15010 + }, + { + "epoch": 2.486343320642278, + "grad_norm": 2.1071043014526367, + "learning_rate": 8.673599355402496e-07, + "loss": 0.3386, + "step": 15020 + }, + { + "epoch": 2.487998675715941, + "grad_norm": 1.7851759195327759, + "learning_rate": 8.619464260796651e-07, + "loss": 0.3284, + "step": 15030 + }, + { + "epoch": 2.4896540307896045, + "grad_norm": 2.402301788330078, + "learning_rate": 8.565482694263516e-07, + "loss": 0.3356, + "step": 15040 + }, + { + "epoch": 2.4913093858632678, + "grad_norm": 1.905591368675232, + "learning_rate": 8.51165485608288e-07, + "loss": 0.303, + "step": 15050 + }, + { + "epoch": 2.492964740936931, + "grad_norm": 2.015292167663574, + "learning_rate": 8.4579809459642e-07, + "loss": 0.3312, + "step": 15060 + }, + { + "epoch": 2.4946200960105944, + "grad_norm": 2.36039662361145, + "learning_rate": 8.404461163045829e-07, + "loss": 0.3066, + "step": 15070 + }, + { + "epoch": 2.4962754510842577, + "grad_norm": 2.305978536605835, + "learning_rate": 8.351095705894308e-07, + "loss": 0.3414, + "step": 15080 + }, + { + "epoch": 2.497930806157921, + "grad_norm": 2.039421558380127, + "learning_rate": 8.297884772503578e-07, + "loss": 0.3564, + "step": 15090 + }, + { + "epoch": 2.4995861612315844, + "grad_norm": 2.0474462509155273, + "learning_rate": 8.244828560294272e-07, + "loss": 0.3178, + "step": 15100 + }, + { + "epoch": 2.5012415163052477, + "grad_norm": 2.152980089187622, + "learning_rate": 8.19192726611302e-07, + "loss": 0.3511, + "step": 15110 + }, + { + "epoch": 2.502896871378911, + "grad_norm": 2.019902467727661, + "learning_rate": 8.139181086231651e-07, + "loss": 0.3401, + "step": 15120 + }, + { + "epoch": 2.504552226452574, + "grad_norm": 2.122307777404785, + "learning_rate": 8.086590216346479e-07, + "loss": 0.3577, + "step": 15130 + }, + { + "epoch": 2.5062075815262372, + "grad_norm": 2.2243058681488037, + "learning_rate": 8.034154851577658e-07, + "loss": 0.361, + "step": 15140 + }, + { + "epoch": 2.5078629365999006, + "grad_norm": 2.078754425048828, + "learning_rate": 7.981875186468335e-07, + "loss": 0.3345, + "step": 15150 + }, + { + "epoch": 2.509518291673564, + "grad_norm": 2.2314717769622803, + "learning_rate": 7.929751414984011e-07, + "loss": 0.3215, + "step": 15160 + }, + { + "epoch": 2.511173646747227, + "grad_norm": 2.4789726734161377, + "learning_rate": 7.87778373051179e-07, + "loss": 0.3323, + "step": 15170 + }, + { + "epoch": 2.5128290018208905, + "grad_norm": 1.7479615211486816, + "learning_rate": 7.825972325859671e-07, + "loss": 0.3372, + "step": 15180 + }, + { + "epoch": 2.514484356894554, + "grad_norm": 2.23327374458313, + "learning_rate": 7.774317393255837e-07, + "loss": 0.3448, + "step": 15190 + }, + { + "epoch": 2.516139711968217, + "grad_norm": 1.7538516521453857, + "learning_rate": 7.722819124347952e-07, + "loss": 0.3118, + "step": 15200 + }, + { + "epoch": 2.5177950670418805, + "grad_norm": 2.249854803085327, + "learning_rate": 7.671477710202407e-07, + "loss": 0.3393, + "step": 15210 + }, + { + "epoch": 2.519450422115544, + "grad_norm": 1.923726201057434, + "learning_rate": 7.620293341303636e-07, + "loss": 0.3565, + "step": 15220 + }, + { + "epoch": 2.521105777189207, + "grad_norm": 2.8457915782928467, + "learning_rate": 7.569266207553427e-07, + "loss": 0.3383, + "step": 15230 + }, + { + "epoch": 2.5227611322628705, + "grad_norm": 2.412850856781006, + "learning_rate": 7.518396498270191e-07, + "loss": 0.3066, + "step": 15240 + }, + { + "epoch": 2.524416487336534, + "grad_norm": 2.3531320095062256, + "learning_rate": 7.467684402188291e-07, + "loss": 0.3237, + "step": 15250 + }, + { + "epoch": 2.526071842410197, + "grad_norm": 1.6720978021621704, + "learning_rate": 7.417130107457293e-07, + "loss": 0.3458, + "step": 15260 + }, + { + "epoch": 2.5277271974838604, + "grad_norm": 2.2779157161712646, + "learning_rate": 7.366733801641302e-07, + "loss": 0.3091, + "step": 15270 + }, + { + "epoch": 2.5293825525575233, + "grad_norm": 3.225557804107666, + "learning_rate": 7.316495671718293e-07, + "loss": 0.3412, + "step": 15280 + }, + { + "epoch": 2.5310379076311866, + "grad_norm": 2.389375925064087, + "learning_rate": 7.266415904079338e-07, + "loss": 0.3549, + "step": 15290 + }, + { + "epoch": 2.53269326270485, + "grad_norm": 2.0392401218414307, + "learning_rate": 7.216494684527975e-07, + "loss": 0.3286, + "step": 15300 + }, + { + "epoch": 2.5343486177785133, + "grad_norm": 1.929891586303711, + "learning_rate": 7.166732198279535e-07, + "loss": 0.3015, + "step": 15310 + }, + { + "epoch": 2.5360039728521766, + "grad_norm": 1.5978699922561646, + "learning_rate": 7.11712862996039e-07, + "loss": 0.3161, + "step": 15320 + }, + { + "epoch": 2.53765932792584, + "grad_norm": 2.1672823429107666, + "learning_rate": 7.067684163607308e-07, + "loss": 0.3129, + "step": 15330 + }, + { + "epoch": 2.5393146829995032, + "grad_norm": 1.8573296070098877, + "learning_rate": 7.018398982666757e-07, + "loss": 0.3553, + "step": 15340 + }, + { + "epoch": 2.5409700380731666, + "grad_norm": 1.951036810874939, + "learning_rate": 6.96927326999427e-07, + "loss": 0.3444, + "step": 15350 + }, + { + "epoch": 2.54262539314683, + "grad_norm": 2.200702428817749, + "learning_rate": 6.920307207853683e-07, + "loss": 0.3321, + "step": 15360 + }, + { + "epoch": 2.544280748220493, + "grad_norm": 1.9019625186920166, + "learning_rate": 6.87150097791654e-07, + "loss": 0.3775, + "step": 15370 + }, + { + "epoch": 2.5459361032941565, + "grad_norm": 1.8351606130599976, + "learning_rate": 6.822854761261355e-07, + "loss": 0.3357, + "step": 15380 + }, + { + "epoch": 2.54759145836782, + "grad_norm": 2.076394557952881, + "learning_rate": 6.774368738372988e-07, + "loss": 0.3164, + "step": 15390 + }, + { + "epoch": 2.549246813441483, + "grad_norm": 2.102830171585083, + "learning_rate": 6.726043089141943e-07, + "loss": 0.3377, + "step": 15400 + }, + { + "epoch": 2.5509021685151465, + "grad_norm": 2.1659622192382812, + "learning_rate": 6.677877992863707e-07, + "loss": 0.3157, + "step": 15410 + }, + { + "epoch": 2.55255752358881, + "grad_norm": 2.069488525390625, + "learning_rate": 6.629873628238126e-07, + "loss": 0.3388, + "step": 15420 + }, + { + "epoch": 2.554212878662473, + "grad_norm": 2.4191696643829346, + "learning_rate": 6.582030173368664e-07, + "loss": 0.3583, + "step": 15430 + }, + { + "epoch": 2.5558682337361365, + "grad_norm": 2.1078972816467285, + "learning_rate": 6.534347805761826e-07, + "loss": 0.3732, + "step": 15440 + }, + { + "epoch": 2.5575235888098, + "grad_norm": 1.9522989988327026, + "learning_rate": 6.486826702326426e-07, + "loss": 0.289, + "step": 15450 + }, + { + "epoch": 2.559178943883463, + "grad_norm": 1.9794495105743408, + "learning_rate": 6.439467039372971e-07, + "loss": 0.3604, + "step": 15460 + }, + { + "epoch": 2.5608342989571264, + "grad_norm": 2.4447927474975586, + "learning_rate": 6.392268992613005e-07, + "loss": 0.3306, + "step": 15470 + }, + { + "epoch": 2.5624896540307898, + "grad_norm": 2.0704166889190674, + "learning_rate": 6.345232737158458e-07, + "loss": 0.3003, + "step": 15480 + }, + { + "epoch": 2.564145009104453, + "grad_norm": 2.33072566986084, + "learning_rate": 6.298358447520985e-07, + "loss": 0.3249, + "step": 15490 + }, + { + "epoch": 2.5658003641781164, + "grad_norm": 2.0629100799560547, + "learning_rate": 6.251646297611308e-07, + "loss": 0.3381, + "step": 15500 + }, + { + "epoch": 2.5674557192517797, + "grad_norm": 3.117828607559204, + "learning_rate": 6.205096460738591e-07, + "loss": 0.3267, + "step": 15510 + }, + { + "epoch": 2.569111074325443, + "grad_norm": 1.9236739873886108, + "learning_rate": 6.158709109609828e-07, + "loss": 0.3281, + "step": 15520 + }, + { + "epoch": 2.5707664293991064, + "grad_norm": 2.1610283851623535, + "learning_rate": 6.112484416329107e-07, + "loss": 0.3101, + "step": 15530 + }, + { + "epoch": 2.5724217844727693, + "grad_norm": 1.9747469425201416, + "learning_rate": 6.066422552397083e-07, + "loss": 0.3243, + "step": 15540 + }, + { + "epoch": 2.5740771395464326, + "grad_norm": 2.253056049346924, + "learning_rate": 6.020523688710256e-07, + "loss": 0.3359, + "step": 15550 + }, + { + "epoch": 2.575732494620096, + "grad_norm": 2.041837215423584, + "learning_rate": 5.974787995560389e-07, + "loss": 0.302, + "step": 15560 + }, + { + "epoch": 2.5773878496937592, + "grad_norm": 1.8018887042999268, + "learning_rate": 5.92921564263384e-07, + "loss": 0.36, + "step": 15570 + }, + { + "epoch": 2.5790432047674225, + "grad_norm": 2.423647880554199, + "learning_rate": 5.88380679901096e-07, + "loss": 0.3439, + "step": 15580 + }, + { + "epoch": 2.580698559841086, + "grad_norm": 2.334973096847534, + "learning_rate": 5.838561633165474e-07, + "loss": 0.2951, + "step": 15590 + }, + { + "epoch": 2.582353914914749, + "grad_norm": 1.6279757022857666, + "learning_rate": 5.793480312963789e-07, + "loss": 0.3056, + "step": 15600 + }, + { + "epoch": 2.5840092699884125, + "grad_norm": 1.6940557956695557, + "learning_rate": 5.748563005664482e-07, + "loss": 0.3447, + "step": 15610 + }, + { + "epoch": 2.585664625062076, + "grad_norm": 2.257672071456909, + "learning_rate": 5.703809877917571e-07, + "loss": 0.3755, + "step": 15620 + }, + { + "epoch": 2.587319980135739, + "grad_norm": 1.9907878637313843, + "learning_rate": 5.659221095763955e-07, + "loss": 0.3425, + "step": 15630 + }, + { + "epoch": 2.5889753352094025, + "grad_norm": 1.69442617893219, + "learning_rate": 5.614796824634783e-07, + "loss": 0.332, + "step": 15640 + }, + { + "epoch": 2.590630690283066, + "grad_norm": 1.8623377084732056, + "learning_rate": 5.570537229350864e-07, + "loss": 0.3538, + "step": 15650 + }, + { + "epoch": 2.592286045356729, + "grad_norm": 2.767336845397949, + "learning_rate": 5.526442474122013e-07, + "loss": 0.3634, + "step": 15660 + }, + { + "epoch": 2.5939414004303925, + "grad_norm": 2.1302411556243896, + "learning_rate": 5.482512722546468e-07, + "loss": 0.3413, + "step": 15670 + }, + { + "epoch": 2.5955967555040558, + "grad_norm": 1.9488990306854248, + "learning_rate": 5.438748137610267e-07, + "loss": 0.3044, + "step": 15680 + }, + { + "epoch": 2.5972521105777187, + "grad_norm": 1.6431246995925903, + "learning_rate": 5.395148881686685e-07, + "loss": 0.348, + "step": 15690 + }, + { + "epoch": 2.598907465651382, + "grad_norm": 2.266380548477173, + "learning_rate": 5.351715116535571e-07, + "loss": 0.3123, + "step": 15700 + }, + { + "epoch": 2.6005628207250453, + "grad_norm": 2.4466562271118164, + "learning_rate": 5.308447003302808e-07, + "loss": 0.3468, + "step": 15710 + }, + { + "epoch": 2.6022181757987086, + "grad_norm": 2.20597505569458, + "learning_rate": 5.265344702519654e-07, + "loss": 0.3658, + "step": 15720 + }, + { + "epoch": 2.603873530872372, + "grad_norm": 1.868127703666687, + "learning_rate": 5.2224083741022e-07, + "loss": 0.3256, + "step": 15730 + }, + { + "epoch": 2.6055288859460353, + "grad_norm": 2.128237724304199, + "learning_rate": 5.179638177350737e-07, + "loss": 0.3305, + "step": 15740 + }, + { + "epoch": 2.6071842410196986, + "grad_norm": 2.1606836318969727, + "learning_rate": 5.137034270949182e-07, + "loss": 0.3211, + "step": 15750 + }, + { + "epoch": 2.608839596093362, + "grad_norm": 2.227559804916382, + "learning_rate": 5.094596812964525e-07, + "loss": 0.3505, + "step": 15760 + }, + { + "epoch": 2.6104949511670252, + "grad_norm": 1.9581117630004883, + "learning_rate": 5.052325960846155e-07, + "loss": 0.3398, + "step": 15770 + }, + { + "epoch": 2.6121503062406886, + "grad_norm": 2.039578914642334, + "learning_rate": 5.010221871425375e-07, + "loss": 0.3093, + "step": 15780 + }, + { + "epoch": 2.613805661314352, + "grad_norm": 1.7403851747512817, + "learning_rate": 4.968284700914744e-07, + "loss": 0.3196, + "step": 15790 + }, + { + "epoch": 2.615461016388015, + "grad_norm": 1.892404317855835, + "learning_rate": 4.926514604907534e-07, + "loss": 0.3266, + "step": 15800 + }, + { + "epoch": 2.6171163714616785, + "grad_norm": 2.2020905017852783, + "learning_rate": 4.88491173837713e-07, + "loss": 0.3502, + "step": 15810 + }, + { + "epoch": 2.618771726535342, + "grad_norm": 1.3924425840377808, + "learning_rate": 4.843476255676516e-07, + "loss": 0.3446, + "step": 15820 + }, + { + "epoch": 2.620427081609005, + "grad_norm": 2.562934637069702, + "learning_rate": 4.802208310537604e-07, + "loss": 0.3755, + "step": 15830 + }, + { + "epoch": 2.6220824366826685, + "grad_norm": 1.5193594694137573, + "learning_rate": 4.7611080560707344e-07, + "loss": 0.3136, + "step": 15840 + }, + { + "epoch": 2.623737791756332, + "grad_norm": 2.2128007411956787, + "learning_rate": 4.720175644764119e-07, + "loss": 0.3413, + "step": 15850 + }, + { + "epoch": 2.625393146829995, + "grad_norm": 1.999158263206482, + "learning_rate": 4.6794112284831995e-07, + "loss": 0.3565, + "step": 15860 + }, + { + "epoch": 2.6270485019036585, + "grad_norm": 2.3605151176452637, + "learning_rate": 4.63881495847015e-07, + "loss": 0.317, + "step": 15870 + }, + { + "epoch": 2.628703856977322, + "grad_norm": 2.1883292198181152, + "learning_rate": 4.5983869853433174e-07, + "loss": 0.317, + "step": 15880 + }, + { + "epoch": 2.630359212050985, + "grad_norm": 2.075212240219116, + "learning_rate": 4.558127459096612e-07, + "loss": 0.308, + "step": 15890 + }, + { + "epoch": 2.6320145671246484, + "grad_norm": 1.9252678155899048, + "learning_rate": 4.51803652909899e-07, + "loss": 0.3231, + "step": 15900 + }, + { + "epoch": 2.6336699221983118, + "grad_norm": 2.443986415863037, + "learning_rate": 4.4781143440938923e-07, + "loss": 0.3437, + "step": 15910 + }, + { + "epoch": 2.635325277271975, + "grad_norm": 1.7434769868850708, + "learning_rate": 4.438361052198675e-07, + "loss": 0.3152, + "step": 15920 + }, + { + "epoch": 2.6369806323456384, + "grad_norm": 2.128065347671509, + "learning_rate": 4.3987768009041033e-07, + "loss": 0.3374, + "step": 15930 + }, + { + "epoch": 2.6386359874193013, + "grad_norm": 1.744676113128662, + "learning_rate": 4.3593617370737697e-07, + "loss": 0.31, + "step": 15940 + }, + { + "epoch": 2.6402913424929646, + "grad_norm": 1.8740226030349731, + "learning_rate": 4.3201160069435367e-07, + "loss": 0.3152, + "step": 15950 + }, + { + "epoch": 2.641946697566628, + "grad_norm": 3.100207567214966, + "learning_rate": 4.281039756121025e-07, + "loss": 0.3397, + "step": 15960 + }, + { + "epoch": 2.6436020526402912, + "grad_norm": 1.7563873529434204, + "learning_rate": 4.242133129585063e-07, + "loss": 0.3383, + "step": 15970 + }, + { + "epoch": 2.6452574077139546, + "grad_norm": 2.2582356929779053, + "learning_rate": 4.2033962716851396e-07, + "loss": 0.3247, + "step": 15980 + }, + { + "epoch": 2.646912762787618, + "grad_norm": 2.0863699913024902, + "learning_rate": 4.164829326140873e-07, + "loss": 0.335, + "step": 15990 + }, + { + "epoch": 2.648568117861281, + "grad_norm": 2.127856969833374, + "learning_rate": 4.126432436041511e-07, + "loss": 0.3716, + "step": 16000 + }, + { + "epoch": 2.6502234729349445, + "grad_norm": 1.7168405055999756, + "learning_rate": 4.0882057438453305e-07, + "loss": 0.3074, + "step": 16010 + }, + { + "epoch": 2.651878828008608, + "grad_norm": 2.110229969024658, + "learning_rate": 4.050149391379177e-07, + "loss": 0.3661, + "step": 16020 + }, + { + "epoch": 2.653534183082271, + "grad_norm": 1.760513186454773, + "learning_rate": 4.0122635198378943e-07, + "loss": 0.2615, + "step": 16030 + }, + { + "epoch": 2.6551895381559345, + "grad_norm": 1.8273522853851318, + "learning_rate": 3.974548269783807e-07, + "loss": 0.3055, + "step": 16040 + }, + { + "epoch": 2.656844893229598, + "grad_norm": 2.403547763824463, + "learning_rate": 3.9370037811462424e-07, + "loss": 0.341, + "step": 16050 + }, + { + "epoch": 2.658500248303261, + "grad_norm": 2.5805859565734863, + "learning_rate": 3.899630193220949e-07, + "loss": 0.3347, + "step": 16060 + }, + { + "epoch": 2.6601556033769245, + "grad_norm": 2.449556827545166, + "learning_rate": 3.8624276446696086e-07, + "loss": 0.3419, + "step": 16070 + }, + { + "epoch": 2.661810958450588, + "grad_norm": 1.9614695310592651, + "learning_rate": 3.825396273519322e-07, + "loss": 0.3109, + "step": 16080 + }, + { + "epoch": 2.663466313524251, + "grad_norm": 1.9693225622177124, + "learning_rate": 3.78853621716212e-07, + "loss": 0.3744, + "step": 16090 + }, + { + "epoch": 2.665121668597914, + "grad_norm": 2.334308385848999, + "learning_rate": 3.751847612354387e-07, + "loss": 0.3278, + "step": 16100 + }, + { + "epoch": 2.6667770236715773, + "grad_norm": 1.722469687461853, + "learning_rate": 3.715330595216443e-07, + "loss": 0.3293, + "step": 16110 + }, + { + "epoch": 2.6684323787452406, + "grad_norm": 2.3163626194000244, + "learning_rate": 3.678985301231952e-07, + "loss": 0.3562, + "step": 16120 + }, + { + "epoch": 2.670087733818904, + "grad_norm": 1.810492992401123, + "learning_rate": 3.6428118652474807e-07, + "loss": 0.3117, + "step": 16130 + }, + { + "epoch": 2.6717430888925673, + "grad_norm": 1.643846869468689, + "learning_rate": 3.606810421471973e-07, + "loss": 0.3057, + "step": 16140 + }, + { + "epoch": 2.6733984439662306, + "grad_norm": 2.1098129749298096, + "learning_rate": 3.5709811034762456e-07, + "loss": 0.3583, + "step": 16150 + }, + { + "epoch": 2.675053799039894, + "grad_norm": 1.8769261837005615, + "learning_rate": 3.535324044192506e-07, + "loss": 0.3228, + "step": 16160 + }, + { + "epoch": 2.6767091541135573, + "grad_norm": 2.2012782096862793, + "learning_rate": 3.499839375913872e-07, + "loss": 0.3166, + "step": 16170 + }, + { + "epoch": 2.6783645091872206, + "grad_norm": 2.656381607055664, + "learning_rate": 3.464527230293852e-07, + "loss": 0.3546, + "step": 16180 + }, + { + "epoch": 2.680019864260884, + "grad_norm": 2.1426846981048584, + "learning_rate": 3.429387738345868e-07, + "loss": 0.3262, + "step": 16190 + }, + { + "epoch": 2.6816752193345472, + "grad_norm": 2.4014816284179688, + "learning_rate": 3.3944210304427736e-07, + "loss": 0.3479, + "step": 16200 + }, + { + "epoch": 2.6833305744082105, + "grad_norm": 2.6231253147125244, + "learning_rate": 3.3596272363163594e-07, + "loss": 0.338, + "step": 16210 + }, + { + "epoch": 2.684985929481874, + "grad_norm": 2.0805563926696777, + "learning_rate": 3.325006485056881e-07, + "loss": 0.318, + "step": 16220 + }, + { + "epoch": 2.686641284555537, + "grad_norm": 1.7813467979431152, + "learning_rate": 3.2905589051126065e-07, + "loss": 0.336, + "step": 16230 + }, + { + "epoch": 2.6882966396292005, + "grad_norm": 2.336962938308716, + "learning_rate": 3.2562846242892744e-07, + "loss": 0.3539, + "step": 16240 + }, + { + "epoch": 2.689951994702864, + "grad_norm": 2.1561317443847656, + "learning_rate": 3.2221837697496597e-07, + "loss": 0.329, + "step": 16250 + }, + { + "epoch": 2.691607349776527, + "grad_norm": 2.0386962890625, + "learning_rate": 3.18825646801314e-07, + "loss": 0.3483, + "step": 16260 + }, + { + "epoch": 2.6932627048501905, + "grad_norm": 2.5452029705047607, + "learning_rate": 3.1545028449551375e-07, + "loss": 0.3314, + "step": 16270 + }, + { + "epoch": 2.694918059923854, + "grad_norm": 1.8535023927688599, + "learning_rate": 3.1209230258067324e-07, + "loss": 0.3055, + "step": 16280 + }, + { + "epoch": 2.696573414997517, + "grad_norm": 2.0997235774993896, + "learning_rate": 3.0875171351541497e-07, + "loss": 0.3775, + "step": 16290 + }, + { + "epoch": 2.6982287700711804, + "grad_norm": 2.2901790142059326, + "learning_rate": 3.0542852969383196e-07, + "loss": 0.3575, + "step": 16300 + }, + { + "epoch": 2.6998841251448438, + "grad_norm": 1.9664065837860107, + "learning_rate": 3.021227634454399e-07, + "loss": 0.2803, + "step": 16310 + }, + { + "epoch": 2.701539480218507, + "grad_norm": 2.0761005878448486, + "learning_rate": 2.988344270351351e-07, + "loss": 0.3469, + "step": 16320 + }, + { + "epoch": 2.7031948352921704, + "grad_norm": 1.9783636331558228, + "learning_rate": 2.955635326631434e-07, + "loss": 0.3665, + "step": 16330 + }, + { + "epoch": 2.7048501903658337, + "grad_norm": 2.280743360519409, + "learning_rate": 2.9231009246498077e-07, + "loss": 0.3547, + "step": 16340 + }, + { + "epoch": 2.7065055454394966, + "grad_norm": 3.1975841522216797, + "learning_rate": 2.8907411851140487e-07, + "loss": 0.3531, + "step": 16350 + }, + { + "epoch": 2.70816090051316, + "grad_norm": 2.105062961578369, + "learning_rate": 2.8585562280837033e-07, + "loss": 0.3093, + "step": 16360 + }, + { + "epoch": 2.7098162555868233, + "grad_norm": 2.0571696758270264, + "learning_rate": 2.826546172969852e-07, + "loss": 0.3689, + "step": 16370 + }, + { + "epoch": 2.7114716106604866, + "grad_norm": 2.0750467777252197, + "learning_rate": 2.794711138534656e-07, + "loss": 0.3626, + "step": 16380 + }, + { + "epoch": 2.71312696573415, + "grad_norm": 2.3040149211883545, + "learning_rate": 2.7630512428909183e-07, + "loss": 0.3472, + "step": 16390 + }, + { + "epoch": 2.7147823208078132, + "grad_norm": 2.162661552429199, + "learning_rate": 2.731566603501684e-07, + "loss": 0.305, + "step": 16400 + }, + { + "epoch": 2.7164376758814766, + "grad_norm": 2.7106826305389404, + "learning_rate": 2.7002573371797347e-07, + "loss": 0.3489, + "step": 16410 + }, + { + "epoch": 2.71809303095514, + "grad_norm": 2.0429866313934326, + "learning_rate": 2.6691235600872e-07, + "loss": 0.3004, + "step": 16420 + }, + { + "epoch": 2.719748386028803, + "grad_norm": 2.0341339111328125, + "learning_rate": 2.638165387735131e-07, + "loss": 0.3486, + "step": 16430 + }, + { + "epoch": 2.7214037411024665, + "grad_norm": 2.3097574710845947, + "learning_rate": 2.607382934983044e-07, + "loss": 0.3774, + "step": 16440 + }, + { + "epoch": 2.72305909617613, + "grad_norm": 2.3151650428771973, + "learning_rate": 2.5767763160385095e-07, + "loss": 0.3031, + "step": 16450 + }, + { + "epoch": 2.724714451249793, + "grad_norm": 2.07060170173645, + "learning_rate": 2.5463456444567436e-07, + "loss": 0.3331, + "step": 16460 + }, + { + "epoch": 2.7263698063234565, + "grad_norm": 1.5830529928207397, + "learning_rate": 2.51609103314015e-07, + "loss": 0.3127, + "step": 16470 + }, + { + "epoch": 2.72802516139712, + "grad_norm": 2.311042547225952, + "learning_rate": 2.486012594337939e-07, + "loss": 0.3813, + "step": 16480 + }, + { + "epoch": 2.729680516470783, + "grad_norm": 2.337822437286377, + "learning_rate": 2.4561104396456815e-07, + "loss": 0.3171, + "step": 16490 + }, + { + "epoch": 2.731335871544446, + "grad_norm": 2.3123905658721924, + "learning_rate": 2.426384680004917e-07, + "loss": 0.3243, + "step": 16500 + }, + { + "epoch": 2.7329912266181093, + "grad_norm": 2.2762632369995117, + "learning_rate": 2.3968354257027205e-07, + "loss": 0.3188, + "step": 16510 + }, + { + "epoch": 2.7346465816917727, + "grad_norm": 2.021897315979004, + "learning_rate": 2.3674627863713273e-07, + "loss": 0.3599, + "step": 16520 + }, + { + "epoch": 2.736301936765436, + "grad_norm": 1.663979172706604, + "learning_rate": 2.3382668709876878e-07, + "loss": 0.303, + "step": 16530 + }, + { + "epoch": 2.7379572918390993, + "grad_norm": 2.365117311477661, + "learning_rate": 2.3092477878730757e-07, + "loss": 0.3329, + "step": 16540 + }, + { + "epoch": 2.7396126469127626, + "grad_norm": 1.6717923879623413, + "learning_rate": 2.280405644692696e-07, + "loss": 0.3389, + "step": 16550 + }, + { + "epoch": 2.741268001986426, + "grad_norm": 1.7539387941360474, + "learning_rate": 2.2517405484552778e-07, + "loss": 0.3362, + "step": 16560 + }, + { + "epoch": 2.7429233570600893, + "grad_norm": 2.190721273422241, + "learning_rate": 2.2232526055126992e-07, + "loss": 0.3732, + "step": 16570 + }, + { + "epoch": 2.7445787121337526, + "grad_norm": 1.689123272895813, + "learning_rate": 2.1949419215595346e-07, + "loss": 0.3564, + "step": 16580 + }, + { + "epoch": 2.746234067207416, + "grad_norm": 2.2224695682525635, + "learning_rate": 2.1668086016327415e-07, + "loss": 0.3176, + "step": 16590 + }, + { + "epoch": 2.7478894222810792, + "grad_norm": 2.078242063522339, + "learning_rate": 2.1388527501111977e-07, + "loss": 0.3233, + "step": 16600 + }, + { + "epoch": 2.7495447773547426, + "grad_norm": 1.993311882019043, + "learning_rate": 2.1110744707153574e-07, + "loss": 0.3093, + "step": 16610 + }, + { + "epoch": 2.751200132428406, + "grad_norm": 2.2646372318267822, + "learning_rate": 2.0834738665068576e-07, + "loss": 0.3471, + "step": 16620 + }, + { + "epoch": 2.752855487502069, + "grad_norm": 1.9162275791168213, + "learning_rate": 2.056051039888135e-07, + "loss": 0.2779, + "step": 16630 + }, + { + "epoch": 2.7545108425757325, + "grad_norm": 2.898524045944214, + "learning_rate": 2.0288060926020425e-07, + "loss": 0.3635, + "step": 16640 + }, + { + "epoch": 2.756166197649396, + "grad_norm": 2.449143648147583, + "learning_rate": 2.0017391257314723e-07, + "loss": 0.3147, + "step": 16650 + }, + { + "epoch": 2.757821552723059, + "grad_norm": 2.447744131088257, + "learning_rate": 1.9748502396989722e-07, + "loss": 0.3296, + "step": 16660 + }, + { + "epoch": 2.7594769077967225, + "grad_norm": 1.8371167182922363, + "learning_rate": 1.9481395342664078e-07, + "loss": 0.3355, + "step": 16670 + }, + { + "epoch": 2.761132262870386, + "grad_norm": 1.8588885068893433, + "learning_rate": 1.92160710853454e-07, + "loss": 0.3565, + "step": 16680 + }, + { + "epoch": 2.762787617944049, + "grad_norm": 2.5657360553741455, + "learning_rate": 1.8952530609427145e-07, + "loss": 0.2915, + "step": 16690 + }, + { + "epoch": 2.7644429730177125, + "grad_norm": 2.608978033065796, + "learning_rate": 1.8690774892684395e-07, + "loss": 0.3885, + "step": 16700 + }, + { + "epoch": 2.766098328091376, + "grad_norm": 2.3337411880493164, + "learning_rate": 1.8430804906270638e-07, + "loss": 0.2748, + "step": 16710 + }, + { + "epoch": 2.767753683165039, + "grad_norm": 2.865518808364868, + "learning_rate": 1.8172621614714004e-07, + "loss": 0.369, + "step": 16720 + }, + { + "epoch": 2.7694090382387024, + "grad_norm": 2.097165822982788, + "learning_rate": 1.7916225975913693e-07, + "loss": 0.3264, + "step": 16730 + }, + { + "epoch": 2.7710643933123658, + "grad_norm": 2.0352957248687744, + "learning_rate": 1.766161894113655e-07, + "loss": 0.2894, + "step": 16740 + }, + { + "epoch": 2.7727197483860286, + "grad_norm": 1.8923014402389526, + "learning_rate": 1.7408801455013224e-07, + "loss": 0.2917, + "step": 16750 + }, + { + "epoch": 2.774375103459692, + "grad_norm": 2.412301540374756, + "learning_rate": 1.715777445553507e-07, + "loss": 0.3357, + "step": 16760 + }, + { + "epoch": 2.7760304585333553, + "grad_norm": 2.138453722000122, + "learning_rate": 1.6908538874050417e-07, + "loss": 0.3632, + "step": 16770 + }, + { + "epoch": 2.7776858136070186, + "grad_norm": 2.331045627593994, + "learning_rate": 1.6661095635261082e-07, + "loss": 0.3331, + "step": 16780 + }, + { + "epoch": 2.779341168680682, + "grad_norm": 1.8366329669952393, + "learning_rate": 1.6415445657218975e-07, + "loss": 0.2916, + "step": 16790 + }, + { + "epoch": 2.7809965237543453, + "grad_norm": 1.9621044397354126, + "learning_rate": 1.6171589851322999e-07, + "loss": 0.3636, + "step": 16800 + }, + { + "epoch": 2.7826518788280086, + "grad_norm": 2.012941360473633, + "learning_rate": 1.59295291223151e-07, + "loss": 0.3652, + "step": 16810 + }, + { + "epoch": 2.784307233901672, + "grad_norm": 2.185295343399048, + "learning_rate": 1.568926436827739e-07, + "loss": 0.3466, + "step": 16820 + }, + { + "epoch": 2.785962588975335, + "grad_norm": 2.6037065982818604, + "learning_rate": 1.5450796480628637e-07, + "loss": 0.3798, + "step": 16830 + }, + { + "epoch": 2.7876179440489985, + "grad_norm": 2.552006483078003, + "learning_rate": 1.5214126344120894e-07, + "loss": 0.3012, + "step": 16840 + }, + { + "epoch": 2.789273299122662, + "grad_norm": 1.1729012727737427, + "learning_rate": 1.4979254836836266e-07, + "loss": 0.3451, + "step": 16850 + }, + { + "epoch": 2.790928654196325, + "grad_norm": 2.044630765914917, + "learning_rate": 1.4746182830183809e-07, + "loss": 0.338, + "step": 16860 + }, + { + "epoch": 2.7925840092699885, + "grad_norm": 1.5621954202651978, + "learning_rate": 1.4514911188895976e-07, + "loss": 0.3422, + "step": 16870 + }, + { + "epoch": 2.794239364343652, + "grad_norm": 2.243272066116333, + "learning_rate": 1.4285440771025784e-07, + "loss": 0.3366, + "step": 16880 + }, + { + "epoch": 2.795894719417315, + "grad_norm": 2.275195598602295, + "learning_rate": 1.405777242794326e-07, + "loss": 0.3844, + "step": 16890 + }, + { + "epoch": 2.7975500744909785, + "grad_norm": 2.221266746520996, + "learning_rate": 1.3831907004332512e-07, + "loss": 0.3765, + "step": 16900 + }, + { + "epoch": 2.7992054295646414, + "grad_norm": 2.322007894515991, + "learning_rate": 1.3607845338188595e-07, + "loss": 0.3003, + "step": 16910 + }, + { + "epoch": 2.8008607846383047, + "grad_norm": 1.8471641540527344, + "learning_rate": 1.338558826081443e-07, + "loss": 0.3199, + "step": 16920 + }, + { + "epoch": 2.802516139711968, + "grad_norm": 1.6814085245132446, + "learning_rate": 1.3165136596817395e-07, + "loss": 0.332, + "step": 16930 + }, + { + "epoch": 2.8041714947856313, + "grad_norm": 2.1636056900024414, + "learning_rate": 1.2946491164106677e-07, + "loss": 0.3538, + "step": 16940 + }, + { + "epoch": 2.8058268498592946, + "grad_norm": 2.366356372833252, + "learning_rate": 1.27296527738901e-07, + "loss": 0.3084, + "step": 16950 + }, + { + "epoch": 2.807482204932958, + "grad_norm": 1.5104891061782837, + "learning_rate": 1.2514622230670958e-07, + "loss": 0.3326, + "step": 16960 + }, + { + "epoch": 2.8091375600066213, + "grad_norm": 2.3185157775878906, + "learning_rate": 1.230140033224525e-07, + "loss": 0.3529, + "step": 16970 + }, + { + "epoch": 2.8107929150802846, + "grad_norm": 2.288813352584839, + "learning_rate": 1.2089987869698615e-07, + "loss": 0.35, + "step": 16980 + }, + { + "epoch": 2.812448270153948, + "grad_norm": 1.8733060359954834, + "learning_rate": 1.1880385627403345e-07, + "loss": 0.3267, + "step": 16990 + }, + { + "epoch": 2.8141036252276113, + "grad_norm": 2.303619146347046, + "learning_rate": 1.1672594383015656e-07, + "loss": 0.3233, + "step": 17000 + }, + { + "epoch": 2.8157589803012746, + "grad_norm": 2.3300998210906982, + "learning_rate": 1.146661490747264e-07, + "loss": 0.3066, + "step": 17010 + }, + { + "epoch": 2.817414335374938, + "grad_norm": 2.013810634613037, + "learning_rate": 1.1262447964989376e-07, + "loss": 0.3384, + "step": 17020 + }, + { + "epoch": 2.8190696904486012, + "grad_norm": 2.131016254425049, + "learning_rate": 1.1060094313056268e-07, + "loss": 0.3419, + "step": 17030 + }, + { + "epoch": 2.8207250455222646, + "grad_norm": 2.358128070831299, + "learning_rate": 1.0859554702436104e-07, + "loss": 0.3166, + "step": 17040 + }, + { + "epoch": 2.822380400595928, + "grad_norm": 2.0983312129974365, + "learning_rate": 1.0660829877161327e-07, + "loss": 0.354, + "step": 17050 + }, + { + "epoch": 2.824035755669591, + "grad_norm": 1.8471043109893799, + "learning_rate": 1.0463920574531106e-07, + "loss": 0.3207, + "step": 17060 + }, + { + "epoch": 2.8256911107432545, + "grad_norm": 1.907423973083496, + "learning_rate": 1.0268827525108937e-07, + "loss": 0.3158, + "step": 17070 + }, + { + "epoch": 2.827346465816918, + "grad_norm": 1.6720637083053589, + "learning_rate": 1.0075551452719657e-07, + "loss": 0.3553, + "step": 17080 + }, + { + "epoch": 2.829001820890581, + "grad_norm": 1.6659080982208252, + "learning_rate": 9.884093074446932e-08, + "loss": 0.3111, + "step": 17090 + }, + { + "epoch": 2.8306571759642445, + "grad_norm": 1.5908949375152588, + "learning_rate": 9.694453100630275e-08, + "loss": 0.3329, + "step": 17100 + }, + { + "epoch": 2.832312531037908, + "grad_norm": 1.9934496879577637, + "learning_rate": 9.506632234862922e-08, + "loss": 0.3384, + "step": 17110 + }, + { + "epoch": 2.833967886111571, + "grad_norm": 1.8065050840377808, + "learning_rate": 9.320631173988737e-08, + "loss": 0.3315, + "step": 17120 + }, + { + "epoch": 2.8356232411852345, + "grad_norm": 2.1966464519500732, + "learning_rate": 9.136450608099978e-08, + "loss": 0.345, + "step": 17130 + }, + { + "epoch": 2.837278596258898, + "grad_norm": 1.9803756475448608, + "learning_rate": 8.954091220534366e-08, + "loss": 0.3242, + "step": 17140 + }, + { + "epoch": 2.838933951332561, + "grad_norm": 2.167667865753174, + "learning_rate": 8.773553687873082e-08, + "loss": 0.3268, + "step": 17150 + }, + { + "epoch": 2.840589306406224, + "grad_norm": 2.3544301986694336, + "learning_rate": 8.594838679937712e-08, + "loss": 0.3454, + "step": 17160 + }, + { + "epoch": 2.8422446614798873, + "grad_norm": 1.4057775735855103, + "learning_rate": 8.41794685978814e-08, + "loss": 0.3432, + "step": 17170 + }, + { + "epoch": 2.8439000165535506, + "grad_norm": 1.7976477146148682, + "learning_rate": 8.242878883719829e-08, + "loss": 0.3196, + "step": 17180 + }, + { + "epoch": 2.845555371627214, + "grad_norm": 2.1721675395965576, + "learning_rate": 8.0696354012616e-08, + "loss": 0.3821, + "step": 17190 + }, + { + "epoch": 2.8472107267008773, + "grad_norm": 2.5062599182128906, + "learning_rate": 7.898217055173075e-08, + "loss": 0.3178, + "step": 17200 + }, + { + "epoch": 2.8488660817745406, + "grad_norm": 1.7274291515350342, + "learning_rate": 7.728624481442348e-08, + "loss": 0.3445, + "step": 17210 + }, + { + "epoch": 2.850521436848204, + "grad_norm": 2.225858211517334, + "learning_rate": 7.560858309283658e-08, + "loss": 0.3166, + "step": 17220 + }, + { + "epoch": 2.8521767919218672, + "grad_norm": 1.7840882539749146, + "learning_rate": 7.394919161134884e-08, + "loss": 0.3327, + "step": 17230 + }, + { + "epoch": 2.8538321469955306, + "grad_norm": 2.3786237239837646, + "learning_rate": 7.230807652655603e-08, + "loss": 0.3784, + "step": 17240 + }, + { + "epoch": 2.855487502069194, + "grad_norm": 1.890674352645874, + "learning_rate": 7.068524392724319e-08, + "loss": 0.3401, + "step": 17250 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 2.209174156188965, + "learning_rate": 6.908069983436683e-08, + "loss": 0.3495, + "step": 17260 + }, + { + "epoch": 2.8587982122165205, + "grad_norm": 1.9331399202346802, + "learning_rate": 6.749445020102884e-08, + "loss": 0.3155, + "step": 17270 + }, + { + "epoch": 2.860453567290184, + "grad_norm": 1.9215776920318604, + "learning_rate": 6.5926500912456e-08, + "loss": 0.3355, + "step": 17280 + }, + { + "epoch": 2.862108922363847, + "grad_norm": 2.269278049468994, + "learning_rate": 6.437685778597824e-08, + "loss": 0.3326, + "step": 17290 + }, + { + "epoch": 2.8637642774375105, + "grad_norm": 2.3621773719787598, + "learning_rate": 6.284552657100761e-08, + "loss": 0.3111, + "step": 17300 + }, + { + "epoch": 2.8654196325111734, + "grad_norm": 2.254394769668579, + "learning_rate": 6.133251294901443e-08, + "loss": 0.3558, + "step": 17310 + }, + { + "epoch": 2.8670749875848367, + "grad_norm": 2.3596551418304443, + "learning_rate": 5.983782253350944e-08, + "loss": 0.3323, + "step": 17320 + }, + { + "epoch": 2.8687303426585, + "grad_norm": 2.3255999088287354, + "learning_rate": 5.836146087002226e-08, + "loss": 0.3472, + "step": 17330 + }, + { + "epoch": 2.8703856977321633, + "grad_norm": 2.21858549118042, + "learning_rate": 5.690343343607796e-08, + "loss": 0.3458, + "step": 17340 + }, + { + "epoch": 2.8720410528058267, + "grad_norm": 2.3857204914093018, + "learning_rate": 5.54637456411794e-08, + "loss": 0.3269, + "step": 17350 + }, + { + "epoch": 2.87369640787949, + "grad_norm": 2.049962282180786, + "learning_rate": 5.4042402826787746e-08, + "loss": 0.3314, + "step": 17360 + }, + { + "epoch": 2.8753517629531533, + "grad_norm": 2.2532308101654053, + "learning_rate": 5.2639410266299705e-08, + "loss": 0.3339, + "step": 17370 + }, + { + "epoch": 2.8770071180268166, + "grad_norm": 1.2130613327026367, + "learning_rate": 5.1254773165032023e-08, + "loss": 0.3056, + "step": 17380 + }, + { + "epoch": 2.87866247310048, + "grad_norm": 2.2649145126342773, + "learning_rate": 4.988849666019757e-08, + "loss": 0.3218, + "step": 17390 + }, + { + "epoch": 2.8803178281741433, + "grad_norm": 1.4370695352554321, + "learning_rate": 4.854058582089038e-08, + "loss": 0.3054, + "step": 17400 + }, + { + "epoch": 2.8819731832478066, + "grad_norm": 2.0458033084869385, + "learning_rate": 4.7211045648064004e-08, + "loss": 0.3226, + "step": 17410 + }, + { + "epoch": 2.88362853832147, + "grad_norm": 1.8102025985717773, + "learning_rate": 4.589988107451482e-08, + "loss": 0.3504, + "step": 17420 + }, + { + "epoch": 2.8852838933951332, + "grad_norm": 1.952150583267212, + "learning_rate": 4.4607096964862094e-08, + "loss": 0.3291, + "step": 17430 + }, + { + "epoch": 2.8869392484687966, + "grad_norm": 1.774665355682373, + "learning_rate": 4.333269811553187e-08, + "loss": 0.3339, + "step": 17440 + }, + { + "epoch": 2.88859460354246, + "grad_norm": 2.385225772857666, + "learning_rate": 4.20766892547364e-08, + "loss": 0.3291, + "step": 17450 + }, + { + "epoch": 2.890249958616123, + "grad_norm": 2.228212356567383, + "learning_rate": 4.0839075042460875e-08, + "loss": 0.3173, + "step": 17460 + }, + { + "epoch": 2.8919053136897865, + "grad_norm": 1.6550447940826416, + "learning_rate": 3.9619860070440056e-08, + "loss": 0.334, + "step": 17470 + }, + { + "epoch": 2.89356066876345, + "grad_norm": 2.184173822402954, + "learning_rate": 3.841904886214831e-08, + "loss": 0.3265, + "step": 17480 + }, + { + "epoch": 2.895216023837113, + "grad_norm": 2.3634161949157715, + "learning_rate": 3.723664587277631e-08, + "loss": 0.3615, + "step": 17490 + }, + { + "epoch": 2.8968713789107765, + "grad_norm": 2.1004719734191895, + "learning_rate": 3.60726554892199e-08, + "loss": 0.3142, + "step": 17500 + }, + { + "epoch": 2.89852673398444, + "grad_norm": 1.8737444877624512, + "learning_rate": 3.492708203006012e-08, + "loss": 0.3214, + "step": 17510 + }, + { + "epoch": 2.900182089058103, + "grad_norm": 1.9953153133392334, + "learning_rate": 3.3799929745547685e-08, + "loss": 0.3365, + "step": 17520 + }, + { + "epoch": 2.9018374441317665, + "grad_norm": 1.4854812622070312, + "learning_rate": 3.2691202817589086e-08, + "loss": 0.3054, + "step": 17530 + }, + { + "epoch": 2.90349279920543, + "grad_norm": 2.8899099826812744, + "learning_rate": 3.160090535972993e-08, + "loss": 0.3153, + "step": 17540 + }, + { + "epoch": 2.905148154279093, + "grad_norm": 2.137659788131714, + "learning_rate": 3.052904141713886e-08, + "loss": 0.3037, + "step": 17550 + }, + { + "epoch": 2.9068035093527564, + "grad_norm": 2.0721936225891113, + "learning_rate": 2.9475614966594233e-08, + "loss": 0.3211, + "step": 17560 + }, + { + "epoch": 2.9084588644264193, + "grad_norm": 2.115863561630249, + "learning_rate": 2.844062991646801e-08, + "loss": 0.2805, + "step": 17570 + }, + { + "epoch": 2.9101142195000826, + "grad_norm": 1.711255431175232, + "learning_rate": 2.742409010671243e-08, + "loss": 0.3288, + "step": 17580 + }, + { + "epoch": 2.911769574573746, + "grad_norm": 2.0368499755859375, + "learning_rate": 2.642599930884393e-08, + "loss": 0.3121, + "step": 17590 + }, + { + "epoch": 2.9134249296474093, + "grad_norm": 2.2370572090148926, + "learning_rate": 2.544636122593147e-08, + "loss": 0.3161, + "step": 17600 + }, + { + "epoch": 2.9150802847210726, + "grad_norm": 2.0604302883148193, + "learning_rate": 2.4485179492581e-08, + "loss": 0.3285, + "step": 17610 + }, + { + "epoch": 2.916735639794736, + "grad_norm": 2.018296003341675, + "learning_rate": 2.354245767492269e-08, + "loss": 0.3635, + "step": 17620 + }, + { + "epoch": 2.9183909948683993, + "grad_norm": 2.0412917137145996, + "learning_rate": 2.2618199270597607e-08, + "loss": 0.3264, + "step": 17630 + }, + { + "epoch": 2.9200463499420626, + "grad_norm": 1.991478681564331, + "learning_rate": 2.1712407708744386e-08, + "loss": 0.3597, + "step": 17640 + }, + { + "epoch": 2.921701705015726, + "grad_norm": 1.9765408039093018, + "learning_rate": 2.0825086349988145e-08, + "loss": 0.3418, + "step": 17650 + }, + { + "epoch": 2.9233570600893892, + "grad_norm": 2.1275086402893066, + "learning_rate": 1.995623848642547e-08, + "loss": 0.3345, + "step": 17660 + }, + { + "epoch": 2.9250124151630525, + "grad_norm": 1.7225489616394043, + "learning_rate": 1.9105867341613903e-08, + "loss": 0.3135, + "step": 17670 + }, + { + "epoch": 2.926667770236716, + "grad_norm": 1.9626611471176147, + "learning_rate": 1.8273976070559695e-08, + "loss": 0.3614, + "step": 17680 + }, + { + "epoch": 2.928323125310379, + "grad_norm": 2.476229667663574, + "learning_rate": 1.7460567759705615e-08, + "loss": 0.3594, + "step": 17690 + }, + { + "epoch": 2.9299784803840425, + "grad_norm": 1.9182066917419434, + "learning_rate": 1.6665645426920396e-08, + "loss": 0.3197, + "step": 17700 + }, + { + "epoch": 2.931633835457706, + "grad_norm": 2.101400852203369, + "learning_rate": 1.5889212021485966e-08, + "loss": 0.3522, + "step": 17710 + }, + { + "epoch": 2.9332891905313687, + "grad_norm": 1.9222100973129272, + "learning_rate": 1.5131270424088573e-08, + "loss": 0.3461, + "step": 17720 + }, + { + "epoch": 2.934944545605032, + "grad_norm": 2.66011905670166, + "learning_rate": 1.4391823446807117e-08, + "loss": 0.3557, + "step": 17730 + }, + { + "epoch": 2.9365999006786954, + "grad_norm": 2.2789878845214844, + "learning_rate": 1.3670873833101505e-08, + "loss": 0.3229, + "step": 17740 + }, + { + "epoch": 2.9382552557523587, + "grad_norm": 2.3933591842651367, + "learning_rate": 1.2968424257804313e-08, + "loss": 0.3315, + "step": 17750 + }, + { + "epoch": 2.939910610826022, + "grad_norm": 2.3596224784851074, + "learning_rate": 1.22844773271108e-08, + "loss": 0.3439, + "step": 17760 + }, + { + "epoch": 2.9415659658996853, + "grad_norm": 2.155430316925049, + "learning_rate": 1.161903557856725e-08, + "loss": 0.3253, + "step": 17770 + }, + { + "epoch": 2.9432213209733487, + "grad_norm": 2.187922954559326, + "learning_rate": 1.0972101481063758e-08, + "loss": 0.3758, + "step": 17780 + }, + { + "epoch": 2.944876676047012, + "grad_norm": 2.4473180770874023, + "learning_rate": 1.0343677434824783e-08, + "loss": 0.3408, + "step": 17790 + }, + { + "epoch": 2.9465320311206753, + "grad_norm": 2.050906181335449, + "learning_rate": 9.733765771398063e-09, + "loss": 0.3365, + "step": 17800 + }, + { + "epoch": 2.9481873861943386, + "grad_norm": 2.1611318588256836, + "learning_rate": 9.142368753649045e-09, + "loss": 0.3467, + "step": 17810 + }, + { + "epoch": 2.949842741268002, + "grad_norm": 2.38580322265625, + "learning_rate": 8.56948857575035e-09, + "loss": 0.314, + "step": 17820 + }, + { + "epoch": 2.9514980963416653, + "grad_norm": 1.6547008752822876, + "learning_rate": 8.015127363174558e-09, + "loss": 0.3313, + "step": 17830 + }, + { + "epoch": 2.9531534514153286, + "grad_norm": 2.329861640930176, + "learning_rate": 7.479287172685867e-09, + "loss": 0.3271, + "step": 17840 + }, + { + "epoch": 2.954808806488992, + "grad_norm": 1.9690601825714111, + "learning_rate": 6.96196999233234e-09, + "loss": 0.3351, + "step": 17850 + }, + { + "epoch": 2.9564641615626552, + "grad_norm": 2.030273199081421, + "learning_rate": 6.463177741440341e-09, + "loss": 0.3587, + "step": 17860 + }, + { + "epoch": 2.9581195166363186, + "grad_norm": 2.2005035877227783, + "learning_rate": 5.982912270603991e-09, + "loss": 0.3264, + "step": 17870 + }, + { + "epoch": 2.959774871709982, + "grad_norm": 2.0540130138397217, + "learning_rate": 5.521175361681286e-09, + "loss": 0.3387, + "step": 17880 + }, + { + "epoch": 2.961430226783645, + "grad_norm": 1.6892403364181519, + "learning_rate": 5.07796872778632e-09, + "loss": 0.2986, + "step": 17890 + }, + { + "epoch": 2.9630855818573085, + "grad_norm": 1.895839810371399, + "learning_rate": 4.653294013283183e-09, + "loss": 0.3291, + "step": 17900 + }, + { + "epoch": 2.964740936930972, + "grad_norm": 1.6137304306030273, + "learning_rate": 4.247152793779296e-09, + "loss": 0.3265, + "step": 17910 + }, + { + "epoch": 2.966396292004635, + "grad_norm": 1.9061734676361084, + "learning_rate": 3.859546576120421e-09, + "loss": 0.334, + "step": 17920 + }, + { + "epoch": 2.9680516470782985, + "grad_norm": 2.028872013092041, + "learning_rate": 3.4904767983851006e-09, + "loss": 0.3213, + "step": 17930 + }, + { + "epoch": 2.969707002151962, + "grad_norm": 1.7429839372634888, + "learning_rate": 3.1399448298774503e-09, + "loss": 0.3341, + "step": 17940 + }, + { + "epoch": 2.971362357225625, + "grad_norm": 2.5334107875823975, + "learning_rate": 2.8079519711243786e-09, + "loss": 0.3251, + "step": 17950 + }, + { + "epoch": 2.9730177122992885, + "grad_norm": 2.0650694370269775, + "learning_rate": 2.4944994538700363e-09, + "loss": 0.2856, + "step": 17960 + }, + { + "epoch": 2.9746730673729513, + "grad_norm": 1.9629024267196655, + "learning_rate": 2.1995884410708212e-09, + "loss": 0.2924, + "step": 17970 + }, + { + "epoch": 2.9763284224466147, + "grad_norm": 1.9057166576385498, + "learning_rate": 1.9232200268914923e-09, + "loss": 0.3179, + "step": 17980 + }, + { + "epoch": 2.977983777520278, + "grad_norm": 2.323155164718628, + "learning_rate": 1.6653952367007287e-09, + "loss": 0.3351, + "step": 17990 + }, + { + "epoch": 2.9796391325939413, + "grad_norm": 1.7210339307785034, + "learning_rate": 1.426115027067243e-09, + "loss": 0.3269, + "step": 18000 + }, + { + "epoch": 2.9812944876676046, + "grad_norm": 1.83793044090271, + "learning_rate": 1.2053802857581176e-09, + "loss": 0.347, + "step": 18010 + }, + { + "epoch": 2.982949842741268, + "grad_norm": 1.847522258758545, + "learning_rate": 1.003191831731587e-09, + "loss": 0.2979, + "step": 18020 + }, + { + "epoch": 2.9846051978149313, + "grad_norm": 1.777446985244751, + "learning_rate": 8.195504151381484e-10, + "loss": 0.3473, + "step": 18030 + }, + { + "epoch": 2.9862605528885946, + "grad_norm": 2.9953622817993164, + "learning_rate": 6.544567173150107e-10, + "loss": 0.3001, + "step": 18040 + }, + { + "epoch": 2.987915907962258, + "grad_norm": 1.8776600360870361, + "learning_rate": 5.079113507849842e-10, + "loss": 0.357, + "step": 18050 + }, + { + "epoch": 2.9895712630359212, + "grad_norm": 2.086153268814087, + "learning_rate": 3.7991485925370496e-10, + "loss": 0.3004, + "step": 18060 + }, + { + "epoch": 2.9912266181095846, + "grad_norm": 2.1379270553588867, + "learning_rate": 2.704677176079695e-10, + "loss": 0.3721, + "step": 18070 + }, + { + "epoch": 2.992881973183248, + "grad_norm": 2.7157349586486816, + "learning_rate": 1.7957033191240425e-10, + "loss": 0.3555, + "step": 18080 + }, + { + "epoch": 2.994537328256911, + "grad_norm": 2.5094964504241943, + "learning_rate": 1.0722303941113065e-10, + "loss": 0.3435, + "step": 18090 + }, + { + "epoch": 2.9961926833305745, + "grad_norm": 2.10398268699646, + "learning_rate": 5.3426108522769415e-11, + "loss": 0.3287, + "step": 18100 + }, + { + "epoch": 2.997848038404238, + "grad_norm": 3.0506503582000732, + "learning_rate": 1.8179738842660777e-11, + "loss": 0.3678, + "step": 18110 + }, + { + "epoch": 2.9995033934779007, + "grad_norm": 1.7245928049087524, + "learning_rate": 1.484061139533921e-12, + "loss": 0.3259, + "step": 18120 + }, + { + "epoch": 3.0, + "step": 18123, + "total_flos": 1.2130803771160658e+18, + "train_loss": 0.45045166753521493, + "train_runtime": 7695.2204, + "train_samples_per_second": 37.681, + "train_steps_per_second": 2.355 + } + ], + "logging_steps": 10, + "max_steps": 18123, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2130803771160658e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}