| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.9881796836853027, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.0205, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6945570707321167, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.9355, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0729589462280273, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8719, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8945010900497437, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.7976, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.9784910678863525, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.8037, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9404758810997009, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7779, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.9830419421195984, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.779, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0543450117111206, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.7676, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.151178002357483, | |
| "learning_rate": 3e-05, | |
| "loss": 0.7662, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.1893316507339478, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.7598, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.1506898403167725, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.7463, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.2127164602279663, | |
| "learning_rate": 4e-05, | |
| "loss": 0.749, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.0937416553497314, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 0.7379, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.172784686088562, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.7427, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.1628010272979736, | |
| "learning_rate": 5e-05, | |
| "loss": 0.7444, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.4532078504562378, | |
| "learning_rate": 4.999848114735858e-05, | |
| "loss": 0.7257, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.1651058197021484, | |
| "learning_rate": 4.999392477398737e-05, | |
| "loss": 0.7348, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.134162425994873, | |
| "learning_rate": 4.9986331433523156e-05, | |
| "loss": 0.7136, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.1000313758850098, | |
| "learning_rate": 4.997570204861915e-05, | |
| "loss": 0.7167, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.0679141283035278, | |
| "learning_rate": 4.996203791083291e-05, | |
| "loss": 0.7172, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.1267646551132202, | |
| "learning_rate": 4.994534068046937e-05, | |
| "loss": 0.7276, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.1281580924987793, | |
| "learning_rate": 4.992561238637912e-05, | |
| "loss": 0.7187, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9922609329223633, | |
| "learning_rate": 4.9902855425711905e-05, | |
| "loss": 0.7085, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.9926589131355286, | |
| "learning_rate": 4.9877072563625285e-05, | |
| "loss": 0.7185, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.0269533395767212, | |
| "learning_rate": 4.984826693294874e-05, | |
| "loss": 0.7208, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.9333322644233704, | |
| "learning_rate": 4.981644203380291e-05, | |
| "loss": 0.7052, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.9695472717285156, | |
| "learning_rate": 4.978160173317438e-05, | |
| "loss": 0.695, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.0205656290054321, | |
| "learning_rate": 4.974375026444575e-05, | |
| "loss": 0.7029, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.8118201494216919, | |
| "learning_rate": 4.970289222688129e-05, | |
| "loss": 0.6981, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.9229862093925476, | |
| "learning_rate": 4.965903258506806e-05, | |
| "loss": 0.7108, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.9452656507492065, | |
| "learning_rate": 4.961217666831268e-05, | |
| "loss": 0.6916, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9605855345726013, | |
| "learning_rate": 4.956233016999379e-05, | |
| "loss": 0.696, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.8793039917945862, | |
| "learning_rate": 4.9509499146870236e-05, | |
| "loss": 0.6876, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8802406787872314, | |
| "learning_rate": 4.9453690018345144e-05, | |
| "loss": 0.6996, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0251939296722412, | |
| "learning_rate": 4.9394909565685894e-05, | |
| "loss": 0.7063, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8812974691390991, | |
| "learning_rate": 4.933316493120015e-05, | |
| "loss": 0.6948, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7881792783737183, | |
| "learning_rate": 4.9268463617368e-05, | |
| "loss": 0.6951, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8760414123535156, | |
| "learning_rate": 4.9200813485930375e-05, | |
| "loss": 0.6906, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8838094472885132, | |
| "learning_rate": 4.913022275693372e-05, | |
| "loss": 0.6772, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0329331159591675, | |
| "learning_rate": 4.905670000773126e-05, | |
| "loss": 0.6986, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.7983132004737854, | |
| "learning_rate": 4.8980254171940746e-05, | |
| "loss": 0.6811, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9415234327316284, | |
| "learning_rate": 4.8900894538358944e-05, | |
| "loss": 0.6697, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.76316237449646, | |
| "learning_rate": 4.881863074983298e-05, | |
| "loss": 0.6851, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8890565633773804, | |
| "learning_rate": 4.8733472802088654e-05, | |
| "loss": 0.6819, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.7978311777114868, | |
| "learning_rate": 4.864543104251587e-05, | |
| "loss": 0.6852, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.8316269516944885, | |
| "learning_rate": 4.855451616891136e-05, | |
| "loss": 0.6718, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.877873957157135, | |
| "learning_rate": 4.8460739228178806e-05, | |
| "loss": 0.6707, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.0596739053726196, | |
| "learning_rate": 4.8364111614986527e-05, | |
| "loss": 0.6964, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8643849492073059, | |
| "learning_rate": 4.8264645070382964e-05, | |
| "loss": 0.6763, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.7877243161201477, | |
| "learning_rate": 4.8162351680370044e-05, | |
| "loss": 0.6874, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.9450274109840393, | |
| "learning_rate": 4.805724387443462e-05, | |
| "loss": 0.6818, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7456744313240051, | |
| "learning_rate": 4.7949334424038176e-05, | |
| "loss": 0.665, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.8591768741607666, | |
| "learning_rate": 4.783863644106502e-05, | |
| "loss": 0.6675, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7636451125144958, | |
| "learning_rate": 4.7725163376229064e-05, | |
| "loss": 0.6633, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8254076242446899, | |
| "learning_rate": 4.760892901743944e-05, | |
| "loss": 0.667, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.8828392624855042, | |
| "learning_rate": 4.7489947488125175e-05, | |
| "loss": 0.6842, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.8078437447547913, | |
| "learning_rate": 4.736823324551909e-05, | |
| "loss": 0.665, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.8083636164665222, | |
| "learning_rate": 4.7243801078901084e-05, | |
| "loss": 0.6577, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.9294388890266418, | |
| "learning_rate": 4.711666610780115e-05, | |
| "loss": 0.668, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8191084265708923, | |
| "learning_rate": 4.698684378016222e-05, | |
| "loss": 0.6571, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.7333921194076538, | |
| "learning_rate": 4.685434987046314e-05, | |
| "loss": 0.6671, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7950130701065063, | |
| "learning_rate": 4.671920047780186e-05, | |
| "loss": 0.6549, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.8252810835838318, | |
| "learning_rate": 4.6581412023939354e-05, | |
| "loss": 0.6517, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7834108471870422, | |
| "learning_rate": 4.644100125130418e-05, | |
| "loss": 0.6707, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.7022207379341125, | |
| "learning_rate": 4.629798522095818e-05, | |
| "loss": 0.6698, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8147993087768555, | |
| "learning_rate": 4.6152381310523387e-05, | |
| "loss": 0.651, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9004583358764648, | |
| "learning_rate": 4.600420721207053e-05, | |
| "loss": 0.6694, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.7641873955726624, | |
| "learning_rate": 4.585348092996925e-05, | |
| "loss": 0.655, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.7779438495635986, | |
| "learning_rate": 4.5700220778700504e-05, | |
| "loss": 0.6514, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7963545322418213, | |
| "learning_rate": 4.554444538063113e-05, | |
| "loss": 0.635, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.7627516388893127, | |
| "learning_rate": 4.538617366375112e-05, | |
| "loss": 0.6472, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6603427529335022, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 0.6416, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.7783029079437256, | |
| "learning_rate": 4.5062218499798526e-05, | |
| "loss": 0.644, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.7225221395492554, | |
| "learning_rate": 4.4896574415938465e-05, | |
| "loss": 0.644, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8217127919197083, | |
| "learning_rate": 4.4728512734909844e-05, | |
| "loss": 0.647, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.9464953541755676, | |
| "learning_rate": 4.455805387758691e-05, | |
| "loss": 0.6629, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.7954788208007812, | |
| "learning_rate": 4.438521855612054e-05, | |
| "loss": 0.6603, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7955674529075623, | |
| "learning_rate": 4.421002777142148e-05, | |
| "loss": 0.6616, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.7647875547409058, | |
| "learning_rate": 4.4032502810608614e-05, | |
| "loss": 0.6463, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7553123235702515, | |
| "learning_rate": 4.385266524442241e-05, | |
| "loss": 0.6453, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.6640244722366333, | |
| "learning_rate": 4.367053692460385e-05, | |
| "loss": 0.6269, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.7737441062927246, | |
| "learning_rate": 4.3486139981239304e-05, | |
| "loss": 0.6546, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.6590189337730408, | |
| "learning_rate": 4.3299496820071546e-05, | |
| "loss": 0.6414, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.7308911681175232, | |
| "learning_rate": 4.311063011977723e-05, | |
| "loss": 0.6435, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.7682307362556458, | |
| "learning_rate": 4.2919562829211283e-05, | |
| "loss": 0.6416, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.7915315628051758, | |
| "learning_rate": 4.2726318164618435e-05, | |
| "loss": 0.6504, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.7202879190444946, | |
| "learning_rate": 4.2530919606812216e-05, | |
| "loss": 0.6491, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.6950281858444214, | |
| "learning_rate": 4.233339089832189e-05, | |
| "loss": 0.6422, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.715614914894104, | |
| "learning_rate": 4.21337560405075e-05, | |
| "loss": 0.6461, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.6588935852050781, | |
| "learning_rate": 4.193203929064353e-05, | |
| "loss": 0.6248, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.6951584815979004, | |
| "learning_rate": 4.172826515897146e-05, | |
| "loss": 0.6415, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.7861813306808472, | |
| "learning_rate": 4.152245840572153e-05, | |
| "loss": 0.6359, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.7668294310569763, | |
| "learning_rate": 4.131464403810422e-05, | |
| "loss": 0.6557, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.7085067629814148, | |
| "learning_rate": 4.110484730727161e-05, | |
| "loss": 0.6417, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.7475394010543823, | |
| "learning_rate": 4.089309370524921e-05, | |
| "loss": 0.624, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.7203657627105713, | |
| "learning_rate": 4.067940896183843e-05, | |
| "loss": 0.628, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.7134045958518982, | |
| "learning_rate": 4.046381904149024e-05, | |
| "loss": 0.6346, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.6537560820579529, | |
| "learning_rate": 4.024635014015023e-05, | |
| "loss": 0.6259, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.7190784215927124, | |
| "learning_rate": 4.002702868207563e-05, | |
| "loss": 0.6373, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6727094650268555, | |
| "learning_rate": 3.9805881316624506e-05, | |
| "loss": 0.6158, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.7174749970436096, | |
| "learning_rate": 3.9582934915017665e-05, | |
| "loss": 0.6011, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.6894753575325012, | |
| "learning_rate": 3.935821656707359e-05, | |
| "loss": 0.5974, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.6777900457382202, | |
| "learning_rate": 3.91317535779168e-05, | |
| "loss": 0.6027, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.680266797542572, | |
| "learning_rate": 3.890357346466001e-05, | |
| "loss": 0.5857, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.7262083292007446, | |
| "learning_rate": 3.867370395306068e-05, | |
| "loss": 0.5985, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 0.6810183525085449, | |
| "learning_rate": 3.844217297415196e-05, | |
| "loss": 0.5874, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.743713915348053, | |
| "learning_rate": 3.8209008660848974e-05, | |
| "loss": 0.5842, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.7603400349617004, | |
| "learning_rate": 3.797423934453038e-05, | |
| "loss": 0.596, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.6763606071472168, | |
| "learning_rate": 3.773789355159587e-05, | |
| "loss": 0.6043, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.7692492008209229, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.599, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.750664472579956, | |
| "learning_rate": 3.726058759576271e-05, | |
| "loss": 0.5878, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.6852485537528992, | |
| "learning_rate": 3.7019685429456986e-05, | |
| "loss": 0.5905, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.7305880188941956, | |
| "learning_rate": 3.6777322772674186e-05, | |
| "loss": 0.5977, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": 0.6685648560523987, | |
| "learning_rate": 3.65335290744672e-05, | |
| "loss": 0.5987, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.730179488658905, | |
| "learning_rate": 3.628833395777224e-05, | |
| "loss": 0.5957, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.6852392554283142, | |
| "learning_rate": 3.604176721580935e-05, | |
| "loss": 0.5961, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.6334732174873352, | |
| "learning_rate": 3.579385880846232e-05, | |
| "loss": 0.5793, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.6806017756462097, | |
| "learning_rate": 3.5544638858638304e-05, | |
| "loss": 0.5969, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.6200957894325256, | |
| "learning_rate": 3.5294137648607625e-05, | |
| "loss": 0.5973, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.6411502361297607, | |
| "learning_rate": 3.504238561632424e-05, | |
| "loss": 0.5812, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.622643232345581, | |
| "learning_rate": 3.478941335172729e-05, | |
| "loss": 0.5891, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.6710910797119141, | |
| "learning_rate": 3.453525159302415e-05, | |
| "loss": 0.6081, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.6420513987541199, | |
| "learning_rate": 3.427993122295552e-05, | |
| "loss": 0.6082, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.6355459690093994, | |
| "learning_rate": 3.4023483265042874e-05, | |
| "loss": 0.5871, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.7996178269386292, | |
| "learning_rate": 3.376593887981887e-05, | |
| "loss": 0.6023, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.7436390519142151, | |
| "learning_rate": 3.350732936104108e-05, | |
| "loss": 0.5881, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.6650518774986267, | |
| "learning_rate": 3.3247686131889574e-05, | |
| "loss": 0.5856, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.6648463606834412, | |
| "learning_rate": 3.29870407411487e-05, | |
| "loss": 0.5995, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 0.6547637581825256, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.5896, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.6670102477073669, | |
| "learning_rate": 3.246287027504237e-05, | |
| "loss": 0.5846, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.6504084467887878, | |
| "learning_rate": 3.2199408890692655e-05, | |
| "loss": 0.5891, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.7213049530982971, | |
| "learning_rate": 3.1935072719046115e-05, | |
| "loss": 0.5899, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.6338471174240112, | |
| "learning_rate": 3.1669893879118156e-05, | |
| "loss": 0.587, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.6446500420570374, | |
| "learning_rate": 3.140390459231528e-05, | |
| "loss": 0.5862, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.626853346824646, | |
| "learning_rate": 3.1137137178519985e-05, | |
| "loss": 0.5982, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.6430269479751587, | |
| "learning_rate": 3.086962405216353e-05, | |
| "loss": 0.592, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.590183436870575, | |
| "learning_rate": 3.06013977182874e-05, | |
| "loss": 0.5848, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.6320422291755676, | |
| "learning_rate": 3.0332490768593675e-05, | |
| "loss": 0.5794, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.3900000000000001, | |
| "grad_norm": 0.5953449010848999, | |
| "learning_rate": 3.0062935877484804e-05, | |
| "loss": 0.6042, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.5875259041786194, | |
| "learning_rate": 2.9792765798093465e-05, | |
| "loss": 0.6037, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 0.6373878717422485, | |
| "learning_rate": 2.952201335830275e-05, | |
| "loss": 0.5921, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.7542224526405334, | |
| "learning_rate": 2.925071145675733e-05, | |
| "loss": 0.5948, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 0.6458872556686401, | |
| "learning_rate": 2.8978893058865987e-05, | |
| "loss": 0.5762, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 0.741593062877655, | |
| "learning_rate": 2.870659119279605e-05, | |
| "loss": 0.6207, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.6762745976448059, | |
| "learning_rate": 2.8433838945460205e-05, | |
| "loss": 0.5978, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.6076602339744568, | |
| "learning_rate": 2.8160669458496158e-05, | |
| "loss": 0.5937, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.556096613407135, | |
| "learning_rate": 2.788711592423966e-05, | |
| "loss": 0.589, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.710705578327179, | |
| "learning_rate": 2.761321158169134e-05, | |
| "loss": 0.5781, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.6006096005439758, | |
| "learning_rate": 2.7338989712477945e-05, | |
| "loss": 0.5905, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.6222957372665405, | |
| "learning_rate": 2.7064483636808313e-05, | |
| "loss": 0.5743, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 0.6598724722862244, | |
| "learning_rate": 2.678972670942468e-05, | |
| "loss": 0.586, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.6561234593391418, | |
| "learning_rate": 2.6514752315549847e-05, | |
| "loss": 0.5802, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.591433048248291, | |
| "learning_rate": 2.623959386683056e-05, | |
| "loss": 0.5828, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.6482975482940674, | |
| "learning_rate": 2.5964284797277762e-05, | |
| "loss": 0.5961, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.6356632113456726, | |
| "learning_rate": 2.5688858559204053e-05, | |
| "loss": 0.5887, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.6521331071853638, | |
| "learning_rate": 2.5413348619158967e-05, | |
| "loss": 0.5943, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.5699999999999998, | |
| "grad_norm": 0.589897096157074, | |
| "learning_rate": 2.5137788453862515e-05, | |
| "loss": 0.5925, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 0.6734787225723267, | |
| "learning_rate": 2.486221154613749e-05, | |
| "loss": 0.5725, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.5899999999999999, | |
| "grad_norm": 0.613000214099884, | |
| "learning_rate": 2.458665138084104e-05, | |
| "loss": 0.5832, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.6318116188049316, | |
| "learning_rate": 2.4311141440795953e-05, | |
| "loss": 0.5779, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6099999999999999, | |
| "grad_norm": 0.6127640604972839, | |
| "learning_rate": 2.4035715202722237e-05, | |
| "loss": 0.575, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.6314959526062012, | |
| "learning_rate": 2.3760406133169443e-05, | |
| "loss": 0.5818, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 0.6150800585746765, | |
| "learning_rate": 2.3485247684450166e-05, | |
| "loss": 0.5743, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 0.5990105867385864, | |
| "learning_rate": 2.3210273290575333e-05, | |
| "loss": 0.5764, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.6026667356491089, | |
| "learning_rate": 2.2935516363191693e-05, | |
| "loss": 0.5795, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": 0.614840567111969, | |
| "learning_rate": 2.2661010287522057e-05, | |
| "loss": 0.5821, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.5916484594345093, | |
| "learning_rate": 2.238678841830867e-05, | |
| "loss": 0.5819, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.5516738891601562, | |
| "learning_rate": 2.2112884075760347e-05, | |
| "loss": 0.5835, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 0.582239031791687, | |
| "learning_rate": 2.1839330541503845e-05, | |
| "loss": 0.5751, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.5710985660552979, | |
| "learning_rate": 2.1566161054539798e-05, | |
| "loss": 0.5777, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.5882836580276489, | |
| "learning_rate": 2.1293408807203947e-05, | |
| "loss": 0.5787, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.5639011859893799, | |
| "learning_rate": 2.1021106941134012e-05, | |
| "loss": 0.585, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.6191072463989258, | |
| "learning_rate": 2.074928854324268e-05, | |
| "loss": 0.5872, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 0.5924238562583923, | |
| "learning_rate": 2.047798664169726e-05, | |
| "loss": 0.5823, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.537702202796936, | |
| "learning_rate": 2.0207234201906547e-05, | |
| "loss": 0.5744, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.5480232238769531, | |
| "learning_rate": 1.9937064122515202e-05, | |
| "loss": 0.5909, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.5865290760993958, | |
| "learning_rate": 1.9667509231406334e-05, | |
| "loss": 0.5691, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.5969544649124146, | |
| "learning_rate": 1.9398602281712604e-05, | |
| "loss": 0.5763, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.5609118342399597, | |
| "learning_rate": 1.913037594783648e-05, | |
| "loss": 0.5714, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.5902943015098572, | |
| "learning_rate": 1.8862862821480025e-05, | |
| "loss": 0.5676, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 0.6172893047332764, | |
| "learning_rate": 1.859609540768471e-05, | |
| "loss": 0.568, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": 0.6336326003074646, | |
| "learning_rate": 1.8330106120881846e-05, | |
| "loss": 0.582, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.589616060256958, | |
| "learning_rate": 1.806492728095389e-05, | |
| "loss": 0.581, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.5233524441719055, | |
| "learning_rate": 1.780059110930735e-05, | |
| "loss": 0.578, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.5605369210243225, | |
| "learning_rate": 1.7537129724957642e-05, | |
| "loss": 0.5696, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": 0.5463587641716003, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.5717, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 0.5655060410499573, | |
| "learning_rate": 1.70129592588513e-05, | |
| "loss": 0.5698, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 0.5800077319145203, | |
| "learning_rate": 1.675231386811043e-05, | |
| "loss": 0.5844, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8900000000000001, | |
| "grad_norm": 0.5689355134963989, | |
| "learning_rate": 1.6492670638958924e-05, | |
| "loss": 0.5847, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.576605498790741, | |
| "learning_rate": 1.6234061120181142e-05, | |
| "loss": 0.5783, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.9100000000000001, | |
| "grad_norm": 0.5839110612869263, | |
| "learning_rate": 1.5976516734957138e-05, | |
| "loss": 0.5676, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.5562628507614136, | |
| "learning_rate": 1.5720068777044476e-05, | |
| "loss": 0.5684, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.9300000000000002, | |
| "grad_norm": 0.5681043863296509, | |
| "learning_rate": 1.5464748406975847e-05, | |
| "loss": 0.5769, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.5721532702445984, | |
| "learning_rate": 1.521058664827272e-05, | |
| "loss": 0.5737, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.538774311542511, | |
| "learning_rate": 1.495761438367577e-05, | |
| "loss": 0.5585, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.5676820874214172, | |
| "learning_rate": 1.4705862351392379e-05, | |
| "loss": 0.5831, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.5609452128410339, | |
| "learning_rate": 1.44553611413617e-05, | |
| "loss": 0.5728, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.5539233088493347, | |
| "learning_rate": 1.4206141191537682e-05, | |
| "loss": 0.5724, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.5074451565742493, | |
| "learning_rate": 1.395823278419065e-05, | |
| "loss": 0.5662, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.5767993927001953, | |
| "learning_rate": 1.3711666042227772e-05, | |
| "loss": 0.5646, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.6038601398468018, | |
| "learning_rate": 1.346647092553281e-05, | |
| "loss": 0.5408, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.5604304075241089, | |
| "learning_rate": 1.322267722732582e-05, | |
| "loss": 0.5433, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.5742123126983643, | |
| "learning_rate": 1.2980314570543006e-05, | |
| "loss": 0.5506, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.6020936965942383, | |
| "learning_rate": 1.2739412404237306e-05, | |
| "loss": 0.5405, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.5823059678077698, | |
| "learning_rate": 1.2500000000000006e-05, | |
| "loss": 0.5317, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.5085962414741516, | |
| "learning_rate": 1.2262106448404132e-05, | |
| "loss": 0.5315, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.5282750129699707, | |
| "learning_rate": 1.202576065546963e-05, | |
| "loss": 0.5395, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.5278365612030029, | |
| "learning_rate": 1.1790991339151031e-05, | |
| "loss": 0.5417, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.556912362575531, | |
| "learning_rate": 1.1557827025848047e-05, | |
| "loss": 0.5389, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.586982250213623, | |
| "learning_rate": 1.1326296046939333e-05, | |
| "loss": 0.5385, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.5237417221069336, | |
| "learning_rate": 1.1096426535339985e-05, | |
| "loss": 0.5379, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.5813203454017639, | |
| "learning_rate": 1.0868246422083204e-05, | |
| "loss": 0.5411, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.5908067226409912, | |
| "learning_rate": 1.064178343292641e-05, | |
| "loss": 0.5283, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.6426777243614197, | |
| "learning_rate": 1.0417065084982346e-05, | |
| "loss": 0.5427, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.5757883787155151, | |
| "learning_rate": 1.0194118683375503e-05, | |
| "loss": 0.5306, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.584816575050354, | |
| "learning_rate": 9.972971317924374e-06, | |
| "loss": 0.5222, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.6235145330429077, | |
| "learning_rate": 9.753649859849775e-06, | |
| "loss": 0.5349, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.5281869173049927, | |
| "learning_rate": 9.536180958509768e-06, | |
| "loss": 0.5352, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.5590287446975708, | |
| "learning_rate": 9.320591038161574e-06, | |
| "loss": 0.5348, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.5477201342582703, | |
| "learning_rate": 9.106906294750805e-06, | |
| "loss": 0.5213, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.5789000988006592, | |
| "learning_rate": 8.895152692728397e-06, | |
| "loss": 0.5331, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.5713198781013489, | |
| "learning_rate": 8.685355961895784e-06, | |
| "loss": 0.5403, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.5491828322410583, | |
| "learning_rate": 8.477541594278474e-06, | |
| "loss": 0.5371, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.576400876045227, | |
| "learning_rate": 8.271734841028553e-06, | |
| "loss": 0.5314, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.5434856414794922, | |
| "learning_rate": 8.067960709356478e-06, | |
| "loss": 0.547, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.5376303791999817, | |
| "learning_rate": 7.866243959492509e-06, | |
| "loss": 0.5326, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 0.5279624462127686, | |
| "learning_rate": 7.666609101678121e-06, | |
| "loss": 0.5297, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 0.5436615943908691, | |
| "learning_rate": 7.469080393187786e-06, | |
| "loss": 0.537, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.4894987642765045, | |
| "learning_rate": 7.273681835381569e-06, | |
| "loss": 0.541, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.5661471486091614, | |
| "learning_rate": 7.080437170788723e-06, | |
| "loss": 0.534, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.5674607753753662, | |
| "learning_rate": 6.889369880222776e-06, | |
| "loss": 0.5305, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.5481733083724976, | |
| "learning_rate": 6.700503179928458e-06, | |
| "loss": 0.5337, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.5626810789108276, | |
| "learning_rate": 6.513860018760698e-06, | |
| "loss": 0.5391, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 0.545527458190918, | |
| "learning_rate": 6.329463075396161e-06, | |
| "loss": 0.5442, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.5546277761459351, | |
| "learning_rate": 6.147334755577596e-06, | |
| "loss": 0.5446, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.5283880829811096, | |
| "learning_rate": 5.967497189391386e-06, | |
| "loss": 0.5246, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.5393268465995789, | |
| "learning_rate": 5.78997222857853e-06, | |
| "loss": 0.535, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.5287243127822876, | |
| "learning_rate": 5.614781443879463e-06, | |
| "loss": 0.5291, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.5424278378486633, | |
| "learning_rate": 5.441946122413086e-06, | |
| "loss": 0.5302, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.48928436636924744, | |
| "learning_rate": 5.271487265090163e-06, | |
| "loss": 0.5377, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.5223726034164429, | |
| "learning_rate": 5.103425584061538e-06, | |
| "loss": 0.5235, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.5279435515403748, | |
| "learning_rate": 4.937781500201474e-06, | |
| "loss": 0.524, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.5740906596183777, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.5433, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.5629609823226929, | |
| "learning_rate": 4.613826336248881e-06, | |
| "loss": 0.5366, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.5071305632591248, | |
| "learning_rate": 4.4555546193688735e-06, | |
| "loss": 0.5327, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.5691395998001099, | |
| "learning_rate": 4.299779221299499e-06, | |
| "loss": 0.538, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.4699999999999998, | |
| "grad_norm": 0.5553451180458069, | |
| "learning_rate": 4.146519070030757e-06, | |
| "loss": 0.5254, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.5273837447166443, | |
| "learning_rate": 3.995792787929481e-06, | |
| "loss": 0.5287, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.5408598184585571, | |
| "learning_rate": 3.847618689476612e-06, | |
| "loss": 0.5296, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.5135334730148315, | |
| "learning_rate": 3.7020147790418263e-06, | |
| "loss": 0.5256, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.5844847559928894, | |
| "learning_rate": 3.5589987486958243e-06, | |
| "loss": 0.5379, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.5451521277427673, | |
| "learning_rate": 3.418587976060653e-06, | |
| "loss": 0.5415, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.5300000000000002, | |
| "grad_norm": 0.5308763384819031, | |
| "learning_rate": 3.280799522198144e-06, | |
| "loss": 0.539, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.5386905074119568, | |
| "learning_rate": 3.145650129536862e-06, | |
| "loss": 0.5291, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.5555285215377808, | |
| "learning_rate": 3.013156219837776e-06, | |
| "loss": 0.5304, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.5238758325576782, | |
| "learning_rate": 2.883333892198853e-06, | |
| "loss": 0.5246, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.48604971170425415, | |
| "learning_rate": 2.7561989210989235e-06, | |
| "loss": 0.5283, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.5265457034111023, | |
| "learning_rate": 2.6317667544809134e-06, | |
| "loss": 0.5225, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.5299903154373169, | |
| "learning_rate": 2.510052511874822e-06, | |
| "loss": 0.5198, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.499087929725647, | |
| "learning_rate": 2.391070982560564e-06, | |
| "loss": 0.5245, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.4687907099723816, | |
| "learning_rate": 2.2748366237709374e-06, | |
| "loss": 0.5285, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.5042903423309326, | |
| "learning_rate": 2.1613635589349756e-06, | |
| "loss": 0.536, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.5207704901695251, | |
| "learning_rate": 2.0506655759618244e-06, | |
| "loss": 0.5442, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.5210204720497131, | |
| "learning_rate": 1.9427561255653816e-06, | |
| "loss": 0.5322, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.5390477776527405, | |
| "learning_rate": 1.837648319629956e-06, | |
| "loss": 0.5281, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.5613718032836914, | |
| "learning_rate": 1.735354929617042e-06, | |
| "loss": 0.5423, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 0.5014131665229797, | |
| "learning_rate": 1.6358883850134816e-06, | |
| "loss": 0.5373, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.4976540803909302, | |
| "learning_rate": 1.5392607718211994e-06, | |
| "loss": 0.5281, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.5201677083969116, | |
| "learning_rate": 1.4454838310886425e-06, | |
| "loss": 0.531, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.4653576612472534, | |
| "learning_rate": 1.3545689574841342e-06, | |
| "loss": 0.5251, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 0.5000585913658142, | |
| "learning_rate": 1.266527197911352e-06, | |
| "loss": 0.5253, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.4802665114402771, | |
| "learning_rate": 1.1813692501670276e-06, | |
| "loss": 0.5328, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.5173624157905579, | |
| "learning_rate": 1.0991054616410589e-06, | |
| "loss": 0.5339, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.5062382221221924, | |
| "learning_rate": 1.0197458280592542e-06, | |
| "loss": 0.5368, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.5551048517227173, | |
| "learning_rate": 9.432999922687396e-07, | |
| "loss": 0.5222, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.5234004855155945, | |
| "learning_rate": 8.697772430662859e-07, | |
| "loss": 0.5183, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.507086455821991, | |
| "learning_rate": 7.991865140696331e-07, | |
| "loss": 0.5204, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.7800000000000002, | |
| "grad_norm": 0.5369780659675598, | |
| "learning_rate": 7.315363826320005e-07, | |
| "loss": 0.5398, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.5381918549537659, | |
| "learning_rate": 6.668350687998565e-07, | |
| "loss": 0.5188, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.5035576820373535, | |
| "learning_rate": 6.050904343141095e-07, | |
| "loss": 0.5431, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 0.45299527049064636, | |
| "learning_rate": 5.463099816548579e-07, | |
| "loss": 0.5319, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.5247001647949219, | |
| "learning_rate": 4.905008531297661e-07, | |
| "loss": 0.5344, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.4998054802417755, | |
| "learning_rate": 4.3766983000621266e-07, | |
| "loss": 0.5266, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.5558290481567383, | |
| "learning_rate": 3.8782333168732033e-07, | |
| "loss": 0.5287, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.5186458230018616, | |
| "learning_rate": 3.4096741493194197e-07, | |
| "loss": 0.5379, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.499171644449234, | |
| "learning_rate": 2.9710777311871e-07, | |
| "loss": 0.5366, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 0.5045514702796936, | |
| "learning_rate": 2.5624973555424815e-07, | |
| "loss": 0.5247, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 0.507380485534668, | |
| "learning_rate": 2.1839826682562015e-07, | |
| "loss": 0.5349, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 0.5166347622871399, | |
| "learning_rate": 1.8355796619708987e-07, | |
| "loss": 0.5346, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.5490320920944214, | |
| "learning_rate": 1.517330670512629e-07, | |
| "loss": 0.5285, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 0.5213032960891724, | |
| "learning_rate": 1.229274363747146e-07, | |
| "loss": 0.5318, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 0.5020200610160828, | |
| "learning_rate": 9.71445742881022e-08, | |
| "loss": 0.5431, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 0.5045921802520752, | |
| "learning_rate": 7.438761362087987e-08, | |
| "loss": 0.5287, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 0.5005814433097839, | |
| "learning_rate": 5.4659319530636633e-08, | |
| "loss": 0.5288, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.5101141333580017, | |
| "learning_rate": 3.796208916709565e-08, | |
| "loss": 0.5413, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.4954812526702881, | |
| "learning_rate": 2.429795138085278e-08, | |
| "loss": 0.5126, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.9699999999999998, | |
| "grad_norm": 0.48134854435920715, | |
| "learning_rate": 1.3668566476848777e-08, | |
| "loss": 0.5367, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 0.534488320350647, | |
| "learning_rate": 6.075226012636215e-09, | |
| "loss": 0.5266, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 0.4778456687927246, | |
| "learning_rate": 1.5188526414244842e-09, | |
| "loss": 0.5479, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.5285366773605347, | |
| "learning_rate": 0.0, | |
| "loss": 0.5271, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3000, | |
| "total_flos": 4.3556612022154035e+18, | |
| "train_loss": 0.6027277402877808, | |
| "train_runtime": 90142.7719, | |
| "train_samples_per_second": 2.13, | |
| "train_steps_per_second": 0.033 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.3556612022154035e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |