{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 182, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01098901098901099, "grad_norm": 9.772354466167668, "learning_rate": 0.0, "loss": 1.1231, "num_tokens": 917504.0, "step": 1 }, { "epoch": 0.02197802197802198, "grad_norm": 9.600092839122757, "learning_rate": 6.666666666666667e-06, "loss": 1.1013, "num_tokens": 1835008.0, "step": 2 }, { "epoch": 0.03296703296703297, "grad_norm": 8.212581379803161, "learning_rate": 1.3333333333333333e-05, "loss": 1.049, "num_tokens": 2752512.0, "step": 3 }, { "epoch": 0.04395604395604396, "grad_norm": 8.633085385332315, "learning_rate": 2e-05, "loss": 0.9997, "num_tokens": 3670016.0, "step": 4 }, { "epoch": 0.054945054945054944, "grad_norm": 4.476388592007681, "learning_rate": 2.6666666666666667e-05, "loss": 0.9131, "num_tokens": 4587520.0, "step": 5 }, { "epoch": 0.06593406593406594, "grad_norm": 2.199174050912146, "learning_rate": 3.3333333333333335e-05, "loss": 0.8268, "num_tokens": 5505024.0, "step": 6 }, { "epoch": 0.07692307692307693, "grad_norm": 3.6366718060006793, "learning_rate": 4e-05, "loss": 0.7987, "num_tokens": 6422528.0, "step": 7 }, { "epoch": 0.08791208791208792, "grad_norm": 0.8709538445918814, "learning_rate": 3.999713248705006e-05, "loss": 0.7646, "num_tokens": 7340032.0, "step": 8 }, { "epoch": 0.0989010989010989, "grad_norm": 0.8508772967587946, "learning_rate": 3.9988530861825856e-05, "loss": 0.7187, "num_tokens": 8257536.0, "step": 9 }, { "epoch": 0.10989010989010989, "grad_norm": 0.6250406817887132, "learning_rate": 3.997419786491312e-05, "loss": 0.6833, "num_tokens": 9175040.0, "step": 10 }, { "epoch": 0.12087912087912088, "grad_norm": 0.6761987112883829, "learning_rate": 3.9954138062984565e-05, "loss": 0.6929, "num_tokens": 10092544.0, "step": 11 }, { "epoch": 0.13186813186813187, "grad_norm": 0.5992732864599857, "learning_rate": 3.992835784734483e-05, "loss": 0.6765, "num_tokens": 11010048.0, "step": 12 }, { "epoch": 0.14285714285714285, "grad_norm": 0.6199696915985338, "learning_rate": 3.9896865431894156e-05, "loss": 0.6655, "num_tokens": 11927552.0, "step": 13 }, { "epoch": 0.15384615384615385, "grad_norm": 0.4527410428309599, "learning_rate": 3.985967085051134e-05, "loss": 0.6421, "num_tokens": 12845056.0, "step": 14 }, { "epoch": 0.16483516483516483, "grad_norm": 0.38594598627024723, "learning_rate": 3.981678595385679e-05, "loss": 0.6365, "num_tokens": 13762560.0, "step": 15 }, { "epoch": 0.17582417582417584, "grad_norm": 0.6000184423982345, "learning_rate": 3.97682244055968e-05, "loss": 0.6321, "num_tokens": 14680064.0, "step": 16 }, { "epoch": 0.18681318681318682, "grad_norm": 0.35874323284286763, "learning_rate": 3.971400167805009e-05, "loss": 0.6199, "num_tokens": 15597568.0, "step": 17 }, { "epoch": 0.1978021978021978, "grad_norm": 0.4276517652795243, "learning_rate": 3.965413504725815e-05, "loss": 0.6242, "num_tokens": 16515072.0, "step": 18 }, { "epoch": 0.2087912087912088, "grad_norm": 0.37269377795397257, "learning_rate": 3.958864358748087e-05, "loss": 0.6031, "num_tokens": 17432576.0, "step": 19 }, { "epoch": 0.21978021978021978, "grad_norm": 0.40621358017043085, "learning_rate": 3.951754816511927e-05, "loss": 0.613, "num_tokens": 18350080.0, "step": 20 }, { "epoch": 0.23076923076923078, "grad_norm": 0.3367633776740304, "learning_rate": 3.9440871432067133e-05, "loss": 0.5873, "num_tokens": 19267584.0, "step": 21 }, { "epoch": 0.24175824175824176, "grad_norm": 0.34836410139728774, "learning_rate": 3.935863781849393e-05, "loss": 0.6072, "num_tokens": 20163950.0, "step": 22 }, { "epoch": 0.25274725274725274, "grad_norm": 0.2895774881956595, "learning_rate": 3.927087352506096e-05, "loss": 0.589, "num_tokens": 21081454.0, "step": 23 }, { "epoch": 0.26373626373626374, "grad_norm": 0.30561434058887504, "learning_rate": 3.917760651457355e-05, "loss": 0.5854, "num_tokens": 21998958.0, "step": 24 }, { "epoch": 0.27472527472527475, "grad_norm": 0.31029776548829807, "learning_rate": 3.9078866503071756e-05, "loss": 0.5983, "num_tokens": 22916462.0, "step": 25 }, { "epoch": 0.2857142857142857, "grad_norm": 0.3448080441527171, "learning_rate": 3.897468495036242e-05, "loss": 0.5721, "num_tokens": 23833966.0, "step": 26 }, { "epoch": 0.2967032967032967, "grad_norm": 0.3066308102005833, "learning_rate": 3.8865095049995714e-05, "loss": 0.5816, "num_tokens": 24751470.0, "step": 27 }, { "epoch": 0.3076923076923077, "grad_norm": 0.2784335840846886, "learning_rate": 3.8750131718689246e-05, "loss": 0.5623, "num_tokens": 25668974.0, "step": 28 }, { "epoch": 0.31868131868131866, "grad_norm": 0.2877897717363653, "learning_rate": 3.862983158520316e-05, "loss": 0.5755, "num_tokens": 26586478.0, "step": 29 }, { "epoch": 0.32967032967032966, "grad_norm": 0.2731795869548364, "learning_rate": 3.850423297866976e-05, "loss": 0.5668, "num_tokens": 27503982.0, "step": 30 }, { "epoch": 0.34065934065934067, "grad_norm": 0.2652626315629539, "learning_rate": 3.8373375916381336e-05, "loss": 0.5683, "num_tokens": 28421486.0, "step": 31 }, { "epoch": 0.3516483516483517, "grad_norm": 0.24873904159586221, "learning_rate": 3.8237302091040185e-05, "loss": 0.5686, "num_tokens": 29338990.0, "step": 32 }, { "epoch": 0.3626373626373626, "grad_norm": 0.2632038822718199, "learning_rate": 3.809605485747481e-05, "loss": 0.5712, "num_tokens": 30256494.0, "step": 33 }, { "epoch": 0.37362637362637363, "grad_norm": 0.2722240550626087, "learning_rate": 3.794967921882645e-05, "loss": 0.5598, "num_tokens": 31173998.0, "step": 34 }, { "epoch": 0.38461538461538464, "grad_norm": 0.24879771736887557, "learning_rate": 3.779822181221061e-05, "loss": 0.556, "num_tokens": 32091502.0, "step": 35 }, { "epoch": 0.3956043956043956, "grad_norm": 0.24922465450396175, "learning_rate": 3.7641730893857814e-05, "loss": 0.554, "num_tokens": 33009006.0, "step": 36 }, { "epoch": 0.4065934065934066, "grad_norm": 0.33013988996913113, "learning_rate": 3.7480256323738615e-05, "loss": 0.544, "num_tokens": 33926510.0, "step": 37 }, { "epoch": 0.4175824175824176, "grad_norm": 0.29605139440653716, "learning_rate": 3.731384954967756e-05, "loss": 0.5483, "num_tokens": 34844014.0, "step": 38 }, { "epoch": 0.42857142857142855, "grad_norm": 0.2623007557945402, "learning_rate": 3.7142563590961265e-05, "loss": 0.5567, "num_tokens": 35761518.0, "step": 39 }, { "epoch": 0.43956043956043955, "grad_norm": 0.30367469474267783, "learning_rate": 3.696645302144582e-05, "loss": 0.5498, "num_tokens": 36679022.0, "step": 40 }, { "epoch": 0.45054945054945056, "grad_norm": 0.28265812011201014, "learning_rate": 3.6785573952168854e-05, "loss": 0.5461, "num_tokens": 37596526.0, "step": 41 }, { "epoch": 0.46153846153846156, "grad_norm": 0.26093051165990416, "learning_rate": 3.659998401347187e-05, "loss": 0.5474, "num_tokens": 38514030.0, "step": 42 }, { "epoch": 0.4725274725274725, "grad_norm": 0.25911214710800756, "learning_rate": 3.640974233663849e-05, "loss": 0.5501, "num_tokens": 39431534.0, "step": 43 }, { "epoch": 0.4835164835164835, "grad_norm": 0.26046107218874337, "learning_rate": 3.6214909535054486e-05, "loss": 0.5475, "num_tokens": 40343766.0, "step": 44 }, { "epoch": 0.4945054945054945, "grad_norm": 0.241370938400794, "learning_rate": 3.60155476848956e-05, "loss": 0.5429, "num_tokens": 41261270.0, "step": 45 }, { "epoch": 0.5054945054945055, "grad_norm": 0.2841881073193607, "learning_rate": 3.581172030534926e-05, "loss": 0.5358, "num_tokens": 42178774.0, "step": 46 }, { "epoch": 0.5164835164835165, "grad_norm": 0.2638928127717608, "learning_rate": 3.5603492338376656e-05, "loss": 0.5467, "num_tokens": 43096278.0, "step": 47 }, { "epoch": 0.5274725274725275, "grad_norm": 0.2497753788478401, "learning_rate": 3.5390930128021294e-05, "loss": 0.5411, "num_tokens": 44013782.0, "step": 48 }, { "epoch": 0.5384615384615384, "grad_norm": 0.2500710961601236, "learning_rate": 3.517410139927106e-05, "loss": 0.5464, "num_tokens": 44931286.0, "step": 49 }, { "epoch": 0.5494505494505495, "grad_norm": 0.24763057093079505, "learning_rate": 3.4953075236480134e-05, "loss": 0.5374, "num_tokens": 45848790.0, "step": 50 }, { "epoch": 0.5604395604395604, "grad_norm": 0.2358717735312636, "learning_rate": 3.472792206135786e-05, "loss": 0.5268, "num_tokens": 46766294.0, "step": 51 }, { "epoch": 0.5714285714285714, "grad_norm": 0.22380327887618337, "learning_rate": 3.4498713610531493e-05, "loss": 0.535, "num_tokens": 47683798.0, "step": 52 }, { "epoch": 0.5824175824175825, "grad_norm": 0.22805944066275524, "learning_rate": 3.426552291269005e-05, "loss": 0.5198, "num_tokens": 48601302.0, "step": 53 }, { "epoch": 0.5934065934065934, "grad_norm": 0.27848681523085456, "learning_rate": 3.4028424265316374e-05, "loss": 0.5354, "num_tokens": 49518806.0, "step": 54 }, { "epoch": 0.6043956043956044, "grad_norm": 0.3166567297130911, "learning_rate": 3.3787493211015134e-05, "loss": 0.5319, "num_tokens": 50436310.0, "step": 55 }, { "epoch": 0.6153846153846154, "grad_norm": 0.26340720750585067, "learning_rate": 3.35428065134439e-05, "loss": 0.54, "num_tokens": 51353814.0, "step": 56 }, { "epoch": 0.6263736263736264, "grad_norm": 0.2186831097362782, "learning_rate": 3.329444213285534e-05, "loss": 0.5223, "num_tokens": 52271318.0, "step": 57 }, { "epoch": 0.6373626373626373, "grad_norm": 0.232110037575015, "learning_rate": 3.3042479201258074e-05, "loss": 0.5234, "num_tokens": 53188822.0, "step": 58 }, { "epoch": 0.6483516483516484, "grad_norm": 0.23910951589261223, "learning_rate": 3.278699799720425e-05, "loss": 0.5228, "num_tokens": 54106326.0, "step": 59 }, { "epoch": 0.6593406593406593, "grad_norm": 0.23275421303395694, "learning_rate": 3.2528079920211756e-05, "loss": 0.5316, "num_tokens": 55023830.0, "step": 60 }, { "epoch": 0.6703296703296703, "grad_norm": 0.2413878787658049, "learning_rate": 3.226580746482936e-05, "loss": 0.5329, "num_tokens": 55941334.0, "step": 61 }, { "epoch": 0.6813186813186813, "grad_norm": 0.2754719141614586, "learning_rate": 3.200026419435284e-05, "loss": 0.525, "num_tokens": 56858838.0, "step": 62 }, { "epoch": 0.6923076923076923, "grad_norm": 0.2657997108274376, "learning_rate": 3.1731534714200765e-05, "loss": 0.5214, "num_tokens": 57776342.0, "step": 63 }, { "epoch": 0.7032967032967034, "grad_norm": 0.2853819135228901, "learning_rate": 3.1459704644958036e-05, "loss": 0.5103, "num_tokens": 58693846.0, "step": 64 }, { "epoch": 0.7142857142857143, "grad_norm": 0.24958946177305488, "learning_rate": 3.11848605950962e-05, "loss": 0.5274, "num_tokens": 59611350.0, "step": 65 }, { "epoch": 0.7252747252747253, "grad_norm": 0.2465041243728232, "learning_rate": 3.090709013337882e-05, "loss": 0.5282, "num_tokens": 60512642.0, "step": 66 }, { "epoch": 0.7362637362637363, "grad_norm": 0.28725482896293897, "learning_rate": 3.062648176096103e-05, "loss": 0.5336, "num_tokens": 61430146.0, "step": 67 }, { "epoch": 0.7472527472527473, "grad_norm": 0.23984645905048624, "learning_rate": 3.0343124883191896e-05, "loss": 0.5242, "num_tokens": 62347650.0, "step": 68 }, { "epoch": 0.7582417582417582, "grad_norm": 0.23432046606410395, "learning_rate": 3.0057109781128826e-05, "loss": 0.5281, "num_tokens": 63265154.0, "step": 69 }, { "epoch": 0.7692307692307693, "grad_norm": 0.27804370268544276, "learning_rate": 2.9768527582772808e-05, "loss": 0.5314, "num_tokens": 64182658.0, "step": 70 }, { "epoch": 0.7802197802197802, "grad_norm": 0.24349784505265698, "learning_rate": 2.947747023403396e-05, "loss": 0.5074, "num_tokens": 65100162.0, "step": 71 }, { "epoch": 0.7912087912087912, "grad_norm": 0.24300790833604854, "learning_rate": 2.9184030469436335e-05, "loss": 0.5145, "num_tokens": 66017666.0, "step": 72 }, { "epoch": 0.8021978021978022, "grad_norm": 0.2671185613248455, "learning_rate": 2.8888301782571618e-05, "loss": 0.5198, "num_tokens": 66935170.0, "step": 73 }, { "epoch": 0.8131868131868132, "grad_norm": 0.2932424795270821, "learning_rate": 2.8590378396310836e-05, "loss": 0.5192, "num_tokens": 67852674.0, "step": 74 }, { "epoch": 0.8241758241758241, "grad_norm": 0.2434654915426533, "learning_rate": 2.8290355232783776e-05, "loss": 0.5153, "num_tokens": 68770178.0, "step": 75 }, { "epoch": 0.8351648351648352, "grad_norm": 0.29726799564738154, "learning_rate": 2.7988327883135634e-05, "loss": 0.5129, "num_tokens": 69687682.0, "step": 76 }, { "epoch": 0.8461538461538461, "grad_norm": 0.225836634053875, "learning_rate": 2.7684392577070452e-05, "loss": 0.514, "num_tokens": 70605186.0, "step": 77 }, { "epoch": 0.8571428571428571, "grad_norm": 0.25032550499397527, "learning_rate": 2.7378646152191128e-05, "loss": 0.5214, "num_tokens": 71522690.0, "step": 78 }, { "epoch": 0.8681318681318682, "grad_norm": 0.22512849845434282, "learning_rate": 2.707118602314574e-05, "loss": 0.5114, "num_tokens": 72440194.0, "step": 79 }, { "epoch": 0.8791208791208791, "grad_norm": 0.22903799875558534, "learning_rate": 2.6762110150590027e-05, "loss": 0.5062, "num_tokens": 73357698.0, "step": 80 }, { "epoch": 0.8901098901098901, "grad_norm": 0.26331645247701346, "learning_rate": 2.645151700997588e-05, "loss": 0.5165, "num_tokens": 74275202.0, "step": 81 }, { "epoch": 0.9010989010989011, "grad_norm": 0.21094645192435654, "learning_rate": 2.6139505560175854e-05, "loss": 0.5054, "num_tokens": 75192706.0, "step": 82 }, { "epoch": 0.9120879120879121, "grad_norm": 0.26785720821943976, "learning_rate": 2.582617521195358e-05, "loss": 0.5003, "num_tokens": 76110210.0, "step": 83 }, { "epoch": 0.9230769230769231, "grad_norm": 0.22593987553665204, "learning_rate": 2.5511625796290314e-05, "loss": 0.5128, "num_tokens": 77027714.0, "step": 84 }, { "epoch": 0.9340659340659341, "grad_norm": 0.2498708364531816, "learning_rate": 2.5195957532577464e-05, "loss": 0.5128, "num_tokens": 77945218.0, "step": 85 }, { "epoch": 0.945054945054945, "grad_norm": 0.23721562361070062, "learning_rate": 2.4879270996685487e-05, "loss": 0.5122, "num_tokens": 78862722.0, "step": 86 }, { "epoch": 0.9560439560439561, "grad_norm": 0.21843894493477256, "learning_rate": 2.456166708891914e-05, "loss": 0.5051, "num_tokens": 79780226.0, "step": 87 }, { "epoch": 0.967032967032967, "grad_norm": 0.24168907616551225, "learning_rate": 2.4243247001869348e-05, "loss": 0.5098, "num_tokens": 80680945.0, "step": 88 }, { "epoch": 0.978021978021978, "grad_norm": 0.21025312954441036, "learning_rate": 2.3924112188172038e-05, "loss": 0.4893, "num_tokens": 81598449.0, "step": 89 }, { "epoch": 0.989010989010989, "grad_norm": 0.2556674465888256, "learning_rate": 2.3604364328183968e-05, "loss": 0.5141, "num_tokens": 82515953.0, "step": 90 }, { "epoch": 1.0, "grad_norm": 0.23567112398452283, "learning_rate": 2.3284105297586185e-05, "loss": 0.4946, "num_tokens": 83301494.0, "step": 91 }, { "epoch": 1.010989010989011, "grad_norm": 0.35674531173933427, "learning_rate": 2.2963437134925073e-05, "loss": 0.4522, "num_tokens": 84218998.0, "step": 92 }, { "epoch": 1.021978021978022, "grad_norm": 0.32732971865138927, "learning_rate": 2.2642462009101652e-05, "loss": 0.4408, "num_tokens": 85136502.0, "step": 93 }, { "epoch": 1.032967032967033, "grad_norm": 0.2643264459677597, "learning_rate": 2.232128218681923e-05, "loss": 0.4504, "num_tokens": 86054006.0, "step": 94 }, { "epoch": 1.043956043956044, "grad_norm": 0.2974529043501973, "learning_rate": 2.2000000000000003e-05, "loss": 0.4533, "num_tokens": 86971510.0, "step": 95 }, { "epoch": 1.054945054945055, "grad_norm": 0.31084829686048726, "learning_rate": 2.1678717813180774e-05, "loss": 0.4453, "num_tokens": 87889014.0, "step": 96 }, { "epoch": 1.065934065934066, "grad_norm": 0.27597500734793406, "learning_rate": 2.1357537990898357e-05, "loss": 0.45, "num_tokens": 88806518.0, "step": 97 }, { "epoch": 1.0769230769230769, "grad_norm": 0.31043276611201287, "learning_rate": 2.103656286507493e-05, "loss": 0.4528, "num_tokens": 89724022.0, "step": 98 }, { "epoch": 1.0879120879120878, "grad_norm": 0.2571927303424464, "learning_rate": 2.0715894702413825e-05, "loss": 0.4419, "num_tokens": 90641526.0, "step": 99 }, { "epoch": 1.098901098901099, "grad_norm": 0.31150275031289787, "learning_rate": 2.0395635671816034e-05, "loss": 0.4314, "num_tokens": 91559030.0, "step": 100 }, { "epoch": 1.10989010989011, "grad_norm": 0.24608732221540086, "learning_rate": 2.0075887811827974e-05, "loss": 0.4545, "num_tokens": 92476534.0, "step": 101 }, { "epoch": 1.120879120879121, "grad_norm": 0.28525912376181395, "learning_rate": 1.9756752998130654e-05, "loss": 0.4332, "num_tokens": 93394038.0, "step": 102 }, { "epoch": 1.1318681318681318, "grad_norm": 0.23425555984045235, "learning_rate": 1.9438332911080873e-05, "loss": 0.4407, "num_tokens": 94311542.0, "step": 103 }, { "epoch": 1.1428571428571428, "grad_norm": 0.2770529896138633, "learning_rate": 1.912072900331452e-05, "loss": 0.4435, "num_tokens": 95229046.0, "step": 104 }, { "epoch": 1.1538461538461537, "grad_norm": 0.23844615285702547, "learning_rate": 1.8804042467422548e-05, "loss": 0.4346, "num_tokens": 96146550.0, "step": 105 }, { "epoch": 1.164835164835165, "grad_norm": 0.23621746789263579, "learning_rate": 1.8488374203709692e-05, "loss": 0.4393, "num_tokens": 97064054.0, "step": 106 }, { "epoch": 1.1758241758241759, "grad_norm": 0.22871395947209364, "learning_rate": 1.817382478804642e-05, "loss": 0.441, "num_tokens": 97981558.0, "step": 107 }, { "epoch": 1.1868131868131868, "grad_norm": 0.23662509384292804, "learning_rate": 1.786049443982415e-05, "loss": 0.4352, "num_tokens": 98899062.0, "step": 108 }, { "epoch": 1.1978021978021978, "grad_norm": 0.24289640385135913, "learning_rate": 1.7548482990024124e-05, "loss": 0.4299, "num_tokens": 99816566.0, "step": 109 }, { "epoch": 1.2087912087912087, "grad_norm": 0.2378269125932523, "learning_rate": 1.7237889849409982e-05, "loss": 0.4341, "num_tokens": 100734070.0, "step": 110 }, { "epoch": 1.2197802197802199, "grad_norm": 0.21748020471162874, "learning_rate": 1.6928813976854267e-05, "loss": 0.429, "num_tokens": 101651574.0, "step": 111 }, { "epoch": 1.2307692307692308, "grad_norm": 0.2483873489137389, "learning_rate": 1.6621353847808878e-05, "loss": 0.4346, "num_tokens": 102569078.0, "step": 112 }, { "epoch": 1.2417582417582418, "grad_norm": 0.22656056425221513, "learning_rate": 1.6315607422929557e-05, "loss": 0.4238, "num_tokens": 103478660.0, "step": 113 }, { "epoch": 1.2527472527472527, "grad_norm": 0.24907751206518808, "learning_rate": 1.601167211686437e-05, "loss": 0.448, "num_tokens": 104396164.0, "step": 114 }, { "epoch": 1.2637362637362637, "grad_norm": 0.22511489192054707, "learning_rate": 1.5709644767216233e-05, "loss": 0.4274, "num_tokens": 105313668.0, "step": 115 }, { "epoch": 1.2747252747252746, "grad_norm": 0.25451119470593475, "learning_rate": 1.5409621603689177e-05, "loss": 0.4391, "num_tokens": 106231172.0, "step": 116 }, { "epoch": 1.2857142857142856, "grad_norm": 0.21851038341954235, "learning_rate": 1.5111698217428385e-05, "loss": 0.4368, "num_tokens": 107148676.0, "step": 117 }, { "epoch": 1.2967032967032968, "grad_norm": 0.24869542086528182, "learning_rate": 1.4815969530563666e-05, "loss": 0.4464, "num_tokens": 108066180.0, "step": 118 }, { "epoch": 1.3076923076923077, "grad_norm": 2.2476265832187554, "learning_rate": 1.4522529765966048e-05, "loss": 0.4475, "num_tokens": 108983684.0, "step": 119 }, { "epoch": 1.3186813186813187, "grad_norm": 0.3023135747330207, "learning_rate": 1.4231472417227185e-05, "loss": 0.4366, "num_tokens": 109901188.0, "step": 120 }, { "epoch": 1.3296703296703296, "grad_norm": 0.2216681503897794, "learning_rate": 1.3942890218871177e-05, "loss": 0.4403, "num_tokens": 110818692.0, "step": 121 }, { "epoch": 1.3406593406593408, "grad_norm": 0.2532401911435245, "learning_rate": 1.3656875116808105e-05, "loss": 0.429, "num_tokens": 111736196.0, "step": 122 }, { "epoch": 1.3516483516483517, "grad_norm": 0.2400191704170556, "learning_rate": 1.3373518239038985e-05, "loss": 0.4457, "num_tokens": 112653700.0, "step": 123 }, { "epoch": 1.3626373626373627, "grad_norm": 0.24101371721731865, "learning_rate": 1.3092909866621181e-05, "loss": 0.4335, "num_tokens": 113571204.0, "step": 124 }, { "epoch": 1.3736263736263736, "grad_norm": 0.23058749347423538, "learning_rate": 1.2815139404903811e-05, "loss": 0.4303, "num_tokens": 114488708.0, "step": 125 }, { "epoch": 1.3846153846153846, "grad_norm": 0.22427253426617916, "learning_rate": 1.2540295355041972e-05, "loss": 0.4343, "num_tokens": 115406212.0, "step": 126 }, { "epoch": 1.3956043956043955, "grad_norm": 0.23614824410262367, "learning_rate": 1.226846528579925e-05, "loss": 0.4368, "num_tokens": 116323716.0, "step": 127 }, { "epoch": 1.4065934065934065, "grad_norm": 0.2187687297235707, "learning_rate": 1.1999735805647165e-05, "loss": 0.4323, "num_tokens": 117241220.0, "step": 128 }, { "epoch": 1.4175824175824177, "grad_norm": 0.2457868403421847, "learning_rate": 1.1734192535170646e-05, "loss": 0.4319, "num_tokens": 118158724.0, "step": 129 }, { "epoch": 1.4285714285714286, "grad_norm": 0.221312873267363, "learning_rate": 1.1471920079788248e-05, "loss": 0.4422, "num_tokens": 119076228.0, "step": 130 }, { "epoch": 1.4395604395604396, "grad_norm": 0.2408621876511875, "learning_rate": 1.1213002002795757e-05, "loss": 0.4465, "num_tokens": 119993732.0, "step": 131 }, { "epoch": 1.4505494505494505, "grad_norm": 0.2300963154048304, "learning_rate": 1.0957520798741935e-05, "loss": 0.4505, "num_tokens": 120911236.0, "step": 132 }, { "epoch": 1.4615384615384617, "grad_norm": 0.2406985669097342, "learning_rate": 1.0705557867144662e-05, "loss": 0.4285, "num_tokens": 121828740.0, "step": 133 }, { "epoch": 1.4725274725274726, "grad_norm": 0.23752471909651654, "learning_rate": 1.0457193486556106e-05, "loss": 0.4443, "num_tokens": 122746244.0, "step": 134 }, { "epoch": 1.4835164835164836, "grad_norm": 0.20111074494044714, "learning_rate": 1.021250678898487e-05, "loss": 0.4331, "num_tokens": 123645320.0, "step": 135 }, { "epoch": 1.4945054945054945, "grad_norm": 0.2231276963091454, "learning_rate": 9.971575734683633e-06, "loss": 0.4364, "num_tokens": 124562824.0, "step": 136 }, { "epoch": 1.5054945054945055, "grad_norm": 0.22110276071102014, "learning_rate": 9.73447708730996e-06, "loss": 0.4339, "num_tokens": 125480328.0, "step": 137 }, { "epoch": 1.5164835164835164, "grad_norm": 0.2119262254489098, "learning_rate": 9.501286389468512e-06, "loss": 0.4363, "num_tokens": 126397832.0, "step": 138 }, { "epoch": 1.5274725274725274, "grad_norm": 0.19890961890941636, "learning_rate": 9.272077938642147e-06, "loss": 0.4288, "num_tokens": 127315336.0, "step": 139 }, { "epoch": 1.5384615384615383, "grad_norm": 0.20060082322657913, "learning_rate": 9.04692476351987e-06, "loss": 0.4338, "num_tokens": 128232840.0, "step": 140 }, { "epoch": 1.5494505494505495, "grad_norm": 0.20995608468833954, "learning_rate": 8.825898600728945e-06, "loss": 0.4333, "num_tokens": 129150344.0, "step": 141 }, { "epoch": 1.5604395604395604, "grad_norm": 0.20133499334583052, "learning_rate": 8.609069871978708e-06, "loss": 0.4353, "num_tokens": 130067848.0, "step": 142 }, { "epoch": 1.5714285714285714, "grad_norm": 0.2152514674681236, "learning_rate": 8.396507661623355e-06, "loss": 0.4281, "num_tokens": 130985352.0, "step": 143 }, { "epoch": 1.5824175824175826, "grad_norm": 0.20757467747575342, "learning_rate": 8.18827969465074e-06, "loss": 0.4401, "num_tokens": 131902856.0, "step": 144 }, { "epoch": 1.5934065934065935, "grad_norm": 0.20316604323446175, "learning_rate": 7.984452315104413e-06, "loss": 0.4314, "num_tokens": 132820360.0, "step": 145 }, { "epoch": 1.6043956043956045, "grad_norm": 0.21423177058095494, "learning_rate": 7.785090464945514e-06, "loss": 0.4273, "num_tokens": 133737864.0, "step": 146 }, { "epoch": 1.6153846153846154, "grad_norm": 0.2029825946539759, "learning_rate": 7.590257663361516e-06, "loss": 0.4309, "num_tokens": 134655368.0, "step": 147 }, { "epoch": 1.6263736263736264, "grad_norm": 0.22231790835116305, "learning_rate": 7.4000159865281365e-06, "loss": 0.4421, "num_tokens": 135572872.0, "step": 148 }, { "epoch": 1.6373626373626373, "grad_norm": 0.2204500858397147, "learning_rate": 7.214426047831158e-06, "loss": 0.4371, "num_tokens": 136490376.0, "step": 149 }, { "epoch": 1.6483516483516483, "grad_norm": 0.19221591910101343, "learning_rate": 7.0335469785541845e-06, "loss": 0.4301, "num_tokens": 137407880.0, "step": 150 }, { "epoch": 1.6593406593406592, "grad_norm": 0.2200648566470767, "learning_rate": 6.857436409038738e-06, "loss": 0.4347, "num_tokens": 138325384.0, "step": 151 }, { "epoch": 1.6703296703296702, "grad_norm": 0.19865931046297322, "learning_rate": 6.686150450322449e-06, "loss": 0.429, "num_tokens": 139242888.0, "step": 152 }, { "epoch": 1.6813186813186813, "grad_norm": 0.2137535541376169, "learning_rate": 6.519743676261391e-06, "loss": 0.4311, "num_tokens": 140160392.0, "step": 153 }, { "epoch": 1.6923076923076923, "grad_norm": 0.21248466316302173, "learning_rate": 6.358269106142197e-06, "loss": 0.4297, "num_tokens": 141077896.0, "step": 154 }, { "epoch": 1.7032967032967035, "grad_norm": 0.19748425089266558, "learning_rate": 6.201778187789399e-06, "loss": 0.4279, "num_tokens": 141995400.0, "step": 155 }, { "epoch": 1.7142857142857144, "grad_norm": 0.19463743812765466, "learning_rate": 6.050320781173557e-06, "loss": 0.4298, "num_tokens": 142912904.0, "step": 156 }, { "epoch": 1.7252747252747254, "grad_norm": 0.20069413130304345, "learning_rate": 5.9039451425251985e-06, "loss": 0.4366, "num_tokens": 143812372.0, "step": 157 }, { "epoch": 1.7362637362637363, "grad_norm": 0.20034875750342657, "learning_rate": 5.762697908959817e-06, "loss": 0.4189, "num_tokens": 144729876.0, "step": 158 }, { "epoch": 1.7472527472527473, "grad_norm": 0.19215075611347193, "learning_rate": 5.626624083618669e-06, "loss": 0.4353, "num_tokens": 145647380.0, "step": 159 }, { "epoch": 1.7582417582417582, "grad_norm": 0.19260235989647062, "learning_rate": 5.495767021330247e-06, "loss": 0.4304, "num_tokens": 146564884.0, "step": 160 }, { "epoch": 1.7692307692307692, "grad_norm": 0.18810017570633555, "learning_rate": 5.370168414796839e-06, "loss": 0.4404, "num_tokens": 147482388.0, "step": 161 }, { "epoch": 1.7802197802197801, "grad_norm": 0.19029051095780564, "learning_rate": 5.249868281310756e-06, "loss": 0.4181, "num_tokens": 148399892.0, "step": 162 }, { "epoch": 1.791208791208791, "grad_norm": 0.18890363805567867, "learning_rate": 5.134904950004292e-06, "loss": 0.431, "num_tokens": 149317396.0, "step": 163 }, { "epoch": 1.8021978021978022, "grad_norm": 0.1864217458059224, "learning_rate": 5.0253150496375836e-06, "loss": 0.4251, "num_tokens": 150234900.0, "step": 164 }, { "epoch": 1.8131868131868132, "grad_norm": 0.18897338795078375, "learning_rate": 4.921133496928249e-06, "loss": 0.4326, "num_tokens": 151152404.0, "step": 165 }, { "epoch": 1.8241758241758241, "grad_norm": 0.19320879835217267, "learning_rate": 4.822393485426452e-06, "loss": 0.436, "num_tokens": 152069908.0, "step": 166 }, { "epoch": 1.8351648351648353, "grad_norm": 0.1899799248610807, "learning_rate": 4.72912647493905e-06, "loss": 0.4208, "num_tokens": 152987412.0, "step": 167 }, { "epoch": 1.8461538461538463, "grad_norm": 0.18585926786014914, "learning_rate": 4.641362181506075e-06, "loss": 0.4316, "num_tokens": 153904916.0, "step": 168 }, { "epoch": 1.8571428571428572, "grad_norm": 0.18544543317640078, "learning_rate": 4.55912856793287e-06, "loss": 0.4227, "num_tokens": 154822420.0, "step": 169 }, { "epoch": 1.8681318681318682, "grad_norm": 0.18321225894086932, "learning_rate": 4.482451834880739e-06, "loss": 0.4163, "num_tokens": 155739924.0, "step": 170 }, { "epoch": 1.879120879120879, "grad_norm": 0.19525860190829208, "learning_rate": 4.411356412519131e-06, "loss": 0.4355, "num_tokens": 156657428.0, "step": 171 }, { "epoch": 1.89010989010989, "grad_norm": 0.18997066507784008, "learning_rate": 4.345864952741853e-06, "loss": 0.4355, "num_tokens": 157574932.0, "step": 172 }, { "epoch": 1.901098901098901, "grad_norm": 0.19052605410789913, "learning_rate": 4.2859983219499105e-06, "loss": 0.4321, "num_tokens": 158492436.0, "step": 173 }, { "epoch": 1.912087912087912, "grad_norm": 0.1853455570615677, "learning_rate": 4.2317755944032005e-06, "loss": 0.4307, "num_tokens": 159409940.0, "step": 174 }, { "epoch": 1.9230769230769231, "grad_norm": 0.1887973737773016, "learning_rate": 4.1832140461432125e-06, "loss": 0.4267, "num_tokens": 160327444.0, "step": 175 }, { "epoch": 1.934065934065934, "grad_norm": 0.18555753371184378, "learning_rate": 4.140329149488669e-06, "loss": 0.4285, "num_tokens": 161244948.0, "step": 176 }, { "epoch": 1.945054945054945, "grad_norm": 0.18720636646833771, "learning_rate": 4.103134568105848e-06, "loss": 0.4276, "num_tokens": 162162452.0, "step": 177 }, { "epoch": 1.9560439560439562, "grad_norm": 0.18216577817621057, "learning_rate": 4.071642152655174e-06, "loss": 0.4212, "num_tokens": 163079956.0, "step": 178 }, { "epoch": 1.9670329670329672, "grad_norm": 0.1838279867433696, "learning_rate": 4.045861937015437e-06, "loss": 0.4292, "num_tokens": 163978482.0, "step": 179 }, { "epoch": 1.978021978021978, "grad_norm": 0.18500102000422938, "learning_rate": 4.025802135086881e-06, "loss": 0.4365, "num_tokens": 164895986.0, "step": 180 }, { "epoch": 1.989010989010989, "grad_norm": 0.18059160305577232, "learning_rate": 4.011469138174149e-06, "loss": 0.4265, "num_tokens": 165813490.0, "step": 181 }, { "epoch": 2.0, "grad_norm": 0.20158288785081682, "learning_rate": 4.002867512949942e-06, "loss": 0.4303, "num_tokens": 166602988.0, "step": 182 }, { "epoch": 2.0, "step": 182, "total_flos": 174931417497600.0, "train_loss": 0.511371944631849, "train_runtime": 8742.1099, "train_samples_per_second": 4.656, "train_steps_per_second": 0.021 } ], "logging_steps": 1, "max_steps": 182, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 174931417497600.0, "train_batch_size": 56, "trial_name": null, "trial_params": null }