|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.968, |
|
"eval_steps": 500, |
|
"global_step": 123, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 6.8362836837768555, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 1.2519, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 6.583799362182617, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 1.23, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 6.952097415924072, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 1.2785, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 6.371545314788818, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 1.2499, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.713391304016113, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 1.2259, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 3.5113883018493652, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 1.1784, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 2.6190996170043945, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 1.1433, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 4.008308410644531, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 1.1378, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 4.964306831359863, |
|
"learning_rate": 6.923076923076923e-06, |
|
"loss": 1.1729, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.513827323913574, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 1.1446, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 3.6050338745117188, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 1.0779, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 3.471524715423584, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.0671, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 2.42974591255188, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0223, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.8660697937011719, |
|
"learning_rate": 9.997960964140946e-06, |
|
"loss": 1.0106, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.853407621383667, |
|
"learning_rate": 9.991845519630679e-06, |
|
"loss": 1.0557, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.5663851499557495, |
|
"learning_rate": 9.981658654313458e-06, |
|
"loss": 0.9931, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 1.4195172786712646, |
|
"learning_rate": 9.96740867674275e-06, |
|
"loss": 0.993, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 1.153138518333435, |
|
"learning_rate": 9.949107209404664e-06, |
|
"loss": 1.0047, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 1.4434843063354492, |
|
"learning_rate": 9.926769179238467e-06, |
|
"loss": 0.9929, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1956053972244263, |
|
"learning_rate": 9.900412805461968e-06, |
|
"loss": 0.9942, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 1.0062505006790161, |
|
"learning_rate": 9.870059584711668e-06, |
|
"loss": 0.9831, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.0054383277893066, |
|
"learning_rate": 9.835734273509787e-06, |
|
"loss": 0.9784, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.9153076410293579, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.9665, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.9883344769477844, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.9633, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9623836874961853, |
|
"learning_rate": 9.709221818197626e-06, |
|
"loss": 0.9732, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.8564513325691223, |
|
"learning_rate": 9.659320146057263e-06, |
|
"loss": 0.9188, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.7836757302284241, |
|
"learning_rate": 9.60561826557425e-06, |
|
"loss": 0.9723, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.8154799938201904, |
|
"learning_rate": 9.548159976772593e-06, |
|
"loss": 0.9103, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.8699765205383301, |
|
"learning_rate": 9.486992143456792e-06, |
|
"loss": 0.9487, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6717429757118225, |
|
"learning_rate": 9.422164654989073e-06, |
|
"loss": 0.9657, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.7567694187164307, |
|
"learning_rate": 9.353730385598887e-06, |
|
"loss": 0.9785, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.7300078868865967, |
|
"learning_rate": 9.281745151257946e-06, |
|
"loss": 0.9331, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.8414798378944397, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.9369, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.7341222763061523, |
|
"learning_rate": 9.12735948481387e-06, |
|
"loss": 0.9246, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.7759191393852234, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.935, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.9772646427154541, |
|
"learning_rate": 8.959511229611377e-06, |
|
"loss": 0.9506, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.7034265995025635, |
|
"learning_rate": 8.870708053195414e-06, |
|
"loss": 0.9332, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.6552270650863647, |
|
"learning_rate": 8.778747871771293e-06, |
|
"loss": 0.8891, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.7311168313026428, |
|
"learning_rate": 8.683705689382025e-06, |
|
"loss": 0.9656, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6239702701568604, |
|
"learning_rate": 8.585659023794818e-06, |
|
"loss": 0.9594, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.7404515743255615, |
|
"learning_rate": 8.48468784327647e-06, |
|
"loss": 0.9467, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 1.015267252922058, |
|
"learning_rate": 8.380874501370098e-06, |
|
"loss": 1.4435, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.7437450885772705, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": 0.9858, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 0.5530978441238403, |
|
"learning_rate": 8.165062269044353e-06, |
|
"loss": 0.7881, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.6483109593391418, |
|
"learning_rate": 8.053239398177191e-06, |
|
"loss": 0.8529, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 0.6724779009819031, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.8778, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.7351576089859009, |
|
"learning_rate": 7.822216094333847e-06, |
|
"loss": 0.9103, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.6864327192306519, |
|
"learning_rate": 7.703204087277989e-06, |
|
"loss": 0.848, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.8479780554771423, |
|
"learning_rate": 7.5819873081948105e-06, |
|
"loss": 0.896, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 0.6257553100585938, |
|
"learning_rate": 7.45866462322802e-06, |
|
"loss": 0.8881, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.6846205592155457, |
|
"learning_rate": 7.333336616128369e-06, |
|
"loss": 0.9257, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 0.970287561416626, |
|
"learning_rate": 7.206105506216107e-06, |
|
"loss": 0.8823, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.6447298526763916, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 0.822, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 2.3928756713867188, |
|
"learning_rate": 6.946350531586959e-06, |
|
"loss": 0.7433, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.8941048979759216, |
|
"learning_rate": 6.814038526753205e-06, |
|
"loss": 0.9697, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 0.6773728728294373, |
|
"learning_rate": 6.680246966077151e-06, |
|
"loss": 0.8388, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.6578003764152527, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.898, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.674293041229248, |
|
"learning_rate": 6.408662784207149e-06, |
|
"loss": 0.9156, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.6401296854019165, |
|
"learning_rate": 6.271091670967437e-06, |
|
"loss": 0.8781, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 0.6086285710334778, |
|
"learning_rate": 6.132483837128823e-06, |
|
"loss": 0.8464, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.6196046471595764, |
|
"learning_rate": 5.9929523332287275e-06, |
|
"loss": 0.8604, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 0.6998230814933777, |
|
"learning_rate": 5.85261096316312e-06, |
|
"loss": 0.9152, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.645214855670929, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": 0.8257, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 0.7682178616523743, |
|
"learning_rate": 5.569957049452703e-06, |
|
"loss": 0.893, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.5541701912879944, |
|
"learning_rate": 5.4278750423942e-06, |
|
"loss": 0.8719, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 0.5373249053955078, |
|
"learning_rate": 5.285444054313841e-06, |
|
"loss": 0.8374, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 0.6957194805145264, |
|
"learning_rate": 5.142780253968481e-06, |
|
"loss": 0.9636, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.4742963910102844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7453, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.6149660348892212, |
|
"learning_rate": 4.85721974603152e-06, |
|
"loss": 0.8727, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 0.6649055480957031, |
|
"learning_rate": 4.71455594568616e-06, |
|
"loss": 0.8197, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 0.5363019704818726, |
|
"learning_rate": 4.572124957605803e-06, |
|
"loss": 0.8977, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 0.5486201643943787, |
|
"learning_rate": 4.430042950547298e-06, |
|
"loss": 0.8524, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.7598503232002258, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": 0.8718, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 0.5470713973045349, |
|
"learning_rate": 4.147389036836881e-06, |
|
"loss": 0.8436, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 0.5422508120536804, |
|
"learning_rate": 4.007047666771274e-06, |
|
"loss": 0.9016, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 0.563028872013092, |
|
"learning_rate": 3.867516162871177e-06, |
|
"loss": 0.8374, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.5209132432937622, |
|
"learning_rate": 3.7289083290325668e-06, |
|
"loss": 0.833, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.5739846229553223, |
|
"learning_rate": 3.5913372157928515e-06, |
|
"loss": 0.9076, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 0.5330556035041809, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.8753, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 0.4843089282512665, |
|
"learning_rate": 3.319753033922849e-06, |
|
"loss": 0.8169, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.5041735172271729, |
|
"learning_rate": 3.1859614732467957e-06, |
|
"loss": 0.8877, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 0.5104906558990479, |
|
"learning_rate": 3.053649468413043e-06, |
|
"loss": 0.7909, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.008, |
|
"grad_norm": 1.0316483974456787, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 1.3689, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.5285851955413818, |
|
"learning_rate": 2.7938944937838924e-06, |
|
"loss": 0.8718, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"grad_norm": 0.5638172626495361, |
|
"learning_rate": 2.6666633838716317e-06, |
|
"loss": 0.8144, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.4385935962200165, |
|
"learning_rate": 2.5413353767719805e-06, |
|
"loss": 0.7497, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.104, |
|
"grad_norm": 0.5555351376533508, |
|
"learning_rate": 2.418012691805191e-06, |
|
"loss": 0.9051, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.5720311999320984, |
|
"learning_rate": 2.296795912722014e-06, |
|
"loss": 0.8908, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"grad_norm": 0.46188703179359436, |
|
"learning_rate": 2.1777839056661555e-06, |
|
"loss": 0.7216, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.48220935463905334, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.8029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.49016672372817993, |
|
"learning_rate": 1.946760601822809e-06, |
|
"loss": 0.9015, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.44178417325019836, |
|
"learning_rate": 1.8349377309556487e-06, |
|
"loss": 0.7919, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"grad_norm": 0.5433825850486755, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": 0.7993, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.39549779891967773, |
|
"learning_rate": 1.6191254986299044e-06, |
|
"loss": 0.7369, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.296, |
|
"grad_norm": 0.48636099696159363, |
|
"learning_rate": 1.5153121567235334e-06, |
|
"loss": 0.9214, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.4830661118030548, |
|
"learning_rate": 1.4143409762051829e-06, |
|
"loss": 0.8425, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"grad_norm": 0.42800870537757874, |
|
"learning_rate": 1.3162943106179748e-06, |
|
"loss": 0.7417, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.46301019191741943, |
|
"learning_rate": 1.2212521282287093e-06, |
|
"loss": 0.8874, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.392, |
|
"grad_norm": 0.4442095458507538, |
|
"learning_rate": 1.1292919468045876e-06, |
|
"loss": 0.8052, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 0.49116918444633484, |
|
"learning_rate": 1.0404887703886252e-06, |
|
"loss": 0.8252, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.45165371894836426, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.7867, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.4509302079677582, |
|
"learning_rate": 8.7264051518613e-07, |
|
"loss": 0.9062, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.488, |
|
"grad_norm": 0.45219430327415466, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": 0.7844, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.39615580439567566, |
|
"learning_rate": 7.182548487420555e-07, |
|
"loss": 0.7702, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"grad_norm": 0.5083522796630859, |
|
"learning_rate": 6.462696144011149e-07, |
|
"loss": 0.9273, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.40505877137184143, |
|
"learning_rate": 5.778353450109286e-07, |
|
"loss": 0.8128, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.584, |
|
"grad_norm": 0.4423372447490692, |
|
"learning_rate": 5.130078565432089e-07, |
|
"loss": 0.7965, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 0.448724627494812, |
|
"learning_rate": 4.5184002322740784e-07, |
|
"loss": 0.7753, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.632, |
|
"grad_norm": 0.4123077690601349, |
|
"learning_rate": 3.9438173442575e-07, |
|
"loss": 0.8232, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.4331120550632477, |
|
"learning_rate": 3.406798539427386e-07, |
|
"loss": 0.8859, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.463981956243515, |
|
"learning_rate": 2.9077818180237693e-07, |
|
"loss": 0.81, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.4227185547351837, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.7866, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.7279999999999998, |
|
"grad_norm": 0.4594978392124176, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": 0.8428, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.4640582799911499, |
|
"learning_rate": 1.6426572649021477e-07, |
|
"loss": 0.8939, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.776, |
|
"grad_norm": 0.410780668258667, |
|
"learning_rate": 1.2994041528833267e-07, |
|
"loss": 0.7411, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.4083250164985657, |
|
"learning_rate": 9.958719453803278e-08, |
|
"loss": 0.8683, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.824, |
|
"grad_norm": 0.36987847089767456, |
|
"learning_rate": 7.32308207615351e-08, |
|
"loss": 0.7558, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.3830074965953827, |
|
"learning_rate": 5.089279059533658e-08, |
|
"loss": 0.8281, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.872, |
|
"grad_norm": 0.4141235947608948, |
|
"learning_rate": 3.25913232572489e-08, |
|
"loss": 0.8886, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.3976007103919983, |
|
"learning_rate": 1.834134568654333e-08, |
|
"loss": 0.7874, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.447281152009964, |
|
"learning_rate": 8.15448036932176e-09, |
|
"loss": 0.8983, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.3919880986213684, |
|
"learning_rate": 2.0390358590538507e-09, |
|
"loss": 0.6968, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.968, |
|
"grad_norm": 0.42730164527893066, |
|
"learning_rate": 0.0, |
|
"loss": 0.8298, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.968, |
|
"step": 123, |
|
"total_flos": 121724209528832.0, |
|
"train_loss": 0.9151467346563572, |
|
"train_runtime": 7021.3239, |
|
"train_samples_per_second": 1.709, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 123, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 121724209528832.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|