|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9907578558225508, |
|
"eval_steps": 500, |
|
"global_step": 67, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014787430683918669, |
|
"grad_norm": 2.2905805110931396, |
|
"learning_rate": 4.997252228714279e-05, |
|
"loss": 1.5801, |
|
"num_input_tokens_seen": 1572864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.029574861367837338, |
|
"grad_norm": 1.8597325086593628, |
|
"learning_rate": 4.9890149550547454e-05, |
|
"loss": 1.3696, |
|
"num_input_tokens_seen": 3145728, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04436229205175601, |
|
"grad_norm": 1.7161543369293213, |
|
"learning_rate": 4.9753062863366276e-05, |
|
"loss": 1.2964, |
|
"num_input_tokens_seen": 4718592, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.059149722735674676, |
|
"grad_norm": 1.556989073753357, |
|
"learning_rate": 4.95615635718894e-05, |
|
"loss": 1.2465, |
|
"num_input_tokens_seen": 6291456, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07393715341959335, |
|
"grad_norm": 1.2705891132354736, |
|
"learning_rate": 4.931607263312032e-05, |
|
"loss": 1.1284, |
|
"num_input_tokens_seen": 7864320, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08872458410351201, |
|
"grad_norm": 1.1708149909973145, |
|
"learning_rate": 4.9017129689421e-05, |
|
"loss": 1.0455, |
|
"num_input_tokens_seen": 9437184, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10351201478743069, |
|
"grad_norm": 1.0333069562911987, |
|
"learning_rate": 4.8665391882260856e-05, |
|
"loss": 0.985, |
|
"num_input_tokens_seen": 11010048, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11829944547134935, |
|
"grad_norm": 1.0009125471115112, |
|
"learning_rate": 4.8261632407677174e-05, |
|
"loss": 0.957, |
|
"num_input_tokens_seen": 12582912, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.133086876155268, |
|
"grad_norm": 0.7598341107368469, |
|
"learning_rate": 4.780673881662242e-05, |
|
"loss": 0.9121, |
|
"num_input_tokens_seen": 14155776, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1478743068391867, |
|
"grad_norm": 0.366968035697937, |
|
"learning_rate": 4.730171106393466e-05, |
|
"loss": 0.8857, |
|
"num_input_tokens_seen": 15728640, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16266173752310537, |
|
"grad_norm": 0.2760382890701294, |
|
"learning_rate": 4.674765931021976e-05, |
|
"loss": 0.8871, |
|
"num_input_tokens_seen": 17301504, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.17744916820702403, |
|
"grad_norm": 0.23743785917758942, |
|
"learning_rate": 4.614580148147744e-05, |
|
"loss": 0.8668, |
|
"num_input_tokens_seen": 18874368, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1922365988909427, |
|
"grad_norm": 0.21216906607151031, |
|
"learning_rate": 4.5497460591835615e-05, |
|
"loss": 0.8264, |
|
"num_input_tokens_seen": 20447232, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.20702402957486138, |
|
"grad_norm": 0.202525332570076, |
|
"learning_rate": 4.480406183527823e-05, |
|
"loss": 0.8316, |
|
"num_input_tokens_seen": 22020096, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.22181146025878004, |
|
"grad_norm": 0.1785624921321869, |
|
"learning_rate": 4.406712945275955e-05, |
|
"loss": 0.8214, |
|
"num_input_tokens_seen": 23592960, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2365988909426987, |
|
"grad_norm": 0.17254720628261566, |
|
"learning_rate": 4.328828338159173e-05, |
|
"loss": 0.8074, |
|
"num_input_tokens_seen": 25165824, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.2513863216266174, |
|
"grad_norm": 0.15905120968818665, |
|
"learning_rate": 4.2469235694471043e-05, |
|
"loss": 0.7947, |
|
"num_input_tokens_seen": 26738688, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.266173752310536, |
|
"grad_norm": 0.13924936950206757, |
|
"learning_rate": 4.161178683597054e-05, |
|
"loss": 0.7868, |
|
"num_input_tokens_seen": 28311552, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2809611829944547, |
|
"grad_norm": 0.12500160932540894, |
|
"learning_rate": 4.071782166477213e-05, |
|
"loss": 0.7657, |
|
"num_input_tokens_seen": 29884416, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.2957486136783734, |
|
"grad_norm": 0.11662258207798004, |
|
"learning_rate": 3.978930531033807e-05, |
|
"loss": 0.7821, |
|
"num_input_tokens_seen": 31457280, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.31053604436229204, |
|
"grad_norm": 0.10671637207269669, |
|
"learning_rate": 3.882827885312999e-05, |
|
"loss": 0.764, |
|
"num_input_tokens_seen": 33030144, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.32532347504621073, |
|
"grad_norm": 0.10863461345434189, |
|
"learning_rate": 3.783685483787105e-05, |
|
"loss": 0.7781, |
|
"num_input_tokens_seen": 34603008, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.34011090573012936, |
|
"grad_norm": 0.09509966522455215, |
|
"learning_rate": 3.681721262971413e-05, |
|
"loss": 0.7663, |
|
"num_input_tokens_seen": 36175872, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.35489833641404805, |
|
"grad_norm": 0.08742663264274597, |
|
"learning_rate": 3.5771593623524265e-05, |
|
"loss": 0.7303, |
|
"num_input_tokens_seen": 37748736, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.36968576709796674, |
|
"grad_norm": 0.09325291961431503, |
|
"learning_rate": 3.4702296316806244e-05, |
|
"loss": 0.7626, |
|
"num_input_tokens_seen": 39321600, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3844731977818854, |
|
"grad_norm": 0.08703341335058212, |
|
"learning_rate": 3.361167125710832e-05, |
|
"loss": 0.7709, |
|
"num_input_tokens_seen": 40894464, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.39926062846580407, |
|
"grad_norm": 0.09138187021017075, |
|
"learning_rate": 3.2502115875008524e-05, |
|
"loss": 0.7638, |
|
"num_input_tokens_seen": 42467328, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.41404805914972276, |
|
"grad_norm": 0.08183719217777252, |
|
"learning_rate": 3.1376069214041913e-05, |
|
"loss": 0.7561, |
|
"num_input_tokens_seen": 44040192, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4288354898336414, |
|
"grad_norm": 0.08267659693956375, |
|
"learning_rate": 3.0236006569153617e-05, |
|
"loss": 0.7372, |
|
"num_input_tokens_seen": 45613056, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4436229205175601, |
|
"grad_norm": 0.07969928532838821, |
|
"learning_rate": 2.9084434045463255e-05, |
|
"loss": 0.7281, |
|
"num_input_tokens_seen": 47185920, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4584103512014787, |
|
"grad_norm": 0.07706066220998764, |
|
"learning_rate": 2.792388304930207e-05, |
|
"loss": 0.7601, |
|
"num_input_tokens_seen": 48758784, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.4731977818853974, |
|
"grad_norm": 0.07695123553276062, |
|
"learning_rate": 2.6756904723632324e-05, |
|
"loss": 0.7296, |
|
"num_input_tokens_seen": 50331648, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.4879852125693161, |
|
"grad_norm": 0.07787525653839111, |
|
"learning_rate": 2.5586064340081516e-05, |
|
"loss": 0.741, |
|
"num_input_tokens_seen": 51904512, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5027726432532348, |
|
"grad_norm": 0.08225582540035248, |
|
"learning_rate": 2.441393565991849e-05, |
|
"loss": 0.7251, |
|
"num_input_tokens_seen": 53477376, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5175600739371534, |
|
"grad_norm": 0.0731961578130722, |
|
"learning_rate": 2.3243095276367685e-05, |
|
"loss": 0.7385, |
|
"num_input_tokens_seen": 55050240, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.532347504621072, |
|
"grad_norm": 0.08208758383989334, |
|
"learning_rate": 2.207611695069794e-05, |
|
"loss": 0.746, |
|
"num_input_tokens_seen": 56623104, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5471349353049908, |
|
"grad_norm": 0.07385499030351639, |
|
"learning_rate": 2.0915565954536744e-05, |
|
"loss": 0.7315, |
|
"num_input_tokens_seen": 58195968, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5619223659889094, |
|
"grad_norm": 0.06937970221042633, |
|
"learning_rate": 1.9763993430846395e-05, |
|
"loss": 0.7267, |
|
"num_input_tokens_seen": 59768832, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5767097966728281, |
|
"grad_norm": 0.0702456384897232, |
|
"learning_rate": 1.8623930785958092e-05, |
|
"loss": 0.7443, |
|
"num_input_tokens_seen": 61341696, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.5914972273567468, |
|
"grad_norm": 0.0659838542342186, |
|
"learning_rate": 1.749788412499149e-05, |
|
"loss": 0.7163, |
|
"num_input_tokens_seen": 62914560, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6062846580406654, |
|
"grad_norm": 0.07357968389987946, |
|
"learning_rate": 1.638832874289168e-05, |
|
"loss": 0.73, |
|
"num_input_tokens_seen": 64487424, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6210720887245841, |
|
"grad_norm": 0.0689447820186615, |
|
"learning_rate": 1.5297703683193752e-05, |
|
"loss": 0.723, |
|
"num_input_tokens_seen": 66060288, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6358595194085028, |
|
"grad_norm": 0.0671798512339592, |
|
"learning_rate": 1.4228406376475742e-05, |
|
"loss": 0.7176, |
|
"num_input_tokens_seen": 67633152, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6506469500924215, |
|
"grad_norm": 0.06556376069784164, |
|
"learning_rate": 1.3182787370285865e-05, |
|
"loss": 0.7066, |
|
"num_input_tokens_seen": 69206016, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.6654343807763401, |
|
"grad_norm": 0.07168299704790115, |
|
"learning_rate": 1.2163145162128947e-05, |
|
"loss": 0.7244, |
|
"num_input_tokens_seen": 70778880, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6802218114602587, |
|
"grad_norm": 0.06598961353302002, |
|
"learning_rate": 1.1171721146870015e-05, |
|
"loss": 0.7349, |
|
"num_input_tokens_seen": 72351744, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6950092421441775, |
|
"grad_norm": 0.06602618843317032, |
|
"learning_rate": 1.021069468966194e-05, |
|
"loss": 0.7487, |
|
"num_input_tokens_seen": 73924608, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7097966728280961, |
|
"grad_norm": 0.0713997408747673, |
|
"learning_rate": 9.282178335227884e-06, |
|
"loss": 0.7381, |
|
"num_input_tokens_seen": 75497472, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7245841035120147, |
|
"grad_norm": 0.06555724889039993, |
|
"learning_rate": 8.38821316402946e-06, |
|
"loss": 0.7262, |
|
"num_input_tokens_seen": 77070336, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7393715341959335, |
|
"grad_norm": 0.06813663244247437, |
|
"learning_rate": 7.530764305528959e-06, |
|
"loss": 0.7473, |
|
"num_input_tokens_seen": 78643200, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7541589648798521, |
|
"grad_norm": 0.06930514425039291, |
|
"learning_rate": 6.711716618408281e-06, |
|
"loss": 0.6998, |
|
"num_input_tokens_seen": 80216064, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7689463955637708, |
|
"grad_norm": 0.06492163240909576, |
|
"learning_rate": 5.932870547240454e-06, |
|
"loss": 0.7218, |
|
"num_input_tokens_seen": 81788928, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7837338262476895, |
|
"grad_norm": 0.07155918329954147, |
|
"learning_rate": 5.1959381647217666e-06, |
|
"loss": 0.7314, |
|
"num_input_tokens_seen": 83361792, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7985212569316081, |
|
"grad_norm": 0.06532897800207138, |
|
"learning_rate": 4.502539408164386e-06, |
|
"loss": 0.7028, |
|
"num_input_tokens_seen": 84934656, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8133086876155268, |
|
"grad_norm": 0.06727246940135956, |
|
"learning_rate": 3.8541985185225645e-06, |
|
"loss": 0.7084, |
|
"num_input_tokens_seen": 86507520, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8280961182994455, |
|
"grad_norm": 0.06698304414749146, |
|
"learning_rate": 3.252340689780245e-06, |
|
"loss": 0.7223, |
|
"num_input_tokens_seen": 88080384, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8428835489833642, |
|
"grad_norm": 0.06450291723012924, |
|
"learning_rate": 2.6982889360653377e-06, |
|
"loss": 0.7195, |
|
"num_input_tokens_seen": 89653248, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8576709796672828, |
|
"grad_norm": 0.06992805004119873, |
|
"learning_rate": 2.1932611833775846e-06, |
|
"loss": 0.7431, |
|
"num_input_tokens_seen": 91226112, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8724584103512015, |
|
"grad_norm": 0.06958083808422089, |
|
"learning_rate": 1.738367592322837e-06, |
|
"loss": 0.732, |
|
"num_input_tokens_seen": 92798976, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8872458410351202, |
|
"grad_norm": 0.0694640502333641, |
|
"learning_rate": 1.3346081177391472e-06, |
|
"loss": 0.7302, |
|
"num_input_tokens_seen": 94371840, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9020332717190388, |
|
"grad_norm": 0.07005713880062103, |
|
"learning_rate": 9.828703105789983e-07, |
|
"loss": 0.7197, |
|
"num_input_tokens_seen": 95944704, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9168207024029574, |
|
"grad_norm": 0.07030840963125229, |
|
"learning_rate": 6.839273668796747e-07, |
|
"loss": 0.7203, |
|
"num_input_tokens_seen": 97517568, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9316081330868762, |
|
"grad_norm": 0.0708225816488266, |
|
"learning_rate": 4.3843642811059737e-07, |
|
"loss": 0.7474, |
|
"num_input_tokens_seen": 99090432, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9463955637707948, |
|
"grad_norm": 0.0676749050617218, |
|
"learning_rate": 2.4693713663372644e-07, |
|
"loss": 0.7403, |
|
"num_input_tokens_seen": 100663296, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.9611829944547134, |
|
"grad_norm": 0.06782912462949753, |
|
"learning_rate": 1.0985044945254764e-07, |
|
"loss": 0.7327, |
|
"num_input_tokens_seen": 102236160, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9759704251386322, |
|
"grad_norm": 0.06357243657112122, |
|
"learning_rate": 2.7477712857215677e-08, |
|
"loss": 0.725, |
|
"num_input_tokens_seen": 103809024, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9907578558225508, |
|
"grad_norm": 0.07199209183454514, |
|
"learning_rate": 0.0, |
|
"loss": 0.7209, |
|
"num_input_tokens_seen": 105381888, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9907578558225508, |
|
"num_input_tokens_seen": 105381888, |
|
"step": 67, |
|
"total_flos": 4.104162098269913e+18, |
|
"train_loss": 0.8075656152483243, |
|
"train_runtime": 10309.5741, |
|
"train_samples_per_second": 2.518, |
|
"train_steps_per_second": 0.006 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 67, |
|
"num_input_tokens_seen": 105381888, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.104162098269913e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|