k1h0's picture
Upload folder using huggingface_hub
29e2abb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9907578558225508,
"eval_steps": 500,
"global_step": 67,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014787430683918669,
"grad_norm": 2.2905805110931396,
"learning_rate": 4.997252228714279e-05,
"loss": 1.5801,
"num_input_tokens_seen": 1572864,
"step": 1
},
{
"epoch": 0.029574861367837338,
"grad_norm": 1.8597325086593628,
"learning_rate": 4.9890149550547454e-05,
"loss": 1.3696,
"num_input_tokens_seen": 3145728,
"step": 2
},
{
"epoch": 0.04436229205175601,
"grad_norm": 1.7161543369293213,
"learning_rate": 4.9753062863366276e-05,
"loss": 1.2964,
"num_input_tokens_seen": 4718592,
"step": 3
},
{
"epoch": 0.059149722735674676,
"grad_norm": 1.556989073753357,
"learning_rate": 4.95615635718894e-05,
"loss": 1.2465,
"num_input_tokens_seen": 6291456,
"step": 4
},
{
"epoch": 0.07393715341959335,
"grad_norm": 1.2705891132354736,
"learning_rate": 4.931607263312032e-05,
"loss": 1.1284,
"num_input_tokens_seen": 7864320,
"step": 5
},
{
"epoch": 0.08872458410351201,
"grad_norm": 1.1708149909973145,
"learning_rate": 4.9017129689421e-05,
"loss": 1.0455,
"num_input_tokens_seen": 9437184,
"step": 6
},
{
"epoch": 0.10351201478743069,
"grad_norm": 1.0333069562911987,
"learning_rate": 4.8665391882260856e-05,
"loss": 0.985,
"num_input_tokens_seen": 11010048,
"step": 7
},
{
"epoch": 0.11829944547134935,
"grad_norm": 1.0009125471115112,
"learning_rate": 4.8261632407677174e-05,
"loss": 0.957,
"num_input_tokens_seen": 12582912,
"step": 8
},
{
"epoch": 0.133086876155268,
"grad_norm": 0.7598341107368469,
"learning_rate": 4.780673881662242e-05,
"loss": 0.9121,
"num_input_tokens_seen": 14155776,
"step": 9
},
{
"epoch": 0.1478743068391867,
"grad_norm": 0.366968035697937,
"learning_rate": 4.730171106393466e-05,
"loss": 0.8857,
"num_input_tokens_seen": 15728640,
"step": 10
},
{
"epoch": 0.16266173752310537,
"grad_norm": 0.2760382890701294,
"learning_rate": 4.674765931021976e-05,
"loss": 0.8871,
"num_input_tokens_seen": 17301504,
"step": 11
},
{
"epoch": 0.17744916820702403,
"grad_norm": 0.23743785917758942,
"learning_rate": 4.614580148147744e-05,
"loss": 0.8668,
"num_input_tokens_seen": 18874368,
"step": 12
},
{
"epoch": 0.1922365988909427,
"grad_norm": 0.21216906607151031,
"learning_rate": 4.5497460591835615e-05,
"loss": 0.8264,
"num_input_tokens_seen": 20447232,
"step": 13
},
{
"epoch": 0.20702402957486138,
"grad_norm": 0.202525332570076,
"learning_rate": 4.480406183527823e-05,
"loss": 0.8316,
"num_input_tokens_seen": 22020096,
"step": 14
},
{
"epoch": 0.22181146025878004,
"grad_norm": 0.1785624921321869,
"learning_rate": 4.406712945275955e-05,
"loss": 0.8214,
"num_input_tokens_seen": 23592960,
"step": 15
},
{
"epoch": 0.2365988909426987,
"grad_norm": 0.17254720628261566,
"learning_rate": 4.328828338159173e-05,
"loss": 0.8074,
"num_input_tokens_seen": 25165824,
"step": 16
},
{
"epoch": 0.2513863216266174,
"grad_norm": 0.15905120968818665,
"learning_rate": 4.2469235694471043e-05,
"loss": 0.7947,
"num_input_tokens_seen": 26738688,
"step": 17
},
{
"epoch": 0.266173752310536,
"grad_norm": 0.13924936950206757,
"learning_rate": 4.161178683597054e-05,
"loss": 0.7868,
"num_input_tokens_seen": 28311552,
"step": 18
},
{
"epoch": 0.2809611829944547,
"grad_norm": 0.12500160932540894,
"learning_rate": 4.071782166477213e-05,
"loss": 0.7657,
"num_input_tokens_seen": 29884416,
"step": 19
},
{
"epoch": 0.2957486136783734,
"grad_norm": 0.11662258207798004,
"learning_rate": 3.978930531033807e-05,
"loss": 0.7821,
"num_input_tokens_seen": 31457280,
"step": 20
},
{
"epoch": 0.31053604436229204,
"grad_norm": 0.10671637207269669,
"learning_rate": 3.882827885312999e-05,
"loss": 0.764,
"num_input_tokens_seen": 33030144,
"step": 21
},
{
"epoch": 0.32532347504621073,
"grad_norm": 0.10863461345434189,
"learning_rate": 3.783685483787105e-05,
"loss": 0.7781,
"num_input_tokens_seen": 34603008,
"step": 22
},
{
"epoch": 0.34011090573012936,
"grad_norm": 0.09509966522455215,
"learning_rate": 3.681721262971413e-05,
"loss": 0.7663,
"num_input_tokens_seen": 36175872,
"step": 23
},
{
"epoch": 0.35489833641404805,
"grad_norm": 0.08742663264274597,
"learning_rate": 3.5771593623524265e-05,
"loss": 0.7303,
"num_input_tokens_seen": 37748736,
"step": 24
},
{
"epoch": 0.36968576709796674,
"grad_norm": 0.09325291961431503,
"learning_rate": 3.4702296316806244e-05,
"loss": 0.7626,
"num_input_tokens_seen": 39321600,
"step": 25
},
{
"epoch": 0.3844731977818854,
"grad_norm": 0.08703341335058212,
"learning_rate": 3.361167125710832e-05,
"loss": 0.7709,
"num_input_tokens_seen": 40894464,
"step": 26
},
{
"epoch": 0.39926062846580407,
"grad_norm": 0.09138187021017075,
"learning_rate": 3.2502115875008524e-05,
"loss": 0.7638,
"num_input_tokens_seen": 42467328,
"step": 27
},
{
"epoch": 0.41404805914972276,
"grad_norm": 0.08183719217777252,
"learning_rate": 3.1376069214041913e-05,
"loss": 0.7561,
"num_input_tokens_seen": 44040192,
"step": 28
},
{
"epoch": 0.4288354898336414,
"grad_norm": 0.08267659693956375,
"learning_rate": 3.0236006569153617e-05,
"loss": 0.7372,
"num_input_tokens_seen": 45613056,
"step": 29
},
{
"epoch": 0.4436229205175601,
"grad_norm": 0.07969928532838821,
"learning_rate": 2.9084434045463255e-05,
"loss": 0.7281,
"num_input_tokens_seen": 47185920,
"step": 30
},
{
"epoch": 0.4584103512014787,
"grad_norm": 0.07706066220998764,
"learning_rate": 2.792388304930207e-05,
"loss": 0.7601,
"num_input_tokens_seen": 48758784,
"step": 31
},
{
"epoch": 0.4731977818853974,
"grad_norm": 0.07695123553276062,
"learning_rate": 2.6756904723632324e-05,
"loss": 0.7296,
"num_input_tokens_seen": 50331648,
"step": 32
},
{
"epoch": 0.4879852125693161,
"grad_norm": 0.07787525653839111,
"learning_rate": 2.5586064340081516e-05,
"loss": 0.741,
"num_input_tokens_seen": 51904512,
"step": 33
},
{
"epoch": 0.5027726432532348,
"grad_norm": 0.08225582540035248,
"learning_rate": 2.441393565991849e-05,
"loss": 0.7251,
"num_input_tokens_seen": 53477376,
"step": 34
},
{
"epoch": 0.5175600739371534,
"grad_norm": 0.0731961578130722,
"learning_rate": 2.3243095276367685e-05,
"loss": 0.7385,
"num_input_tokens_seen": 55050240,
"step": 35
},
{
"epoch": 0.532347504621072,
"grad_norm": 0.08208758383989334,
"learning_rate": 2.207611695069794e-05,
"loss": 0.746,
"num_input_tokens_seen": 56623104,
"step": 36
},
{
"epoch": 0.5471349353049908,
"grad_norm": 0.07385499030351639,
"learning_rate": 2.0915565954536744e-05,
"loss": 0.7315,
"num_input_tokens_seen": 58195968,
"step": 37
},
{
"epoch": 0.5619223659889094,
"grad_norm": 0.06937970221042633,
"learning_rate": 1.9763993430846395e-05,
"loss": 0.7267,
"num_input_tokens_seen": 59768832,
"step": 38
},
{
"epoch": 0.5767097966728281,
"grad_norm": 0.0702456384897232,
"learning_rate": 1.8623930785958092e-05,
"loss": 0.7443,
"num_input_tokens_seen": 61341696,
"step": 39
},
{
"epoch": 0.5914972273567468,
"grad_norm": 0.0659838542342186,
"learning_rate": 1.749788412499149e-05,
"loss": 0.7163,
"num_input_tokens_seen": 62914560,
"step": 40
},
{
"epoch": 0.6062846580406654,
"grad_norm": 0.07357968389987946,
"learning_rate": 1.638832874289168e-05,
"loss": 0.73,
"num_input_tokens_seen": 64487424,
"step": 41
},
{
"epoch": 0.6210720887245841,
"grad_norm": 0.0689447820186615,
"learning_rate": 1.5297703683193752e-05,
"loss": 0.723,
"num_input_tokens_seen": 66060288,
"step": 42
},
{
"epoch": 0.6358595194085028,
"grad_norm": 0.0671798512339592,
"learning_rate": 1.4228406376475742e-05,
"loss": 0.7176,
"num_input_tokens_seen": 67633152,
"step": 43
},
{
"epoch": 0.6506469500924215,
"grad_norm": 0.06556376069784164,
"learning_rate": 1.3182787370285865e-05,
"loss": 0.7066,
"num_input_tokens_seen": 69206016,
"step": 44
},
{
"epoch": 0.6654343807763401,
"grad_norm": 0.07168299704790115,
"learning_rate": 1.2163145162128947e-05,
"loss": 0.7244,
"num_input_tokens_seen": 70778880,
"step": 45
},
{
"epoch": 0.6802218114602587,
"grad_norm": 0.06598961353302002,
"learning_rate": 1.1171721146870015e-05,
"loss": 0.7349,
"num_input_tokens_seen": 72351744,
"step": 46
},
{
"epoch": 0.6950092421441775,
"grad_norm": 0.06602618843317032,
"learning_rate": 1.021069468966194e-05,
"loss": 0.7487,
"num_input_tokens_seen": 73924608,
"step": 47
},
{
"epoch": 0.7097966728280961,
"grad_norm": 0.0713997408747673,
"learning_rate": 9.282178335227884e-06,
"loss": 0.7381,
"num_input_tokens_seen": 75497472,
"step": 48
},
{
"epoch": 0.7245841035120147,
"grad_norm": 0.06555724889039993,
"learning_rate": 8.38821316402946e-06,
"loss": 0.7262,
"num_input_tokens_seen": 77070336,
"step": 49
},
{
"epoch": 0.7393715341959335,
"grad_norm": 0.06813663244247437,
"learning_rate": 7.530764305528959e-06,
"loss": 0.7473,
"num_input_tokens_seen": 78643200,
"step": 50
},
{
"epoch": 0.7541589648798521,
"grad_norm": 0.06930514425039291,
"learning_rate": 6.711716618408281e-06,
"loss": 0.6998,
"num_input_tokens_seen": 80216064,
"step": 51
},
{
"epoch": 0.7689463955637708,
"grad_norm": 0.06492163240909576,
"learning_rate": 5.932870547240454e-06,
"loss": 0.7218,
"num_input_tokens_seen": 81788928,
"step": 52
},
{
"epoch": 0.7837338262476895,
"grad_norm": 0.07155918329954147,
"learning_rate": 5.1959381647217666e-06,
"loss": 0.7314,
"num_input_tokens_seen": 83361792,
"step": 53
},
{
"epoch": 0.7985212569316081,
"grad_norm": 0.06532897800207138,
"learning_rate": 4.502539408164386e-06,
"loss": 0.7028,
"num_input_tokens_seen": 84934656,
"step": 54
},
{
"epoch": 0.8133086876155268,
"grad_norm": 0.06727246940135956,
"learning_rate": 3.8541985185225645e-06,
"loss": 0.7084,
"num_input_tokens_seen": 86507520,
"step": 55
},
{
"epoch": 0.8280961182994455,
"grad_norm": 0.06698304414749146,
"learning_rate": 3.252340689780245e-06,
"loss": 0.7223,
"num_input_tokens_seen": 88080384,
"step": 56
},
{
"epoch": 0.8428835489833642,
"grad_norm": 0.06450291723012924,
"learning_rate": 2.6982889360653377e-06,
"loss": 0.7195,
"num_input_tokens_seen": 89653248,
"step": 57
},
{
"epoch": 0.8576709796672828,
"grad_norm": 0.06992805004119873,
"learning_rate": 2.1932611833775846e-06,
"loss": 0.7431,
"num_input_tokens_seen": 91226112,
"step": 58
},
{
"epoch": 0.8724584103512015,
"grad_norm": 0.06958083808422089,
"learning_rate": 1.738367592322837e-06,
"loss": 0.732,
"num_input_tokens_seen": 92798976,
"step": 59
},
{
"epoch": 0.8872458410351202,
"grad_norm": 0.0694640502333641,
"learning_rate": 1.3346081177391472e-06,
"loss": 0.7302,
"num_input_tokens_seen": 94371840,
"step": 60
},
{
"epoch": 0.9020332717190388,
"grad_norm": 0.07005713880062103,
"learning_rate": 9.828703105789983e-07,
"loss": 0.7197,
"num_input_tokens_seen": 95944704,
"step": 61
},
{
"epoch": 0.9168207024029574,
"grad_norm": 0.07030840963125229,
"learning_rate": 6.839273668796747e-07,
"loss": 0.7203,
"num_input_tokens_seen": 97517568,
"step": 62
},
{
"epoch": 0.9316081330868762,
"grad_norm": 0.0708225816488266,
"learning_rate": 4.3843642811059737e-07,
"loss": 0.7474,
"num_input_tokens_seen": 99090432,
"step": 63
},
{
"epoch": 0.9463955637707948,
"grad_norm": 0.0676749050617218,
"learning_rate": 2.4693713663372644e-07,
"loss": 0.7403,
"num_input_tokens_seen": 100663296,
"step": 64
},
{
"epoch": 0.9611829944547134,
"grad_norm": 0.06782912462949753,
"learning_rate": 1.0985044945254764e-07,
"loss": 0.7327,
"num_input_tokens_seen": 102236160,
"step": 65
},
{
"epoch": 0.9759704251386322,
"grad_norm": 0.06357243657112122,
"learning_rate": 2.7477712857215677e-08,
"loss": 0.725,
"num_input_tokens_seen": 103809024,
"step": 66
},
{
"epoch": 0.9907578558225508,
"grad_norm": 0.07199209183454514,
"learning_rate": 0.0,
"loss": 0.7209,
"num_input_tokens_seen": 105381888,
"step": 67
},
{
"epoch": 0.9907578558225508,
"num_input_tokens_seen": 105381888,
"step": 67,
"total_flos": 4.104162098269913e+18,
"train_loss": 0.8075656152483243,
"train_runtime": 10309.5741,
"train_samples_per_second": 2.518,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 67,
"num_input_tokens_seen": 105381888,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.104162098269913e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}