|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.8938906752411575, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03858520900321544, |
|
"grad_norm": 6.827114582061768, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.2682, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07717041800643087, |
|
"grad_norm": 6.703758239746094, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.2274, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1157556270096463, |
|
"grad_norm": 6.74079704284668, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.2669, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.15434083601286175, |
|
"grad_norm": 5.965632438659668, |
|
"learning_rate": 5e-06, |
|
"loss": 1.2216, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.19292604501607716, |
|
"grad_norm": 4.468871593475342, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.1837, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2315112540192926, |
|
"grad_norm": 2.849684715270996, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.1514, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.27009646302250806, |
|
"grad_norm": 5.281802177429199, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.1846, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.3086816720257235, |
|
"grad_norm": 5.24518346786499, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1349, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.34726688102893893, |
|
"grad_norm": 4.561065673828125, |
|
"learning_rate": 9.994504457428557e-06, |
|
"loss": 1.0946, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3858520900321543, |
|
"grad_norm": 4.529706001281738, |
|
"learning_rate": 9.978029910109491e-06, |
|
"loss": 1.0995, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.42443729903536975, |
|
"grad_norm": 3.045126438140869, |
|
"learning_rate": 9.950612572673255e-06, |
|
"loss": 1.0659, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.4630225080385852, |
|
"grad_norm": 2.239044666290283, |
|
"learning_rate": 9.91231271437788e-06, |
|
"loss": 1.018, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5016077170418006, |
|
"grad_norm": 2.176582098007202, |
|
"learning_rate": 9.863214526624065e-06, |
|
"loss": 1.0193, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5401929260450161, |
|
"grad_norm": 1.5574040412902832, |
|
"learning_rate": 9.803425937884202e-06, |
|
"loss": 0.9824, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5787781350482315, |
|
"grad_norm": 1.3071755170822144, |
|
"learning_rate": 9.733078376452172e-06, |
|
"loss": 0.9843, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.617363344051447, |
|
"grad_norm": 1.1668739318847656, |
|
"learning_rate": 9.652326481535434e-06, |
|
"loss": 0.9856, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6559485530546624, |
|
"grad_norm": 1.3676371574401855, |
|
"learning_rate": 9.561347763324484e-06, |
|
"loss": 0.9434, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6945337620578779, |
|
"grad_norm": 1.3451117277145386, |
|
"learning_rate": 9.460342212786933e-06, |
|
"loss": 0.9642, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7331189710610932, |
|
"grad_norm": 0.9681344032287598, |
|
"learning_rate": 9.349531862043952e-06, |
|
"loss": 0.9553, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.7717041800643086, |
|
"grad_norm": 1.0520753860473633, |
|
"learning_rate": 9.229160296295488e-06, |
|
"loss": 0.9128, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.8102893890675241, |
|
"grad_norm": 0.9027125239372253, |
|
"learning_rate": 9.099492118367123e-06, |
|
"loss": 0.9291, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8488745980707395, |
|
"grad_norm": 0.8002101182937622, |
|
"learning_rate": 8.960812367055646e-06, |
|
"loss": 0.9195, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.887459807073955, |
|
"grad_norm": 0.787247896194458, |
|
"learning_rate": 8.81342589055191e-06, |
|
"loss": 0.9461, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.9260450160771704, |
|
"grad_norm": 0.7884970903396606, |
|
"learning_rate": 8.657656676318346e-06, |
|
"loss": 0.9381, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9646302250803859, |
|
"grad_norm": 0.6913259625434875, |
|
"learning_rate": 8.49384713889421e-06, |
|
"loss": 0.8714, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.0032154340836013, |
|
"grad_norm": 0.7232531309127808, |
|
"learning_rate": 8.32235736719411e-06, |
|
"loss": 0.9519, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0418006430868167, |
|
"grad_norm": 0.7492582201957703, |
|
"learning_rate": 8.143564332954426e-06, |
|
"loss": 0.8788, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.0803858520900322, |
|
"grad_norm": 0.6388415694236755, |
|
"learning_rate": 7.957861062067614e-06, |
|
"loss": 0.9166, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.1189710610932475, |
|
"grad_norm": 0.6874092817306519, |
|
"learning_rate": 7.765655770625997e-06, |
|
"loss": 0.8655, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.157556270096463, |
|
"grad_norm": 0.6590151190757751, |
|
"learning_rate": 7.56737096757421e-06, |
|
"loss": 0.8655, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1961414790996785, |
|
"grad_norm": 0.6995782852172852, |
|
"learning_rate": 7.363442525942827e-06, |
|
"loss": 0.8524, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.234726688102894, |
|
"grad_norm": 0.8694888949394226, |
|
"learning_rate": 7.1543187247048525e-06, |
|
"loss": 0.9178, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2733118971061093, |
|
"grad_norm": 0.72708660364151, |
|
"learning_rate": 6.9404592633612486e-06, |
|
"loss": 0.8379, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.3118971061093248, |
|
"grad_norm": 0.6599162220954895, |
|
"learning_rate": 6.722334251421665e-06, |
|
"loss": 0.9048, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.3504823151125402, |
|
"grad_norm": 0.7191936373710632, |
|
"learning_rate": 6.500423175001705e-06, |
|
"loss": 0.8702, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.3890675241157555, |
|
"grad_norm": 0.7617672085762024, |
|
"learning_rate": 6.275213842808383e-06, |
|
"loss": 0.875, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.427652733118971, |
|
"grad_norm": 0.5834156274795532, |
|
"learning_rate": 6.047201313830724e-06, |
|
"loss": 0.855, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.4662379421221865, |
|
"grad_norm": 0.6282949447631836, |
|
"learning_rate": 5.816886809092651e-06, |
|
"loss": 0.8703, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.504823151125402, |
|
"grad_norm": 0.6048595905303955, |
|
"learning_rate": 5.584776609860414e-06, |
|
"loss": 0.8615, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.5434083601286175, |
|
"grad_norm": 0.5948866605758667, |
|
"learning_rate": 5.351380944726465e-06, |
|
"loss": 0.8744, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5819935691318328, |
|
"grad_norm": 0.5789873600006104, |
|
"learning_rate": 5.117212868016303e-06, |
|
"loss": 0.8056, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.6205787781350482, |
|
"grad_norm": 0.5074349045753479, |
|
"learning_rate": 4.882787131983698e-06, |
|
"loss": 0.8465, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.6591639871382635, |
|
"grad_norm": 0.5655978322029114, |
|
"learning_rate": 4.6486190552735375e-06, |
|
"loss": 0.9188, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.697749196141479, |
|
"grad_norm": 0.5455473065376282, |
|
"learning_rate": 4.415223390139588e-06, |
|
"loss": 0.8093, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.7363344051446945, |
|
"grad_norm": 0.6158671379089355, |
|
"learning_rate": 4.183113190907349e-06, |
|
"loss": 0.9393, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.77491961414791, |
|
"grad_norm": 0.5077454447746277, |
|
"learning_rate": 3.952798686169279e-06, |
|
"loss": 0.8114, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.8135048231511255, |
|
"grad_norm": 0.47759971022605896, |
|
"learning_rate": 3.7247861571916183e-06, |
|
"loss": 0.8585, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.852090032154341, |
|
"grad_norm": 0.4739689230918884, |
|
"learning_rate": 3.4995768249982975e-06, |
|
"loss": 0.8259, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.8906752411575563, |
|
"grad_norm": 0.5395683646202087, |
|
"learning_rate": 3.2776657485783357e-06, |
|
"loss": 0.8673, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.9292604501607717, |
|
"grad_norm": 0.5374310612678528, |
|
"learning_rate": 3.059540736638751e-06, |
|
"loss": 0.7906, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.967845659163987, |
|
"grad_norm": 0.4914711117744446, |
|
"learning_rate": 2.8456812752951483e-06, |
|
"loss": 0.9054, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.0064308681672025, |
|
"grad_norm": 0.521967887878418, |
|
"learning_rate": 2.636557474057173e-06, |
|
"loss": 0.8763, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.045016077170418, |
|
"grad_norm": 0.4540090262889862, |
|
"learning_rate": 2.4326290324257896e-06, |
|
"loss": 0.8473, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.0836012861736335, |
|
"grad_norm": 0.4512139856815338, |
|
"learning_rate": 2.234344229374003e-06, |
|
"loss": 0.7919, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.122186495176849, |
|
"grad_norm": 0.5352345705032349, |
|
"learning_rate": 2.042138937932388e-06, |
|
"loss": 0.8196, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.1607717041800645, |
|
"grad_norm": 0.44427061080932617, |
|
"learning_rate": 1.856435667045577e-06, |
|
"loss": 0.7881, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.19935691318328, |
|
"grad_norm": 0.5240706205368042, |
|
"learning_rate": 1.677642632805892e-06, |
|
"loss": 0.8631, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.237942122186495, |
|
"grad_norm": 0.5195144414901733, |
|
"learning_rate": 1.5061528611057917e-06, |
|
"loss": 0.7746, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.2765273311897105, |
|
"grad_norm": 0.43310004472732544, |
|
"learning_rate": 1.3423433236816563e-06, |
|
"loss": 0.8119, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.315112540192926, |
|
"grad_norm": 0.4922521710395813, |
|
"learning_rate": 1.186574109448091e-06, |
|
"loss": 0.8375, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.3536977491961415, |
|
"grad_norm": 0.4353758692741394, |
|
"learning_rate": 1.0391876329443534e-06, |
|
"loss": 0.7619, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.392282958199357, |
|
"grad_norm": 0.4733580946922302, |
|
"learning_rate": 9.005078816328772e-07, |
|
"loss": 0.8132, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.4308681672025725, |
|
"grad_norm": 0.43942683935165405, |
|
"learning_rate": 7.708397037045129e-07, |
|
"loss": 0.7784, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.469453376205788, |
|
"grad_norm": 0.46390509605407715, |
|
"learning_rate": 6.50468137956049e-07, |
|
"loss": 0.8832, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.508038585209003, |
|
"grad_norm": 0.44283559918403625, |
|
"learning_rate": 5.396577872130676e-07, |
|
"loss": 0.8102, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.5466237942122185, |
|
"grad_norm": 0.4312247037887573, |
|
"learning_rate": 4.386522366755169e-07, |
|
"loss": 0.7881, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.585209003215434, |
|
"grad_norm": 0.46290892362594604, |
|
"learning_rate": 3.4767351846456744e-07, |
|
"loss": 0.8807, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.6237942122186495, |
|
"grad_norm": 0.44196000695228577, |
|
"learning_rate": 2.669216235478295e-07, |
|
"loss": 0.7756, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.662379421221865, |
|
"grad_norm": 0.47514083981513977, |
|
"learning_rate": 1.9657406211579966e-07, |
|
"loss": 0.8685, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.7009646302250805, |
|
"grad_norm": 0.4315619468688965, |
|
"learning_rate": 1.3678547337593494e-07, |
|
"loss": 0.7501, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.739549839228296, |
|
"grad_norm": 0.4595395624637604, |
|
"learning_rate": 8.768728562211948e-08, |
|
"loss": 0.8121, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.778135048231511, |
|
"grad_norm": 0.4295967221260071, |
|
"learning_rate": 4.9387427326745287e-08, |
|
"loss": 0.7776, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.816720257234727, |
|
"grad_norm": 0.44801920652389526, |
|
"learning_rate": 2.1970089890509527e-08, |
|
"loss": 0.8149, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.855305466237942, |
|
"grad_norm": 0.4446917474269867, |
|
"learning_rate": 5.495542571443135e-09, |
|
"loss": 0.8291, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.8938906752411575, |
|
"grad_norm": 0.4660709500312805, |
|
"learning_rate": 0.0, |
|
"loss": 0.8268, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.8938906752411575, |
|
"step": 75, |
|
"total_flos": 66441907232768.0, |
|
"train_loss": 0.9123331260681152, |
|
"train_runtime": 3973.9028, |
|
"train_samples_per_second": 1.875, |
|
"train_steps_per_second": 0.019 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 66441907232768.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|