|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.976, |
|
"eval_steps": 500, |
|
"global_step": 93, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 6.902911186218262, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.274, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 6.865423202514648, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.2584, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 6.873320579528809, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2997, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 6.20452880859375, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.2787, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.477367401123047, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1893, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 2.854663133621216, |
|
"learning_rate": 6e-06, |
|
"loss": 1.1767, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 2.8232390880584717, |
|
"learning_rate": 7e-06, |
|
"loss": 1.1806, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 5.816140651702881, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.158, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 5.778530597686768, |
|
"learning_rate": 9e-06, |
|
"loss": 1.1352, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.842033863067627, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1387, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 5.925730228424072, |
|
"learning_rate": 9.996418774081658e-06, |
|
"loss": 1.1677, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 3.952319383621216, |
|
"learning_rate": 9.985680226398261e-06, |
|
"loss": 1.1415, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 2.6014835834503174, |
|
"learning_rate": 9.967799739815925e-06, |
|
"loss": 1.0675, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 2.5148086547851562, |
|
"learning_rate": 9.942802927959444e-06, |
|
"loss": 1.0578, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.052213430404663, |
|
"learning_rate": 9.910725598521014e-06, |
|
"loss": 1.0096, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 1.4976890087127686, |
|
"learning_rate": 9.871613701966067e-06, |
|
"loss": 1.0094, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.1822410821914673, |
|
"learning_rate": 9.825523265709667e-06, |
|
"loss": 1.0001, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.2348859310150146, |
|
"learning_rate": 9.772520313857777e-06, |
|
"loss": 0.9683, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.1384820938110352, |
|
"learning_rate": 9.712680772628365e-06, |
|
"loss": 0.983, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.017440915107727, |
|
"learning_rate": 9.646090361587828e-06, |
|
"loss": 0.97, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.9060385227203369, |
|
"learning_rate": 9.572844470858537e-06, |
|
"loss": 0.9922, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.9758835434913635, |
|
"learning_rate": 9.493048024473413e-06, |
|
"loss": 0.9464, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.1663302183151245, |
|
"learning_rate": 9.406815330073244e-06, |
|
"loss": 0.982, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.9243867993354797, |
|
"learning_rate": 9.314269915162115e-06, |
|
"loss": 0.9922, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9539313316345215, |
|
"learning_rate": 9.215544350155423e-06, |
|
"loss": 0.9401, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.7756137847900391, |
|
"learning_rate": 9.110780058474052e-06, |
|
"loss": 0.9571, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.7239847183227539, |
|
"learning_rate": 9.000127113956673e-06, |
|
"loss": 0.9731, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.7880154848098755, |
|
"learning_rate": 8.883744025880429e-06, |
|
"loss": 0.9277, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.7989363670349121, |
|
"learning_rate": 8.761797511897907e-06, |
|
"loss": 0.9404, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8506297469139099, |
|
"learning_rate": 8.634462259215719e-06, |
|
"loss": 0.9388, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.8318315744400024, |
|
"learning_rate": 8.501920674356755e-06, |
|
"loss": 0.9318, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.70693039894104, |
|
"learning_rate": 8.364362621864595e-06, |
|
"loss": 1.7235, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.7298498749732971, |
|
"learning_rate": 8.221985152324385e-06, |
|
"loss": 0.8678, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.7523313164710999, |
|
"learning_rate": 8.07499222008977e-06, |
|
"loss": 0.8889, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.7242347002029419, |
|
"learning_rate": 7.923594391120237e-06, |
|
"loss": 0.9157, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.8468252420425415, |
|
"learning_rate": 7.768008541347423e-06, |
|
"loss": 0.9081, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.6322146058082581, |
|
"learning_rate": 7.608457546002423e-06, |
|
"loss": 0.8875, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 0.7224799990653992, |
|
"learning_rate": 7.445169960349167e-06, |
|
"loss": 0.8787, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.8994417190551758, |
|
"learning_rate": 7.278379692281209e-06, |
|
"loss": 0.9457, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.5251118540763855, |
|
"learning_rate": 7.10832566725092e-06, |
|
"loss": 0.8881, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.7077475190162659, |
|
"learning_rate": 6.9352514860110876e-06, |
|
"loss": 0.8929, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.6465011835098267, |
|
"learning_rate": 6.759405075659165e-06, |
|
"loss": 0.7808, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.714865505695343, |
|
"learning_rate": 6.58103833448412e-06, |
|
"loss": 0.8973, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 0.5610604882240295, |
|
"learning_rate": 6.4004067711245366e-06, |
|
"loss": 0.9228, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.6022524833679199, |
|
"learning_rate": 6.2177691385549595e-06, |
|
"loss": 0.857, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.72687166929245, |
|
"learning_rate": 6.033387063424765e-06, |
|
"loss": 0.843, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.6550407409667969, |
|
"learning_rate": 5.8475246712804845e-06, |
|
"loss": 1.0347, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.5872577428817749, |
|
"learning_rate": 5.660448208208513e-06, |
|
"loss": 0.817, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.6337549090385437, |
|
"learning_rate": 5.472425659440157e-06, |
|
"loss": 0.8407, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6403087377548218, |
|
"learning_rate": 5.2837263654653715e-06, |
|
"loss": 0.8921, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.6125657558441162, |
|
"learning_rate": 5.094620636205096e-06, |
|
"loss": 0.9579, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.5235335826873779, |
|
"learning_rate": 4.905379363794907e-06, |
|
"loss": 0.7933, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.6453654766082764, |
|
"learning_rate": 4.71627363453463e-06, |
|
"loss": 0.97, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.5285298228263855, |
|
"learning_rate": 4.527574340559844e-06, |
|
"loss": 0.8357, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.44597020745277405, |
|
"learning_rate": 4.33955179179149e-06, |
|
"loss": 0.828, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.6450824737548828, |
|
"learning_rate": 4.152475328719517e-06, |
|
"loss": 0.8928, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.5392605662345886, |
|
"learning_rate": 3.966612936575235e-06, |
|
"loss": 0.9052, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.5523377060890198, |
|
"learning_rate": 3.782230861445041e-06, |
|
"loss": 0.9586, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.537310779094696, |
|
"learning_rate": 3.5995932288754655e-06, |
|
"loss": 0.8434, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 10.150139808654785, |
|
"learning_rate": 3.4189616655158803e-06, |
|
"loss": 0.8777, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.5695199370384216, |
|
"learning_rate": 3.240594924340835e-06, |
|
"loss": 0.8699, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.5820671319961548, |
|
"learning_rate": 3.0647485139889145e-06, |
|
"loss": 0.8321, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.9486154913902283, |
|
"learning_rate": 2.89167433274908e-06, |
|
"loss": 1.5436, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.5092023611068726, |
|
"learning_rate": 2.721620307718793e-06, |
|
"loss": 0.8582, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.5217841863632202, |
|
"learning_rate": 2.554830039650834e-06, |
|
"loss": 0.8665, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.5346829891204834, |
|
"learning_rate": 2.391542453997578e-06, |
|
"loss": 0.8705, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.5108064413070679, |
|
"learning_rate": 2.2319914586525776e-06, |
|
"loss": 0.7992, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.4857538044452667, |
|
"learning_rate": 2.0764056088797646e-06, |
|
"loss": 0.8735, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.5583701729774475, |
|
"learning_rate": 1.9250077799102323e-06, |
|
"loss": 0.8379, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.44473886489868164, |
|
"learning_rate": 1.7780148476756148e-06, |
|
"loss": 0.8338, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.4547748267650604, |
|
"learning_rate": 1.6356373781354058e-06, |
|
"loss": 0.7843, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.4426772892475128, |
|
"learning_rate": 1.4980793256432474e-06, |
|
"loss": 0.8256, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.4569498896598816, |
|
"learning_rate": 1.3655377407842813e-06, |
|
"loss": 0.8289, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.46035248041152954, |
|
"learning_rate": 1.2382024881020937e-06, |
|
"loss": 0.8025, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.4538329839706421, |
|
"learning_rate": 1.1162559741195733e-06, |
|
"loss": 0.9191, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.4397728443145752, |
|
"learning_rate": 9.998728860433277e-07, |
|
"loss": 0.8216, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.46339666843414307, |
|
"learning_rate": 8.892199415259501e-07, |
|
"loss": 0.8265, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.4591324031352997, |
|
"learning_rate": 7.844556498445788e-07, |
|
"loss": 0.8353, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.4275136888027191, |
|
"learning_rate": 6.857300848378857e-07, |
|
"loss": 0.8233, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.42812904715538025, |
|
"learning_rate": 5.931846699267558e-07, |
|
"loss": 0.918, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.41314032673835754, |
|
"learning_rate": 5.0695197552659e-07, |
|
"loss": 0.7847, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.442142516374588, |
|
"learning_rate": 4.271555291414636e-07, |
|
"loss": 0.8077, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.43055617809295654, |
|
"learning_rate": 3.539096384121743e-07, |
|
"loss": 0.8602, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.39766573905944824, |
|
"learning_rate": 2.873192273716369e-07, |
|
"loss": 0.846, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.4134579002857208, |
|
"learning_rate": 2.274796861422246e-07, |
|
"loss": 0.808, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.39045432209968567, |
|
"learning_rate": 1.7447673429033361e-07, |
|
"loss": 0.8078, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.433301717042923, |
|
"learning_rate": 1.2838629803393343e-07, |
|
"loss": 0.8601, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.4349207282066345, |
|
"learning_rate": 8.927440147898703e-08, |
|
"loss": 0.8478, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.4573710560798645, |
|
"learning_rate": 5.7197072040557356e-08, |
|
"loss": 0.9191, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.40317612886428833, |
|
"learning_rate": 3.220026018407541e-08, |
|
"loss": 0.7321, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.4268146753311157, |
|
"learning_rate": 1.431977360173975e-08, |
|
"loss": 0.8163, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.4728689193725586, |
|
"learning_rate": 3.5812259183426457e-09, |
|
"loss": 0.8683, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.4204527735710144, |
|
"learning_rate": 0.0, |
|
"loss": 0.8822, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"step": 93, |
|
"total_flos": 91659381276672.0, |
|
"train_loss": 0.945604988323745, |
|
"train_runtime": 5318.5414, |
|
"train_samples_per_second": 1.692, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 93, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 91659381276672.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|