lemexp-processed-task1_min_symbols_lemma_command_small-deepseek-coder-1.3b-base
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 6.0, | |
"eval_steps": 3683, | |
"global_step": 110472, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.027156202476645665, | |
"grad_norm": 1.423619031906128, | |
"learning_rate": 0.00019927699054107928, | |
"loss": 0.804, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.05431240495329133, | |
"grad_norm": 2.0592901706695557, | |
"learning_rate": 0.00019837096365020115, | |
"loss": 0.7258, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.081468607429937, | |
"grad_norm": 1.7999378442764282, | |
"learning_rate": 0.00019746493675932303, | |
"loss": 0.698, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 0.10862480990658266, | |
"grad_norm": 1.6404426097869873, | |
"learning_rate": 0.0001965589098684449, | |
"loss": 0.6863, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 0.13578101238322832, | |
"grad_norm": 2.424567222595215, | |
"learning_rate": 0.00019565469503134855, | |
"loss": 0.6638, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 0.162937214859874, | |
"grad_norm": 2.102727174758911, | |
"learning_rate": 0.00019475048019425216, | |
"loss": 0.6554, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 0.19009341733651966, | |
"grad_norm": 1.732860803604126, | |
"learning_rate": 0.00019384445330337407, | |
"loss": 0.6364, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 0.20003258744297198, | |
"eval_loss": 0.6357121467590332, | |
"eval_runtime": 25.1143, | |
"eval_samples_per_second": 14.693, | |
"eval_steps_per_second": 7.366, | |
"step": 3683 | |
}, | |
{ | |
"epoch": 0.21724961981316532, | |
"grad_norm": 1.5829988718032837, | |
"learning_rate": 0.00019293842641249595, | |
"loss": 0.6576, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 0.244405822289811, | |
"grad_norm": 0.9155117869377136, | |
"learning_rate": 0.0001920323995216178, | |
"loss": 0.6241, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 0.27156202476645663, | |
"grad_norm": 1.6804828643798828, | |
"learning_rate": 0.0001911263726307397, | |
"loss": 0.6369, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 0.2987182272431023, | |
"grad_norm": 1.337156891822815, | |
"learning_rate": 0.00019022034573986155, | |
"loss": 0.6196, | |
"step": 5500 | |
}, | |
{ | |
"epoch": 0.325874429719748, | |
"grad_norm": 1.680059790611267, | |
"learning_rate": 0.00018931431884898345, | |
"loss": 0.6121, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 0.35303063219639363, | |
"grad_norm": 2.251309871673584, | |
"learning_rate": 0.00018840829195810533, | |
"loss": 0.5984, | |
"step": 6500 | |
}, | |
{ | |
"epoch": 0.3801868346730393, | |
"grad_norm": 1.7084137201309204, | |
"learning_rate": 0.0001875022650672272, | |
"loss": 0.5857, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 0.40006517488594395, | |
"eval_loss": 0.5826721787452698, | |
"eval_runtime": 24.2431, | |
"eval_samples_per_second": 15.221, | |
"eval_steps_per_second": 7.631, | |
"step": 7366 | |
}, | |
{ | |
"epoch": 0.407343037149685, | |
"grad_norm": 1.7896497249603271, | |
"learning_rate": 0.00018659623817634909, | |
"loss": 0.5777, | |
"step": 7500 | |
}, | |
{ | |
"epoch": 0.43449923962633064, | |
"grad_norm": 1.6899892091751099, | |
"learning_rate": 0.00018569021128547096, | |
"loss": 0.584, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 0.4616554421029763, | |
"grad_norm": 2.2623794078826904, | |
"learning_rate": 0.00018478418439459284, | |
"loss": 0.5731, | |
"step": 8500 | |
}, | |
{ | |
"epoch": 0.488811644579622, | |
"grad_norm": 2.9157674312591553, | |
"learning_rate": 0.00018387996955749646, | |
"loss": 0.5719, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 0.5159678470562676, | |
"grad_norm": 1.841354489326477, | |
"learning_rate": 0.00018297394266661836, | |
"loss": 0.5765, | |
"step": 9500 | |
}, | |
{ | |
"epoch": 0.5431240495329133, | |
"grad_norm": 2.550917625427246, | |
"learning_rate": 0.00018206791577574024, | |
"loss": 0.552, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 0.570280252009559, | |
"grad_norm": 1.5018529891967773, | |
"learning_rate": 0.00018116188888486211, | |
"loss": 0.5489, | |
"step": 10500 | |
}, | |
{ | |
"epoch": 0.5974364544862046, | |
"grad_norm": 3.648230791091919, | |
"learning_rate": 0.00018025767404776576, | |
"loss": 0.5682, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 0.6000977623289159, | |
"eval_loss": 0.5515537858009338, | |
"eval_runtime": 25.2059, | |
"eval_samples_per_second": 14.639, | |
"eval_steps_per_second": 7.34, | |
"step": 11049 | |
}, | |
{ | |
"epoch": 0.6245926569628503, | |
"grad_norm": 3.894047498703003, | |
"learning_rate": 0.0001793516471568876, | |
"loss": 0.5662, | |
"step": 11500 | |
}, | |
{ | |
"epoch": 0.651748859439496, | |
"grad_norm": 2.1958436965942383, | |
"learning_rate": 0.0001784456202660095, | |
"loss": 0.557, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 0.6789050619161416, | |
"grad_norm": 1.6268092393875122, | |
"learning_rate": 0.00017754140542891312, | |
"loss": 0.5472, | |
"step": 12500 | |
}, | |
{ | |
"epoch": 0.7060612643927873, | |
"grad_norm": 1.6947818994522095, | |
"learning_rate": 0.00017663537853803503, | |
"loss": 0.5487, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 0.733217466869433, | |
"grad_norm": 1.740544080734253, | |
"learning_rate": 0.00017572935164715688, | |
"loss": 0.5473, | |
"step": 13500 | |
}, | |
{ | |
"epoch": 0.7603736693460786, | |
"grad_norm": 2.6229496002197266, | |
"learning_rate": 0.00017482332475627878, | |
"loss": 0.5306, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 0.7875298718227243, | |
"grad_norm": 1.760733723640442, | |
"learning_rate": 0.00017391729786540066, | |
"loss": 0.5421, | |
"step": 14500 | |
}, | |
{ | |
"epoch": 0.8001303497718879, | |
"eval_loss": 0.5292674899101257, | |
"eval_runtime": 23.3828, | |
"eval_samples_per_second": 15.781, | |
"eval_steps_per_second": 7.912, | |
"step": 14732 | |
}, | |
{ | |
"epoch": 0.81468607429937, | |
"grad_norm": 2.7095932960510254, | |
"learning_rate": 0.00017301308302830428, | |
"loss": 0.5225, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 0.8418422767760156, | |
"grad_norm": 3.7730860710144043, | |
"learning_rate": 0.00017210705613742618, | |
"loss": 0.536, | |
"step": 15500 | |
}, | |
{ | |
"epoch": 0.8689984792526613, | |
"grad_norm": 1.8944693803787231, | |
"learning_rate": 0.00017120102924654803, | |
"loss": 0.5123, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 0.896154681729307, | |
"grad_norm": 2.137572765350342, | |
"learning_rate": 0.00017029500235566993, | |
"loss": 0.5411, | |
"step": 16500 | |
}, | |
{ | |
"epoch": 0.9233108842059526, | |
"grad_norm": 4.163636207580566, | |
"learning_rate": 0.0001693889754647918, | |
"loss": 0.5218, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 0.9504670866825983, | |
"grad_norm": 2.492893934249878, | |
"learning_rate": 0.0001684829485739137, | |
"loss": 0.5171, | |
"step": 17500 | |
}, | |
{ | |
"epoch": 0.977623289159244, | |
"grad_norm": 1.2668529748916626, | |
"learning_rate": 0.00016757692168303557, | |
"loss": 0.5142, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 1.00016293721486, | |
"eval_loss": 0.517742395401001, | |
"eval_runtime": 24.0099, | |
"eval_samples_per_second": 15.369, | |
"eval_steps_per_second": 7.705, | |
"step": 18415 | |
}, | |
{ | |
"epoch": 1.0047794916358896, | |
"grad_norm": 4.145332336425781, | |
"learning_rate": 0.00016667089479215742, | |
"loss": 0.5248, | |
"step": 18500 | |
}, | |
{ | |
"epoch": 1.0319356941125353, | |
"grad_norm": 3.0422215461730957, | |
"learning_rate": 0.00016576667995506109, | |
"loss": 0.4571, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 1.059091896589181, | |
"grad_norm": 2.034750461578369, | |
"learning_rate": 0.00016486065306418294, | |
"loss": 0.4635, | |
"step": 19500 | |
}, | |
{ | |
"epoch": 1.0862480990658265, | |
"grad_norm": 2.047473907470703, | |
"learning_rate": 0.0001639564382270866, | |
"loss": 0.47, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 1.1134043015424724, | |
"grad_norm": 2.424201011657715, | |
"learning_rate": 0.00016305041133620845, | |
"loss": 0.4742, | |
"step": 20500 | |
}, | |
{ | |
"epoch": 1.140560504019118, | |
"grad_norm": 2.1113667488098145, | |
"learning_rate": 0.00016214438444533036, | |
"loss": 0.4628, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 1.1677167064957636, | |
"grad_norm": 2.0212793350219727, | |
"learning_rate": 0.00016123835755445224, | |
"loss": 0.4636, | |
"step": 21500 | |
}, | |
{ | |
"epoch": 1.1948729089724093, | |
"grad_norm": 4.672229290008545, | |
"learning_rate": 0.00016033233066357409, | |
"loss": 0.4674, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 1.2001955246578317, | |
"eval_loss": 0.5015310645103455, | |
"eval_runtime": 25.0548, | |
"eval_samples_per_second": 14.728, | |
"eval_steps_per_second": 7.384, | |
"step": 22098 | |
}, | |
{ | |
"epoch": 1.222029111449055, | |
"grad_norm": 2.172687292098999, | |
"learning_rate": 0.000159426303772696, | |
"loss": 0.4583, | |
"step": 22500 | |
}, | |
{ | |
"epoch": 1.2491853139257005, | |
"grad_norm": 2.1510438919067383, | |
"learning_rate": 0.00015852027688181784, | |
"loss": 0.4558, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 1.2763415164023462, | |
"grad_norm": 1.1689780950546265, | |
"learning_rate": 0.00015761424999093975, | |
"loss": 0.4502, | |
"step": 23500 | |
}, | |
{ | |
"epoch": 1.303497718878992, | |
"grad_norm": 2.7791380882263184, | |
"learning_rate": 0.00015670822310006162, | |
"loss": 0.4451, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 1.3306539213556376, | |
"grad_norm": 2.7756049633026123, | |
"learning_rate": 0.0001558021962091835, | |
"loss": 0.4509, | |
"step": 24500 | |
}, | |
{ | |
"epoch": 1.3578101238322833, | |
"grad_norm": 2.263340950012207, | |
"learning_rate": 0.00015489616931830538, | |
"loss": 0.4472, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 1.384966326308929, | |
"grad_norm": 3.0343711376190186, | |
"learning_rate": 0.000153991954481209, | |
"loss": 0.4615, | |
"step": 25500 | |
}, | |
{ | |
"epoch": 1.4002281121008038, | |
"eval_loss": 0.500033438205719, | |
"eval_runtime": 24.7233, | |
"eval_samples_per_second": 14.925, | |
"eval_steps_per_second": 7.483, | |
"step": 25781 | |
}, | |
{ | |
"epoch": 1.4121225287855745, | |
"grad_norm": 3.176940441131592, | |
"learning_rate": 0.0001530859275903309, | |
"loss": 0.4561, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 1.4392787312622204, | |
"grad_norm": 4.111068248748779, | |
"learning_rate": 0.00015217990069945275, | |
"loss": 0.491, | |
"step": 26500 | |
}, | |
{ | |
"epoch": 1.466434933738866, | |
"grad_norm": 5.199289321899414, | |
"learning_rate": 0.00015127387380857465, | |
"loss": 0.4721, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 1.4935911362155116, | |
"grad_norm": 1.901997447013855, | |
"learning_rate": 0.00015036965897147827, | |
"loss": 0.4458, | |
"step": 27500 | |
}, | |
{ | |
"epoch": 1.5207473386921573, | |
"grad_norm": 3.2669544219970703, | |
"learning_rate": 0.00014946363208060017, | |
"loss": 0.4313, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 1.547903541168803, | |
"grad_norm": 1.151863694190979, | |
"learning_rate": 0.00014855760518972205, | |
"loss": 0.4698, | |
"step": 28500 | |
}, | |
{ | |
"epoch": 1.5750597436454488, | |
"grad_norm": 2.112612724304199, | |
"learning_rate": 0.0001476515782988439, | |
"loss": 0.453, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 1.6002606995437758, | |
"eval_loss": 0.47696688771247864, | |
"eval_runtime": 22.9354, | |
"eval_samples_per_second": 16.089, | |
"eval_steps_per_second": 8.066, | |
"step": 29464 | |
}, | |
{ | |
"epoch": 1.6022159461220942, | |
"grad_norm": 2.558180570602417, | |
"learning_rate": 0.0001467455514079658, | |
"loss": 0.4575, | |
"step": 29500 | |
}, | |
{ | |
"epoch": 1.62937214859874, | |
"grad_norm": 2.6507065296173096, | |
"learning_rate": 0.00014583952451708768, | |
"loss": 0.459, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 1.6565283510753857, | |
"grad_norm": 1.5638259649276733, | |
"learning_rate": 0.00014493349762620956, | |
"loss": 0.4288, | |
"step": 30500 | |
}, | |
{ | |
"epoch": 1.6836845535520313, | |
"grad_norm": 3.8055946826934814, | |
"learning_rate": 0.00014402747073533143, | |
"loss": 0.4514, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 1.710840756028677, | |
"grad_norm": 3.0687201023101807, | |
"learning_rate": 0.00014312325589823508, | |
"loss": 0.4471, | |
"step": 31500 | |
}, | |
{ | |
"epoch": 1.7379969585053225, | |
"grad_norm": 2.26448655128479, | |
"learning_rate": 0.0001422190410611387, | |
"loss": 0.4447, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 1.7651531609819684, | |
"grad_norm": 1.4060781002044678, | |
"learning_rate": 0.00014131301417026057, | |
"loss": 0.4361, | |
"step": 32500 | |
}, | |
{ | |
"epoch": 1.7923093634586138, | |
"grad_norm": 2.3706018924713135, | |
"learning_rate": 0.00014040698727938247, | |
"loss": 0.4506, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 1.8002932869867476, | |
"eval_loss": 0.470061331987381, | |
"eval_runtime": 27.1848, | |
"eval_samples_per_second": 13.574, | |
"eval_steps_per_second": 6.805, | |
"step": 33147 | |
}, | |
{ | |
"epoch": 1.8194655659352597, | |
"grad_norm": 2.880718946456909, | |
"learning_rate": 0.00013950096038850432, | |
"loss": 0.4439, | |
"step": 33500 | |
}, | |
{ | |
"epoch": 1.8466217684119053, | |
"grad_norm": 1.4225813150405884, | |
"learning_rate": 0.000138596745551408, | |
"loss": 0.4434, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 1.873777970888551, | |
"grad_norm": 0.7051529884338379, | |
"learning_rate": 0.00013769071866052984, | |
"loss": 0.449, | |
"step": 34500 | |
}, | |
{ | |
"epoch": 1.9009341733651968, | |
"grad_norm": 4.5070648193359375, | |
"learning_rate": 0.00013678469176965174, | |
"loss": 0.4356, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 1.9280903758418422, | |
"grad_norm": 1.354962944984436, | |
"learning_rate": 0.00013587866487877362, | |
"loss": 0.4509, | |
"step": 35500 | |
}, | |
{ | |
"epoch": 1.955246578318488, | |
"grad_norm": 2.919261932373047, | |
"learning_rate": 0.00013497263798789547, | |
"loss": 0.4399, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 1.9824027807951337, | |
"grad_norm": 1.7376036643981934, | |
"learning_rate": 0.00013406661109701738, | |
"loss": 0.4309, | |
"step": 36500 | |
}, | |
{ | |
"epoch": 2.00032587442972, | |
"eval_loss": 0.4645754098892212, | |
"eval_runtime": 26.5674, | |
"eval_samples_per_second": 13.889, | |
"eval_steps_per_second": 6.963, | |
"step": 36830 | |
}, | |
{ | |
"epoch": 2.0095589832717793, | |
"grad_norm": 2.448807954788208, | |
"learning_rate": 0.000133162396259921, | |
"loss": 0.4074, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 2.036715185748425, | |
"grad_norm": 4.4545369148254395, | |
"learning_rate": 0.0001322563693690429, | |
"loss": 0.3784, | |
"step": 37500 | |
}, | |
{ | |
"epoch": 2.0638713882250705, | |
"grad_norm": 1.4585567712783813, | |
"learning_rate": 0.0001313521545319465, | |
"loss": 0.3802, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 2.0910275907017164, | |
"grad_norm": 4.981091499328613, | |
"learning_rate": 0.0001304461276410684, | |
"loss": 0.3887, | |
"step": 38500 | |
}, | |
{ | |
"epoch": 2.118183793178362, | |
"grad_norm": 1.345459222793579, | |
"learning_rate": 0.00012954010075019026, | |
"loss": 0.3791, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 2.1453399956550077, | |
"grad_norm": 2.339366912841797, | |
"learning_rate": 0.00012863407385931214, | |
"loss": 0.3895, | |
"step": 39500 | |
}, | |
{ | |
"epoch": 2.172496198131653, | |
"grad_norm": 2.1575145721435547, | |
"learning_rate": 0.00012772804696843405, | |
"loss": 0.4046, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 2.199652400608299, | |
"grad_norm": 3.028726100921631, | |
"learning_rate": 0.0001268220200775559, | |
"loss": 0.3829, | |
"step": 40500 | |
}, | |
{ | |
"epoch": 2.2003584618726917, | |
"eval_loss": 0.46665239334106445, | |
"eval_runtime": 22.994, | |
"eval_samples_per_second": 16.048, | |
"eval_steps_per_second": 8.046, | |
"step": 40513 | |
}, | |
{ | |
"epoch": 2.2268086030849448, | |
"grad_norm": 2.198944330215454, | |
"learning_rate": 0.0001259159931866778, | |
"loss": 0.404, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 2.25396480556159, | |
"grad_norm": 1.4934983253479004, | |
"learning_rate": 0.00012500996629579965, | |
"loss": 0.3758, | |
"step": 41500 | |
}, | |
{ | |
"epoch": 2.281121008038236, | |
"grad_norm": 3.0350615978240967, | |
"learning_rate": 0.0001241057514587033, | |
"loss": 0.3851, | |
"step": 42000 | |
}, | |
{ | |
"epoch": 2.3082772105148814, | |
"grad_norm": 2.0013248920440674, | |
"learning_rate": 0.00012319972456782517, | |
"loss": 0.3874, | |
"step": 42500 | |
}, | |
{ | |
"epoch": 2.3354334129915273, | |
"grad_norm": 3.0805039405822754, | |
"learning_rate": 0.00012229369767694705, | |
"loss": 0.383, | |
"step": 43000 | |
}, | |
{ | |
"epoch": 2.362589615468173, | |
"grad_norm": 2.340902328491211, | |
"learning_rate": 0.00012138767078606894, | |
"loss": 0.3795, | |
"step": 43500 | |
}, | |
{ | |
"epoch": 2.3897458179448186, | |
"grad_norm": 1.4872759580612183, | |
"learning_rate": 0.00012048164389519082, | |
"loss": 0.3925, | |
"step": 44000 | |
}, | |
{ | |
"epoch": 2.4003910493156635, | |
"eval_loss": 0.45948517322540283, | |
"eval_runtime": 23.0689, | |
"eval_samples_per_second": 15.996, | |
"eval_steps_per_second": 8.019, | |
"step": 44196 | |
}, | |
{ | |
"epoch": 2.4169020204214644, | |
"grad_norm": 2.0199170112609863, | |
"learning_rate": 0.00011957742905809446, | |
"loss": 0.387, | |
"step": 44500 | |
}, | |
{ | |
"epoch": 2.44405822289811, | |
"grad_norm": 2.0993518829345703, | |
"learning_rate": 0.00011867140216721633, | |
"loss": 0.3858, | |
"step": 45000 | |
}, | |
{ | |
"epoch": 2.4712144253747557, | |
"grad_norm": 2.5431594848632812, | |
"learning_rate": 0.0001177653752763382, | |
"loss": 0.3731, | |
"step": 45500 | |
}, | |
{ | |
"epoch": 2.498370627851401, | |
"grad_norm": 2.0377984046936035, | |
"learning_rate": 0.00011685934838546009, | |
"loss": 0.383, | |
"step": 46000 | |
}, | |
{ | |
"epoch": 2.525526830328047, | |
"grad_norm": 2.3051955699920654, | |
"learning_rate": 0.00011595332149458195, | |
"loss": 0.3865, | |
"step": 46500 | |
}, | |
{ | |
"epoch": 2.5526830328046923, | |
"grad_norm": 3.8095552921295166, | |
"learning_rate": 0.00011504729460370384, | |
"loss": 0.3726, | |
"step": 47000 | |
}, | |
{ | |
"epoch": 2.579839235281338, | |
"grad_norm": 2.2560086250305176, | |
"learning_rate": 0.00011414307976660747, | |
"loss": 0.3858, | |
"step": 47500 | |
}, | |
{ | |
"epoch": 2.6004236367586357, | |
"eval_loss": 0.45664411783218384, | |
"eval_runtime": 22.9971, | |
"eval_samples_per_second": 16.045, | |
"eval_steps_per_second": 8.044, | |
"step": 47879 | |
}, | |
{ | |
"epoch": 2.606995437757984, | |
"grad_norm": 2.8991200923919678, | |
"learning_rate": 0.00011323705287572936, | |
"loss": 0.383, | |
"step": 48000 | |
}, | |
{ | |
"epoch": 2.6341516402346294, | |
"grad_norm": 4.307155132293701, | |
"learning_rate": 0.00011233102598485124, | |
"loss": 0.3941, | |
"step": 48500 | |
}, | |
{ | |
"epoch": 2.6613078427112753, | |
"grad_norm": 3.7580649852752686, | |
"learning_rate": 0.0001114249990939731, | |
"loss": 0.385, | |
"step": 49000 | |
}, | |
{ | |
"epoch": 2.6884640451879207, | |
"grad_norm": 2.604210615158081, | |
"learning_rate": 0.000110518972203095, | |
"loss": 0.3847, | |
"step": 49500 | |
}, | |
{ | |
"epoch": 2.7156202476645666, | |
"grad_norm": 1.8067151308059692, | |
"learning_rate": 0.00010961475736599862, | |
"loss": 0.3775, | |
"step": 50000 | |
}, | |
{ | |
"epoch": 2.7427764501412124, | |
"grad_norm": 2.4924516677856445, | |
"learning_rate": 0.00010870873047512051, | |
"loss": 0.392, | |
"step": 50500 | |
}, | |
{ | |
"epoch": 2.769932652617858, | |
"grad_norm": 2.7145466804504395, | |
"learning_rate": 0.00010780270358424238, | |
"loss": 0.3817, | |
"step": 51000 | |
}, | |
{ | |
"epoch": 2.7970888550945037, | |
"grad_norm": 3.6621336936950684, | |
"learning_rate": 0.00010689667669336427, | |
"loss": 0.3879, | |
"step": 51500 | |
}, | |
{ | |
"epoch": 2.8004562242016076, | |
"eval_loss": 0.4439272880554199, | |
"eval_runtime": 22.9231, | |
"eval_samples_per_second": 16.097, | |
"eval_steps_per_second": 8.07, | |
"step": 51562 | |
}, | |
{ | |
"epoch": 2.824245057571149, | |
"grad_norm": 3.2784557342529297, | |
"learning_rate": 0.00010599064980248614, | |
"loss": 0.3775, | |
"step": 52000 | |
}, | |
{ | |
"epoch": 2.851401260047795, | |
"grad_norm": 2.4789769649505615, | |
"learning_rate": 0.00010508643496538977, | |
"loss": 0.3828, | |
"step": 52500 | |
}, | |
{ | |
"epoch": 2.878557462524441, | |
"grad_norm": 4.17576789855957, | |
"learning_rate": 0.00010418040807451166, | |
"loss": 0.3922, | |
"step": 53000 | |
}, | |
{ | |
"epoch": 2.905713665001086, | |
"grad_norm": 2.2692151069641113, | |
"learning_rate": 0.00010327438118363353, | |
"loss": 0.3684, | |
"step": 53500 | |
}, | |
{ | |
"epoch": 2.932869867477732, | |
"grad_norm": 3.434340238571167, | |
"learning_rate": 0.00010236835429275542, | |
"loss": 0.3703, | |
"step": 54000 | |
}, | |
{ | |
"epoch": 2.9600260699543774, | |
"grad_norm": 2.867629289627075, | |
"learning_rate": 0.00010146232740187728, | |
"loss": 0.3769, | |
"step": 54500 | |
}, | |
{ | |
"epoch": 2.9871822724310233, | |
"grad_norm": 2.588996171951294, | |
"learning_rate": 0.00010055630051099917, | |
"loss": 0.3764, | |
"step": 55000 | |
}, | |
{ | |
"epoch": 3.00048881164458, | |
"eval_loss": 0.43786150217056274, | |
"eval_runtime": 23.1256, | |
"eval_samples_per_second": 15.956, | |
"eval_steps_per_second": 8.0, | |
"step": 55245 | |
}, | |
{ | |
"epoch": 3.0143384749076687, | |
"grad_norm": 2.558405876159668, | |
"learning_rate": 9.96520856739028e-05, | |
"loss": 0.3535, | |
"step": 55500 | |
}, | |
{ | |
"epoch": 3.0414946773843146, | |
"grad_norm": 2.3702216148376465, | |
"learning_rate": 9.874605878302469e-05, | |
"loss": 0.3299, | |
"step": 56000 | |
}, | |
{ | |
"epoch": 3.0686508798609604, | |
"grad_norm": 2.283313274383545, | |
"learning_rate": 9.784003189214657e-05, | |
"loss": 0.3366, | |
"step": 56500 | |
}, | |
{ | |
"epoch": 3.095807082337606, | |
"grad_norm": 2.421048641204834, | |
"learning_rate": 9.693400500126845e-05, | |
"loss": 0.3261, | |
"step": 57000 | |
}, | |
{ | |
"epoch": 3.1229632848142517, | |
"grad_norm": 2.0642685890197754, | |
"learning_rate": 9.602979016417207e-05, | |
"loss": 0.3335, | |
"step": 57500 | |
}, | |
{ | |
"epoch": 3.150119487290897, | |
"grad_norm": 3.4360289573669434, | |
"learning_rate": 9.512376327329395e-05, | |
"loss": 0.3287, | |
"step": 58000 | |
}, | |
{ | |
"epoch": 3.177275689767543, | |
"grad_norm": 3.9619264602661133, | |
"learning_rate": 9.421773638241583e-05, | |
"loss": 0.3267, | |
"step": 58500 | |
}, | |
{ | |
"epoch": 3.2005213990875516, | |
"eval_loss": 0.4501725733280182, | |
"eval_runtime": 23.0376, | |
"eval_samples_per_second": 16.017, | |
"eval_steps_per_second": 8.03, | |
"step": 58928 | |
}, | |
{ | |
"epoch": 3.2044318922441883, | |
"grad_norm": 2.5098698139190674, | |
"learning_rate": 9.331170949153772e-05, | |
"loss": 0.3365, | |
"step": 59000 | |
}, | |
{ | |
"epoch": 3.231588094720834, | |
"grad_norm": 2.2651731967926025, | |
"learning_rate": 9.24056826006596e-05, | |
"loss": 0.3285, | |
"step": 59500 | |
}, | |
{ | |
"epoch": 3.25874429719748, | |
"grad_norm": 2.573915958404541, | |
"learning_rate": 9.150146776356322e-05, | |
"loss": 0.3421, | |
"step": 60000 | |
}, | |
{ | |
"epoch": 3.2859004996741255, | |
"grad_norm": 3.5748302936553955, | |
"learning_rate": 9.059544087268512e-05, | |
"loss": 0.3267, | |
"step": 60500 | |
}, | |
{ | |
"epoch": 3.3130567021507713, | |
"grad_norm": 2.8185431957244873, | |
"learning_rate": 8.968941398180698e-05, | |
"loss": 0.3225, | |
"step": 61000 | |
}, | |
{ | |
"epoch": 3.3402129046274167, | |
"grad_norm": 6.555810451507568, | |
"learning_rate": 8.878338709092886e-05, | |
"loss": 0.3174, | |
"step": 61500 | |
}, | |
{ | |
"epoch": 3.3673691071040626, | |
"grad_norm": 3.8243870735168457, | |
"learning_rate": 8.787736020005073e-05, | |
"loss": 0.3249, | |
"step": 62000 | |
}, | |
{ | |
"epoch": 3.3945253095807084, | |
"grad_norm": 1.514364242553711, | |
"learning_rate": 8.697314536295438e-05, | |
"loss": 0.3346, | |
"step": 62500 | |
}, | |
{ | |
"epoch": 3.4005539865305234, | |
"eval_loss": 0.4442519247531891, | |
"eval_runtime": 22.857, | |
"eval_samples_per_second": 16.144, | |
"eval_steps_per_second": 8.094, | |
"step": 62611 | |
}, | |
{ | |
"epoch": 3.421681512057354, | |
"grad_norm": 2.1374149322509766, | |
"learning_rate": 8.606711847207625e-05, | |
"loss": 0.3231, | |
"step": 63000 | |
}, | |
{ | |
"epoch": 3.4488377145339997, | |
"grad_norm": 2.8971145153045654, | |
"learning_rate": 8.516109158119814e-05, | |
"loss": 0.3376, | |
"step": 63500 | |
}, | |
{ | |
"epoch": 3.475993917010645, | |
"grad_norm": 2.860117197036743, | |
"learning_rate": 8.425506469032002e-05, | |
"loss": 0.3295, | |
"step": 64000 | |
}, | |
{ | |
"epoch": 3.503150119487291, | |
"grad_norm": 1.976477026939392, | |
"learning_rate": 8.335084985322365e-05, | |
"loss": 0.3236, | |
"step": 64500 | |
}, | |
{ | |
"epoch": 3.530306321963937, | |
"grad_norm": 2.6291637420654297, | |
"learning_rate": 8.244482296234553e-05, | |
"loss": 0.3201, | |
"step": 65000 | |
}, | |
{ | |
"epoch": 3.557462524440582, | |
"grad_norm": 2.5785484313964844, | |
"learning_rate": 8.15387960714674e-05, | |
"loss": 0.3354, | |
"step": 65500 | |
}, | |
{ | |
"epoch": 3.584618726917228, | |
"grad_norm": 2.3802502155303955, | |
"learning_rate": 8.063276918058928e-05, | |
"loss": 0.3363, | |
"step": 66000 | |
}, | |
{ | |
"epoch": 3.6005865739734957, | |
"eval_loss": 0.43394023180007935, | |
"eval_runtime": 23.107, | |
"eval_samples_per_second": 15.969, | |
"eval_steps_per_second": 8.006, | |
"step": 66294 | |
}, | |
{ | |
"epoch": 3.6117749293938735, | |
"grad_norm": 3.012232542037964, | |
"learning_rate": 7.972674228971116e-05, | |
"loss": 0.323, | |
"step": 66500 | |
}, | |
{ | |
"epoch": 3.6389311318705193, | |
"grad_norm": 2.5260913372039795, | |
"learning_rate": 7.88225274526148e-05, | |
"loss": 0.3316, | |
"step": 67000 | |
}, | |
{ | |
"epoch": 3.666087334347165, | |
"grad_norm": 3.0673775672912598, | |
"learning_rate": 7.791650056173668e-05, | |
"loss": 0.3194, | |
"step": 67500 | |
}, | |
{ | |
"epoch": 3.6932435368238106, | |
"grad_norm": 1.782955527305603, | |
"learning_rate": 7.701047367085855e-05, | |
"loss": 0.3268, | |
"step": 68000 | |
}, | |
{ | |
"epoch": 3.720399739300456, | |
"grad_norm": 3.0327773094177246, | |
"learning_rate": 7.610444677998043e-05, | |
"loss": 0.327, | |
"step": 68500 | |
}, | |
{ | |
"epoch": 3.747555941777102, | |
"grad_norm": 4.625910758972168, | |
"learning_rate": 7.520023194288407e-05, | |
"loss": 0.3231, | |
"step": 69000 | |
}, | |
{ | |
"epoch": 3.7747121442537477, | |
"grad_norm": 2.987931966781616, | |
"learning_rate": 7.429420505200595e-05, | |
"loss": 0.3321, | |
"step": 69500 | |
}, | |
{ | |
"epoch": 3.8006191614164675, | |
"eval_loss": 0.43500107526779175, | |
"eval_runtime": 22.946, | |
"eval_samples_per_second": 16.081, | |
"eval_steps_per_second": 8.062, | |
"step": 69977 | |
}, | |
{ | |
"epoch": 3.801868346730393, | |
"grad_norm": 3.8928215503692627, | |
"learning_rate": 7.338817816112783e-05, | |
"loss": 0.3387, | |
"step": 70000 | |
}, | |
{ | |
"epoch": 3.829024549207039, | |
"grad_norm": 2.32753586769104, | |
"learning_rate": 7.24821512702497e-05, | |
"loss": 0.3327, | |
"step": 70500 | |
}, | |
{ | |
"epoch": 3.8561807516836843, | |
"grad_norm": 2.5396571159362793, | |
"learning_rate": 7.157793643315333e-05, | |
"loss": 0.3251, | |
"step": 71000 | |
}, | |
{ | |
"epoch": 3.88333695416033, | |
"grad_norm": 2.509148597717285, | |
"learning_rate": 7.067190954227521e-05, | |
"loss": 0.3225, | |
"step": 71500 | |
}, | |
{ | |
"epoch": 3.910493156636976, | |
"grad_norm": 1.7930841445922852, | |
"learning_rate": 6.97658826513971e-05, | |
"loss": 0.3392, | |
"step": 72000 | |
}, | |
{ | |
"epoch": 3.9376493591136215, | |
"grad_norm": 2.579759120941162, | |
"learning_rate": 6.885985576051898e-05, | |
"loss": 0.3415, | |
"step": 72500 | |
}, | |
{ | |
"epoch": 3.9648055615902673, | |
"grad_norm": 4.053764820098877, | |
"learning_rate": 6.795564092342262e-05, | |
"loss": 0.3373, | |
"step": 73000 | |
}, | |
{ | |
"epoch": 3.9919617640669127, | |
"grad_norm": 2.3885462284088135, | |
"learning_rate": 6.70496140325445e-05, | |
"loss": 0.3423, | |
"step": 73500 | |
}, | |
{ | |
"epoch": 4.00065174885944, | |
"eval_loss": 0.42881426215171814, | |
"eval_runtime": 23.0588, | |
"eval_samples_per_second": 16.003, | |
"eval_steps_per_second": 8.023, | |
"step": 73660 | |
}, | |
{ | |
"epoch": 4.019117966543559, | |
"grad_norm": 1.8718838691711426, | |
"learning_rate": 6.614358714166636e-05, | |
"loss": 0.2902, | |
"step": 74000 | |
}, | |
{ | |
"epoch": 4.046274169020204, | |
"grad_norm": 3.1479783058166504, | |
"learning_rate": 6.523756025078824e-05, | |
"loss": 0.2817, | |
"step": 74500 | |
}, | |
{ | |
"epoch": 4.07343037149685, | |
"grad_norm": 2.8043808937072754, | |
"learning_rate": 6.433153335991013e-05, | |
"loss": 0.28, | |
"step": 75000 | |
}, | |
{ | |
"epoch": 4.100586573973495, | |
"grad_norm": 0.6163878440856934, | |
"learning_rate": 6.342550646903201e-05, | |
"loss": 0.283, | |
"step": 75500 | |
}, | |
{ | |
"epoch": 4.127742776450141, | |
"grad_norm": 1.6441878080368042, | |
"learning_rate": 6.252129163193563e-05, | |
"loss": 0.2731, | |
"step": 76000 | |
}, | |
{ | |
"epoch": 4.154898978926787, | |
"grad_norm": 3.012065887451172, | |
"learning_rate": 6.161526474105753e-05, | |
"loss": 0.2757, | |
"step": 76500 | |
}, | |
{ | |
"epoch": 4.182055181403433, | |
"grad_norm": 2.1326332092285156, | |
"learning_rate": 6.07092378501794e-05, | |
"loss": 0.2789, | |
"step": 77000 | |
}, | |
{ | |
"epoch": 4.200684336302412, | |
"eval_loss": 0.44576430320739746, | |
"eval_runtime": 23.0355, | |
"eval_samples_per_second": 16.019, | |
"eval_steps_per_second": 8.031, | |
"step": 77343 | |
}, | |
{ | |
"epoch": 4.209211383880078, | |
"grad_norm": 3.3734445571899414, | |
"learning_rate": 5.9803210959301273e-05, | |
"loss": 0.2729, | |
"step": 77500 | |
}, | |
{ | |
"epoch": 4.236367586356724, | |
"grad_norm": 2.7482869625091553, | |
"learning_rate": 5.889718406842315e-05, | |
"loss": 0.2924, | |
"step": 78000 | |
}, | |
{ | |
"epoch": 4.2635237888333695, | |
"grad_norm": 2.5796825885772705, | |
"learning_rate": 5.799115717754503e-05, | |
"loss": 0.2843, | |
"step": 78500 | |
}, | |
{ | |
"epoch": 4.290679991310015, | |
"grad_norm": 3.74029541015625, | |
"learning_rate": 5.708513028666691e-05, | |
"loss": 0.2889, | |
"step": 79000 | |
}, | |
{ | |
"epoch": 4.317836193786661, | |
"grad_norm": 3.763978958129883, | |
"learning_rate": 5.617910339578879e-05, | |
"loss": 0.2812, | |
"step": 79500 | |
}, | |
{ | |
"epoch": 4.344992396263306, | |
"grad_norm": 2.851184844970703, | |
"learning_rate": 5.527488855869243e-05, | |
"loss": 0.283, | |
"step": 80000 | |
}, | |
{ | |
"epoch": 4.372148598739952, | |
"grad_norm": 3.071202278137207, | |
"learning_rate": 5.436886166781431e-05, | |
"loss": 0.2911, | |
"step": 80500 | |
}, | |
{ | |
"epoch": 4.399304801216598, | |
"grad_norm": 3.962803602218628, | |
"learning_rate": 5.3464646830717936e-05, | |
"loss": 0.2928, | |
"step": 81000 | |
}, | |
{ | |
"epoch": 4.400716923745383, | |
"eval_loss": 0.4378789961338043, | |
"eval_runtime": 22.9566, | |
"eval_samples_per_second": 16.074, | |
"eval_steps_per_second": 8.059, | |
"step": 81026 | |
}, | |
{ | |
"epoch": 4.426461003693244, | |
"grad_norm": 2.5465190410614014, | |
"learning_rate": 5.2558619939839814e-05, | |
"loss": 0.269, | |
"step": 81500 | |
}, | |
{ | |
"epoch": 4.4536172061698895, | |
"grad_norm": 3.322237491607666, | |
"learning_rate": 5.16525930489617e-05, | |
"loss": 0.2883, | |
"step": 82000 | |
}, | |
{ | |
"epoch": 4.4807734086465345, | |
"grad_norm": 1.5292987823486328, | |
"learning_rate": 5.0746566158083575e-05, | |
"loss": 0.2796, | |
"step": 82500 | |
}, | |
{ | |
"epoch": 4.50792961112318, | |
"grad_norm": 2.0258724689483643, | |
"learning_rate": 4.984053926720545e-05, | |
"loss": 0.2766, | |
"step": 83000 | |
}, | |
{ | |
"epoch": 4.535085813599826, | |
"grad_norm": 2.583266019821167, | |
"learning_rate": 4.893451237632733e-05, | |
"loss": 0.2975, | |
"step": 83500 | |
}, | |
{ | |
"epoch": 4.562242016076472, | |
"grad_norm": 2.7614002227783203, | |
"learning_rate": 4.802848548544921e-05, | |
"loss": 0.2846, | |
"step": 84000 | |
}, | |
{ | |
"epoch": 4.589398218553118, | |
"grad_norm": 4.259634971618652, | |
"learning_rate": 4.712245859457109e-05, | |
"loss": 0.2963, | |
"step": 84500 | |
}, | |
{ | |
"epoch": 4.600749511188355, | |
"eval_loss": 0.43254056572914124, | |
"eval_runtime": 22.8989, | |
"eval_samples_per_second": 16.114, | |
"eval_steps_per_second": 8.079, | |
"step": 84709 | |
}, | |
{ | |
"epoch": 4.616554421029763, | |
"grad_norm": 1.8035340309143066, | |
"learning_rate": 4.621643170369297e-05, | |
"loss": 0.2854, | |
"step": 85000 | |
}, | |
{ | |
"epoch": 4.643710623506409, | |
"grad_norm": 3.2322275638580322, | |
"learning_rate": 4.53122168665966e-05, | |
"loss": 0.287, | |
"step": 85500 | |
}, | |
{ | |
"epoch": 4.670866825983055, | |
"grad_norm": 7.430004119873047, | |
"learning_rate": 4.440618997571848e-05, | |
"loss": 0.2805, | |
"step": 86000 | |
}, | |
{ | |
"epoch": 4.6980230284597, | |
"grad_norm": 2.2691986560821533, | |
"learning_rate": 4.3500163084840364e-05, | |
"loss": 0.2874, | |
"step": 86500 | |
}, | |
{ | |
"epoch": 4.725179230936346, | |
"grad_norm": 2.7627906799316406, | |
"learning_rate": 4.2594136193962235e-05, | |
"loss": 0.2818, | |
"step": 87000 | |
}, | |
{ | |
"epoch": 4.752335433412991, | |
"grad_norm": 3.7362864017486572, | |
"learning_rate": 4.1689921356865876e-05, | |
"loss": 0.2827, | |
"step": 87500 | |
}, | |
{ | |
"epoch": 4.779491635889637, | |
"grad_norm": 4.409236907958984, | |
"learning_rate": 4.0783894465987754e-05, | |
"loss": 0.2887, | |
"step": 88000 | |
}, | |
{ | |
"epoch": 4.800782098631327, | |
"eval_loss": 0.42746320366859436, | |
"eval_runtime": 23.0563, | |
"eval_samples_per_second": 16.004, | |
"eval_steps_per_second": 8.024, | |
"step": 88392 | |
}, | |
{ | |
"epoch": 4.806647838366283, | |
"grad_norm": 4.065585136413574, | |
"learning_rate": 3.987786757510963e-05, | |
"loss": 0.2905, | |
"step": 88500 | |
}, | |
{ | |
"epoch": 4.833804040842929, | |
"grad_norm": 3.655996799468994, | |
"learning_rate": 3.897184068423151e-05, | |
"loss": 0.2716, | |
"step": 89000 | |
}, | |
{ | |
"epoch": 4.860960243319575, | |
"grad_norm": 4.297955513000488, | |
"learning_rate": 3.806762584713515e-05, | |
"loss": 0.29, | |
"step": 89500 | |
}, | |
{ | |
"epoch": 4.88811644579622, | |
"grad_norm": 3.1703717708587646, | |
"learning_rate": 3.716159895625702e-05, | |
"loss": 0.2754, | |
"step": 90000 | |
}, | |
{ | |
"epoch": 4.9152726482728655, | |
"grad_norm": 3.771336078643799, | |
"learning_rate": 3.62555720653789e-05, | |
"loss": 0.2839, | |
"step": 90500 | |
}, | |
{ | |
"epoch": 4.942428850749511, | |
"grad_norm": 3.908500909805298, | |
"learning_rate": 3.534954517450078e-05, | |
"loss": 0.2744, | |
"step": 91000 | |
}, | |
{ | |
"epoch": 4.969585053226157, | |
"grad_norm": 3.199415445327759, | |
"learning_rate": 3.444351828362266e-05, | |
"loss": 0.2834, | |
"step": 91500 | |
}, | |
{ | |
"epoch": 4.996741255702802, | |
"grad_norm": 3.1083319187164307, | |
"learning_rate": 3.3539303446526294e-05, | |
"loss": 0.2949, | |
"step": 92000 | |
}, | |
{ | |
"epoch": 5.0008146860743, | |
"eval_loss": 0.4291832447052002, | |
"eval_runtime": 23.525, | |
"eval_samples_per_second": 15.685, | |
"eval_steps_per_second": 7.864, | |
"step": 92075 | |
}, | |
{ | |
"epoch": 5.023897458179448, | |
"grad_norm": 6.121253490447998, | |
"learning_rate": 3.263327655564817e-05, | |
"loss": 0.2289, | |
"step": 92500 | |
}, | |
{ | |
"epoch": 5.051053660656094, | |
"grad_norm": 2.5016486644744873, | |
"learning_rate": 3.1727249664770055e-05, | |
"loss": 0.248, | |
"step": 93000 | |
}, | |
{ | |
"epoch": 5.07820986313274, | |
"grad_norm": 2.344914197921753, | |
"learning_rate": 3.0821222773891926e-05, | |
"loss": 0.2315, | |
"step": 93500 | |
}, | |
{ | |
"epoch": 5.1053660656093856, | |
"grad_norm": 3.519299268722534, | |
"learning_rate": 2.9917007936795567e-05, | |
"loss": 0.2516, | |
"step": 94000 | |
}, | |
{ | |
"epoch": 5.1325222680860305, | |
"grad_norm": 3.192281484603882, | |
"learning_rate": 2.9010981045917445e-05, | |
"loss": 0.2368, | |
"step": 94500 | |
}, | |
{ | |
"epoch": 5.159678470562676, | |
"grad_norm": 3.7645487785339355, | |
"learning_rate": 2.8104954155039322e-05, | |
"loss": 0.2573, | |
"step": 95000 | |
}, | |
{ | |
"epoch": 5.186834673039322, | |
"grad_norm": 4.5175275802612305, | |
"learning_rate": 2.71989272641612e-05, | |
"loss": 0.2437, | |
"step": 95500 | |
}, | |
{ | |
"epoch": 5.2008472735172715, | |
"eval_loss": 0.4366357922554016, | |
"eval_runtime": 23.1107, | |
"eval_samples_per_second": 15.967, | |
"eval_steps_per_second": 8.005, | |
"step": 95758 | |
}, | |
{ | |
"epoch": 5.213990875515968, | |
"grad_norm": 4.234988212585449, | |
"learning_rate": 2.629290037328308e-05, | |
"loss": 0.2439, | |
"step": 96000 | |
}, | |
{ | |
"epoch": 5.241147077992614, | |
"grad_norm": 3.174309492111206, | |
"learning_rate": 2.538687348240496e-05, | |
"loss": 0.2523, | |
"step": 96500 | |
}, | |
{ | |
"epoch": 5.268303280469259, | |
"grad_norm": 3.7519733905792236, | |
"learning_rate": 2.4480846591526838e-05, | |
"loss": 0.2463, | |
"step": 97000 | |
}, | |
{ | |
"epoch": 5.295459482945905, | |
"grad_norm": 2.9701130390167236, | |
"learning_rate": 2.357481970064872e-05, | |
"loss": 0.2519, | |
"step": 97500 | |
}, | |
{ | |
"epoch": 5.322615685422551, | |
"grad_norm": 5.130082130432129, | |
"learning_rate": 2.2672416917334107e-05, | |
"loss": 0.2486, | |
"step": 98000 | |
}, | |
{ | |
"epoch": 5.349771887899196, | |
"grad_norm": 3.390826463699341, | |
"learning_rate": 2.1766390026455985e-05, | |
"loss": 0.2478, | |
"step": 98500 | |
}, | |
{ | |
"epoch": 5.376928090375841, | |
"grad_norm": 2.6151483058929443, | |
"learning_rate": 2.0860363135577865e-05, | |
"loss": 0.2424, | |
"step": 99000 | |
}, | |
{ | |
"epoch": 5.400879860960243, | |
"eval_loss": 0.43580135703086853, | |
"eval_runtime": 23.7346, | |
"eval_samples_per_second": 15.547, | |
"eval_steps_per_second": 7.795, | |
"step": 99441 | |
}, | |
{ | |
"epoch": 5.404084292852487, | |
"grad_norm": 3.701735496520996, | |
"learning_rate": 1.9954336244699743e-05, | |
"loss": 0.2443, | |
"step": 99500 | |
}, | |
{ | |
"epoch": 5.431240495329133, | |
"grad_norm": 3.8400754928588867, | |
"learning_rate": 1.9048309353821623e-05, | |
"loss": 0.2276, | |
"step": 100000 | |
}, | |
{ | |
"epoch": 5.458396697805779, | |
"grad_norm": 2.5460264682769775, | |
"learning_rate": 1.81422824629435e-05, | |
"loss": 0.2313, | |
"step": 100500 | |
}, | |
{ | |
"epoch": 5.485552900282425, | |
"grad_norm": 5.040457725524902, | |
"learning_rate": 1.7236255572065378e-05, | |
"loss": 0.238, | |
"step": 101000 | |
}, | |
{ | |
"epoch": 5.51270910275907, | |
"grad_norm": 4.061932563781738, | |
"learning_rate": 1.633022868118726e-05, | |
"loss": 0.2558, | |
"step": 101500 | |
}, | |
{ | |
"epoch": 5.539865305235716, | |
"grad_norm": 4.28571081161499, | |
"learning_rate": 1.5424201790309136e-05, | |
"loss": 0.2531, | |
"step": 102000 | |
}, | |
{ | |
"epoch": 5.5670215077123615, | |
"grad_norm": 4.26746129989624, | |
"learning_rate": 1.4519986953212772e-05, | |
"loss": 0.2487, | |
"step": 102500 | |
}, | |
{ | |
"epoch": 5.594177710189007, | |
"grad_norm": 1.4005869626998901, | |
"learning_rate": 1.3613960062334651e-05, | |
"loss": 0.2528, | |
"step": 103000 | |
}, | |
{ | |
"epoch": 5.600912448403215, | |
"eval_loss": 0.4331228733062744, | |
"eval_runtime": 25.1727, | |
"eval_samples_per_second": 14.659, | |
"eval_steps_per_second": 7.349, | |
"step": 103124 | |
}, | |
{ | |
"epoch": 5.621333912665653, | |
"grad_norm": 3.8620026111602783, | |
"learning_rate": 1.2707933171456529e-05, | |
"loss": 0.248, | |
"step": 103500 | |
}, | |
{ | |
"epoch": 5.648490115142298, | |
"grad_norm": 4.398037433624268, | |
"learning_rate": 1.1803718334360163e-05, | |
"loss": 0.2394, | |
"step": 104000 | |
}, | |
{ | |
"epoch": 5.675646317618944, | |
"grad_norm": 2.4203145503997803, | |
"learning_rate": 1.0897691443482042e-05, | |
"loss": 0.2344, | |
"step": 104500 | |
}, | |
{ | |
"epoch": 5.70280252009559, | |
"grad_norm": 3.2735469341278076, | |
"learning_rate": 9.991664552603922e-06, | |
"loss": 0.2391, | |
"step": 105000 | |
}, | |
{ | |
"epoch": 5.729958722572236, | |
"grad_norm": 3.202352523803711, | |
"learning_rate": 9.0856376617258e-06, | |
"loss": 0.2503, | |
"step": 105500 | |
}, | |
{ | |
"epoch": 5.757114925048882, | |
"grad_norm": 2.457843065261841, | |
"learning_rate": 8.17961077084768e-06, | |
"loss": 0.233, | |
"step": 106000 | |
}, | |
{ | |
"epoch": 5.7842711275255265, | |
"grad_norm": 2.1440610885620117, | |
"learning_rate": 7.273583879969558e-06, | |
"loss": 0.2477, | |
"step": 106500 | |
}, | |
{ | |
"epoch": 5.800945035846187, | |
"eval_loss": 0.43289270997047424, | |
"eval_runtime": 25.7135, | |
"eval_samples_per_second": 14.35, | |
"eval_steps_per_second": 7.195, | |
"step": 106807 | |
}, | |
{ | |
"epoch": 5.811427330002172, | |
"grad_norm": 2.6855876445770264, | |
"learning_rate": 6.367556989091436e-06, | |
"loss": 0.231, | |
"step": 107000 | |
}, | |
{ | |
"epoch": 5.838583532478818, | |
"grad_norm": 5.511388778686523, | |
"learning_rate": 5.461530098213316e-06, | |
"loss": 0.2399, | |
"step": 107500 | |
}, | |
{ | |
"epoch": 5.865739734955464, | |
"grad_norm": 2.992866277694702, | |
"learning_rate": 4.555503207335194e-06, | |
"loss": 0.2367, | |
"step": 108000 | |
}, | |
{ | |
"epoch": 5.89289593743211, | |
"grad_norm": 2.2536861896514893, | |
"learning_rate": 3.651288370238829e-06, | |
"loss": 0.2545, | |
"step": 108500 | |
}, | |
{ | |
"epoch": 5.920052139908755, | |
"grad_norm": 3.6174511909484863, | |
"learning_rate": 2.745261479360707e-06, | |
"loss": 0.2576, | |
"step": 109000 | |
}, | |
{ | |
"epoch": 5.947208342385401, | |
"grad_norm": 2.4859135150909424, | |
"learning_rate": 1.8392345884825864e-06, | |
"loss": 0.2448, | |
"step": 109500 | |
}, | |
{ | |
"epoch": 5.974364544862047, | |
"grad_norm": 1.783007025718689, | |
"learning_rate": 9.350197513862211e-07, | |
"loss": 0.2347, | |
"step": 110000 | |
}, | |
{ | |
"epoch": 6.0, | |
"step": 110472, | |
"total_flos": 7.299634402197504e+17, | |
"train_loss": 0.3804842073002838, | |
"train_runtime": 59722.3514, | |
"train_samples_per_second": 3.699, | |
"train_steps_per_second": 1.85 | |
} | |
], | |
"logging_steps": 500, | |
"max_steps": 110472, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 6, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 7.299634402197504e+17, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |