|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 228, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 99.10416793823242, |
|
"epoch": 0.013157894736842105, |
|
"grad_norm": 1.1334748507973709, |
|
"kl": 0.0, |
|
"learning_rate": 7.142857142857142e-08, |
|
"loss": -0.0, |
|
"reward": 0.6663198471069336, |
|
"reward_std": 0.4498612582683563, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.4163198471069336, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 91.58333587646484, |
|
"epoch": 0.02631578947368421, |
|
"grad_norm": 16.060906203241963, |
|
"kl": 0.0, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": -0.0, |
|
"reward": 0.9228171110153198, |
|
"reward_std": 0.4594407230615616, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.38115040957927704, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 72.02083587646484, |
|
"epoch": 0.039473684210526314, |
|
"grad_norm": 3.1753635339709736, |
|
"kl": 0.00012445449829101562, |
|
"learning_rate": 2.1428571428571426e-07, |
|
"loss": 0.0, |
|
"reward": 0.6041666865348816, |
|
"reward_std": 0.25199543684720993, |
|
"rewards/correct_code_reward_func": 0.1666666679084301, |
|
"rewards/len_reward_func": 0.4375000149011612, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 87.33333396911621, |
|
"epoch": 0.05263157894736842, |
|
"grad_norm": 5.760299497142578, |
|
"kl": -1.5407800674438477e-05, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": -0.0, |
|
"reward": 0.7341760993003845, |
|
"reward_std": 0.37608518451452255, |
|
"rewards/correct_code_reward_func": 0.31250002048909664, |
|
"rewards/len_reward_func": 0.4216760843992233, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 71.85416793823242, |
|
"epoch": 0.06578947368421052, |
|
"grad_norm": 12.020598074533623, |
|
"kl": 0.00014781951904296875, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.0, |
|
"reward": 0.5998998582363129, |
|
"reward_std": 0.20088782534003258, |
|
"rewards/correct_code_reward_func": 0.14583333395421505, |
|
"rewards/len_reward_func": 0.45406651496887207, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 89.77083587646484, |
|
"epoch": 0.07894736842105263, |
|
"grad_norm": 0.8550201889336397, |
|
"kl": 4.374980926513672e-05, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.5605520606040955, |
|
"reward_std": 0.2179059162735939, |
|
"rewards/correct_code_reward_func": 0.1041666679084301, |
|
"rewards/len_reward_func": 0.45638537406921387, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 72.79167175292969, |
|
"epoch": 0.09210526315789473, |
|
"grad_norm": 3.0170118497826293, |
|
"kl": 0.00016558170318603516, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.9927114844322205, |
|
"reward_std": 0.40002472698688507, |
|
"rewards/correct_code_reward_func": 0.5625, |
|
"rewards/len_reward_func": 0.4302114397287369, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 81.66667175292969, |
|
"epoch": 0.10526315789473684, |
|
"grad_norm": 4.916856440024865, |
|
"kl": 7.331371307373047e-05, |
|
"learning_rate": 4.999747408985249e-07, |
|
"loss": 0.0, |
|
"reward": 0.9147771000862122, |
|
"reward_std": 0.43089577555656433, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.39394378662109375, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 106.1875057220459, |
|
"epoch": 0.11842105263157894, |
|
"grad_norm": 7.051099519560641, |
|
"kl": 0.00026702880859375, |
|
"learning_rate": 4.998989686982771e-07, |
|
"loss": 0.0, |
|
"reward": 0.9007994532585144, |
|
"reward_std": 0.30253712832927704, |
|
"rewards/correct_code_reward_func": 0.4791666716337204, |
|
"rewards/len_reward_func": 0.4216327518224716, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 89.25, |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 3.0953263442901586, |
|
"kl": 0.00014472007751464844, |
|
"learning_rate": 4.997726987107581e-07, |
|
"loss": 0.0, |
|
"reward": 0.8582921326160431, |
|
"reward_std": 0.4781641513109207, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.4416254311800003, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 61.0625, |
|
"epoch": 0.14473684210526316, |
|
"grad_norm": 26.321796409287952, |
|
"kl": 0.00046253204345703125, |
|
"learning_rate": 4.995959564516996e-07, |
|
"loss": 0.0, |
|
"reward": 1.1543937921524048, |
|
"reward_std": 0.46843117475509644, |
|
"rewards/correct_code_reward_func": 0.7708333432674408, |
|
"rewards/len_reward_func": 0.3835604339838028, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 57.687503814697266, |
|
"epoch": 0.15789473684210525, |
|
"grad_norm": 2.6899417118195452, |
|
"kl": 0.000301361083984375, |
|
"learning_rate": 4.993687776359066e-07, |
|
"loss": 0.0, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.37565621733665466, |
|
"rewards/correct_code_reward_func": 0.395833358168602, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 160.35416793823242, |
|
"epoch": 0.17105263157894737, |
|
"grad_norm": 2.247154751130096, |
|
"kl": 0.0006351470947265625, |
|
"learning_rate": 4.990912081700413e-07, |
|
"loss": 0.0, |
|
"reward": 0.5879067778587341, |
|
"reward_std": 0.3584153801202774, |
|
"rewards/correct_code_reward_func": 0.2291666679084301, |
|
"rewards/len_reward_func": 0.3587401211261749, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 54.916669845581055, |
|
"epoch": 0.18421052631578946, |
|
"grad_norm": 7.0002606987315685, |
|
"kl": 0.000507354736328125, |
|
"learning_rate": 4.987633041433461e-07, |
|
"loss": 0.0, |
|
"reward": 0.9894838631153107, |
|
"reward_std": 0.5111564844846725, |
|
"rewards/correct_code_reward_func": 0.5416666716337204, |
|
"rewards/len_reward_func": 0.4478171765804291, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 63.06250190734863, |
|
"epoch": 0.19736842105263158, |
|
"grad_norm": 2.6443252242541284, |
|
"kl": 0.001739501953125, |
|
"learning_rate": 4.983851318163097e-07, |
|
"loss": 0.0, |
|
"reward": 0.8326185643672943, |
|
"reward_std": 0.29923243820667267, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.4367852360010147, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 99.39583587646484, |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 1.0079451181922614, |
|
"kl": 0.00141143798828125, |
|
"learning_rate": 4.979567676072775e-07, |
|
"loss": 0.0, |
|
"reward": 0.852995753288269, |
|
"reward_std": 0.34659716486930847, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.45716238021850586, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 53.08333396911621, |
|
"epoch": 0.2236842105263158, |
|
"grad_norm": 9.967406022456908, |
|
"kl": 0.0040435791015625, |
|
"learning_rate": 4.9747829807701e-07, |
|
"loss": 0.0, |
|
"reward": 0.8658536076545715, |
|
"reward_std": 0.426578089594841, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.44918693602085114, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 143.7916717529297, |
|
"epoch": 0.23684210526315788, |
|
"grad_norm": 1.9448270150109064, |
|
"kl": 0.001434326171875, |
|
"learning_rate": 4.9694981991119e-07, |
|
"loss": 0.0, |
|
"reward": 0.7298745512962341, |
|
"reward_std": 0.28270113095641136, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.39654120802879333, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 61.041669845581055, |
|
"epoch": 0.25, |
|
"grad_norm": 3.1720724842373182, |
|
"kl": 0.00389862060546875, |
|
"learning_rate": 4.963714399008868e-07, |
|
"loss": 0.0, |
|
"reward": 0.718452662229538, |
|
"reward_std": 0.37869784235954285, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.46845264732837677, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 79.89583587646484, |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 1.7632061082251036, |
|
"kl": 0.00446319580078125, |
|
"learning_rate": 4.957432749209755e-07, |
|
"loss": 0.0, |
|
"reward": 0.7348142862319946, |
|
"reward_std": 0.1320388102903962, |
|
"rewards/correct_code_reward_func": 0.25, |
|
"rewards/len_reward_func": 0.48481428623199463, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 81.18750381469727, |
|
"epoch": 0.27631578947368424, |
|
"grad_norm": 2.5858288436684593, |
|
"kl": 0.00498199462890625, |
|
"learning_rate": 4.950654519065195e-07, |
|
"loss": 0.0, |
|
"reward": 0.6811683177947998, |
|
"reward_std": 0.1932540312409401, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.472834974527359, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 87.56250381469727, |
|
"epoch": 0.2894736842105263, |
|
"grad_norm": 18.2585507937141, |
|
"kl": 0.03173828125, |
|
"learning_rate": 4.943381078271214e-07, |
|
"loss": 0.0, |
|
"reward": 0.6875, |
|
"reward_std": 0.28126099705696106, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 134.64583587646484, |
|
"epoch": 0.3026315789473684, |
|
"grad_norm": 17.07507069634021, |
|
"kl": 0.12744140625, |
|
"learning_rate": 4.935613896592445e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9976747632026672, |
|
"reward_std": 0.22606945782899857, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.45600807666778564, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 52.25000190734863, |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 5.933528514518437, |
|
"kl": 0.05426025390625, |
|
"learning_rate": 4.92735454356513e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.2994871214032173, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 96.29167175292969, |
|
"epoch": 0.32894736842105265, |
|
"grad_norm": 0.7449673513689808, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 4.918604688179958e-07, |
|
"loss": 0.0, |
|
"reward": 0.5824915617704391, |
|
"reward_std": 0.1475645862519741, |
|
"rewards/correct_code_reward_func": 0.1041666716337204, |
|
"rewards/len_reward_func": 0.47832491993904114, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 57.04166793823242, |
|
"epoch": 0.34210526315789475, |
|
"grad_norm": 7.282800644316476, |
|
"kl": 0.05474853515625, |
|
"learning_rate": 4.90936609854481e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.4474997818470001, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 69.29166984558105, |
|
"epoch": 0.35526315789473684, |
|
"grad_norm": 11.501413945029437, |
|
"kl": 0.037353515625, |
|
"learning_rate": 4.89964064152747e-07, |
|
"loss": 0.0, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.2730504274368286, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 52.708335876464844, |
|
"epoch": 0.3684210526315789, |
|
"grad_norm": 1.4375830290890173, |
|
"kl": 0.05767822265625, |
|
"learning_rate": 4.889430282378384e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7291666865348816, |
|
"reward_std": 0.21322893351316452, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.4583333432674408, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 84.875, |
|
"epoch": 0.3815789473684211, |
|
"grad_norm": 1.8947890037281183, |
|
"kl": 0.079345703125, |
|
"learning_rate": 4.878737084333535e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7291666865348816, |
|
"reward_std": 0.30859363824129105, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.4583333432674408, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 91.70833587646484, |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 1.2731739041206802, |
|
"kl": 0.03851318359375, |
|
"learning_rate": 4.867563208197519e-07, |
|
"loss": 0.0, |
|
"reward": 0.799913227558136, |
|
"reward_std": 0.3069130629301071, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.4457465559244156, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 72.29166793823242, |
|
"epoch": 0.40789473684210525, |
|
"grad_norm": 1.4676508584854064, |
|
"kl": 0.04327392578125, |
|
"learning_rate": 4.855910911906906e-07, |
|
"loss": 0.0, |
|
"reward": 0.9658634960651398, |
|
"reward_std": 0.3308482989668846, |
|
"rewards/correct_code_reward_func": 0.520833358168602, |
|
"rewards/len_reward_func": 0.4450301378965378, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 76.93750381469727, |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 4.605001948885057, |
|
"kl": 0.087890625, |
|
"learning_rate": 4.843782550073969e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.22516431659460068, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 55.77083396911621, |
|
"epoch": 0.4342105263157895, |
|
"grad_norm": 1.0747031450058906, |
|
"kl": 0.234375, |
|
"learning_rate": 4.83118057351089e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6458333432674408, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.1666666716337204, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 58.02083396911621, |
|
"epoch": 0.4473684210526316, |
|
"grad_norm": 3.450736535043091, |
|
"kl": 0.06573486328125, |
|
"learning_rate": 4.818107528734503e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9100490808486938, |
|
"reward_std": 0.49446116387844086, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.47254903614521027, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 64.47916793823242, |
|
"epoch": 0.4605263157894737, |
|
"grad_norm": 3.8860905534674637, |
|
"kl": 0.0714111328125, |
|
"learning_rate": 4.804566057451729e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8531287312507629, |
|
"reward_std": 0.3388620540499687, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.49896204471588135, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 49.06250286102295, |
|
"epoch": 0.47368421052631576, |
|
"grad_norm": 118.2111884355624, |
|
"kl": 0.9837646484375, |
|
"learning_rate": 4.790558896025742e-07, |
|
"loss": 0.001, |
|
"reward": 0.8689024448394775, |
|
"reward_std": 0.31673362106084824, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.49390244483947754, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 82.68750381469727, |
|
"epoch": 0.4868421052631579, |
|
"grad_norm": 1.2725798743612686, |
|
"kl": 0.0997314453125, |
|
"learning_rate": 4.776088874923041e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7270315587520599, |
|
"reward_std": 0.36151498556137085, |
|
"rewards/correct_code_reward_func": 0.229166679084301, |
|
"rewards/len_reward_func": 0.49786490201950073, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 67.29166984558105, |
|
"epoch": 0.5, |
|
"grad_norm": 0.6682530578144538, |
|
"kl": 0.24072265625, |
|
"learning_rate": 4.7611589181414734e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.19500282034277916, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 82.89583587646484, |
|
"epoch": 0.5131578947368421, |
|
"grad_norm": 1.896117750609313, |
|
"kl": 0.109375, |
|
"learning_rate": 4.745772042619388e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7291666865348816, |
|
"reward_std": 0.28126102685928345, |
|
"rewards/correct_code_reward_func": 0.229166679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 58.06250190734863, |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 118.0980466511993, |
|
"kl": 44.5732421875, |
|
"learning_rate": 4.729931357625986e-07, |
|
"loss": 0.0447, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.3177001550793648, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 74.12500381469727, |
|
"epoch": 0.5394736842105263, |
|
"grad_norm": 0.7236725923896844, |
|
"kl": 0.41094970703125, |
|
"learning_rate": 4.7136400641330245e-07, |
|
"loss": 0.0004, |
|
"reward": 0.6250000149011612, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/correct_code_reward_func": 0.1458333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 46.895835876464844, |
|
"epoch": 0.5526315789473685, |
|
"grad_norm": 4.631491683010619, |
|
"kl": 1.0185546875, |
|
"learning_rate": 4.696901454167988e-07, |
|
"loss": 0.001, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.2903675064444542, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 60.79166793823242, |
|
"epoch": 0.5657894736842105, |
|
"grad_norm": 3.3754705022635965, |
|
"kl": 0.15283203125, |
|
"learning_rate": 4.6797189101488576e-07, |
|
"loss": 0.0002, |
|
"reward": 0.979166716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 43.583335876464844, |
|
"epoch": 0.5789473684210527, |
|
"grad_norm": 95.77831861506337, |
|
"kl": 2.21484375, |
|
"learning_rate": 4.662095904200617e-07, |
|
"loss": 0.0022, |
|
"reward": 0.9375000596046448, |
|
"reward_std": 0.3310800567269325, |
|
"rewards/correct_code_reward_func": 0.4375000111758709, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 64.0625, |
|
"epoch": 0.5921052631578947, |
|
"grad_norm": 0.9899941998859535, |
|
"kl": 0.0555419921875, |
|
"learning_rate": 4.6440359974536304e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7708333730697632, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 53.04166793823242, |
|
"epoch": 0.6052631578947368, |
|
"grad_norm": 3.552556057212125, |
|
"kl": 0.229736328125, |
|
"learning_rate": 4.6255428393240354e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.19500282034277916, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 40.52083396911621, |
|
"epoch": 0.618421052631579, |
|
"grad_norm": 27.248705920564117, |
|
"kl": 7.90234375, |
|
"learning_rate": 4.606620166776294e-07, |
|
"loss": 0.0079, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.16623876243829727, |
|
"rewards/correct_code_reward_func": 0.3750000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 54.68750190734863, |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 6.398561296574553, |
|
"kl": 0.744384765625, |
|
"learning_rate": 4.587271803568055e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5416666865348816, |
|
"reward_std": 0.1451837718486786, |
|
"rewards/correct_code_reward_func": 0.0625, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 48.29166793823242, |
|
"epoch": 0.6447368421052632, |
|
"grad_norm": 4.1122533170941535, |
|
"kl": 0.148681640625, |
|
"learning_rate": 4.567501659477476e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8958333730697632, |
|
"reward_std": 0.3205290287733078, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 39.3125, |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 53.38113189457624, |
|
"kl": 13.2625732421875, |
|
"learning_rate": 4.5473137295131625e-07, |
|
"loss": 0.0133, |
|
"reward": 0.9375, |
|
"reward_std": 0.22516433894634247, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 42.83333396911621, |
|
"epoch": 0.6710526315789473, |
|
"grad_norm": 18178.959749921032, |
|
"kl": 4768.09375, |
|
"learning_rate": 4.526712093106887e-07, |
|
"loss": 4.7525, |
|
"reward": 1.0208333432674408, |
|
"reward_std": 0.2041093371808529, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 71.66667175292969, |
|
"epoch": 0.6842105263157895, |
|
"grad_norm": 1.9579852812558598, |
|
"kl": 0.474609375, |
|
"learning_rate": 4.5057009132892455e-07, |
|
"loss": 0.0005, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.4583333432674408, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 57.83333396911621, |
|
"epoch": 0.6973684210526315, |
|
"grad_norm": 3.4888745953712426, |
|
"kl": 0.9111328125, |
|
"learning_rate": 4.4842844358484233e-07, |
|
"loss": 0.0009, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.22516431659460068, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 86.72916793823242, |
|
"epoch": 0.7105263157894737, |
|
"grad_norm": 3.0697908089632087, |
|
"kl": 0.12158203125, |
|
"learning_rate": 4.4624669884722364e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.31142252683639526, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 53.895835876464844, |
|
"epoch": 0.7236842105263158, |
|
"grad_norm": 1.4628769742470222, |
|
"kl": 0.137939453125, |
|
"learning_rate": 4.4402529798736214e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0, |
|
"reward_std": 0.2994871214032173, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 51.770835876464844, |
|
"epoch": 0.7368421052631579, |
|
"grad_norm": 19.218612806133308, |
|
"kl": 6.68017578125, |
|
"learning_rate": 4.417646898899758e-07, |
|
"loss": 0.0067, |
|
"reward": 0.7708333432674408, |
|
"reward_std": 0.2041093371808529, |
|
"rewards/correct_code_reward_func": 0.2708333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 48.875, |
|
"epoch": 0.75, |
|
"grad_norm": 1.1426530146279454, |
|
"kl": 0.115234375, |
|
"learning_rate": 4.394653313624992e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.3478616699576378, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 39.91666793823242, |
|
"epoch": 0.7631578947368421, |
|
"grad_norm": 1.424776813399897, |
|
"kl": 2.8671875, |
|
"learning_rate": 4.3712768704277524e-07, |
|
"loss": 0.0029, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.22233545035123825, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 70.31250190734863, |
|
"epoch": 0.7763157894736842, |
|
"grad_norm": 3.1751573512407827, |
|
"kl": 0.115234375, |
|
"learning_rate": 4.3475222930516473e-07, |
|
"loss": 0.0001, |
|
"reward": 0.873985230922699, |
|
"reward_std": 0.25253836810588837, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.4781518578529358, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 54.54166793823242, |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 58.36302253834897, |
|
"kl": 1.03515625, |
|
"learning_rate": 4.32339438165092e-07, |
|
"loss": 0.001, |
|
"reward": 0.916666716337204, |
|
"reward_std": 0.42927367985248566, |
|
"rewards/correct_code_reward_func": 0.416666679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 63.583335876464844, |
|
"epoch": 0.8026315789473685, |
|
"grad_norm": 2.827911135736344, |
|
"kl": 0.29736328125, |
|
"learning_rate": 4.29889801182047e-07, |
|
"loss": 0.0003, |
|
"reward": 0.791666716337204, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.29166667722165585, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 65.41666793823242, |
|
"epoch": 0.8157894736842105, |
|
"grad_norm": 1.5958266464282538, |
|
"kl": 0.065673828125, |
|
"learning_rate": 4.274038133610628e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 0.15430335700511932, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 31.270834922790527, |
|
"epoch": 0.8289473684210527, |
|
"grad_norm": 6.489845534414021, |
|
"kl": 1.37109375, |
|
"learning_rate": 4.248819770526884e-07, |
|
"loss": 0.0014, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.376638799905777, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 27.750000953674316, |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 1.7297180083254418, |
|
"kl": 4.314453125, |
|
"learning_rate": 4.223248018514777e-07, |
|
"loss": 0.0043, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.30859364569187164, |
|
"rewards/correct_code_reward_func": 0.6458333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 47.35416793823242, |
|
"epoch": 0.8552631578947368, |
|
"grad_norm": 15.360105674599206, |
|
"kl": 0.5107421875, |
|
"learning_rate": 4.1973280449301364e-07, |
|
"loss": 0.0005, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.3177001550793648, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 34.500000953674316, |
|
"epoch": 0.868421052631579, |
|
"grad_norm": 0.8012424945081584, |
|
"kl": 0.1363525390625, |
|
"learning_rate": 4.171065087494909e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7083333730697632, |
|
"reward_std": 0.07715167850255966, |
|
"rewards/correct_code_reward_func": 0.2083333395421505, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 68.00000190734863, |
|
"epoch": 0.881578947368421, |
|
"grad_norm": 1.8876444286085294, |
|
"kl": 0.2359619140625, |
|
"learning_rate": 4.144464453238748e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.36751919239759445, |
|
"rewards/correct_code_reward_func": 0.4791666716337204, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 111.93750762939453, |
|
"epoch": 0.8947368421052632, |
|
"grad_norm": 0.7306458054174331, |
|
"kl": 0.102294921875, |
|
"learning_rate": 4.1175315174266135e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5625000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/correct_code_reward_func": 0.06250000186264515, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 92.27083969116211, |
|
"epoch": 0.9078947368421053, |
|
"grad_norm": 1.8168097569883273, |
|
"kl": 0.185546875, |
|
"learning_rate": 4.090271722472576e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.28126100450754166, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 86.77083587646484, |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 2.3675233810359515, |
|
"kl": 1.0478515625, |
|
"learning_rate": 4.062690576840051e-07, |
|
"loss": 0.001, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.32964862883090973, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 57.66666793823242, |
|
"epoch": 0.9342105263157895, |
|
"grad_norm": 51.022759357987056, |
|
"kl": 8.755859375, |
|
"learning_rate": 4.0347936539286874e-07, |
|
"loss": 0.0088, |
|
"reward": 1.0, |
|
"reward_std": 0.19500282034277916, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 79.37500381469727, |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 1.1663678155323538, |
|
"kl": 0.1044921875, |
|
"learning_rate": 4.006586590948141e-07, |
|
"loss": 0.0001, |
|
"reward": 1.048139214515686, |
|
"reward_std": 0.17669584602117538, |
|
"rewards/correct_code_reward_func": 0.5625000149011612, |
|
"rewards/len_reward_func": 0.48563915491104126, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 35.62500190734863, |
|
"epoch": 0.9605263157894737, |
|
"grad_norm": 1.1660255366197148, |
|
"kl": 0.2127685546875, |
|
"learning_rate": 3.9780750877789394e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9375000596046448, |
|
"reward_std": 0.24056154489517212, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 63.70833396911621, |
|
"epoch": 0.9736842105263158, |
|
"grad_norm": 1.1033519234822637, |
|
"kl": 0.115234375, |
|
"learning_rate": 3.9492649058206964e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 0.3491683229804039, |
|
"rewards/correct_code_reward_func": 0.3541666679084301, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 76.02083396911621, |
|
"epoch": 0.9868421052631579, |
|
"grad_norm": 1.0147417730649864, |
|
"kl": 0.085693359375, |
|
"learning_rate": 3.920161866827889e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9223043918609619, |
|
"reward_std": 0.26814398169517517, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.4848043918609619, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 37.04166793823242, |
|
"epoch": 1.0, |
|
"grad_norm": 53.40321846717754, |
|
"kl": 1.2109375, |
|
"learning_rate": 3.8907718517334396e-07, |
|
"loss": 0.0012, |
|
"reward": 1.125, |
|
"reward_std": 0.2994871288537979, |
|
"rewards/correct_code_reward_func": 0.625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 109.77083587646484, |
|
"epoch": 1.013157894736842, |
|
"grad_norm": 1.2927063784991473, |
|
"kl": 0.1328125, |
|
"learning_rate": 3.861100799460336e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6458333730697632, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.1458333395421505, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 55.6875, |
|
"epoch": 1.0263157894736843, |
|
"grad_norm": 1.118771460054152, |
|
"kl": 0.130859375, |
|
"learning_rate": 3.831154705721541e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0, |
|
"reward_std": 0.2342708371579647, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 46.25000190734863, |
|
"epoch": 1.0394736842105263, |
|
"grad_norm": 6.523486555479139, |
|
"kl": 0.462158203125, |
|
"learning_rate": 3.800939621808419e-07, |
|
"loss": 0.0005, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.22233543917536736, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 77.52083587646484, |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.2611203201996803, |
|
"kl": 0.10211181640625, |
|
"learning_rate": 3.7704616533679334e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7708333432674408, |
|
"reward_std": 0.438380166888237, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 31.666666984558105, |
|
"epoch": 1.0657894736842106, |
|
"grad_norm": 6780.981931743591, |
|
"kl": 2705.0, |
|
"learning_rate": 3.7397269591688663e-07, |
|
"loss": 2.7153, |
|
"reward": 0.9375, |
|
"reward_std": 0.13607724383473396, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 60.08333396911621, |
|
"epoch": 1.0789473684210527, |
|
"grad_norm": 1.4225091724950767, |
|
"kl": 0.09033203125, |
|
"learning_rate": 3.7087417498572944e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.34018656611442566, |
|
"rewards/correct_code_reward_func": 0.2916666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 47.95833396911621, |
|
"epoch": 1.0921052631578947, |
|
"grad_norm": 39.92181658742887, |
|
"kl": 21.05078125, |
|
"learning_rate": 3.6775122867015865e-07, |
|
"loss": 0.021, |
|
"reward": 1.1666667461395264, |
|
"reward_std": 0.34503278136253357, |
|
"rewards/correct_code_reward_func": 0.6666666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 30.291667938232422, |
|
"epoch": 1.1052631578947367, |
|
"grad_norm": 1.8228407124659103, |
|
"kl": 0.380859375, |
|
"learning_rate": 3.6460448803271754e-07, |
|
"loss": 0.0004, |
|
"reward": 1.1875000596046448, |
|
"reward_std": 0.2931964099407196, |
|
"rewards/correct_code_reward_func": 0.6875000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 48.1875, |
|
"epoch": 1.118421052631579, |
|
"grad_norm": 0.7759500560241989, |
|
"kl": 0.359130859375, |
|
"learning_rate": 3.614345889441346e-07, |
|
"loss": 0.0004, |
|
"reward": 0.7019230872392654, |
|
"reward_std": 0.09528262168169022, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.49358974397182465, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 31.58333396911621, |
|
"epoch": 1.131578947368421, |
|
"grad_norm": 522.7574671201318, |
|
"kl": 133.271484375, |
|
"learning_rate": 3.5824217195483176e-07, |
|
"loss": 0.1335, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.13607724383473396, |
|
"rewards/correct_code_reward_func": 0.3541666679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 41.29166793823242, |
|
"epoch": 1.1447368421052633, |
|
"grad_norm": 1.2754536374690895, |
|
"kl": 0.64208984375, |
|
"learning_rate": 3.550278821654866e-07, |
|
"loss": 0.0006, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.520833358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 49.52083396911621, |
|
"epoch": 1.1578947368421053, |
|
"grad_norm": 1.1388596254339887, |
|
"kl": 0.1265869140625, |
|
"learning_rate": 3.5179236909667464e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/correct_code_reward_func": 0.5, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 56.104169845581055, |
|
"epoch": 1.1710526315789473, |
|
"grad_norm": 4.8354182485526875, |
|
"kl": 1.86669921875, |
|
"learning_rate": 3.485362865576194e-07, |
|
"loss": 0.0019, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.3541666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 55.562503814697266, |
|
"epoch": 1.1842105263157894, |
|
"grad_norm": 1.8509215827690364, |
|
"kl": 0.501953125, |
|
"learning_rate": 3.4526029251407505e-07, |
|
"loss": 0.0005, |
|
"reward": 0.6250000298023224, |
|
"reward_std": 0.22233543917536736, |
|
"rewards/correct_code_reward_func": 0.12500000558793545, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 56.89583396911621, |
|
"epoch": 1.1973684210526316, |
|
"grad_norm": 3.479508392096908, |
|
"kl": 0.2423095703125, |
|
"learning_rate": 3.4196504895536943e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.24383928626775742, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 49.79166793823242, |
|
"epoch": 1.2105263157894737, |
|
"grad_norm": 1.436722605439725, |
|
"kl": 0.130126953125, |
|
"learning_rate": 3.3865122176063385e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0625000298023224, |
|
"reward_std": 0.2041093371808529, |
|
"rewards/correct_code_reward_func": 0.5625000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 57.58333396911621, |
|
"epoch": 1.2236842105263157, |
|
"grad_norm": 3.889581407116151, |
|
"kl": 1.8251953125, |
|
"learning_rate": 3.3531948056424764e-07, |
|
"loss": 0.0018, |
|
"reward": 0.7708333730697632, |
|
"reward_std": 0.204109326004982, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 33.33333492279053, |
|
"epoch": 1.236842105263158, |
|
"grad_norm": 67.13793060279492, |
|
"kl": 26.9716796875, |
|
"learning_rate": 3.319704986205223e-07, |
|
"loss": 0.0269, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.22233543917536736, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 68.54166793823242, |
|
"epoch": 1.25, |
|
"grad_norm": 0.8593629362399875, |
|
"kl": 0.2412109375, |
|
"learning_rate": 3.2860495266765597e-07, |
|
"loss": 0.0002, |
|
"reward": 0.75, |
|
"reward_std": 0.26726123690605164, |
|
"rewards/correct_code_reward_func": 0.25, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 114.70833587646484, |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 0.4762384719502881, |
|
"kl": 0.31884765625, |
|
"learning_rate": 3.252235227909825e-07, |
|
"loss": 0.0003, |
|
"reward": 0.5833333432674408, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.08333333395421505, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 69.75, |
|
"epoch": 1.2763157894736843, |
|
"grad_norm": 0.9237831880116578, |
|
"kl": 0.156494140625, |
|
"learning_rate": 3.2182689228554515e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0416667461395264, |
|
"reward_std": 0.24966806918382645, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 56.708335876464844, |
|
"epoch": 1.2894736842105263, |
|
"grad_norm": 86.96758033365275, |
|
"kl": 16.03466796875, |
|
"learning_rate": 3.184157475180207e-07, |
|
"loss": 0.016, |
|
"reward": 1.0416667461395264, |
|
"reward_std": 0.2994871214032173, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 66.27083587646484, |
|
"epoch": 1.3026315789473684, |
|
"grad_norm": 2.4485962428130286, |
|
"kl": 0.5283203125, |
|
"learning_rate": 3.1499077778802387e-07, |
|
"loss": 0.0005, |
|
"reward": 0.8958333730697632, |
|
"reward_std": 0.4068002998828888, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 53.770835876464844, |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 2.0445270805974585, |
|
"kl": 1.6337890625, |
|
"learning_rate": 3.115526751888181e-07, |
|
"loss": 0.0016, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.30859364569187164, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 119.14583587646484, |
|
"epoch": 1.3289473684210527, |
|
"grad_norm": 1.2383154787127502, |
|
"kl": 0.08740234375, |
|
"learning_rate": 3.081021344674632e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9476010203361511, |
|
"reward_std": 0.2919590622186661, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.4892676770687103, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 101.10416793823242, |
|
"epoch": 1.3421052631578947, |
|
"grad_norm": 408.9106300781395, |
|
"kl": 77.0654296875, |
|
"learning_rate": 3.0463985288442474e-07, |
|
"loss": 0.0775, |
|
"reward": 0.5625, |
|
"reward_std": 0.08625819534063339, |
|
"rewards/correct_code_reward_func": 0.0625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 106.85416984558105, |
|
"epoch": 1.3552631578947367, |
|
"grad_norm": 1.2917146685193897, |
|
"kl": 0.157958984375, |
|
"learning_rate": 3.011665300726775e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.28126102685928345, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 54.458335876464844, |
|
"epoch": 1.368421052631579, |
|
"grad_norm": 488.53335548972836, |
|
"kl": 259.2265625, |
|
"learning_rate": 2.976828678963284e-07, |
|
"loss": 0.2594, |
|
"reward": 0.6458333432674408, |
|
"reward_std": 0.28126102685928345, |
|
"rewards/correct_code_reward_func": 0.1666666716337204, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 78.89583396911621, |
|
"epoch": 1.381578947368421, |
|
"grad_norm": 2.403194346252636, |
|
"kl": 0.272705078125, |
|
"learning_rate": 2.941895703087887e-07, |
|
"loss": 0.0003, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.2903675436973572, |
|
"rewards/correct_code_reward_func": 0.2083333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 70.89583587646484, |
|
"epoch": 1.3947368421052633, |
|
"grad_norm": 5.590757587987271, |
|
"kl": 0.32861328125, |
|
"learning_rate": 2.906873432105244e-07, |
|
"loss": 0.0003, |
|
"reward": 0.979166716337204, |
|
"reward_std": 0.48676779866218567, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 66.0625, |
|
"epoch": 1.4078947368421053, |
|
"grad_norm": 1.8305343201041324, |
|
"kl": 0.4862060546875, |
|
"learning_rate": 2.871768943064129e-07, |
|
"loss": 0.0005, |
|
"reward": 1.0000000596046448, |
|
"reward_std": 0.2721545100212097, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 54.06250190734863, |
|
"epoch": 1.4210526315789473, |
|
"grad_norm": 0.6615096170114014, |
|
"kl": 0.1812744140625, |
|
"learning_rate": 2.8365893296273487e-07, |
|
"loss": 0.0002, |
|
"reward": 0.875, |
|
"reward_std": 0.28215693682432175, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 73.29166984558105, |
|
"epoch": 1.4342105263157894, |
|
"grad_norm": 13.375888897910476, |
|
"kl": 2.66064453125, |
|
"learning_rate": 2.801341700638307e-07, |
|
"loss": 0.0027, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.07715167850255966, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 70.45833587646484, |
|
"epoch": 1.4473684210526316, |
|
"grad_norm": 0.8193690701749492, |
|
"kl": 0.42578125, |
|
"learning_rate": 2.766033178684506e-07, |
|
"loss": 0.0004, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.24339043349027634, |
|
"rewards/correct_code_reward_func": 0.2500000111758709, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 78.39583587646484, |
|
"epoch": 1.4605263157894737, |
|
"grad_norm": 811.8284865728368, |
|
"kl": 149.04150390625, |
|
"learning_rate": 2.730670898658255e-07, |
|
"loss": 0.1496, |
|
"reward": 0.5625, |
|
"reward_std": 0.08625819534063339, |
|
"rewards/correct_code_reward_func": 0.0625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 91.58333396911621, |
|
"epoch": 1.4736842105263157, |
|
"grad_norm": 44.2662547081144, |
|
"kl": 3.12255859375, |
|
"learning_rate": 2.6952620063149116e-07, |
|
"loss": 0.0031, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.3731769919395447, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 58.895835876464844, |
|
"epoch": 1.486842105263158, |
|
"grad_norm": 4.041649610812466, |
|
"kl": 0.2001953125, |
|
"learning_rate": 2.6598136568289144e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7708333730697632, |
|
"reward_std": 0.22516431659460068, |
|
"rewards/correct_code_reward_func": 0.2708333395421505, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 62.20833396911621, |
|
"epoch": 1.5, |
|
"grad_norm": 2.891055687279037, |
|
"kl": 0.6484375, |
|
"learning_rate": 2.624333013347917e-07, |
|
"loss": 0.0006, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.1480126492679119, |
|
"rewards/correct_code_reward_func": 0.3958333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 45.31250190734863, |
|
"epoch": 1.513157894736842, |
|
"grad_norm": 0.7564583724153957, |
|
"kl": 0.769287109375, |
|
"learning_rate": 2.5888272455453133e-07, |
|
"loss": 0.0008, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 59.270835876464844, |
|
"epoch": 1.526315789473684, |
|
"grad_norm": 1.5880223387121624, |
|
"kl": 1.41796875, |
|
"learning_rate": 2.5533035281714365e-07, |
|
"loss": 0.0014, |
|
"reward": 0.6041666865348816, |
|
"reward_std": 0.25392838567495346, |
|
"rewards/correct_code_reward_func": 0.1041666679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 57.583335876464844, |
|
"epoch": 1.5394736842105263, |
|
"grad_norm": 16.81790414189794, |
|
"kl": 10.380859375, |
|
"learning_rate": 2.5177690396037436e-07, |
|
"loss": 0.0104, |
|
"reward": 1.0625, |
|
"reward_std": 0.1480126492679119, |
|
"rewards/correct_code_reward_func": 0.5625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 103.41667175292969, |
|
"epoch": 1.5526315789473686, |
|
"grad_norm": 6.8725768759144845, |
|
"kl": 0.1064453125, |
|
"learning_rate": 2.482230960396256e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.4173382371664047, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 64.25000190734863, |
|
"epoch": 1.5657894736842106, |
|
"grad_norm": 1.6956587222545918, |
|
"kl": 0.2694091796875, |
|
"learning_rate": 2.4466964718285633e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.21322892233729362, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 48.208335876464844, |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 0.8688231987899243, |
|
"kl": 0.05615234375, |
|
"learning_rate": 2.411172754454688e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.2994871288537979, |
|
"rewards/correct_code_reward_func": 0.5000000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 31.479166984558105, |
|
"epoch": 1.5921052631578947, |
|
"grad_norm": 1.0911022619943564, |
|
"kl": 0.193359375, |
|
"learning_rate": 2.375666986652083e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4166666865348816, |
|
"reward_std": 0.19500282034277916, |
|
"rewards/correct_code_reward_func": 0.9166666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 49.91666793823242, |
|
"epoch": 1.6052631578947367, |
|
"grad_norm": 10.939422149518188, |
|
"kl": 0.50537109375, |
|
"learning_rate": 2.3401863431710862e-07, |
|
"loss": 0.0005, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.28408990427851677, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 47.72916793823242, |
|
"epoch": 1.618421052631579, |
|
"grad_norm": 1.2714992325565007, |
|
"kl": 0.097412109375, |
|
"learning_rate": 2.3047379936850882e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.2931964248418808, |
|
"rewards/correct_code_reward_func": 0.645833358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 76.47916984558105, |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 18.033727216842927, |
|
"kl": 5.46875, |
|
"learning_rate": 2.2693291013417452e-07, |
|
"loss": 0.0055, |
|
"reward": 0.8958333730697632, |
|
"reward_std": 0.3177132308483124, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 57.583335876464844, |
|
"epoch": 1.6447368421052633, |
|
"grad_norm": 10.905825675003781, |
|
"kl": 10.9765625, |
|
"learning_rate": 2.2339668213154941e-07, |
|
"loss": 0.011, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.28408990427851677, |
|
"rewards/correct_code_reward_func": 0.2916666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 38.31250190734863, |
|
"epoch": 1.6578947368421053, |
|
"grad_norm": 1.7809035413529761, |
|
"kl": 0.0531005859375, |
|
"learning_rate": 2.1986582993616925e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.3535194396972656, |
|
"rewards/correct_code_reward_func": 0.520833358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 60.60416793823242, |
|
"epoch": 1.6710526315789473, |
|
"grad_norm": 0.8551909967151841, |
|
"kl": 0.1240234375, |
|
"learning_rate": 2.1634106703726518e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9375, |
|
"reward_std": 0.30231600999832153, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 146.62500762939453, |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 0.5963722529446751, |
|
"kl": 0.032470703125, |
|
"learning_rate": 2.1282310569358704e-07, |
|
"loss": 0.0, |
|
"reward": 0.854166716337204, |
|
"reward_std": 0.23144196718931198, |
|
"rewards/correct_code_reward_func": 0.354166679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 79.31250381469727, |
|
"epoch": 1.6973684210526314, |
|
"grad_norm": 11.657983637810885, |
|
"kl": 5.33203125, |
|
"learning_rate": 2.093126567894755e-07, |
|
"loss": 0.0053, |
|
"reward": 0.75, |
|
"reward_std": 0.22233545035123825, |
|
"rewards/correct_code_reward_func": 0.25, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 40.54166793823242, |
|
"epoch": 1.7105263157894737, |
|
"grad_norm": 2.9861227183589163, |
|
"kl": 1.69921875, |
|
"learning_rate": 2.0581042969121132e-07, |
|
"loss": 0.0017, |
|
"reward": 0.9375, |
|
"reward_std": 0.320529043674469, |
|
"rewards/correct_code_reward_func": 0.4375, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 35.64583396911621, |
|
"epoch": 1.723684210526316, |
|
"grad_norm": 1.119123230040456, |
|
"kl": 1.509521484375, |
|
"learning_rate": 2.023171321036716e-07, |
|
"loss": 0.0015, |
|
"reward": 1.3541666865348816, |
|
"reward_std": 0.23144196718931198, |
|
"rewards/correct_code_reward_func": 0.8541666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 63.04166793823242, |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 1.328286772215648, |
|
"kl": 0.2412109375, |
|
"learning_rate": 1.9883346992732254e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.4096161276102066, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 50.29166793823242, |
|
"epoch": 1.75, |
|
"grad_norm": 16.589627428183928, |
|
"kl": 14.0625, |
|
"learning_rate": 1.9536014711557526e-07, |
|
"loss": 0.0141, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 80.91666793823242, |
|
"epoch": 1.763157894736842, |
|
"grad_norm": 0.6658576872819127, |
|
"kl": 0.183837890625, |
|
"learning_rate": 1.9189786553253687e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.25392838940024376, |
|
"rewards/correct_code_reward_func": 0.4375000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 44.06250190734863, |
|
"epoch": 1.776315789473684, |
|
"grad_norm": 20.037402776427545, |
|
"kl": 1.3486328125, |
|
"learning_rate": 1.884473248111818e-07, |
|
"loss": 0.0013, |
|
"reward": 1.0208333432674408, |
|
"reward_std": 0.32964862883090973, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 38.9375, |
|
"epoch": 1.7894736842105263, |
|
"grad_norm": 1.4127858285242418, |
|
"kl": 0.097900390625, |
|
"learning_rate": 1.8500922221197619e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.320529043674469, |
|
"rewards/correct_code_reward_func": 0.6041666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 61.75, |
|
"epoch": 1.8026315789473686, |
|
"grad_norm": 19.7909578978993, |
|
"kl": 0.2247314453125, |
|
"learning_rate": 1.8158425248197928e-07, |
|
"loss": 0.0002, |
|
"reward": 1.1041667461395264, |
|
"reward_std": 0.3219604715704918, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 60.625, |
|
"epoch": 1.8157894736842106, |
|
"grad_norm": 1.5841720040680045, |
|
"kl": 0.15380859375, |
|
"learning_rate": 1.7817310771445488e-07, |
|
"loss": 0.0002, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.583333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 48.97916793823242, |
|
"epoch": 1.8289473684210527, |
|
"grad_norm": 2.2749307920378903, |
|
"kl": 1.35400390625, |
|
"learning_rate": 1.7477647720901746e-07, |
|
"loss": 0.0014, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.32964862883090973, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 29.666667938232422, |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 40.21756072339013, |
|
"kl": 2.6141357421875, |
|
"learning_rate": 1.7139504733234412e-07, |
|
"loss": 0.0026, |
|
"reward": 1.104166716337204, |
|
"reward_std": 0.30859363824129105, |
|
"rewards/correct_code_reward_func": 0.6041666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 84.70833587646484, |
|
"epoch": 1.8552631578947367, |
|
"grad_norm": 37.379935422229906, |
|
"kl": 7.72314453125, |
|
"learning_rate": 1.680295013794778e-07, |
|
"loss": 0.0078, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.4159068167209625, |
|
"rewards/correct_code_reward_func": 0.2916666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 116.58333969116211, |
|
"epoch": 1.868421052631579, |
|
"grad_norm": 21.10627645969623, |
|
"kl": 8.8427734375, |
|
"learning_rate": 1.646805194357524e-07, |
|
"loss": 0.0088, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 47.06250190734863, |
|
"epoch": 1.881578947368421, |
|
"grad_norm": 193.23023765526983, |
|
"kl": 42.0615234375, |
|
"learning_rate": 1.6134877823936607e-07, |
|
"loss": 0.0421, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.3794676959514618, |
|
"rewards/correct_code_reward_func": 0.4791666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 32.20833396911621, |
|
"epoch": 1.8947368421052633, |
|
"grad_norm": 1.3678936634574226, |
|
"kl": 0.09912109375, |
|
"learning_rate": 1.580349510446306e-07, |
|
"loss": 0.0001, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.2721545025706291, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 71.08333587646484, |
|
"epoch": 1.9078947368421053, |
|
"grad_norm": 1.2737509817094077, |
|
"kl": 0.795654296875, |
|
"learning_rate": 1.547397074859249e-07, |
|
"loss": 0.0008, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 0.0, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 65.10416984558105, |
|
"epoch": 1.9210526315789473, |
|
"grad_norm": 1.0612643430461282, |
|
"kl": 0.25439453125, |
|
"learning_rate": 1.514637134423806e-07, |
|
"loss": 0.0003, |
|
"reward": 0.6250000298023224, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.12500000558793545, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 50.70833396911621, |
|
"epoch": 1.9342105263157894, |
|
"grad_norm": 0.9018149237891075, |
|
"kl": 0.42431640625, |
|
"learning_rate": 1.482076309033254e-07, |
|
"loss": 0.0004, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.19500280916690826, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 77.93750381469727, |
|
"epoch": 1.9473684210526314, |
|
"grad_norm": 0.8274466826539039, |
|
"kl": 0.40673828125, |
|
"learning_rate": 1.4497211783451352e-07, |
|
"loss": 0.0004, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.22233543917536736, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 53.458335876464844, |
|
"epoch": 1.9605263157894737, |
|
"grad_norm": 47.18259760797994, |
|
"kl": 21.875, |
|
"learning_rate": 1.4175782804516822e-07, |
|
"loss": 0.0219, |
|
"reward": 1.0625, |
|
"reward_std": 0.42016713321208954, |
|
"rewards/correct_code_reward_func": 0.5625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 50.5625, |
|
"epoch": 1.973684210526316, |
|
"grad_norm": 3.162565983897937, |
|
"kl": 0.3603515625, |
|
"learning_rate": 1.3856541105586545e-07, |
|
"loss": 0.0004, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.2342708334326744, |
|
"rewards/correct_code_reward_func": 0.8333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 43.81250190734863, |
|
"epoch": 1.986842105263158, |
|
"grad_norm": 1.4665389553089878, |
|
"kl": 0.22265625, |
|
"learning_rate": 1.3539551196728249e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.3128539249300957, |
|
"rewards/correct_code_reward_func": 0.3750000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 54.72916793823242, |
|
"epoch": 2.0, |
|
"grad_norm": 1.6783852431327184, |
|
"kl": 0.2939453125, |
|
"learning_rate": 1.322487713298413e-07, |
|
"loss": 0.0003, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.33592625707387924, |
|
"rewards/correct_code_reward_func": 0.4791666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 65.62500381469727, |
|
"epoch": 2.013157894736842, |
|
"grad_norm": 0.870944500815664, |
|
"kl": 0.1328125, |
|
"learning_rate": 1.2912582501427061e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6041666865348816, |
|
"reward_std": 0.21322893351316452, |
|
"rewards/correct_code_reward_func": 0.1041666679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 48.68750286102295, |
|
"epoch": 2.026315789473684, |
|
"grad_norm": 32.567721507115095, |
|
"kl": 11.8994140625, |
|
"learning_rate": 1.260273040831134e-07, |
|
"loss": 0.0119, |
|
"reward": 0.8058712482452393, |
|
"reward_std": 0.07767461240291595, |
|
"rewards/correct_code_reward_func": 0.3125000149011612, |
|
"rewards/len_reward_func": 0.49337121844291687, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 55.52083396911621, |
|
"epoch": 2.039473684210526, |
|
"grad_norm": 5.082828282284441, |
|
"kl": 1.729736328125, |
|
"learning_rate": 1.2295383466320674e-07, |
|
"loss": 0.0017, |
|
"reward": 0.9278694987297058, |
|
"reward_std": 0.3939250260591507, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.4903694987297058, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 41.666666984558105, |
|
"epoch": 2.0526315789473686, |
|
"grad_norm": 4.028610834972993, |
|
"kl": 0.8974609375, |
|
"learning_rate": 1.1990603781915814e-07, |
|
"loss": 0.0009, |
|
"reward": 1.0000000596046448, |
|
"reward_std": 0.350690558552742, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 69.0625, |
|
"epoch": 2.0657894736842106, |
|
"grad_norm": 1.314671666949959, |
|
"kl": 0.165283203125, |
|
"learning_rate": 1.1688452942784591e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9375000596046448, |
|
"reward_std": 0.24056155234575272, |
|
"rewards/correct_code_reward_func": 0.4375000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 46.95833492279053, |
|
"epoch": 2.0789473684210527, |
|
"grad_norm": 2.979789760395441, |
|
"kl": 2.431640625, |
|
"learning_rate": 1.138899200539663e-07, |
|
"loss": 0.0024, |
|
"reward": 0.6875, |
|
"reward_std": 0.13607725501060486, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 41.72916793823242, |
|
"epoch": 2.0921052631578947, |
|
"grad_norm": 2.3176148256976137, |
|
"kl": 0.5712890625, |
|
"learning_rate": 1.10922814826656e-07, |
|
"loss": 0.0006, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.2903675064444542, |
|
"rewards/correct_code_reward_func": 0.2083333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 58.625003814697266, |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 1.4397948211346696, |
|
"kl": 0.56298828125, |
|
"learning_rate": 1.0798381331721107e-07, |
|
"loss": 0.0006, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.39140307903289795, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 106.64583396911621, |
|
"epoch": 2.1184210526315788, |
|
"grad_norm": 16.776871927679135, |
|
"kl": 4.4949951171875, |
|
"learning_rate": 1.0507350941793042e-07, |
|
"loss": 0.0045, |
|
"reward": 0.6666666865348816, |
|
"reward_std": 0.24339043349027634, |
|
"rewards/correct_code_reward_func": 0.1875, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 59.9375, |
|
"epoch": 2.1315789473684212, |
|
"grad_norm": 13.001510018724911, |
|
"kl": 9.50634765625, |
|
"learning_rate": 1.0219249122210619e-07, |
|
"loss": 0.0095, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.2840898856520653, |
|
"rewards/correct_code_reward_func": 0.3750000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 35.39583396911621, |
|
"epoch": 2.1447368421052633, |
|
"grad_norm": 2.008422665082507, |
|
"kl": 1.0521240234375, |
|
"learning_rate": 9.934134090518592e-08, |
|
"loss": 0.001, |
|
"reward": 1.1250000596046448, |
|
"reward_std": 0.2721545100212097, |
|
"rewards/correct_code_reward_func": 0.6250000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 41.72916793823242, |
|
"epoch": 2.1578947368421053, |
|
"grad_norm": 3.008857027150773, |
|
"kl": 0.1142578125, |
|
"learning_rate": 9.652063460713117e-08, |
|
"loss": 0.0001, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.15430335700511932, |
|
"rewards/correct_code_reward_func": 0.5000000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 52.54166793823242, |
|
"epoch": 2.1710526315789473, |
|
"grad_norm": 2.2942300832362656, |
|
"kl": 1.458984375, |
|
"learning_rate": 9.37309423159949e-08, |
|
"loss": 0.0015, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.2931964062154293, |
|
"rewards/correct_code_reward_func": 0.520833358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 67.10416793823242, |
|
"epoch": 2.1842105263157894, |
|
"grad_norm": 0.7847964783320711, |
|
"kl": 0.3076171875, |
|
"learning_rate": 9.097282775274238e-08, |
|
"loss": 0.0003, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.22516433894634247, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 62.458335876464844, |
|
"epoch": 2.1973684210526314, |
|
"grad_norm": 67.89371008742586, |
|
"kl": 38.031982421875, |
|
"learning_rate": 8.824684825733863e-08, |
|
"loss": 0.038, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.28408990800380707, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 47.79166793823242, |
|
"epoch": 2.2105263157894735, |
|
"grad_norm": 24.24181613825729, |
|
"kl": 12.8359375, |
|
"learning_rate": 8.555355467612527e-08, |
|
"loss": 0.0128, |
|
"reward": 0.7708333432674408, |
|
"reward_std": 0.2041093371808529, |
|
"rewards/correct_code_reward_func": 0.2708333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 44.68750190734863, |
|
"epoch": 2.223684210526316, |
|
"grad_norm": 237.339379162307, |
|
"kl": 158.1015625, |
|
"learning_rate": 8.289349125050913e-08, |
|
"loss": 0.1585, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.36751919239759445, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 66.95833587646484, |
|
"epoch": 2.236842105263158, |
|
"grad_norm": 1.2301856222728829, |
|
"kl": 0.20703125, |
|
"learning_rate": 8.026719550698627e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 40.08333396911621, |
|
"epoch": 2.25, |
|
"grad_norm": 2.792062080070077, |
|
"kl": 0.27099609375, |
|
"learning_rate": 7.767519814852233e-08, |
|
"loss": 0.0003, |
|
"reward": 1.1458333432674408, |
|
"reward_std": 0.34158404916524887, |
|
"rewards/correct_code_reward_func": 0.6458333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 64.04166984558105, |
|
"epoch": 2.263157894736842, |
|
"grad_norm": 1.3110888523745599, |
|
"kl": 0.81640625, |
|
"learning_rate": 7.511802294731159e-08, |
|
"loss": 0.0008, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.33875514566898346, |
|
"rewards/correct_code_reward_func": 0.2916666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 33.85416793823242, |
|
"epoch": 2.276315789473684, |
|
"grad_norm": 1.8190810652171645, |
|
"kl": 0.2119140625, |
|
"learning_rate": 7.259618663893724e-08, |
|
"loss": 0.0002, |
|
"reward": 1.1875000596046448, |
|
"reward_std": 0.2903806045651436, |
|
"rewards/correct_code_reward_func": 0.6875000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 49.06250190734863, |
|
"epoch": 2.2894736842105265, |
|
"grad_norm": 0.9842986413475313, |
|
"kl": 1.07421875, |
|
"learning_rate": 7.011019881795297e-08, |
|
"loss": 0.0011, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 0.17251639068126678, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 65.04166793823242, |
|
"epoch": 2.3026315789473686, |
|
"grad_norm": 4.44211038726181, |
|
"kl": 1.56640625, |
|
"learning_rate": 6.766056183490798e-08, |
|
"loss": 0.0016, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.2931964062154293, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 54.791669845581055, |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 2.5493155442330098, |
|
"kl": 1.79150390625, |
|
"learning_rate": 6.524777069483525e-08, |
|
"loss": 0.0018, |
|
"reward": 0.8958333730697632, |
|
"reward_std": 0.30231600999832153, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 62.083335876464844, |
|
"epoch": 2.3289473684210527, |
|
"grad_norm": 319.0871377811938, |
|
"kl": 194.21875, |
|
"learning_rate": 6.28723129572247e-08, |
|
"loss": 0.1953, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.31142252683639526, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 46.06250190734863, |
|
"epoch": 2.3421052631578947, |
|
"grad_norm": 6.171508505395978, |
|
"kl": 4.849609375, |
|
"learning_rate": 6.053466863750084e-08, |
|
"loss": 0.0048, |
|
"reward": 0.625, |
|
"reward_std": 0.16623875498771667, |
|
"rewards/correct_code_reward_func": 0.125, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 45.10416793823242, |
|
"epoch": 2.3552631578947367, |
|
"grad_norm": 7.899530101559774, |
|
"kl": 4.39013671875, |
|
"learning_rate": 5.8235310110024224e-08, |
|
"loss": 0.0044, |
|
"reward": 0.979166716337204, |
|
"reward_std": 0.30231600999832153, |
|
"rewards/correct_code_reward_func": 0.479166679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 63.95833396911621, |
|
"epoch": 2.3684210526315788, |
|
"grad_norm": 19.13925515641779, |
|
"kl": 6.4049072265625, |
|
"learning_rate": 5.597470201263782e-08, |
|
"loss": 0.0064, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.34018659591674805, |
|
"rewards/correct_code_reward_func": 0.4583333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 95.70833587646484, |
|
"epoch": 2.3815789473684212, |
|
"grad_norm": 2.2349939552948057, |
|
"kl": 0.345458984375, |
|
"learning_rate": 5.3753301152776347e-08, |
|
"loss": 0.0003, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.16623875498771667, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 59.375, |
|
"epoch": 2.3947368421052633, |
|
"grad_norm": 2.542273591175544, |
|
"kl": 0.755615234375, |
|
"learning_rate": 5.1571556415157646e-08, |
|
"loss": 0.0008, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.43129096925258636, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 66.375, |
|
"epoch": 2.4078947368421053, |
|
"grad_norm": 1.1341732814888108, |
|
"kl": 0.12890625, |
|
"learning_rate": 4.942990867107547e-08, |
|
"loss": 0.0001, |
|
"reward": 0.8541666865348816, |
|
"reward_std": 0.3177132308483124, |
|
"rewards/correct_code_reward_func": 0.3541666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 61.29166793823242, |
|
"epoch": 2.4210526315789473, |
|
"grad_norm": 484.5704062949344, |
|
"kl": 301.046875, |
|
"learning_rate": 4.732879068931131e-08, |
|
"loss": 0.3011, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.2630349025130272, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 43.58333396911621, |
|
"epoch": 2.4342105263157894, |
|
"grad_norm": 4.437414420601948, |
|
"kl": 3.1494140625, |
|
"learning_rate": 4.526862704868376e-08, |
|
"loss": 0.0032, |
|
"reward": 0.8750000596046448, |
|
"reward_std": 0.24966806918382645, |
|
"rewards/correct_code_reward_func": 0.3750000111758709, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 71.14583396911621, |
|
"epoch": 2.4473684210526314, |
|
"grad_norm": 90.82877113902249, |
|
"kl": 0.159912109375, |
|
"learning_rate": 4.324983405225235e-08, |
|
"loss": 0.0002, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.19500280916690826, |
|
"rewards/correct_code_reward_func": 0.458333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 78.04166793823242, |
|
"epoch": 2.4605263157894735, |
|
"grad_norm": 283.12841990209046, |
|
"kl": 57.2578125, |
|
"learning_rate": 4.1272819643194456e-08, |
|
"loss": 0.0573, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.4958743155002594, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 47.00000190734863, |
|
"epoch": 2.473684210526316, |
|
"grad_norm": 1.1669700382103856, |
|
"kl": 0.20751953125, |
|
"learning_rate": 3.933798332237059e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.1451837718486786, |
|
"rewards/correct_code_reward_func": 0.2500000074505806, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 110.87500381469727, |
|
"epoch": 2.486842105263158, |
|
"grad_norm": 1.125444972266526, |
|
"kl": 0.0595703125, |
|
"learning_rate": 3.74457160675965e-08, |
|
"loss": 0.0001, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.22233545035123825, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 122.5625, |
|
"epoch": 2.5, |
|
"grad_norm": 1.2043716850566297, |
|
"kl": 0.3837890625, |
|
"learning_rate": 3.5596400254637035e-08, |
|
"loss": 0.0004, |
|
"reward": 0.6041666865348816, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.1041666679084301, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 65.66666793823242, |
|
"epoch": 2.513157894736842, |
|
"grad_norm": 1.3296548993223685, |
|
"kl": 0.513916015625, |
|
"learning_rate": 3.379040957993834e-08, |
|
"loss": 0.0005, |
|
"reward": 0.8750000298023224, |
|
"reward_std": 0.2630349025130272, |
|
"rewards/correct_code_reward_func": 0.3750000223517418, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 48.041666984558105, |
|
"epoch": 2.526315789473684, |
|
"grad_norm": 284.52007908369717, |
|
"kl": 93.12890625, |
|
"learning_rate": 3.2028108985114235e-08, |
|
"loss": 0.0934, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.2122463434934616, |
|
"rewards/correct_code_reward_func": 0.5000000149011612, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 57.666666984558105, |
|
"epoch": 2.5394736842105265, |
|
"grad_norm": 5.140729023126808, |
|
"kl": 0.083740234375, |
|
"learning_rate": 3.030985458320118e-08, |
|
"loss": 0.0001, |
|
"reward": 1.1041666865348816, |
|
"reward_std": 0.24056155234575272, |
|
"rewards/correct_code_reward_func": 0.6041666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 71.08333587646484, |
|
"epoch": 2.5526315789473686, |
|
"grad_norm": 1.728517309004394, |
|
"kl": 2.6953125, |
|
"learning_rate": 2.863599358669755e-08, |
|
"loss": 0.0027, |
|
"reward": 0.7083333432674408, |
|
"reward_std": 0.2960253022611141, |
|
"rewards/correct_code_reward_func": 0.20833333395421505, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 45.83333396911621, |
|
"epoch": 2.5657894736842106, |
|
"grad_norm": 2.095306352134131, |
|
"kl": 2.639404296875, |
|
"learning_rate": 2.7006864237401423e-08, |
|
"loss": 0.0026, |
|
"reward": 1.0625, |
|
"reward_std": 0.3584126979112625, |
|
"rewards/correct_code_reward_func": 0.5625, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 55.18750190734863, |
|
"epoch": 2.5789473684210527, |
|
"grad_norm": 144.09595199637047, |
|
"kl": 78.4609375, |
|
"learning_rate": 2.542279573806122e-08, |
|
"loss": 0.0784, |
|
"reward": 0.5833333730697632, |
|
"reward_std": 0.15430335700511932, |
|
"rewards/correct_code_reward_func": 0.0833333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 55.37500190734863, |
|
"epoch": 2.5921052631578947, |
|
"grad_norm": 1.595820339822705, |
|
"kl": 0.9705810546875, |
|
"learning_rate": 2.3884108185852626e-08, |
|
"loss": 0.001, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.3233579248189926, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 91.14583587646484, |
|
"epoch": 2.6052631578947367, |
|
"grad_norm": 2.359631401946909, |
|
"kl": 0.7138671875, |
|
"learning_rate": 2.2391112507695876e-08, |
|
"loss": 0.0007, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.4130779355764389, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 31.812500953674316, |
|
"epoch": 2.6184210526315788, |
|
"grad_norm": 9.107326597955607, |
|
"kl": 4.916015625, |
|
"learning_rate": 2.094411039742569e-08, |
|
"loss": 0.0049, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.19500280916690826, |
|
"rewards/correct_code_reward_func": 0.7500000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 62.60416793823242, |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 2.350577703191847, |
|
"kl": 0.904296875, |
|
"learning_rate": 1.954339425482712e-08, |
|
"loss": 0.0009, |
|
"reward": 1.1875000596046448, |
|
"reward_std": 0.25392838567495346, |
|
"rewards/correct_code_reward_func": 0.6875000298023224, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 48.79166793823242, |
|
"epoch": 2.6447368421052633, |
|
"grad_norm": 3.477458112159059, |
|
"kl": 0.35595703125, |
|
"learning_rate": 1.818924712654965e-08, |
|
"loss": 0.0004, |
|
"reward": 0.8125, |
|
"reward_std": 0.23144196718931198, |
|
"rewards/correct_code_reward_func": 0.3125, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 74.52083587646484, |
|
"epoch": 2.6578947368421053, |
|
"grad_norm": 0.7180881930260916, |
|
"kl": 0.181884765625, |
|
"learning_rate": 1.6881942648911074e-08, |
|
"loss": 0.0002, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.16340987384319305, |
|
"rewards/correct_code_reward_func": 0.5416666865348816, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 69.25000381469727, |
|
"epoch": 2.6710526315789473, |
|
"grad_norm": 1.195470821288021, |
|
"kl": 0.26025390625, |
|
"learning_rate": 1.5621744992603047e-08, |
|
"loss": 0.0003, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.4446708858013153, |
|
"rewards/correct_code_reward_func": 0.4166666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 92.64583587646484, |
|
"epoch": 2.6842105263157894, |
|
"grad_norm": 0.5914043963888233, |
|
"kl": 0.1630859375, |
|
"learning_rate": 1.4408908809309422e-08, |
|
"loss": 0.0002, |
|
"reward": 0.7291666865348816, |
|
"reward_std": 0.08625819534063339, |
|
"rewards/correct_code_reward_func": 0.2291666716337204, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 67.5, |
|
"epoch": 2.6973684210526314, |
|
"grad_norm": 0.5891628759615603, |
|
"kl": 0.06036376953125, |
|
"learning_rate": 1.3243679180248075e-08, |
|
"loss": 0.0001, |
|
"reward": 0.9791666865348816, |
|
"reward_std": 0.22516432031989098, |
|
"rewards/correct_code_reward_func": 0.4791666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 37.52083396911621, |
|
"epoch": 2.7105263157894735, |
|
"grad_norm": 3.112281199267652, |
|
"kl": 2.693359375, |
|
"learning_rate": 1.2126291566646462e-08, |
|
"loss": 0.0027, |
|
"reward": 0.875, |
|
"reward_std": 0.1451837718486786, |
|
"rewards/correct_code_reward_func": 0.375, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 69.02083396911621, |
|
"epoch": 2.723684210526316, |
|
"grad_norm": 1.4964559285333816, |
|
"kl": 0.83251953125, |
|
"learning_rate": 1.1056971762161583e-08, |
|
"loss": 0.0008, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.376638799905777, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 41.68750190734863, |
|
"epoch": 2.736842105263158, |
|
"grad_norm": 47.69500240874892, |
|
"kl": 33.90625, |
|
"learning_rate": 1.0035935847253012e-08, |
|
"loss": 0.0339, |
|
"reward": 1.0208333730697632, |
|
"reward_std": 0.28126102685928345, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 49.895835876464844, |
|
"epoch": 2.75, |
|
"grad_norm": 3.62772385527898, |
|
"kl": 0.6689453125, |
|
"learning_rate": 9.063390145519018e-09, |
|
"loss": 0.0007, |
|
"reward": 1.1851695775985718, |
|
"reward_std": 0.3313498795032501, |
|
"rewards/correct_code_reward_func": 0.6875000298023224, |
|
"rewards/len_reward_func": 0.4976695030927658, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 78.97917175292969, |
|
"epoch": 2.763157894736842, |
|
"grad_norm": 13.13764745425958, |
|
"kl": 5.2373046875, |
|
"learning_rate": 8.139531182004222e-09, |
|
"loss": 0.0052, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.2994871288537979, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 33.35416793823242, |
|
"epoch": 2.776315789473684, |
|
"grad_norm": 1.5959152998105355, |
|
"kl": 1.5703125, |
|
"learning_rate": 7.2645456434869965e-09, |
|
"loss": 0.0016, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.2994871214032173, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 64.68750381469727, |
|
"epoch": 2.7894736842105265, |
|
"grad_norm": 109.88019118611281, |
|
"kl": 0.3125, |
|
"learning_rate": 6.438610340755462e-09, |
|
"loss": 0.0003, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.376638799905777, |
|
"rewards/correct_code_reward_func": 0.4166666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 73.08333587646484, |
|
"epoch": 2.8026315789473686, |
|
"grad_norm": 0.9800418792331905, |
|
"kl": 0.9185791015625, |
|
"learning_rate": 5.6618921728786015e-09, |
|
"loss": 0.0009, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.22233543917536736, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 61.625, |
|
"epoch": 2.8157894736842106, |
|
"grad_norm": 1.5620983389420056, |
|
"kl": 0.1494140625, |
|
"learning_rate": 4.934548093480511e-09, |
|
"loss": 0.0001, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.31142252683639526, |
|
"rewards/correct_code_reward_func": 0.4583333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 72.25000190734863, |
|
"epoch": 2.8289473684210527, |
|
"grad_norm": 0.976086181884898, |
|
"kl": 0.1263427734375, |
|
"learning_rate": 4.256725079024553e-09, |
|
"loss": 0.0001, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.3177132233977318, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 44.08333396911621, |
|
"epoch": 2.8421052631578947, |
|
"grad_norm": 1.4377976203336797, |
|
"kl": 0.2220458984375, |
|
"learning_rate": 3.6285600991131095e-09, |
|
"loss": 0.0002, |
|
"reward": 0.70163694024086, |
|
"reward_std": 0.09609203785657883, |
|
"rewards/correct_code_reward_func": 0.2083333395421505, |
|
"rewards/len_reward_func": 0.493303582072258, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 41.10416793823242, |
|
"epoch": 2.8552631578947367, |
|
"grad_norm": 1.728700023377306, |
|
"kl": 0.392578125, |
|
"learning_rate": 3.0501800888099726e-09, |
|
"loss": 0.0004, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.21322892233729362, |
|
"rewards/correct_code_reward_func": 0.3958333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 57.58333396911621, |
|
"epoch": 2.8684210526315788, |
|
"grad_norm": 2.083207612873304, |
|
"kl": 0.4912109375, |
|
"learning_rate": 2.5217019229900606e-09, |
|
"loss": 0.0005, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.22233545035123825, |
|
"rewards/correct_code_reward_func": 0.5833333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 39.54166793823242, |
|
"epoch": 2.8815789473684212, |
|
"grad_norm": 0.9145475753101857, |
|
"kl": 0.1865234375, |
|
"learning_rate": 2.043232392722388e-09, |
|
"loss": 0.0002, |
|
"reward": 1.0208333432674408, |
|
"reward_std": 0.2041093371808529, |
|
"rewards/correct_code_reward_func": 0.5208333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 49.37500190734863, |
|
"epoch": 2.8947368421052633, |
|
"grad_norm": 1.081844737309092, |
|
"kl": 0.203125, |
|
"learning_rate": 1.614868183690249e-09, |
|
"loss": 0.0002, |
|
"reward": 0.7500000298023224, |
|
"reward_std": 0.1451837606728077, |
|
"rewards/correct_code_reward_func": 0.2500000149011612, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 61.125003814697266, |
|
"epoch": 2.9078947368421053, |
|
"grad_norm": 2.1759674723609286, |
|
"kl": 1.7373046875, |
|
"learning_rate": 1.2366958566538688e-09, |
|
"loss": 0.0017, |
|
"reward": 0.6458333432674408, |
|
"reward_std": 0.13607725501060486, |
|
"rewards/correct_code_reward_func": 0.1458333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 73.47916984558105, |
|
"epoch": 2.9210526315789473, |
|
"grad_norm": 0.8216969877474546, |
|
"kl": 0.073974609375, |
|
"learning_rate": 9.087918299586772e-10, |
|
"loss": 0.0001, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.23709972202777863, |
|
"rewards/correct_code_reward_func": 0.6458333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 41.60416793823242, |
|
"epoch": 2.9342105263157894, |
|
"grad_norm": 1.2661888490492268, |
|
"kl": 0.1630859375, |
|
"learning_rate": 6.312223640933811e-10, |
|
"loss": 0.0002, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.24966806918382645, |
|
"rewards/correct_code_reward_func": 0.6666666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 58.895835876464844, |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 10.649202570243657, |
|
"kl": 1.75439453125, |
|
"learning_rate": 4.0404354830042563e-10, |
|
"loss": 0.0018, |
|
"reward": 0.6875000298023224, |
|
"reward_std": 0.30231600254774094, |
|
"rewards/correct_code_reward_func": 0.1875000111758709, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 73.64583587646484, |
|
"epoch": 2.9605263157894735, |
|
"grad_norm": 3.7010298062572695, |
|
"kl": 0.350341796875, |
|
"learning_rate": 2.2730128924186574e-10, |
|
"loss": 0.0004, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.19500282034277916, |
|
"rewards/correct_code_reward_func": 0.2916666865348816, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 47.64583396911621, |
|
"epoch": 2.973684210526316, |
|
"grad_norm": 2.797986645774147, |
|
"kl": 0.597412109375, |
|
"learning_rate": 1.010313017229536e-10, |
|
"loss": 0.0006, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.31142251938581467, |
|
"rewards/correct_code_reward_func": 0.7083333432674408, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 49.18750190734863, |
|
"epoch": 2.986842105263158, |
|
"grad_norm": 29.33011919112973, |
|
"kl": 2.07421875, |
|
"learning_rate": 2.5259101475161304e-11, |
|
"loss": 0.0021, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.3268197476863861, |
|
"rewards/correct_code_reward_func": 0.458333358168602, |
|
"rewards/len_reward_func": 0.5, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 75.37500381469727, |
|
"epoch": 3.0, |
|
"grad_norm": 1.4657552454229807, |
|
"kl": 0.4228515625, |
|
"learning_rate": 0.0, |
|
"loss": 0.0004, |
|
"reward": 0.8125, |
|
"reward_std": 0.24185511097311974, |
|
"rewards/correct_code_reward_func": 0.3333333432674408, |
|
"rewards/len_reward_func": 0.4791666716337204, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 228, |
|
"total_flos": 0.0, |
|
"train_loss": 0.04159710054096379, |
|
"train_runtime": 3805.4602, |
|
"train_samples_per_second": 0.359, |
|
"train_steps_per_second": 0.06 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 228, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 76, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|