|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"episode": 131072, |
|
"epoch": 1.122960932145305, |
|
"eval_steps": 500, |
|
"global_step": 256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"episode": 512, |
|
"epoch": 0.004386566141192597, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0013769641518592834, |
|
"loss/value_avg": 1.9423742294311523, |
|
"lr": 3e-06, |
|
"objective/entropy": -56.19151306152344, |
|
"objective/kl": 2.391636371612549e-06, |
|
"objective/non_score_reward": -1.1958181289628556e-07, |
|
"objective/rlhf_reward": 4.956933975219727, |
|
"objective/scores": 4.956933975219727, |
|
"policy/approxkl_avg": 0.001525376457720995, |
|
"policy/clipfrac_avg": 0.022334059700369835, |
|
"policy/entropy_avg": 1.2976276874542236, |
|
"step": 1, |
|
"val/clipfrac_avg": 0.006576753221452236, |
|
"val/num_eos_tokens": 13845, |
|
"val/ratio": 0.9997893571853638, |
|
"val/ratio_var": 4.300739419704769e-06 |
|
}, |
|
{ |
|
"episode": 1024, |
|
"epoch": 0.008773132282385195, |
|
"eps": 6, |
|
"loss/policy_avg": 0.0031125713139772415, |
|
"loss/value_avg": 1.3287988901138306, |
|
"lr": 2.9882812500000002e-06, |
|
"objective/entropy": 7.951052665710449, |
|
"objective/kl": 0.7057143449783325, |
|
"objective/non_score_reward": -0.035285718739032745, |
|
"objective/rlhf_reward": 5.132782936096191, |
|
"objective/scores": 5.168068885803223, |
|
"policy/approxkl_avg": 0.0028971810825169086, |
|
"policy/clipfrac_avg": 0.02630750462412834, |
|
"policy/entropy_avg": 0.9777142405509949, |
|
"step": 2, |
|
"val/clipfrac_avg": 0.017064230516552925, |
|
"val/num_eos_tokens": 8612, |
|
"val/ratio": 1.000025749206543, |
|
"val/ratio_var": 1.2729425179713871e-05 |
|
}, |
|
{ |
|
"episode": 1536, |
|
"epoch": 0.013159698423577794, |
|
"eps": 6, |
|
"loss/policy_avg": 0.004569530487060547, |
|
"loss/value_avg": 1.8551630973815918, |
|
"lr": 2.9765625e-06, |
|
"objective/entropy": 6.086846351623535, |
|
"objective/kl": 1.7537505626678467, |
|
"objective/non_score_reward": -0.08768752217292786, |
|
"objective/rlhf_reward": 5.270642280578613, |
|
"objective/scores": 5.358329772949219, |
|
"policy/approxkl_avg": 0.0035523450933396816, |
|
"policy/clipfrac_avg": 0.02881922945380211, |
|
"policy/entropy_avg": 0.99476557970047, |
|
"step": 3, |
|
"val/clipfrac_avg": 0.023893361911177635, |
|
"val/num_eos_tokens": 8606, |
|
"val/ratio": 0.999504804611206, |
|
"val/ratio_var": 2.043731728917919e-05 |
|
}, |
|
{ |
|
"episode": 2048, |
|
"epoch": 0.01754626456477039, |
|
"eps": 6, |
|
"loss/policy_avg": 0.00686279870569706, |
|
"loss/value_avg": 2.0762438774108887, |
|
"lr": 2.96484375e-06, |
|
"objective/entropy": -72.34222412109375, |
|
"objective/kl": 2.7035207748413086, |
|
"objective/non_score_reward": -0.13517603278160095, |
|
"objective/rlhf_reward": 5.987885475158691, |
|
"objective/scores": 6.123061656951904, |
|
"policy/approxkl_avg": 0.0016497070901095867, |
|
"policy/clipfrac_avg": 0.02908516302704811, |
|
"policy/entropy_avg": 1.7031397819519043, |
|
"step": 4, |
|
"val/clipfrac_avg": 0.027677347883582115, |
|
"val/num_eos_tokens": 14870, |
|
"val/ratio": 1.000091552734375, |
|
"val/ratio_var": 2.1516761989914812e-06 |
|
}, |
|
{ |
|
"episode": 2560, |
|
"epoch": 0.02193283070596299, |
|
"eps": 6, |
|
"loss/policy_avg": 0.00263179000467062, |
|
"loss/value_avg": 1.606541633605957, |
|
"lr": 2.953125e-06, |
|
"objective/entropy": -65.28475952148438, |
|
"objective/kl": 4.556520462036133, |
|
"objective/non_score_reward": -0.22782602906227112, |
|
"objective/rlhf_reward": 6.3742218017578125, |
|
"objective/scores": 6.602047920227051, |
|
"policy/approxkl_avg": 0.0017431321321055293, |
|
"policy/clipfrac_avg": 0.02620251104235649, |
|
"policy/entropy_avg": 2.0425772666931152, |
|
"step": 5, |
|
"val/clipfrac_avg": 0.023111186921596527, |
|
"val/num_eos_tokens": 13325, |
|
"val/ratio": 0.9996423721313477, |
|
"val/ratio_var": 2.722375029406976e-06 |
|
}, |
|
{ |
|
"episode": 3072, |
|
"epoch": 0.026319396847155587, |
|
"eps": 6, |
|
"loss/policy_avg": 0.0040515996515750885, |
|
"loss/value_avg": 1.1282833814620972, |
|
"lr": 2.94140625e-06, |
|
"objective/entropy": -56.7620964050293, |
|
"objective/kl": 10.2549467086792, |
|
"objective/non_score_reward": -0.5127473473548889, |
|
"objective/rlhf_reward": 7.096510887145996, |
|
"objective/scores": 7.60925817489624, |
|
"policy/approxkl_avg": 0.002651135204359889, |
|
"policy/clipfrac_avg": 0.02610427513718605, |
|
"policy/entropy_avg": 2.47774600982666, |
|
"step": 6, |
|
"val/clipfrac_avg": 0.01816035807132721, |
|
"val/num_eos_tokens": 20916, |
|
"val/ratio": 0.9997857809066772, |
|
"val/ratio_var": 3.7805486954312073e-06 |
|
}, |
|
{ |
|
"episode": 3584, |
|
"epoch": 0.030705962988348184, |
|
"eps": 6, |
|
"loss/policy_avg": -0.0022932137362658978, |
|
"loss/value_avg": 0.9223657846450806, |
|
"lr": 2.9296875e-06, |
|
"objective/entropy": -51.8396110534668, |
|
"objective/kl": 15.692605972290039, |
|
"objective/non_score_reward": -0.784630298614502, |
|
"objective/rlhf_reward": 7.749014377593994, |
|
"objective/scores": 8.533644676208496, |
|
"policy/approxkl_avg": 0.002274099038913846, |
|
"policy/clipfrac_avg": 0.025012236088514328, |
|
"policy/entropy_avg": 2.5470399856567383, |
|
"step": 7, |
|
"val/clipfrac_avg": 0.018111344426870346, |
|
"val/num_eos_tokens": 27788, |
|
"val/ratio": 1.0004068613052368, |
|
"val/ratio_var": 3.3165326840389753e-06 |
|
}, |
|
{ |
|
"episode": 4096, |
|
"epoch": 0.03509252912954078, |
|
"eps": 5, |
|
"loss/policy_avg": -0.02864566072821617, |
|
"loss/value_avg": 0.8586666584014893, |
|
"lr": 2.91796875e-06, |
|
"objective/entropy": -44.25556564331055, |
|
"objective/kl": 18.9859619140625, |
|
"objective/non_score_reward": -0.9492980241775513, |
|
"objective/rlhf_reward": 7.566014289855957, |
|
"objective/scores": 8.515312194824219, |
|
"policy/approxkl_avg": 0.0024859202094376087, |
|
"policy/clipfrac_avg": 0.02374047227203846, |
|
"policy/entropy_avg": 2.4501914978027344, |
|
"step": 8, |
|
"val/clipfrac_avg": 0.012094689533114433, |
|
"val/num_eos_tokens": 39697, |
|
"val/ratio": 0.9997825622558594, |
|
"val/ratio_var": 2.589023324617301e-06 |
|
}, |
|
{ |
|
"episode": 4608, |
|
"epoch": 0.03947909527073338, |
|
"eps": 5, |
|
"loss/policy_avg": -0.03993895649909973, |
|
"loss/value_avg": 0.8039427995681763, |
|
"lr": 2.90625e-06, |
|
"objective/entropy": -41.25428771972656, |
|
"objective/kl": 21.300134658813477, |
|
"objective/non_score_reward": -1.0650067329406738, |
|
"objective/rlhf_reward": 7.658048152923584, |
|
"objective/scores": 8.723054885864258, |
|
"policy/approxkl_avg": 0.002434398978948593, |
|
"policy/clipfrac_avg": 0.020647358149290085, |
|
"policy/entropy_avg": 2.318600654602051, |
|
"step": 9, |
|
"val/clipfrac_avg": 0.012456808239221573, |
|
"val/num_eos_tokens": 38279, |
|
"val/ratio": 0.9994508028030396, |
|
"val/ratio_var": 2.497590912753367e-06 |
|
}, |
|
{ |
|
"episode": 5120, |
|
"epoch": 0.04386566141192598, |
|
"eps": 5, |
|
"loss/policy_avg": -0.038270700722932816, |
|
"loss/value_avg": 0.7831208109855652, |
|
"lr": 2.89453125e-06, |
|
"objective/entropy": -40.375099182128906, |
|
"objective/kl": 23.66585922241211, |
|
"objective/non_score_reward": -1.1832929849624634, |
|
"objective/rlhf_reward": 7.1149516105651855, |
|
"objective/scores": 8.29824447631836, |
|
"policy/approxkl_avg": 0.0023879953660070896, |
|
"policy/clipfrac_avg": 0.02045682817697525, |
|
"policy/entropy_avg": 2.171109199523926, |
|
"step": 10, |
|
"val/clipfrac_avg": 0.01572496071457863, |
|
"val/num_eos_tokens": 36087, |
|
"val/ratio": 0.9995651245117188, |
|
"val/ratio_var": 4.97030487167649e-06 |
|
}, |
|
{ |
|
"episode": 5632, |
|
"epoch": 0.048252227553118573, |
|
"eps": 5, |
|
"loss/policy_avg": -0.03474128991365433, |
|
"loss/value_avg": 0.7656794786453247, |
|
"lr": 2.8828125e-06, |
|
"objective/entropy": -43.84745788574219, |
|
"objective/kl": 24.729175567626953, |
|
"objective/non_score_reward": -1.2364587783813477, |
|
"objective/rlhf_reward": 7.341504096984863, |
|
"objective/scores": 8.577962875366211, |
|
"policy/approxkl_avg": 0.0027273856103420258, |
|
"policy/clipfrac_avg": 0.022427301853895187, |
|
"policy/entropy_avg": 2.208101272583008, |
|
"step": 11, |
|
"val/clipfrac_avg": 0.016871843487024307, |
|
"val/num_eos_tokens": 36666, |
|
"val/ratio": 0.9991950988769531, |
|
"val/ratio_var": 4.898628048977116e-06 |
|
}, |
|
{ |
|
"episode": 6144, |
|
"epoch": 0.052638793694311174, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0249569620937109, |
|
"loss/value_avg": 0.7158781290054321, |
|
"lr": 2.87109375e-06, |
|
"objective/entropy": -48.97076416015625, |
|
"objective/kl": 25.502344131469727, |
|
"objective/non_score_reward": -1.27511727809906, |
|
"objective/rlhf_reward": 7.928722858428955, |
|
"objective/scores": 9.203840255737305, |
|
"policy/approxkl_avg": 0.0023700755555182695, |
|
"policy/clipfrac_avg": 0.02416691742837429, |
|
"policy/entropy_avg": 2.2255916595458984, |
|
"step": 12, |
|
"val/clipfrac_avg": 0.014078151434659958, |
|
"val/num_eos_tokens": 37746, |
|
"val/ratio": 0.9998907446861267, |
|
"val/ratio_var": 5.1026213441218715e-06 |
|
}, |
|
{ |
|
"episode": 6656, |
|
"epoch": 0.05702535983550377, |
|
"eps": 5, |
|
"loss/policy_avg": -0.018949007615447044, |
|
"loss/value_avg": 0.7686400413513184, |
|
"lr": 2.859375e-06, |
|
"objective/entropy": -47.11943817138672, |
|
"objective/kl": 27.999988555908203, |
|
"objective/non_score_reward": -1.3999994993209839, |
|
"objective/rlhf_reward": 7.812624931335449, |
|
"objective/scores": 9.212624549865723, |
|
"policy/approxkl_avg": 0.002762872725725174, |
|
"policy/clipfrac_avg": 0.024748779833316803, |
|
"policy/entropy_avg": 2.155306339263916, |
|
"step": 13, |
|
"val/clipfrac_avg": 0.015288694761693478, |
|
"val/num_eos_tokens": 36580, |
|
"val/ratio": 1.000169277191162, |
|
"val/ratio_var": 5.652353593177395e-06 |
|
}, |
|
{ |
|
"episode": 7168, |
|
"epoch": 0.06141192597669637, |
|
"eps": 5, |
|
"loss/policy_avg": -0.009994697757065296, |
|
"loss/value_avg": 0.7283180952072144, |
|
"lr": 2.84765625e-06, |
|
"objective/entropy": -53.18806457519531, |
|
"objective/kl": 27.922462463378906, |
|
"objective/non_score_reward": -1.3961231708526611, |
|
"objective/rlhf_reward": 8.238632202148438, |
|
"objective/scores": 9.63475513458252, |
|
"policy/approxkl_avg": 0.002478944603353739, |
|
"policy/clipfrac_avg": 0.021354785189032555, |
|
"policy/entropy_avg": 2.150880813598633, |
|
"step": 14, |
|
"val/clipfrac_avg": 0.012931845150887966, |
|
"val/num_eos_tokens": 36004, |
|
"val/ratio": 1.0006251335144043, |
|
"val/ratio_var": 7.0911496550252195e-06 |
|
}, |
|
{ |
|
"episode": 7680, |
|
"epoch": 0.06579849211788896, |
|
"eps": 5, |
|
"loss/policy_avg": -0.006545604206621647, |
|
"loss/value_avg": 0.6837283968925476, |
|
"lr": 2.8359375e-06, |
|
"objective/entropy": -51.048362731933594, |
|
"objective/kl": 29.153331756591797, |
|
"objective/non_score_reward": -1.4576666355133057, |
|
"objective/rlhf_reward": 8.508513450622559, |
|
"objective/scores": 9.966179847717285, |
|
"policy/approxkl_avg": 0.003793728072196245, |
|
"policy/clipfrac_avg": 0.023298773914575577, |
|
"policy/entropy_avg": 2.1069443225860596, |
|
"step": 15, |
|
"val/clipfrac_avg": 0.016515308991074562, |
|
"val/num_eos_tokens": 33784, |
|
"val/ratio": 1.0005180835723877, |
|
"val/ratio_var": 1.2400751074892469e-05 |
|
}, |
|
{ |
|
"episode": 8192, |
|
"epoch": 0.07018505825908156, |
|
"eps": 5, |
|
"loss/policy_avg": -0.010740559548139572, |
|
"loss/value_avg": 0.6928367614746094, |
|
"lr": 2.82421875e-06, |
|
"objective/entropy": -50.633888244628906, |
|
"objective/kl": 29.69721794128418, |
|
"objective/non_score_reward": -1.484860897064209, |
|
"objective/rlhf_reward": 8.630350112915039, |
|
"objective/scores": 10.115211486816406, |
|
"policy/approxkl_avg": 0.002925678389146924, |
|
"policy/clipfrac_avg": 0.022546332329511642, |
|
"policy/entropy_avg": 2.0426688194274902, |
|
"step": 16, |
|
"val/clipfrac_avg": 0.01313821505755186, |
|
"val/num_eos_tokens": 34515, |
|
"val/ratio": 0.9999200105667114, |
|
"val/ratio_var": 3.5139350984536577e-06 |
|
}, |
|
{ |
|
"episode": 8704, |
|
"epoch": 0.07457162440027416, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0003042006865143776, |
|
"loss/value_avg": 0.630737841129303, |
|
"lr": 2.8125e-06, |
|
"objective/entropy": -54.703208923339844, |
|
"objective/kl": 29.418441772460938, |
|
"objective/non_score_reward": -1.4709219932556152, |
|
"objective/rlhf_reward": 8.782114028930664, |
|
"objective/scores": 10.253036499023438, |
|
"policy/approxkl_avg": 0.0034269755706191063, |
|
"policy/clipfrac_avg": 0.021969493478536606, |
|
"policy/entropy_avg": 2.0263655185699463, |
|
"step": 17, |
|
"val/clipfrac_avg": 0.013666579499840736, |
|
"val/num_eos_tokens": 32440, |
|
"val/ratio": 0.9990756511688232, |
|
"val/ratio_var": 5.783027063444024e-06 |
|
}, |
|
{ |
|
"episode": 9216, |
|
"epoch": 0.07895819054146676, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0013809381052851677, |
|
"loss/value_avg": 0.625673234462738, |
|
"lr": 2.80078125e-06, |
|
"objective/entropy": -50.05335235595703, |
|
"objective/kl": 32.24420166015625, |
|
"objective/non_score_reward": -1.6122100353240967, |
|
"objective/rlhf_reward": 8.869917869567871, |
|
"objective/scores": 10.482128143310547, |
|
"policy/approxkl_avg": 0.0037200015503913164, |
|
"policy/clipfrac_avg": 0.023221854120492935, |
|
"policy/entropy_avg": 1.9334304332733154, |
|
"step": 18, |
|
"val/clipfrac_avg": 0.012253889814019203, |
|
"val/num_eos_tokens": 32755, |
|
"val/ratio": 1.0010600090026855, |
|
"val/ratio_var": 1.8864358935388736e-05 |
|
}, |
|
{ |
|
"episode": 9728, |
|
"epoch": 0.08334475668265935, |
|
"eps": 5, |
|
"loss/policy_avg": -0.001600255724042654, |
|
"loss/value_avg": 0.5548046231269836, |
|
"lr": 2.7890625e-06, |
|
"objective/entropy": -52.448543548583984, |
|
"objective/kl": 31.141429901123047, |
|
"objective/non_score_reward": -1.5570714473724365, |
|
"objective/rlhf_reward": 9.025203704833984, |
|
"objective/scores": 10.582275390625, |
|
"policy/approxkl_avg": 0.00342507753521204, |
|
"policy/clipfrac_avg": 0.022851863875985146, |
|
"policy/entropy_avg": 1.9168764352798462, |
|
"step": 19, |
|
"val/clipfrac_avg": 0.011491503566503525, |
|
"val/num_eos_tokens": 33170, |
|
"val/ratio": 0.999735951423645, |
|
"val/ratio_var": 5.808557489217492e-06 |
|
}, |
|
{ |
|
"episode": 10240, |
|
"epoch": 0.08773132282385196, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00035352353006601334, |
|
"loss/value_avg": 0.5776740312576294, |
|
"lr": 2.77734375e-06, |
|
"objective/entropy": -49.64569854736328, |
|
"objective/kl": 32.26124572753906, |
|
"objective/non_score_reward": -1.6130623817443848, |
|
"objective/rlhf_reward": 9.020620346069336, |
|
"objective/scores": 10.633682250976562, |
|
"policy/approxkl_avg": 0.003732402576133609, |
|
"policy/clipfrac_avg": 0.02131691202521324, |
|
"policy/entropy_avg": 1.8495761156082153, |
|
"step": 20, |
|
"val/clipfrac_avg": 0.014484411105513573, |
|
"val/num_eos_tokens": 32124, |
|
"val/ratio": 1.000256061553955, |
|
"val/ratio_var": 1.3850245522917248e-05 |
|
}, |
|
{ |
|
"episode": 10752, |
|
"epoch": 0.09211788896504455, |
|
"eps": 5, |
|
"loss/policy_avg": 0.004326590336859226, |
|
"loss/value_avg": 0.5091855525970459, |
|
"lr": 2.765625e-06, |
|
"objective/entropy": -52.526283264160156, |
|
"objective/kl": 32.356727600097656, |
|
"objective/non_score_reward": -1.6178364753723145, |
|
"objective/rlhf_reward": 8.98788070678711, |
|
"objective/scores": 10.605716705322266, |
|
"policy/approxkl_avg": 0.002898063976317644, |
|
"policy/clipfrac_avg": 0.02005818486213684, |
|
"policy/entropy_avg": 1.8272948265075684, |
|
"step": 21, |
|
"val/clipfrac_avg": 0.010025454685091972, |
|
"val/num_eos_tokens": 31636, |
|
"val/ratio": 0.9998326301574707, |
|
"val/ratio_var": 4.743439149024198e-06 |
|
}, |
|
{ |
|
"episode": 11264, |
|
"epoch": 0.09650445510623715, |
|
"eps": 5, |
|
"loss/policy_avg": 0.001326502999290824, |
|
"loss/value_avg": 0.5260515213012695, |
|
"lr": 2.75390625e-06, |
|
"objective/entropy": -50.08943557739258, |
|
"objective/kl": 32.325721740722656, |
|
"objective/non_score_reward": -1.6162861585617065, |
|
"objective/rlhf_reward": 9.08689022064209, |
|
"objective/scores": 10.703176498413086, |
|
"policy/approxkl_avg": 0.004229161888360977, |
|
"policy/clipfrac_avg": 0.019012872129678726, |
|
"policy/entropy_avg": 1.7281861305236816, |
|
"step": 22, |
|
"val/clipfrac_avg": 0.012183602899312973, |
|
"val/num_eos_tokens": 30727, |
|
"val/ratio": 0.9997547268867493, |
|
"val/ratio_var": 5.818676982016768e-06 |
|
}, |
|
{ |
|
"episode": 11776, |
|
"epoch": 0.10089102124742974, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0023662205785512924, |
|
"loss/value_avg": 0.4883805811405182, |
|
"lr": 2.7421875e-06, |
|
"objective/entropy": -53.46156692504883, |
|
"objective/kl": 31.849475860595703, |
|
"objective/non_score_reward": -1.5924739837646484, |
|
"objective/rlhf_reward": 9.090110778808594, |
|
"objective/scores": 10.682584762573242, |
|
"policy/approxkl_avg": 0.003692931029945612, |
|
"policy/clipfrac_avg": 0.01948227360844612, |
|
"policy/entropy_avg": 1.7976675033569336, |
|
"step": 23, |
|
"val/clipfrac_avg": 0.012527575716376305, |
|
"val/num_eos_tokens": 31978, |
|
"val/ratio": 1.0005892515182495, |
|
"val/ratio_var": 5.0292437663301826e-05 |
|
}, |
|
{ |
|
"episode": 12288, |
|
"epoch": 0.10527758738862235, |
|
"eps": 5, |
|
"loss/policy_avg": -0.002815414220094681, |
|
"loss/value_avg": 0.47801172733306885, |
|
"lr": 2.73046875e-06, |
|
"objective/entropy": -54.20009231567383, |
|
"objective/kl": 30.75657081604004, |
|
"objective/non_score_reward": -1.5378286838531494, |
|
"objective/rlhf_reward": 9.155134201049805, |
|
"objective/scores": 10.692962646484375, |
|
"policy/approxkl_avg": 0.0031356574036180973, |
|
"policy/clipfrac_avg": 0.01814776286482811, |
|
"policy/entropy_avg": 1.7910568714141846, |
|
"step": 24, |
|
"val/clipfrac_avg": 0.0124925896525383, |
|
"val/num_eos_tokens": 30355, |
|
"val/ratio": 0.9999936819076538, |
|
"val/ratio_var": 3.8795255932200234e-06 |
|
}, |
|
{ |
|
"episode": 12800, |
|
"epoch": 0.10966415352981494, |
|
"eps": 5, |
|
"loss/policy_avg": 0.004107598215341568, |
|
"loss/value_avg": 0.42799514532089233, |
|
"lr": 2.71875e-06, |
|
"objective/entropy": -56.50166320800781, |
|
"objective/kl": 29.329524993896484, |
|
"objective/non_score_reward": -1.466476321220398, |
|
"objective/rlhf_reward": 9.282110214233398, |
|
"objective/scores": 10.748586654663086, |
|
"policy/approxkl_avg": 0.0033666701056063175, |
|
"policy/clipfrac_avg": 0.018708810210227966, |
|
"policy/entropy_avg": 1.836845874786377, |
|
"step": 25, |
|
"val/clipfrac_avg": 0.007121690083295107, |
|
"val/num_eos_tokens": 29066, |
|
"val/ratio": 0.9996716976165771, |
|
"val/ratio_var": 5.8772338888957165e-06 |
|
}, |
|
{ |
|
"episode": 13312, |
|
"epoch": 0.11405071967100754, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00938927847892046, |
|
"loss/value_avg": 0.3949218690395355, |
|
"lr": 2.70703125e-06, |
|
"objective/entropy": -60.591705322265625, |
|
"objective/kl": 28.337303161621094, |
|
"objective/non_score_reward": -1.4168651103973389, |
|
"objective/rlhf_reward": 9.06982707977295, |
|
"objective/scores": 10.486692428588867, |
|
"policy/approxkl_avg": 0.0028319661505520344, |
|
"policy/clipfrac_avg": 0.021056555211544037, |
|
"policy/entropy_avg": 1.9272569417953491, |
|
"step": 26, |
|
"val/clipfrac_avg": 0.00876462459564209, |
|
"val/num_eos_tokens": 28183, |
|
"val/ratio": 0.9999039173126221, |
|
"val/ratio_var": 4.642893145501148e-06 |
|
}, |
|
{ |
|
"episode": 13824, |
|
"epoch": 0.11843728581220014, |
|
"eps": 5, |
|
"loss/policy_avg": 0.006120752543210983, |
|
"loss/value_avg": 0.3891026973724365, |
|
"lr": 2.6953125e-06, |
|
"objective/entropy": -60.93333435058594, |
|
"objective/kl": 28.110050201416016, |
|
"objective/non_score_reward": -1.4055025577545166, |
|
"objective/rlhf_reward": 9.255789756774902, |
|
"objective/scores": 10.66129207611084, |
|
"policy/approxkl_avg": 0.0032745348289608955, |
|
"policy/clipfrac_avg": 0.0199459008872509, |
|
"policy/entropy_avg": 1.9059574604034424, |
|
"step": 27, |
|
"val/clipfrac_avg": 0.007393369916826487, |
|
"val/num_eos_tokens": 29010, |
|
"val/ratio": 0.999842643737793, |
|
"val/ratio_var": 8.019253073143773e-06 |
|
}, |
|
{ |
|
"episode": 14336, |
|
"epoch": 0.12282385195339274, |
|
"eps": 5, |
|
"loss/policy_avg": 0.008166640996932983, |
|
"loss/value_avg": 0.33365607261657715, |
|
"lr": 2.68359375e-06, |
|
"objective/entropy": -61.89513397216797, |
|
"objective/kl": 27.650426864624023, |
|
"objective/non_score_reward": -1.382521390914917, |
|
"objective/rlhf_reward": 9.297162055969238, |
|
"objective/scores": 10.679683685302734, |
|
"policy/approxkl_avg": 0.0037113018333911896, |
|
"policy/clipfrac_avg": 0.020008713006973267, |
|
"policy/entropy_avg": 1.9356969594955444, |
|
"step": 28, |
|
"val/clipfrac_avg": 0.008162135258316994, |
|
"val/num_eos_tokens": 25073, |
|
"val/ratio": 1.0000048875808716, |
|
"val/ratio_var": 6.840070909674978e-06 |
|
}, |
|
{ |
|
"episode": 14848, |
|
"epoch": 0.12721041809458533, |
|
"eps": 5, |
|
"loss/policy_avg": 0.01543242298066616, |
|
"loss/value_avg": 0.3317902088165283, |
|
"lr": 2.671875e-06, |
|
"objective/entropy": -53.287864685058594, |
|
"objective/kl": 28.098434448242188, |
|
"objective/non_score_reward": -1.4049216508865356, |
|
"objective/rlhf_reward": 9.339518547058105, |
|
"objective/scores": 10.744440078735352, |
|
"policy/approxkl_avg": 0.00311127002350986, |
|
"policy/clipfrac_avg": 0.02064812183380127, |
|
"policy/entropy_avg": 1.7959502935409546, |
|
"step": 29, |
|
"val/clipfrac_avg": 0.00857294537127018, |
|
"val/num_eos_tokens": 24235, |
|
"val/ratio": 1.000222086906433, |
|
"val/ratio_var": 5.494624929269776e-06 |
|
}, |
|
{ |
|
"episode": 15360, |
|
"epoch": 0.13159698423577793, |
|
"eps": 5, |
|
"loss/policy_avg": 0.010417070239782333, |
|
"loss/value_avg": 0.34546566009521484, |
|
"lr": 2.66015625e-06, |
|
"objective/entropy": -58.94676971435547, |
|
"objective/kl": 28.113197326660156, |
|
"objective/non_score_reward": -1.4056599140167236, |
|
"objective/rlhf_reward": 9.112338066101074, |
|
"objective/scores": 10.517997741699219, |
|
"policy/approxkl_avg": 0.003508294001221657, |
|
"policy/clipfrac_avg": 0.021732624620199203, |
|
"policy/entropy_avg": 1.8900206089019775, |
|
"step": 30, |
|
"val/clipfrac_avg": 0.006745144259184599, |
|
"val/num_eos_tokens": 26990, |
|
"val/ratio": 0.9994415044784546, |
|
"val/ratio_var": 4.539244855550351e-06 |
|
}, |
|
{ |
|
"episode": 15872, |
|
"epoch": 0.13598355037697052, |
|
"eps": 5, |
|
"loss/policy_avg": 0.011747404932975769, |
|
"loss/value_avg": 0.32529520988464355, |
|
"lr": 2.6484375e-06, |
|
"objective/entropy": -50.62657165527344, |
|
"objective/kl": 28.064453125, |
|
"objective/non_score_reward": -1.403222680091858, |
|
"objective/rlhf_reward": 9.3327054977417, |
|
"objective/scores": 10.735928535461426, |
|
"policy/approxkl_avg": 0.0032155239023268223, |
|
"policy/clipfrac_avg": 0.02317969501018524, |
|
"policy/entropy_avg": 1.7806997299194336, |
|
"step": 31, |
|
"val/clipfrac_avg": 0.006999637931585312, |
|
"val/num_eos_tokens": 23939, |
|
"val/ratio": 1.0002076625823975, |
|
"val/ratio_var": 2.5230063329217955e-05 |
|
}, |
|
{ |
|
"episode": 16384, |
|
"epoch": 0.1403701165181631, |
|
"eps": 5, |
|
"loss/policy_avg": 0.005137978587299585, |
|
"loss/value_avg": 0.32520678639411926, |
|
"lr": 2.63671875e-06, |
|
"objective/entropy": -57.196502685546875, |
|
"objective/kl": 28.533008575439453, |
|
"objective/non_score_reward": -1.4266504049301147, |
|
"objective/rlhf_reward": 9.323891639709473, |
|
"objective/scores": 10.750541687011719, |
|
"policy/approxkl_avg": 0.003107226686552167, |
|
"policy/clipfrac_avg": 0.02019701898097992, |
|
"policy/entropy_avg": 1.8475944995880127, |
|
"step": 32, |
|
"val/clipfrac_avg": 0.006430475041270256, |
|
"val/num_eos_tokens": 25239, |
|
"val/ratio": 1.0004234313964844, |
|
"val/ratio_var": 4.5365977712208405e-06 |
|
}, |
|
{ |
|
"episode": 16896, |
|
"epoch": 0.14475668265935573, |
|
"eps": 5, |
|
"loss/policy_avg": 0.01028348132967949, |
|
"loss/value_avg": 0.35146623849868774, |
|
"lr": 2.6250000000000003e-06, |
|
"objective/entropy": -54.473472595214844, |
|
"objective/kl": 30.472434997558594, |
|
"objective/non_score_reward": -1.523621678352356, |
|
"objective/rlhf_reward": 9.17609977722168, |
|
"objective/scores": 10.699721336364746, |
|
"policy/approxkl_avg": 0.003178014885634184, |
|
"policy/clipfrac_avg": 0.0210000891238451, |
|
"policy/entropy_avg": 1.8384833335876465, |
|
"step": 33, |
|
"val/clipfrac_avg": 0.005109312012791634, |
|
"val/num_eos_tokens": 24597, |
|
"val/ratio": 1.000359058380127, |
|
"val/ratio_var": 6.385233064065687e-06 |
|
}, |
|
{ |
|
"episode": 17408, |
|
"epoch": 0.14914324880054833, |
|
"eps": 5, |
|
"loss/policy_avg": 0.004060306120663881, |
|
"loss/value_avg": 0.3234812617301941, |
|
"lr": 2.61328125e-06, |
|
"objective/entropy": -51.67649841308594, |
|
"objective/kl": 31.204999923706055, |
|
"objective/non_score_reward": -1.5602500438690186, |
|
"objective/rlhf_reward": 9.3241605758667, |
|
"objective/scores": 10.884410858154297, |
|
"policy/approxkl_avg": 0.0030823112465441227, |
|
"policy/clipfrac_avg": 0.02090715430676937, |
|
"policy/entropy_avg": 1.7866475582122803, |
|
"step": 34, |
|
"val/clipfrac_avg": 0.006479810923337936, |
|
"val/num_eos_tokens": 26150, |
|
"val/ratio": 0.9998917579650879, |
|
"val/ratio_var": 4.709030235972023e-06 |
|
}, |
|
{ |
|
"episode": 17920, |
|
"epoch": 0.15352981494174092, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0002043084241449833, |
|
"loss/value_avg": 0.331778347492218, |
|
"lr": 2.6015625e-06, |
|
"objective/entropy": -50.791839599609375, |
|
"objective/kl": 32.52722930908203, |
|
"objective/non_score_reward": -1.62636137008667, |
|
"objective/rlhf_reward": 9.229442596435547, |
|
"objective/scores": 10.855804443359375, |
|
"policy/approxkl_avg": 0.0038988732267171144, |
|
"policy/clipfrac_avg": 0.02275794744491577, |
|
"policy/entropy_avg": 1.7861764430999756, |
|
"step": 35, |
|
"val/clipfrac_avg": 0.0063492972403764725, |
|
"val/num_eos_tokens": 24757, |
|
"val/ratio": 0.999879002571106, |
|
"val/ratio_var": 6.5008221099560615e-06 |
|
}, |
|
{ |
|
"episode": 18432, |
|
"epoch": 0.15791638108293352, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0009975926950573921, |
|
"loss/value_avg": 0.3189573884010315, |
|
"lr": 2.5898437500000003e-06, |
|
"objective/entropy": -48.66301727294922, |
|
"objective/kl": 33.829776763916016, |
|
"objective/non_score_reward": -1.6914888620376587, |
|
"objective/rlhf_reward": 9.39903736114502, |
|
"objective/scores": 11.090526580810547, |
|
"policy/approxkl_avg": 0.0024533928371965885, |
|
"policy/clipfrac_avg": 0.020511234179139137, |
|
"policy/entropy_avg": 1.7290668487548828, |
|
"step": 36, |
|
"val/clipfrac_avg": 0.007418747525662184, |
|
"val/num_eos_tokens": 25660, |
|
"val/ratio": 1.0001412630081177, |
|
"val/ratio_var": 4.908704795525409e-06 |
|
}, |
|
{ |
|
"episode": 18944, |
|
"epoch": 0.1623029472241261, |
|
"eps": 5, |
|
"loss/policy_avg": -0.00018343282863497734, |
|
"loss/value_avg": 0.3374910354614258, |
|
"lr": 2.578125e-06, |
|
"objective/entropy": -48.08941650390625, |
|
"objective/kl": 34.32440948486328, |
|
"objective/non_score_reward": -1.716220498085022, |
|
"objective/rlhf_reward": 9.393345832824707, |
|
"objective/scores": 11.109566688537598, |
|
"policy/approxkl_avg": 0.002999143209308386, |
|
"policy/clipfrac_avg": 0.02006850577890873, |
|
"policy/entropy_avg": 1.6796194314956665, |
|
"step": 37, |
|
"val/clipfrac_avg": 0.006178020033985376, |
|
"val/num_eos_tokens": 23563, |
|
"val/ratio": 1.0002793073654175, |
|
"val/ratio_var": 4.9771865633374546e-06 |
|
}, |
|
{ |
|
"episode": 19456, |
|
"epoch": 0.1666895133653187, |
|
"eps": 5, |
|
"loss/policy_avg": 0.006254453212022781, |
|
"loss/value_avg": 0.35177081823349, |
|
"lr": 2.56640625e-06, |
|
"objective/entropy": -48.374507904052734, |
|
"objective/kl": 34.51807403564453, |
|
"objective/non_score_reward": -1.725903868675232, |
|
"objective/rlhf_reward": 9.309269905090332, |
|
"objective/scores": 11.035173416137695, |
|
"policy/approxkl_avg": 0.0031045477371662855, |
|
"policy/clipfrac_avg": 0.020648740231990814, |
|
"policy/entropy_avg": 1.7128504514694214, |
|
"step": 38, |
|
"val/clipfrac_avg": 0.008920802734792233, |
|
"val/num_eos_tokens": 25658, |
|
"val/ratio": 0.9995635747909546, |
|
"val/ratio_var": 6.544411462527933e-06 |
|
}, |
|
{ |
|
"episode": 19968, |
|
"epoch": 0.1710760795065113, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0010195476934313774, |
|
"loss/value_avg": 0.3460981845855713, |
|
"lr": 2.5546875000000003e-06, |
|
"objective/entropy": -51.683372497558594, |
|
"objective/kl": 32.773372650146484, |
|
"objective/non_score_reward": -1.6386685371398926, |
|
"objective/rlhf_reward": 9.41845703125, |
|
"objective/scores": 11.057125091552734, |
|
"policy/approxkl_avg": 0.0031053770799189806, |
|
"policy/clipfrac_avg": 0.02040484920144081, |
|
"policy/entropy_avg": 1.729933261871338, |
|
"step": 39, |
|
"val/clipfrac_avg": 0.006355272606015205, |
|
"val/num_eos_tokens": 24834, |
|
"val/ratio": 0.9998283982276917, |
|
"val/ratio_var": 4.5907336243544705e-06 |
|
}, |
|
{ |
|
"episode": 20480, |
|
"epoch": 0.17546264564770392, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0011649285443127155, |
|
"loss/value_avg": 0.32062214612960815, |
|
"lr": 2.54296875e-06, |
|
"objective/entropy": -53.09362030029297, |
|
"objective/kl": 33.102630615234375, |
|
"objective/non_score_reward": -1.6551315784454346, |
|
"objective/rlhf_reward": 9.409041404724121, |
|
"objective/scores": 11.064172744750977, |
|
"policy/approxkl_avg": 0.0030981849413365126, |
|
"policy/clipfrac_avg": 0.01863136701285839, |
|
"policy/entropy_avg": 1.7126240730285645, |
|
"step": 40, |
|
"val/clipfrac_avg": 0.007702820934355259, |
|
"val/num_eos_tokens": 24091, |
|
"val/ratio": 0.9998593330383301, |
|
"val/ratio_var": 5.052448614151217e-06 |
|
}, |
|
{ |
|
"episode": 20992, |
|
"epoch": 0.1798492117888965, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0019053546711802483, |
|
"loss/value_avg": 0.3188174068927765, |
|
"lr": 2.53125e-06, |
|
"objective/entropy": -54.4024543762207, |
|
"objective/kl": 32.57151794433594, |
|
"objective/non_score_reward": -1.6285758018493652, |
|
"objective/rlhf_reward": 9.431205749511719, |
|
"objective/scores": 11.059782028198242, |
|
"policy/approxkl_avg": 0.0028727450408041477, |
|
"policy/clipfrac_avg": 0.01854875311255455, |
|
"policy/entropy_avg": 1.7159607410430908, |
|
"step": 41, |
|
"val/clipfrac_avg": 0.008048100396990776, |
|
"val/num_eos_tokens": 24981, |
|
"val/ratio": 0.9997231960296631, |
|
"val/ratio_var": 4.052901658724295e-06 |
|
}, |
|
{ |
|
"episode": 21504, |
|
"epoch": 0.1842357779300891, |
|
"eps": 5, |
|
"loss/policy_avg": 0.001745171844959259, |
|
"loss/value_avg": 0.35853347182273865, |
|
"lr": 2.5195312500000003e-06, |
|
"objective/entropy": -54.93397521972656, |
|
"objective/kl": 32.59296417236328, |
|
"objective/non_score_reward": -1.6296483278274536, |
|
"objective/rlhf_reward": 9.271963119506836, |
|
"objective/scores": 10.901611328125, |
|
"policy/approxkl_avg": 0.002658254001289606, |
|
"policy/clipfrac_avg": 0.01734079420566559, |
|
"policy/entropy_avg": 1.7013740539550781, |
|
"step": 42, |
|
"val/clipfrac_avg": 0.005504803732037544, |
|
"val/num_eos_tokens": 27864, |
|
"val/ratio": 1.0002515316009521, |
|
"val/ratio_var": 4.718350737675792e-06 |
|
}, |
|
{ |
|
"episode": 22016, |
|
"epoch": 0.1886223440712817, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0031983088701963425, |
|
"loss/value_avg": 0.32714200019836426, |
|
"lr": 2.5078125e-06, |
|
"objective/entropy": -56.087486267089844, |
|
"objective/kl": 31.602561950683594, |
|
"objective/non_score_reward": -1.5801280736923218, |
|
"objective/rlhf_reward": 9.48602294921875, |
|
"objective/scores": 11.066150665283203, |
|
"policy/approxkl_avg": 0.0033945119939744473, |
|
"policy/clipfrac_avg": 0.017319316044449806, |
|
"policy/entropy_avg": 1.7106884717941284, |
|
"step": 43, |
|
"val/clipfrac_avg": 0.00773418415337801, |
|
"val/num_eos_tokens": 27303, |
|
"val/ratio": 0.9996844530105591, |
|
"val/ratio_var": 4.000376975454856e-06 |
|
}, |
|
{ |
|
"episode": 22528, |
|
"epoch": 0.1930089102124743, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0024800747632980347, |
|
"loss/value_avg": 0.3157750368118286, |
|
"lr": 2.49609375e-06, |
|
"objective/entropy": -58.142826080322266, |
|
"objective/kl": 31.376550674438477, |
|
"objective/non_score_reward": -1.5688276290893555, |
|
"objective/rlhf_reward": 9.588839530944824, |
|
"objective/scores": 11.15766716003418, |
|
"policy/approxkl_avg": 0.0025360295549035072, |
|
"policy/clipfrac_avg": 0.015292399562895298, |
|
"policy/entropy_avg": 1.6972548961639404, |
|
"step": 44, |
|
"val/clipfrac_avg": 0.007428675889968872, |
|
"val/num_eos_tokens": 26608, |
|
"val/ratio": 1.0000505447387695, |
|
"val/ratio_var": 4.718193395092385e-06 |
|
}, |
|
{ |
|
"episode": 23040, |
|
"epoch": 0.1973954763536669, |
|
"eps": 5, |
|
"loss/policy_avg": 0.001205185428261757, |
|
"loss/value_avg": 0.3156132996082306, |
|
"lr": 2.4843750000000002e-06, |
|
"objective/entropy": -58.6962890625, |
|
"objective/kl": 30.90795135498047, |
|
"objective/non_score_reward": -1.5453976392745972, |
|
"objective/rlhf_reward": 9.614949226379395, |
|
"objective/scores": 11.160346984863281, |
|
"policy/approxkl_avg": 0.0026144087314605713, |
|
"policy/clipfrac_avg": 0.016292206943035126, |
|
"policy/entropy_avg": 1.7031623125076294, |
|
"step": 45, |
|
"val/clipfrac_avg": 0.0070708440616726875, |
|
"val/num_eos_tokens": 26980, |
|
"val/ratio": 0.9996762275695801, |
|
"val/ratio_var": 4.505399829213275e-06 |
|
}, |
|
{ |
|
"episode": 23552, |
|
"epoch": 0.20178204249485948, |
|
"eps": 5, |
|
"loss/policy_avg": 0.004359962418675423, |
|
"loss/value_avg": 0.32259702682495117, |
|
"lr": 2.47265625e-06, |
|
"objective/entropy": -61.484619140625, |
|
"objective/kl": 29.10392189025879, |
|
"objective/non_score_reward": -1.4551961421966553, |
|
"objective/rlhf_reward": 9.753060340881348, |
|
"objective/scores": 11.208256721496582, |
|
"policy/approxkl_avg": 0.002397476462647319, |
|
"policy/clipfrac_avg": 0.013912687078118324, |
|
"policy/entropy_avg": 1.727178931236267, |
|
"step": 46, |
|
"val/clipfrac_avg": 0.00816885195672512, |
|
"val/num_eos_tokens": 28750, |
|
"val/ratio": 0.999366283416748, |
|
"val/ratio_var": 4.691919002652867e-06 |
|
}, |
|
{ |
|
"episode": 24064, |
|
"epoch": 0.2061686086360521, |
|
"eps": 5, |
|
"loss/policy_avg": 0.001963222399353981, |
|
"loss/value_avg": 0.31864023208618164, |
|
"lr": 2.4609375e-06, |
|
"objective/entropy": -61.265846252441406, |
|
"objective/kl": 29.692306518554688, |
|
"objective/non_score_reward": -1.4846153259277344, |
|
"objective/rlhf_reward": 9.666860580444336, |
|
"objective/scores": 11.15147590637207, |
|
"policy/approxkl_avg": 0.002565302886068821, |
|
"policy/clipfrac_avg": 0.013390684500336647, |
|
"policy/entropy_avg": 1.7250237464904785, |
|
"step": 47, |
|
"val/clipfrac_avg": 0.007603655569255352, |
|
"val/num_eos_tokens": 28052, |
|
"val/ratio": 1.0003018379211426, |
|
"val/ratio_var": 5.1418255679891445e-06 |
|
}, |
|
{ |
|
"episode": 24576, |
|
"epoch": 0.2105551747772447, |
|
"eps": 5, |
|
"loss/policy_avg": 0.007894441485404968, |
|
"loss/value_avg": 0.32393255829811096, |
|
"lr": 2.4492187500000002e-06, |
|
"objective/entropy": -61.45293426513672, |
|
"objective/kl": 29.473636627197266, |
|
"objective/non_score_reward": -1.4736818075180054, |
|
"objective/rlhf_reward": 9.70655345916748, |
|
"objective/scores": 11.180234909057617, |
|
"policy/approxkl_avg": 0.0018561662873253226, |
|
"policy/clipfrac_avg": 0.012259826064109802, |
|
"policy/entropy_avg": 1.7109990119934082, |
|
"step": 48, |
|
"val/clipfrac_avg": 0.007823488675057888, |
|
"val/num_eos_tokens": 30985, |
|
"val/ratio": 1.0001144409179688, |
|
"val/ratio_var": 3.26874760503415e-06 |
|
}, |
|
{ |
|
"episode": 25088, |
|
"epoch": 0.2149417409184373, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00292217917740345, |
|
"loss/value_avg": 0.3294008672237396, |
|
"lr": 2.4375e-06, |
|
"objective/entropy": -60.33291244506836, |
|
"objective/kl": 30.310314178466797, |
|
"objective/non_score_reward": -1.5155158042907715, |
|
"objective/rlhf_reward": 9.636428833007812, |
|
"objective/scores": 11.151945114135742, |
|
"policy/approxkl_avg": 0.0021009996999055147, |
|
"policy/clipfrac_avg": 0.012688988819718361, |
|
"policy/entropy_avg": 1.6571977138519287, |
|
"step": 49, |
|
"val/clipfrac_avg": 0.0057515716180205345, |
|
"val/num_eos_tokens": 27460, |
|
"val/ratio": 1.0001912117004395, |
|
"val/ratio_var": 3.742007493201527e-06 |
|
}, |
|
{ |
|
"episode": 25600, |
|
"epoch": 0.21932830705962988, |
|
"eps": 5, |
|
"loss/policy_avg": 0.003821900114417076, |
|
"loss/value_avg": 0.30453431606292725, |
|
"lr": 2.42578125e-06, |
|
"objective/entropy": -59.68878173828125, |
|
"objective/kl": 30.681987762451172, |
|
"objective/non_score_reward": -1.5340994596481323, |
|
"objective/rlhf_reward": 9.784141540527344, |
|
"objective/scores": 11.318241119384766, |
|
"policy/approxkl_avg": 0.0016853193519636989, |
|
"policy/clipfrac_avg": 0.011813260614871979, |
|
"policy/entropy_avg": 1.642488956451416, |
|
"step": 50, |
|
"val/clipfrac_avg": 0.00796700082719326, |
|
"val/num_eos_tokens": 28255, |
|
"val/ratio": 1.0001150369644165, |
|
"val/ratio_var": 3.880189069604967e-06 |
|
}, |
|
{ |
|
"episode": 26112, |
|
"epoch": 0.22371487320082248, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0011352086439728737, |
|
"loss/value_avg": 0.30040693283081055, |
|
"lr": 2.4140625000000002e-06, |
|
"objective/entropy": -55.53452682495117, |
|
"objective/kl": 33.050025939941406, |
|
"objective/non_score_reward": -1.6525013446807861, |
|
"objective/rlhf_reward": 9.85799503326416, |
|
"objective/scores": 11.510496139526367, |
|
"policy/approxkl_avg": 0.0019366666674613953, |
|
"policy/clipfrac_avg": 0.011372741311788559, |
|
"policy/entropy_avg": 1.5299839973449707, |
|
"step": 51, |
|
"val/clipfrac_avg": 0.008255399763584137, |
|
"val/num_eos_tokens": 27884, |
|
"val/ratio": 0.9998023509979248, |
|
"val/ratio_var": 4.162790446571307e-06 |
|
}, |
|
{ |
|
"episode": 26624, |
|
"epoch": 0.22810143934201507, |
|
"eps": 5, |
|
"loss/policy_avg": 0.008498594164848328, |
|
"loss/value_avg": 0.2648542523384094, |
|
"lr": 2.40234375e-06, |
|
"objective/entropy": -56.34510803222656, |
|
"objective/kl": 32.5233154296875, |
|
"objective/non_score_reward": -1.626165747642517, |
|
"objective/rlhf_reward": 9.983355522155762, |
|
"objective/scores": 11.60952091217041, |
|
"policy/approxkl_avg": 0.0027015511877834797, |
|
"policy/clipfrac_avg": 0.011898016557097435, |
|
"policy/entropy_avg": 1.5591082572937012, |
|
"step": 52, |
|
"val/clipfrac_avg": 0.006908833980560303, |
|
"val/num_eos_tokens": 27153, |
|
"val/ratio": 0.9998370409011841, |
|
"val/ratio_var": 2.8140475478721783e-06 |
|
}, |
|
{ |
|
"episode": 27136, |
|
"epoch": 0.23248800548320767, |
|
"eps": 5, |
|
"loss/policy_avg": -0.0007923748344182968, |
|
"loss/value_avg": 0.2664443850517273, |
|
"lr": 2.390625e-06, |
|
"objective/entropy": -52.48090362548828, |
|
"objective/kl": 35.079097747802734, |
|
"objective/non_score_reward": -1.7539548873901367, |
|
"objective/rlhf_reward": 9.7537202835083, |
|
"objective/scores": 11.507675170898438, |
|
"policy/approxkl_avg": 0.002516430802643299, |
|
"policy/clipfrac_avg": 0.012202695943415165, |
|
"policy/entropy_avg": 1.4821527004241943, |
|
"step": 53, |
|
"val/clipfrac_avg": 0.005511893425136805, |
|
"val/num_eos_tokens": 27268, |
|
"val/ratio": 1.0001299381256104, |
|
"val/ratio_var": 2.9662376164196758e-06 |
|
}, |
|
{ |
|
"episode": 27648, |
|
"epoch": 0.2368745716244003, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00248061865568161, |
|
"loss/value_avg": 0.29047083854675293, |
|
"lr": 2.3789062500000002e-06, |
|
"objective/entropy": -50.3958740234375, |
|
"objective/kl": 35.103553771972656, |
|
"objective/non_score_reward": -1.755177617073059, |
|
"objective/rlhf_reward": 9.844841003417969, |
|
"objective/scores": 11.600018501281738, |
|
"policy/approxkl_avg": 0.002680136589333415, |
|
"policy/clipfrac_avg": 0.010980302467942238, |
|
"policy/entropy_avg": 1.4363842010498047, |
|
"step": 54, |
|
"val/clipfrac_avg": 0.006265181582421064, |
|
"val/num_eos_tokens": 26313, |
|
"val/ratio": 1.0002726316452026, |
|
"val/ratio_var": 3.265275154262781e-06 |
|
}, |
|
{ |
|
"episode": 28160, |
|
"epoch": 0.24126113776559288, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00040969252586364746, |
|
"loss/value_avg": 0.27804386615753174, |
|
"lr": 2.3671875e-06, |
|
"objective/entropy": -50.88187789916992, |
|
"objective/kl": 34.17361068725586, |
|
"objective/non_score_reward": -1.7086806297302246, |
|
"objective/rlhf_reward": 9.788152694702148, |
|
"objective/scores": 11.496833801269531, |
|
"policy/approxkl_avg": 0.00247185374610126, |
|
"policy/clipfrac_avg": 0.011061925441026688, |
|
"policy/entropy_avg": 1.4272186756134033, |
|
"step": 55, |
|
"val/clipfrac_avg": 0.004378362558782101, |
|
"val/num_eos_tokens": 28037, |
|
"val/ratio": 0.9997042417526245, |
|
"val/ratio_var": 3.098047955063521e-06 |
|
}, |
|
{ |
|
"episode": 28672, |
|
"epoch": 0.24564770390678548, |
|
"eps": 5, |
|
"loss/policy_avg": 0.001188849564641714, |
|
"loss/value_avg": 0.2825721502304077, |
|
"lr": 2.35546875e-06, |
|
"objective/entropy": -52.136688232421875, |
|
"objective/kl": 33.818946838378906, |
|
"objective/non_score_reward": -1.690947413444519, |
|
"objective/rlhf_reward": 9.844747543334961, |
|
"objective/scores": 11.53569507598877, |
|
"policy/approxkl_avg": 0.0022125309333205223, |
|
"policy/clipfrac_avg": 0.011263608932495117, |
|
"policy/entropy_avg": 1.4427647590637207, |
|
"step": 56, |
|
"val/clipfrac_avg": 0.004626412410289049, |
|
"val/num_eos_tokens": 27546, |
|
"val/ratio": 0.999881386756897, |
|
"val/ratio_var": 4.78684432891896e-06 |
|
}, |
|
{ |
|
"episode": 29184, |
|
"epoch": 0.25003427004797807, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009166279807686806, |
|
"loss/value_avg": 0.2479139119386673, |
|
"lr": 2.3437500000000002e-06, |
|
"objective/entropy": -54.860206604003906, |
|
"objective/kl": 32.90901184082031, |
|
"objective/non_score_reward": -1.6454508304595947, |
|
"objective/rlhf_reward": 9.943949699401855, |
|
"objective/scores": 11.589400291442871, |
|
"policy/approxkl_avg": 0.0023442034143954515, |
|
"policy/clipfrac_avg": 0.011214188300073147, |
|
"policy/entropy_avg": 1.4958158731460571, |
|
"step": 57, |
|
"val/clipfrac_avg": 0.005244936794042587, |
|
"val/num_eos_tokens": 26152, |
|
"val/ratio": 1.0008132457733154, |
|
"val/ratio_var": 4.664201696868986e-06 |
|
}, |
|
{ |
|
"episode": 29696, |
|
"epoch": 0.25442083618917066, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004646984860301018, |
|
"loss/value_avg": 0.2771589457988739, |
|
"lr": 2.33203125e-06, |
|
"objective/entropy": -53.552146911621094, |
|
"objective/kl": 33.505615234375, |
|
"objective/non_score_reward": -1.6752808094024658, |
|
"objective/rlhf_reward": 9.639711380004883, |
|
"objective/scores": 11.31499195098877, |
|
"policy/approxkl_avg": 0.0028129604179412127, |
|
"policy/clipfrac_avg": 0.013663064688444138, |
|
"policy/entropy_avg": 1.483880639076233, |
|
"step": 58, |
|
"val/clipfrac_avg": 0.004003824666142464, |
|
"val/num_eos_tokens": 26445, |
|
"val/ratio": 0.9997650980949402, |
|
"val/ratio_var": 2.8647132239711937e-06 |
|
}, |
|
{ |
|
"episode": 30208, |
|
"epoch": 0.25880740233036326, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006868647411465645, |
|
"loss/value_avg": 0.25104641914367676, |
|
"lr": 2.3203125e-06, |
|
"objective/entropy": -57.64221954345703, |
|
"objective/kl": 30.967973709106445, |
|
"objective/non_score_reward": -1.548398733139038, |
|
"objective/rlhf_reward": 9.892714500427246, |
|
"objective/scores": 11.441113471984863, |
|
"policy/approxkl_avg": 0.001667265547439456, |
|
"policy/clipfrac_avg": 0.010815152898430824, |
|
"policy/entropy_avg": 1.515237808227539, |
|
"step": 59, |
|
"val/clipfrac_avg": 0.004705238156020641, |
|
"val/num_eos_tokens": 25173, |
|
"val/ratio": 0.999830424785614, |
|
"val/ratio_var": 3.4093393423972884e-06 |
|
}, |
|
{ |
|
"episode": 30720, |
|
"epoch": 0.26319396847155585, |
|
"eps": 4, |
|
"loss/policy_avg": 0.011243993416428566, |
|
"loss/value_avg": 0.2495395541191101, |
|
"lr": 2.30859375e-06, |
|
"objective/entropy": -60.84779357910156, |
|
"objective/kl": 31.15279769897461, |
|
"objective/non_score_reward": -1.5576398372650146, |
|
"objective/rlhf_reward": 9.788341522216797, |
|
"objective/scores": 11.34598159790039, |
|
"policy/approxkl_avg": 0.002522544004023075, |
|
"policy/clipfrac_avg": 0.01168464682996273, |
|
"policy/entropy_avg": 1.5892558097839355, |
|
"step": 60, |
|
"val/clipfrac_avg": 0.0036916325334459543, |
|
"val/num_eos_tokens": 25030, |
|
"val/ratio": 1.000048041343689, |
|
"val/ratio_var": 5.555901225307025e-06 |
|
}, |
|
{ |
|
"episode": 31232, |
|
"epoch": 0.26758053461274844, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005039035342633724, |
|
"loss/value_avg": 0.23521627485752106, |
|
"lr": 2.296875e-06, |
|
"objective/entropy": -52.48558807373047, |
|
"objective/kl": 29.97555923461914, |
|
"objective/non_score_reward": -1.498777985572815, |
|
"objective/rlhf_reward": 9.8745756149292, |
|
"objective/scores": 11.373353958129883, |
|
"policy/approxkl_avg": 0.0022459528408944607, |
|
"policy/clipfrac_avg": 0.011031204834580421, |
|
"policy/entropy_avg": 1.452695608139038, |
|
"step": 61, |
|
"val/clipfrac_avg": 0.0035397186875343323, |
|
"val/num_eos_tokens": 22719, |
|
"val/ratio": 1.0000181198120117, |
|
"val/ratio_var": 7.56223380449228e-06 |
|
}, |
|
{ |
|
"episode": 31744, |
|
"epoch": 0.27196710075394104, |
|
"eps": 5, |
|
"loss/policy_avg": 0.003956328146159649, |
|
"loss/value_avg": 0.2286529839038849, |
|
"lr": 2.28515625e-06, |
|
"objective/entropy": -62.272193908691406, |
|
"objective/kl": 30.68024444580078, |
|
"objective/non_score_reward": -1.5340123176574707, |
|
"objective/rlhf_reward": 9.862707138061523, |
|
"objective/scores": 11.396718978881836, |
|
"policy/approxkl_avg": 0.0018089789664372802, |
|
"policy/clipfrac_avg": 0.01095383707433939, |
|
"policy/entropy_avg": 1.5233193635940552, |
|
"step": 62, |
|
"val/clipfrac_avg": 0.0031363165471702814, |
|
"val/num_eos_tokens": 21682, |
|
"val/ratio": 1.0003973245620728, |
|
"val/ratio_var": 3.887281764036743e-06 |
|
}, |
|
{ |
|
"episode": 32256, |
|
"epoch": 0.27635366689513363, |
|
"eps": 5, |
|
"loss/policy_avg": 0.006541299633681774, |
|
"loss/value_avg": 0.2510289251804352, |
|
"lr": 2.2734375e-06, |
|
"objective/entropy": -63.541847229003906, |
|
"objective/kl": 29.277267456054688, |
|
"objective/non_score_reward": -1.4638633728027344, |
|
"objective/rlhf_reward": 9.708379745483398, |
|
"objective/scores": 11.172243118286133, |
|
"policy/approxkl_avg": 0.0018989848904311657, |
|
"policy/clipfrac_avg": 0.01218925230205059, |
|
"policy/entropy_avg": 1.6170835494995117, |
|
"step": 63, |
|
"val/clipfrac_avg": 0.0036716386675834656, |
|
"val/num_eos_tokens": 23597, |
|
"val/ratio": 0.999724268913269, |
|
"val/ratio_var": 3.221219003535225e-06 |
|
}, |
|
{ |
|
"episode": 32768, |
|
"epoch": 0.2807402330363262, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00918935239315033, |
|
"loss/value_avg": 0.2206374704837799, |
|
"lr": 2.26171875e-06, |
|
"objective/entropy": -62.05379867553711, |
|
"objective/kl": 29.984851837158203, |
|
"objective/non_score_reward": -1.4992427825927734, |
|
"objective/rlhf_reward": 9.81045913696289, |
|
"objective/scores": 11.309701919555664, |
|
"policy/approxkl_avg": 0.0021062542218714952, |
|
"policy/clipfrac_avg": 0.010909339413046837, |
|
"policy/entropy_avg": 1.581649661064148, |
|
"step": 64, |
|
"val/clipfrac_avg": 0.0033828443847596645, |
|
"val/num_eos_tokens": 25070, |
|
"val/ratio": 0.9998538494110107, |
|
"val/ratio_var": 2.1343398657336365e-06 |
|
}, |
|
{ |
|
"episode": 33280, |
|
"epoch": 0.2851267991775189, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0031117182224988937, |
|
"loss/value_avg": 0.2214275449514389, |
|
"lr": 2.25e-06, |
|
"objective/entropy": -61.279544830322266, |
|
"objective/kl": 30.740419387817383, |
|
"objective/non_score_reward": -1.5370210409164429, |
|
"objective/rlhf_reward": 9.933812141418457, |
|
"objective/scores": 11.470832824707031, |
|
"policy/approxkl_avg": 0.003304574405774474, |
|
"policy/clipfrac_avg": 0.011563178151845932, |
|
"policy/entropy_avg": 1.5370697975158691, |
|
"step": 65, |
|
"val/clipfrac_avg": 0.0039400579407811165, |
|
"val/num_eos_tokens": 23425, |
|
"val/ratio": 1.0000369548797607, |
|
"val/ratio_var": 2.214629830632475e-06 |
|
}, |
|
{ |
|
"episode": 33792, |
|
"epoch": 0.28951336531871147, |
|
"eps": 5, |
|
"loss/policy_avg": 0.006491166073828936, |
|
"loss/value_avg": 0.22057856619358063, |
|
"lr": 2.23828125e-06, |
|
"objective/entropy": -61.981842041015625, |
|
"objective/kl": 30.143644332885742, |
|
"objective/non_score_reward": -1.5071823596954346, |
|
"objective/rlhf_reward": 9.982759475708008, |
|
"objective/scores": 11.489941596984863, |
|
"policy/approxkl_avg": 0.0016894883010536432, |
|
"policy/clipfrac_avg": 0.012134966440498829, |
|
"policy/entropy_avg": 1.5720046758651733, |
|
"step": 66, |
|
"val/clipfrac_avg": 0.0033682635985314846, |
|
"val/num_eos_tokens": 22234, |
|
"val/ratio": 1.0001835823059082, |
|
"val/ratio_var": 5.471536951517919e-06 |
|
}, |
|
{ |
|
"episode": 34304, |
|
"epoch": 0.29389993145990406, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00841301865875721, |
|
"loss/value_avg": 0.2318386435508728, |
|
"lr": 2.2265625e-06, |
|
"objective/entropy": -60.53614044189453, |
|
"objective/kl": 31.55874252319336, |
|
"objective/non_score_reward": -1.577937126159668, |
|
"objective/rlhf_reward": 9.732769012451172, |
|
"objective/scores": 11.31070613861084, |
|
"policy/approxkl_avg": 0.0018887519836425781, |
|
"policy/clipfrac_avg": 0.011964268051087856, |
|
"policy/entropy_avg": 1.5244885683059692, |
|
"step": 67, |
|
"val/clipfrac_avg": 0.0025918553583323956, |
|
"val/num_eos_tokens": 22458, |
|
"val/ratio": 1.0001020431518555, |
|
"val/ratio_var": 1.6006388250389136e-06 |
|
}, |
|
{ |
|
"episode": 34816, |
|
"epoch": 0.29828649760109666, |
|
"eps": 5, |
|
"loss/policy_avg": 0.004491077736020088, |
|
"loss/value_avg": 0.21774470806121826, |
|
"lr": 2.21484375e-06, |
|
"objective/entropy": -61.31328582763672, |
|
"objective/kl": 30.977596282958984, |
|
"objective/non_score_reward": -1.548879861831665, |
|
"objective/rlhf_reward": 9.94593620300293, |
|
"objective/scores": 11.494815826416016, |
|
"policy/approxkl_avg": 0.002278459956869483, |
|
"policy/clipfrac_avg": 0.01156328059732914, |
|
"policy/entropy_avg": 1.5633031129837036, |
|
"step": 68, |
|
"val/clipfrac_avg": 0.004319990985095501, |
|
"val/num_eos_tokens": 22420, |
|
"val/ratio": 0.9998349547386169, |
|
"val/ratio_var": 2.804338009809726e-06 |
|
}, |
|
{ |
|
"episode": 35328, |
|
"epoch": 0.30267306374228925, |
|
"eps": 5, |
|
"loss/policy_avg": 0.012953916564583778, |
|
"loss/value_avg": 0.22159670293331146, |
|
"lr": 2.203125e-06, |
|
"objective/entropy": -60.40982437133789, |
|
"objective/kl": 31.161331176757812, |
|
"objective/non_score_reward": -1.5580666065216064, |
|
"objective/rlhf_reward": 9.979460716247559, |
|
"objective/scores": 11.537527084350586, |
|
"policy/approxkl_avg": 0.002316855126991868, |
|
"policy/clipfrac_avg": 0.012466225773096085, |
|
"policy/entropy_avg": 1.5732831954956055, |
|
"step": 69, |
|
"val/clipfrac_avg": 0.004431965295225382, |
|
"val/num_eos_tokens": 23475, |
|
"val/ratio": 0.9999319314956665, |
|
"val/ratio_var": 2.884213245124556e-06 |
|
}, |
|
{ |
|
"episode": 35840, |
|
"epoch": 0.30705962988348184, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0026891864836215973, |
|
"loss/value_avg": 0.22345781326293945, |
|
"lr": 2.19140625e-06, |
|
"objective/entropy": -59.21501541137695, |
|
"objective/kl": 31.3795166015625, |
|
"objective/non_score_reward": -1.568975806236267, |
|
"objective/rlhf_reward": 10.109546661376953, |
|
"objective/scores": 11.678522109985352, |
|
"policy/approxkl_avg": 0.0033681951463222504, |
|
"policy/clipfrac_avg": 0.01226731576025486, |
|
"policy/entropy_avg": 1.559905767440796, |
|
"step": 70, |
|
"val/clipfrac_avg": 0.004147649277001619, |
|
"val/num_eos_tokens": 26871, |
|
"val/ratio": 0.9998610615730286, |
|
"val/ratio_var": 3.8059629332565237e-06 |
|
}, |
|
{ |
|
"episode": 36352, |
|
"epoch": 0.31144619602467444, |
|
"eps": 5, |
|
"loss/policy_avg": 0.010949812829494476, |
|
"loss/value_avg": 0.21867918968200684, |
|
"lr": 2.1796875e-06, |
|
"objective/entropy": -55.472938537597656, |
|
"objective/kl": 33.11960220336914, |
|
"objective/non_score_reward": -1.655980110168457, |
|
"objective/rlhf_reward": 9.923175811767578, |
|
"objective/scores": 11.579155921936035, |
|
"policy/approxkl_avg": 0.0021974374540150166, |
|
"policy/clipfrac_avg": 0.013162683695554733, |
|
"policy/entropy_avg": 1.446916103363037, |
|
"step": 71, |
|
"val/clipfrac_avg": 0.004135113209486008, |
|
"val/num_eos_tokens": 25201, |
|
"val/ratio": 1.0002843141555786, |
|
"val/ratio_var": 5.781992513220757e-06 |
|
}, |
|
{ |
|
"episode": 36864, |
|
"epoch": 0.31583276216586703, |
|
"eps": 5, |
|
"loss/policy_avg": 0.008956819772720337, |
|
"loss/value_avg": 0.22117221355438232, |
|
"lr": 2.16796875e-06, |
|
"objective/entropy": -57.94511413574219, |
|
"objective/kl": 32.885406494140625, |
|
"objective/non_score_reward": -1.6442703008651733, |
|
"objective/rlhf_reward": 9.936177253723145, |
|
"objective/scores": 11.58044719696045, |
|
"policy/approxkl_avg": 0.0023873518221080303, |
|
"policy/clipfrac_avg": 0.011646779254078865, |
|
"policy/entropy_avg": 1.5099616050720215, |
|
"step": 72, |
|
"val/clipfrac_avg": 0.003031653817743063, |
|
"val/num_eos_tokens": 23619, |
|
"val/ratio": 0.9997458457946777, |
|
"val/ratio_var": 3.188845766999293e-06 |
|
}, |
|
{ |
|
"episode": 37376, |
|
"epoch": 0.3202193283070596, |
|
"eps": 5, |
|
"loss/policy_avg": 0.006841774098575115, |
|
"loss/value_avg": 0.22015002369880676, |
|
"lr": 2.15625e-06, |
|
"objective/entropy": -53.92641830444336, |
|
"objective/kl": 34.366966247558594, |
|
"objective/non_score_reward": -1.7183483839035034, |
|
"objective/rlhf_reward": 10.033120155334473, |
|
"objective/scores": 11.751468658447266, |
|
"policy/approxkl_avg": 0.003308027284219861, |
|
"policy/clipfrac_avg": 0.011116940528154373, |
|
"policy/entropy_avg": 1.41293466091156, |
|
"step": 73, |
|
"val/clipfrac_avg": 0.004825121723115444, |
|
"val/num_eos_tokens": 23706, |
|
"val/ratio": 1.0000604391098022, |
|
"val/ratio_var": 3.1772717647982063e-06 |
|
}, |
|
{ |
|
"episode": 37888, |
|
"epoch": 0.3246058944482522, |
|
"eps": 5, |
|
"loss/policy_avg": 8.680112659931183e-05, |
|
"loss/value_avg": 0.22518639266490936, |
|
"lr": 2.14453125e-06, |
|
"objective/entropy": -51.67547607421875, |
|
"objective/kl": 33.763038635253906, |
|
"objective/non_score_reward": -1.6881520748138428, |
|
"objective/rlhf_reward": 10.125733375549316, |
|
"objective/scores": 11.813885688781738, |
|
"policy/approxkl_avg": 0.002383920131251216, |
|
"policy/clipfrac_avg": 0.012023954652249813, |
|
"policy/entropy_avg": 1.3859004974365234, |
|
"step": 74, |
|
"val/clipfrac_avg": 0.0035750826355069876, |
|
"val/num_eos_tokens": 27803, |
|
"val/ratio": 0.9996781945228577, |
|
"val/ratio_var": 4.62918205812457e-06 |
|
}, |
|
{ |
|
"episode": 38400, |
|
"epoch": 0.3289924605894448, |
|
"eps": 5, |
|
"loss/policy_avg": 0.00683591328561306, |
|
"loss/value_avg": 0.19435633718967438, |
|
"lr": 2.1328125e-06, |
|
"objective/entropy": -55.22687911987305, |
|
"objective/kl": 33.985172271728516, |
|
"objective/non_score_reward": -1.69925856590271, |
|
"objective/rlhf_reward": 10.095266342163086, |
|
"objective/scores": 11.794525146484375, |
|
"policy/approxkl_avg": 0.0020253900438547134, |
|
"policy/clipfrac_avg": 0.011549219489097595, |
|
"policy/entropy_avg": 1.4170671701431274, |
|
"step": 75, |
|
"val/clipfrac_avg": 0.005189975723624229, |
|
"val/num_eos_tokens": 23555, |
|
"val/ratio": 1.0000282526016235, |
|
"val/ratio_var": 2.6403824904264184e-06 |
|
}, |
|
{ |
|
"episode": 38912, |
|
"epoch": 0.3333790267306374, |
|
"eps": 5, |
|
"loss/policy_avg": 0.0018177703022956848, |
|
"loss/value_avg": 0.2192225456237793, |
|
"lr": 2.12109375e-06, |
|
"objective/entropy": -52.719966888427734, |
|
"objective/kl": 34.939414978027344, |
|
"objective/non_score_reward": -1.746970772743225, |
|
"objective/rlhf_reward": 9.940811157226562, |
|
"objective/scores": 11.687782287597656, |
|
"policy/approxkl_avg": 0.003144835354760289, |
|
"policy/clipfrac_avg": 0.011414816603064537, |
|
"policy/entropy_avg": 1.3688035011291504, |
|
"step": 76, |
|
"val/clipfrac_avg": 0.00428430363535881, |
|
"val/num_eos_tokens": 25195, |
|
"val/ratio": 0.9994131326675415, |
|
"val/ratio_var": 3.1442646104551386e-06 |
|
}, |
|
{ |
|
"episode": 39424, |
|
"epoch": 0.33776559287183, |
|
"eps": 5, |
|
"loss/policy_avg": 0.002414613962173462, |
|
"loss/value_avg": 0.22705943882465363, |
|
"lr": 2.109375e-06, |
|
"objective/entropy": -52.9849853515625, |
|
"objective/kl": 35.76004409790039, |
|
"objective/non_score_reward": -1.7880022525787354, |
|
"objective/rlhf_reward": 10.003686904907227, |
|
"objective/scores": 11.791688919067383, |
|
"policy/approxkl_avg": 0.0024730274453759193, |
|
"policy/clipfrac_avg": 0.012745586223900318, |
|
"policy/entropy_avg": 1.3729814291000366, |
|
"step": 77, |
|
"val/clipfrac_avg": 0.004229060374200344, |
|
"val/num_eos_tokens": 26826, |
|
"val/ratio": 0.9999415278434753, |
|
"val/ratio_var": 5.440687800728483e-06 |
|
}, |
|
{ |
|
"episode": 39936, |
|
"epoch": 0.3421521590130226, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007603425532579422, |
|
"loss/value_avg": 0.20695388317108154, |
|
"lr": 2.09765625e-06, |
|
"objective/entropy": -52.369571685791016, |
|
"objective/kl": 35.140933990478516, |
|
"objective/non_score_reward": -1.7570466995239258, |
|
"objective/rlhf_reward": 10.074074745178223, |
|
"objective/scores": 11.831121444702148, |
|
"policy/approxkl_avg": 0.002142505254596472, |
|
"policy/clipfrac_avg": 0.010965963825583458, |
|
"policy/entropy_avg": 1.3412797451019287, |
|
"step": 78, |
|
"val/clipfrac_avg": 0.0038626650348305702, |
|
"val/num_eos_tokens": 23611, |
|
"val/ratio": 0.9996312856674194, |
|
"val/ratio_var": 4.994772552890936e-06 |
|
}, |
|
{ |
|
"episode": 40448, |
|
"epoch": 0.34653872515421524, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0018220320343971252, |
|
"loss/value_avg": 0.21862611174583435, |
|
"lr": 2.0859375e-06, |
|
"objective/entropy": -51.15296936035156, |
|
"objective/kl": 35.11440658569336, |
|
"objective/non_score_reward": -1.7557203769683838, |
|
"objective/rlhf_reward": 10.102788925170898, |
|
"objective/scores": 11.858509063720703, |
|
"policy/approxkl_avg": 0.0024529770016670227, |
|
"policy/clipfrac_avg": 0.012614256702363491, |
|
"policy/entropy_avg": 1.3262717723846436, |
|
"step": 79, |
|
"val/clipfrac_avg": 0.003596197348088026, |
|
"val/num_eos_tokens": 24645, |
|
"val/ratio": 1.000266194343567, |
|
"val/ratio_var": 6.255781045183539e-06 |
|
}, |
|
{ |
|
"episode": 40960, |
|
"epoch": 0.35092529129540784, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0023819338530302048, |
|
"loss/value_avg": 0.22472721338272095, |
|
"lr": 2.07421875e-06, |
|
"objective/entropy": -50.92372512817383, |
|
"objective/kl": 36.36195373535156, |
|
"objective/non_score_reward": -1.8180978298187256, |
|
"objective/rlhf_reward": 10.099588394165039, |
|
"objective/scores": 11.917686462402344, |
|
"policy/approxkl_avg": 0.002738222246989608, |
|
"policy/clipfrac_avg": 0.01162832509726286, |
|
"policy/entropy_avg": 1.3125125169754028, |
|
"step": 80, |
|
"val/clipfrac_avg": 0.005824130028486252, |
|
"val/num_eos_tokens": 22979, |
|
"val/ratio": 0.9995706677436829, |
|
"val/ratio_var": 3.720711902133189e-06 |
|
}, |
|
{ |
|
"episode": 41472, |
|
"epoch": 0.35531185743660043, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004819877445697784, |
|
"loss/value_avg": 0.22295227646827698, |
|
"lr": 2.0625e-06, |
|
"objective/entropy": -50.77427291870117, |
|
"objective/kl": 35.64800262451172, |
|
"objective/non_score_reward": -1.782400131225586, |
|
"objective/rlhf_reward": 9.891586303710938, |
|
"objective/scores": 11.673986434936523, |
|
"policy/approxkl_avg": 0.0026750012766569853, |
|
"policy/clipfrac_avg": 0.012281844392418861, |
|
"policy/entropy_avg": 1.3098926544189453, |
|
"step": 81, |
|
"val/clipfrac_avg": 0.003562133526429534, |
|
"val/num_eos_tokens": 22481, |
|
"val/ratio": 1.0002925395965576, |
|
"val/ratio_var": 3.08896505885059e-06 |
|
}, |
|
{ |
|
"episode": 41984, |
|
"epoch": 0.359698423577793, |
|
"eps": 4, |
|
"loss/policy_avg": 0.003094850108027458, |
|
"loss/value_avg": 0.20467893779277802, |
|
"lr": 2.05078125e-06, |
|
"objective/entropy": -49.50064468383789, |
|
"objective/kl": 35.082889556884766, |
|
"objective/non_score_reward": -1.7541444301605225, |
|
"objective/rlhf_reward": 10.12310791015625, |
|
"objective/scores": 11.877252578735352, |
|
"policy/approxkl_avg": 0.0031247385777533054, |
|
"policy/clipfrac_avg": 0.010663332417607307, |
|
"policy/entropy_avg": 1.2899810075759888, |
|
"step": 82, |
|
"val/clipfrac_avg": 0.003626835998147726, |
|
"val/num_eos_tokens": 23688, |
|
"val/ratio": 0.9997555613517761, |
|
"val/ratio_var": 4.246959178999532e-06 |
|
}, |
|
{ |
|
"episode": 42496, |
|
"epoch": 0.3640849897189856, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006415246054530144, |
|
"loss/value_avg": 0.2166299670934677, |
|
"lr": 2.0390625e-06, |
|
"objective/entropy": -49.80467987060547, |
|
"objective/kl": 35.149269104003906, |
|
"objective/non_score_reward": -1.7574634552001953, |
|
"objective/rlhf_reward": 9.97119140625, |
|
"objective/scores": 11.728654861450195, |
|
"policy/approxkl_avg": 0.002845556242391467, |
|
"policy/clipfrac_avg": 0.013346903957426548, |
|
"policy/entropy_avg": 1.293116569519043, |
|
"step": 83, |
|
"val/clipfrac_avg": 0.004545033443719149, |
|
"val/num_eos_tokens": 24437, |
|
"val/ratio": 0.9999039173126221, |
|
"val/ratio_var": 6.112253231549403e-06 |
|
}, |
|
{ |
|
"episode": 43008, |
|
"epoch": 0.3684715558601782, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004797011613845825, |
|
"loss/value_avg": 0.2133270502090454, |
|
"lr": 2.02734375e-06, |
|
"objective/entropy": -51.009857177734375, |
|
"objective/kl": 35.441497802734375, |
|
"objective/non_score_reward": -1.7720749378204346, |
|
"objective/rlhf_reward": 10.048232078552246, |
|
"objective/scores": 11.820306777954102, |
|
"policy/approxkl_avg": 0.002602731343358755, |
|
"policy/clipfrac_avg": 0.012205180712044239, |
|
"policy/entropy_avg": 1.3177757263183594, |
|
"step": 84, |
|
"val/clipfrac_avg": 0.004008126445114613, |
|
"val/num_eos_tokens": 24990, |
|
"val/ratio": 1.0002726316452026, |
|
"val/ratio_var": 5.879127456864808e-06 |
|
}, |
|
{ |
|
"episode": 43520, |
|
"epoch": 0.3728581220013708, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004787225276231766, |
|
"loss/value_avg": 0.213937908411026, |
|
"lr": 2.015625e-06, |
|
"objective/entropy": -48.71440124511719, |
|
"objective/kl": 35.934600830078125, |
|
"objective/non_score_reward": -1.7967300415039062, |
|
"objective/rlhf_reward": 10.027629852294922, |
|
"objective/scores": 11.824359893798828, |
|
"policy/approxkl_avg": 0.0032878173515200615, |
|
"policy/clipfrac_avg": 0.01084982417523861, |
|
"policy/entropy_avg": 1.26298189163208, |
|
"step": 85, |
|
"val/clipfrac_avg": 0.004145544022321701, |
|
"val/num_eos_tokens": 25423, |
|
"val/ratio": 0.9995351433753967, |
|
"val/ratio_var": 4.676774096878944e-06 |
|
}, |
|
{ |
|
"episode": 44032, |
|
"epoch": 0.3772446881425634, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009194480255246162, |
|
"loss/value_avg": 0.19612735509872437, |
|
"lr": 2.00390625e-06, |
|
"objective/entropy": -50.50733947753906, |
|
"objective/kl": 35.42319869995117, |
|
"objective/non_score_reward": -1.7711601257324219, |
|
"objective/rlhf_reward": 10.026796340942383, |
|
"objective/scores": 11.797956466674805, |
|
"policy/approxkl_avg": 0.0025596285704523325, |
|
"policy/clipfrac_avg": 0.011088266968727112, |
|
"policy/entropy_avg": 1.2955846786499023, |
|
"step": 86, |
|
"val/clipfrac_avg": 0.002618049271404743, |
|
"val/num_eos_tokens": 24385, |
|
"val/ratio": 1.0004734992980957, |
|
"val/ratio_var": 7.4206191129633226e-06 |
|
}, |
|
{ |
|
"episode": 44544, |
|
"epoch": 0.381631254283756, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0011375932954251766, |
|
"loss/value_avg": 0.19349417090415955, |
|
"lr": 1.9921875e-06, |
|
"objective/entropy": -48.215614318847656, |
|
"objective/kl": 36.09561538696289, |
|
"objective/non_score_reward": -1.8047807216644287, |
|
"objective/rlhf_reward": 10.043390274047852, |
|
"objective/scores": 11.84817123413086, |
|
"policy/approxkl_avg": 0.002257507061585784, |
|
"policy/clipfrac_avg": 0.011135649867355824, |
|
"policy/entropy_avg": 1.2575373649597168, |
|
"step": 87, |
|
"val/clipfrac_avg": 0.0035062048118561506, |
|
"val/num_eos_tokens": 26536, |
|
"val/ratio": 1.0005232095718384, |
|
"val/ratio_var": 7.836673830752261e-06 |
|
}, |
|
{ |
|
"episode": 45056, |
|
"epoch": 0.3860178204249486, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004499746486544609, |
|
"loss/value_avg": 0.20375367999076843, |
|
"lr": 1.98046875e-06, |
|
"objective/entropy": -48.469757080078125, |
|
"objective/kl": 35.357276916503906, |
|
"objective/non_score_reward": -1.7678639888763428, |
|
"objective/rlhf_reward": 10.101089477539062, |
|
"objective/scores": 11.868953704833984, |
|
"policy/approxkl_avg": 0.0027951907832175493, |
|
"policy/clipfrac_avg": 0.011439654044806957, |
|
"policy/entropy_avg": 1.2559094429016113, |
|
"step": 88, |
|
"val/clipfrac_avg": 0.0032751811668276787, |
|
"val/num_eos_tokens": 24904, |
|
"val/ratio": 0.999579668045044, |
|
"val/ratio_var": 3.818234290520195e-06 |
|
}, |
|
{ |
|
"episode": 45568, |
|
"epoch": 0.3904043865661412, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004391288384795189, |
|
"loss/value_avg": 0.19215956330299377, |
|
"lr": 1.96875e-06, |
|
"objective/entropy": -47.49878692626953, |
|
"objective/kl": 35.51283645629883, |
|
"objective/non_score_reward": -1.775641918182373, |
|
"objective/rlhf_reward": 10.171991348266602, |
|
"objective/scores": 11.947633743286133, |
|
"policy/approxkl_avg": 0.003075700718909502, |
|
"policy/clipfrac_avg": 0.011072011664509773, |
|
"policy/entropy_avg": 1.2150685787200928, |
|
"step": 89, |
|
"val/clipfrac_avg": 0.0027455922681838274, |
|
"val/num_eos_tokens": 23593, |
|
"val/ratio": 0.9998563528060913, |
|
"val/ratio_var": 3.37912251779926e-06 |
|
}, |
|
{ |
|
"episode": 46080, |
|
"epoch": 0.3947909527073338, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0004821214824914932, |
|
"loss/value_avg": 0.21164780855178833, |
|
"lr": 1.95703125e-06, |
|
"objective/entropy": -49.131282806396484, |
|
"objective/kl": 35.567405700683594, |
|
"objective/non_score_reward": -1.7783703804016113, |
|
"objective/rlhf_reward": 10.056915283203125, |
|
"objective/scores": 11.835285186767578, |
|
"policy/approxkl_avg": 0.0025346523616462946, |
|
"policy/clipfrac_avg": 0.01132851280272007, |
|
"policy/entropy_avg": 1.2548737525939941, |
|
"step": 90, |
|
"val/clipfrac_avg": 0.0035444346722215414, |
|
"val/num_eos_tokens": 23956, |
|
"val/ratio": 1.0001569986343384, |
|
"val/ratio_var": 4.048593382321997e-06 |
|
}, |
|
{ |
|
"episode": 46592, |
|
"epoch": 0.39917751884852637, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002457218011841178, |
|
"loss/value_avg": 0.20488248765468597, |
|
"lr": 1.9453125e-06, |
|
"objective/entropy": -50.223350524902344, |
|
"objective/kl": 34.380897521972656, |
|
"objective/non_score_reward": -1.719044804573059, |
|
"objective/rlhf_reward": 10.15352725982666, |
|
"objective/scores": 11.87257194519043, |
|
"policy/approxkl_avg": 0.0025883247144520283, |
|
"policy/clipfrac_avg": 0.01095154695212841, |
|
"policy/entropy_avg": 1.2806235551834106, |
|
"step": 91, |
|
"val/clipfrac_avg": 0.0030915343668311834, |
|
"val/num_eos_tokens": 24619, |
|
"val/ratio": 1.000389814376831, |
|
"val/ratio_var": 5.21131823916221e-06 |
|
}, |
|
{ |
|
"episode": 47104, |
|
"epoch": 0.40356408498971896, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009746008552610874, |
|
"loss/value_avg": 0.19489187002182007, |
|
"lr": 1.93359375e-06, |
|
"objective/entropy": -49.78590774536133, |
|
"objective/kl": 34.79002380371094, |
|
"objective/non_score_reward": -1.7395012378692627, |
|
"objective/rlhf_reward": 10.149117469787598, |
|
"objective/scores": 11.888618469238281, |
|
"policy/approxkl_avg": 0.0028350851498544216, |
|
"policy/clipfrac_avg": 0.012067590840160847, |
|
"policy/entropy_avg": 1.321395993232727, |
|
"step": 92, |
|
"val/clipfrac_avg": 0.0034278968814760447, |
|
"val/num_eos_tokens": 24707, |
|
"val/ratio": 1.000132441520691, |
|
"val/ratio_var": 3.817275683104526e-06 |
|
}, |
|
{ |
|
"episode": 47616, |
|
"epoch": 0.40795065113091156, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007891923189163208, |
|
"loss/value_avg": 0.18601296842098236, |
|
"lr": 1.921875e-06, |
|
"objective/entropy": -51.31802749633789, |
|
"objective/kl": 33.889923095703125, |
|
"objective/non_score_reward": -1.6944962739944458, |
|
"objective/rlhf_reward": 10.027242660522461, |
|
"objective/scores": 11.721738815307617, |
|
"policy/approxkl_avg": 0.0023833350278437138, |
|
"policy/clipfrac_avg": 0.011782050132751465, |
|
"policy/entropy_avg": 1.3522666692733765, |
|
"step": 93, |
|
"val/clipfrac_avg": 0.002397140022367239, |
|
"val/num_eos_tokens": 23485, |
|
"val/ratio": 0.9999991655349731, |
|
"val/ratio_var": 4.472914497455349e-06 |
|
}, |
|
{ |
|
"episode": 48128, |
|
"epoch": 0.4123372172721042, |
|
"eps": 4, |
|
"loss/policy_avg": 0.008123669773340225, |
|
"loss/value_avg": 0.19486477971076965, |
|
"lr": 1.91015625e-06, |
|
"objective/entropy": -51.119022369384766, |
|
"objective/kl": 33.58214569091797, |
|
"objective/non_score_reward": -1.679107427597046, |
|
"objective/rlhf_reward": 10.173272132873535, |
|
"objective/scores": 11.85237979888916, |
|
"policy/approxkl_avg": 0.002184953773394227, |
|
"policy/clipfrac_avg": 0.010806472972035408, |
|
"policy/entropy_avg": 1.3613462448120117, |
|
"step": 94, |
|
"val/clipfrac_avg": 0.0037152436561882496, |
|
"val/num_eos_tokens": 24130, |
|
"val/ratio": 1.0001533031463623, |
|
"val/ratio_var": 7.5301450124243274e-06 |
|
}, |
|
{ |
|
"episode": 48640, |
|
"epoch": 0.4167237834132968, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006111526861786842, |
|
"loss/value_avg": 0.18791520595550537, |
|
"lr": 1.8984375e-06, |
|
"objective/entropy": -50.709373474121094, |
|
"objective/kl": 34.28173065185547, |
|
"objective/non_score_reward": -1.7140867710113525, |
|
"objective/rlhf_reward": 10.056640625, |
|
"objective/scores": 11.770727157592773, |
|
"policy/approxkl_avg": 0.0028409743681550026, |
|
"policy/clipfrac_avg": 0.010896073654294014, |
|
"policy/entropy_avg": 1.3718466758728027, |
|
"step": 95, |
|
"val/clipfrac_avg": 0.0028323421720415354, |
|
"val/num_eos_tokens": 24401, |
|
"val/ratio": 1.0004030466079712, |
|
"val/ratio_var": 5.722152764064958e-06 |
|
}, |
|
{ |
|
"episode": 49152, |
|
"epoch": 0.4211103495544894, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006566773168742657, |
|
"loss/value_avg": 0.21990007162094116, |
|
"lr": 1.8867187500000001e-06, |
|
"objective/entropy": -51.272518157958984, |
|
"objective/kl": 33.29633712768555, |
|
"objective/non_score_reward": -1.6648168563842773, |
|
"objective/rlhf_reward": 10.015409469604492, |
|
"objective/scores": 11.68022632598877, |
|
"policy/approxkl_avg": 0.002582971705123782, |
|
"policy/clipfrac_avg": 0.012000022456049919, |
|
"policy/entropy_avg": 1.3877381086349487, |
|
"step": 96, |
|
"val/clipfrac_avg": 0.00376589922234416, |
|
"val/num_eos_tokens": 26887, |
|
"val/ratio": 0.9997319579124451, |
|
"val/ratio_var": 3.3458989037171705e-06 |
|
}, |
|
{ |
|
"episode": 49664, |
|
"epoch": 0.425496915695682, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00574074499309063, |
|
"loss/value_avg": 0.20088031888008118, |
|
"lr": 1.875e-06, |
|
"objective/entropy": -50.57700729370117, |
|
"objective/kl": 33.54800033569336, |
|
"objective/non_score_reward": -1.67739999294281, |
|
"objective/rlhf_reward": 10.143403053283691, |
|
"objective/scores": 11.820802688598633, |
|
"policy/approxkl_avg": 0.00242623221129179, |
|
"policy/clipfrac_avg": 0.011794717982411385, |
|
"policy/entropy_avg": 1.3870760202407837, |
|
"step": 97, |
|
"val/clipfrac_avg": 0.002550810342654586, |
|
"val/num_eos_tokens": 26478, |
|
"val/ratio": 1.0001178979873657, |
|
"val/ratio_var": 3.899105195159791e-06 |
|
}, |
|
{ |
|
"episode": 50176, |
|
"epoch": 0.4298834818368746, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002134094014763832, |
|
"loss/value_avg": 0.2105971872806549, |
|
"lr": 1.86328125e-06, |
|
"objective/entropy": -50.969947814941406, |
|
"objective/kl": 32.82494354248047, |
|
"objective/non_score_reward": -1.6412471532821655, |
|
"objective/rlhf_reward": 10.093246459960938, |
|
"objective/scores": 11.734493255615234, |
|
"policy/approxkl_avg": 0.002924936590716243, |
|
"policy/clipfrac_avg": 0.012574190273880959, |
|
"policy/entropy_avg": 1.4206920862197876, |
|
"step": 98, |
|
"val/clipfrac_avg": 0.0022323690354824066, |
|
"val/num_eos_tokens": 25788, |
|
"val/ratio": 1.0000821352005005, |
|
"val/ratio_var": 4.282260761101497e-06 |
|
}, |
|
{ |
|
"episode": 50688, |
|
"epoch": 0.4342700479780672, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002779986709356308, |
|
"loss/value_avg": 0.21304886043071747, |
|
"lr": 1.8515625000000001e-06, |
|
"objective/entropy": -48.35157012939453, |
|
"objective/kl": 34.29575729370117, |
|
"objective/non_score_reward": -1.7147879600524902, |
|
"objective/rlhf_reward": 10.013525009155273, |
|
"objective/scores": 11.728313446044922, |
|
"policy/approxkl_avg": 0.0031486451625823975, |
|
"policy/clipfrac_avg": 0.011947648599743843, |
|
"policy/entropy_avg": 1.3698291778564453, |
|
"step": 99, |
|
"val/clipfrac_avg": 0.003086227923631668, |
|
"val/num_eos_tokens": 27226, |
|
"val/ratio": 1.0002012252807617, |
|
"val/ratio_var": 3.848301275866106e-06 |
|
}, |
|
{ |
|
"episode": 51200, |
|
"epoch": 0.43865661411925977, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002698383294045925, |
|
"loss/value_avg": 0.1958826333284378, |
|
"lr": 1.83984375e-06, |
|
"objective/entropy": -51.114051818847656, |
|
"objective/kl": 33.804664611816406, |
|
"objective/non_score_reward": -1.6902332305908203, |
|
"objective/rlhf_reward": 10.000195503234863, |
|
"objective/scores": 11.690428733825684, |
|
"policy/approxkl_avg": 0.002505134791135788, |
|
"policy/clipfrac_avg": 0.013553831726312637, |
|
"policy/entropy_avg": 1.435497522354126, |
|
"step": 100, |
|
"val/clipfrac_avg": 0.0029125860892236233, |
|
"val/num_eos_tokens": 27134, |
|
"val/ratio": 0.9996867179870605, |
|
"val/ratio_var": 4.967731001670472e-06 |
|
}, |
|
{ |
|
"episode": 51712, |
|
"epoch": 0.44304318026045236, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00523423682898283, |
|
"loss/value_avg": 0.20203697681427002, |
|
"lr": 1.828125e-06, |
|
"objective/entropy": -53.56639862060547, |
|
"objective/kl": 32.214115142822266, |
|
"objective/non_score_reward": -1.610705852508545, |
|
"objective/rlhf_reward": 10.233478546142578, |
|
"objective/scores": 11.844184875488281, |
|
"policy/approxkl_avg": 0.003430293407291174, |
|
"policy/clipfrac_avg": 0.013302096165716648, |
|
"policy/entropy_avg": 1.4647541046142578, |
|
"step": 101, |
|
"val/clipfrac_avg": 0.0026878612115979195, |
|
"val/num_eos_tokens": 24773, |
|
"val/ratio": 1.0002996921539307, |
|
"val/ratio_var": 1.0531030056881718e-05 |
|
}, |
|
{ |
|
"episode": 52224, |
|
"epoch": 0.44742974640164496, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007096525281667709, |
|
"loss/value_avg": 0.2158614844083786, |
|
"lr": 1.81640625e-06, |
|
"objective/entropy": -51.106422424316406, |
|
"objective/kl": 33.817195892333984, |
|
"objective/non_score_reward": -1.6908597946166992, |
|
"objective/rlhf_reward": 10.014439582824707, |
|
"objective/scores": 11.705299377441406, |
|
"policy/approxkl_avg": 0.0026541282422840595, |
|
"policy/clipfrac_avg": 0.012738144025206566, |
|
"policy/entropy_avg": 1.4463238716125488, |
|
"step": 102, |
|
"val/clipfrac_avg": 0.0025819321162998676, |
|
"val/num_eos_tokens": 26794, |
|
"val/ratio": 1.0001683235168457, |
|
"val/ratio_var": 4.307699327910086e-06 |
|
}, |
|
{ |
|
"episode": 52736, |
|
"epoch": 0.45181631254283755, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0076528494246304035, |
|
"loss/value_avg": 0.22683024406433105, |
|
"lr": 1.8046875e-06, |
|
"objective/entropy": -51.95180130004883, |
|
"objective/kl": 32.943721771240234, |
|
"objective/non_score_reward": -1.647186040878296, |
|
"objective/rlhf_reward": 9.92637825012207, |
|
"objective/scores": 11.573564529418945, |
|
"policy/approxkl_avg": 0.002516075037419796, |
|
"policy/clipfrac_avg": 0.012894165702164173, |
|
"policy/entropy_avg": 1.4716830253601074, |
|
"step": 103, |
|
"val/clipfrac_avg": 0.003462804015725851, |
|
"val/num_eos_tokens": 27258, |
|
"val/ratio": 0.9998781681060791, |
|
"val/ratio_var": 3.6184294458507793e-06 |
|
}, |
|
{ |
|
"episode": 53248, |
|
"epoch": 0.45620287868403014, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0059026554226875305, |
|
"loss/value_avg": 0.18866774439811707, |
|
"lr": 1.79296875e-06, |
|
"objective/entropy": -51.065303802490234, |
|
"objective/kl": 33.99561309814453, |
|
"objective/non_score_reward": -1.699780821800232, |
|
"objective/rlhf_reward": 10.005899429321289, |
|
"objective/scores": 11.705679893493652, |
|
"policy/approxkl_avg": 0.003799548838287592, |
|
"policy/clipfrac_avg": 0.011900994926691055, |
|
"policy/entropy_avg": 1.453720211982727, |
|
"step": 104, |
|
"val/clipfrac_avg": 0.002593266312032938, |
|
"val/num_eos_tokens": 25703, |
|
"val/ratio": 1.0001416206359863, |
|
"val/ratio_var": 5.38453923581983e-06 |
|
}, |
|
{ |
|
"episode": 53760, |
|
"epoch": 0.46058944482522274, |
|
"eps": 4, |
|
"loss/policy_avg": -0.00120542012155056, |
|
"loss/value_avg": 0.20909513533115387, |
|
"lr": 1.78125e-06, |
|
"objective/entropy": -51.853965759277344, |
|
"objective/kl": 33.504188537597656, |
|
"objective/non_score_reward": -1.6752095222473145, |
|
"objective/rlhf_reward": 9.993560791015625, |
|
"objective/scores": 11.668770790100098, |
|
"policy/approxkl_avg": 0.0022638142108917236, |
|
"policy/clipfrac_avg": 0.012438900768756866, |
|
"policy/entropy_avg": 1.4844257831573486, |
|
"step": 105, |
|
"val/clipfrac_avg": 0.0025224490091204643, |
|
"val/num_eos_tokens": 28217, |
|
"val/ratio": 0.9998506307601929, |
|
"val/ratio_var": 4.211783107166411e-06 |
|
}, |
|
{ |
|
"episode": 54272, |
|
"epoch": 0.46497601096641533, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0032410603016614914, |
|
"loss/value_avg": 0.19734174013137817, |
|
"lr": 1.76953125e-06, |
|
"objective/entropy": -49.862060546875, |
|
"objective/kl": 34.40680694580078, |
|
"objective/non_score_reward": -1.7203404903411865, |
|
"objective/rlhf_reward": 10.194497108459473, |
|
"objective/scores": 11.914837837219238, |
|
"policy/approxkl_avg": 0.00206130463629961, |
|
"policy/clipfrac_avg": 0.01056149136275053, |
|
"policy/entropy_avg": 1.3804571628570557, |
|
"step": 106, |
|
"val/clipfrac_avg": 0.0024648173712193966, |
|
"val/num_eos_tokens": 25190, |
|
"val/ratio": 1.000199794769287, |
|
"val/ratio_var": 4.576507308229338e-06 |
|
}, |
|
{ |
|
"episode": 54784, |
|
"epoch": 0.4693625771076079, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0007011368870735168, |
|
"loss/value_avg": 0.21626059710979462, |
|
"lr": 1.7578125e-06, |
|
"objective/entropy": -49.26763916015625, |
|
"objective/kl": 35.990623474121094, |
|
"objective/non_score_reward": -1.7995312213897705, |
|
"objective/rlhf_reward": 10.014336585998535, |
|
"objective/scores": 11.813867568969727, |
|
"policy/approxkl_avg": 0.002608464565128088, |
|
"policy/clipfrac_avg": 0.011158171109855175, |
|
"policy/entropy_avg": 1.4025707244873047, |
|
"step": 107, |
|
"val/clipfrac_avg": 0.0035355808213353157, |
|
"val/num_eos_tokens": 26415, |
|
"val/ratio": 0.999910831451416, |
|
"val/ratio_var": 3.4601300740177976e-06 |
|
}, |
|
{ |
|
"episode": 55296, |
|
"epoch": 0.4737491432488006, |
|
"eps": 4, |
|
"loss/policy_avg": 0.010623332113027573, |
|
"loss/value_avg": 0.22851604223251343, |
|
"lr": 1.74609375e-06, |
|
"objective/entropy": -48.97447967529297, |
|
"objective/kl": 35.410743713378906, |
|
"objective/non_score_reward": -1.7705371379852295, |
|
"objective/rlhf_reward": 9.869864463806152, |
|
"objective/scores": 11.640401840209961, |
|
"policy/approxkl_avg": 0.0030459933914244175, |
|
"policy/clipfrac_avg": 0.010695299133658409, |
|
"policy/entropy_avg": 1.3910768032073975, |
|
"step": 108, |
|
"val/clipfrac_avg": 0.003275398164987564, |
|
"val/num_eos_tokens": 27971, |
|
"val/ratio": 0.9999659061431885, |
|
"val/ratio_var": 4.657621047954308e-06 |
|
}, |
|
{ |
|
"episode": 55808, |
|
"epoch": 0.47813570938999317, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0018949531950056553, |
|
"loss/value_avg": 0.2573157548904419, |
|
"lr": 1.734375e-06, |
|
"objective/entropy": -47.27846145629883, |
|
"objective/kl": 36.20323181152344, |
|
"objective/non_score_reward": -1.8101614713668823, |
|
"objective/rlhf_reward": 9.8946533203125, |
|
"objective/scores": 11.704814910888672, |
|
"policy/approxkl_avg": 0.0025291882921010256, |
|
"policy/clipfrac_avg": 0.010728440247476101, |
|
"policy/entropy_avg": 1.3530826568603516, |
|
"step": 109, |
|
"val/clipfrac_avg": 0.003135326784104109, |
|
"val/num_eos_tokens": 28041, |
|
"val/ratio": 1.000365972518921, |
|
"val/ratio_var": 1.0167857908527367e-05 |
|
}, |
|
{ |
|
"episode": 56320, |
|
"epoch": 0.48252227553118576, |
|
"eps": 4, |
|
"loss/policy_avg": -0.000798303633928299, |
|
"loss/value_avg": 0.24519430100917816, |
|
"lr": 1.72265625e-06, |
|
"objective/entropy": -46.93909454345703, |
|
"objective/kl": 35.551170349121094, |
|
"objective/non_score_reward": -1.7775585651397705, |
|
"objective/rlhf_reward": 10.142352104187012, |
|
"objective/scores": 11.919910430908203, |
|
"policy/approxkl_avg": 0.003023324767127633, |
|
"policy/clipfrac_avg": 0.01054347399622202, |
|
"policy/entropy_avg": 1.3338515758514404, |
|
"step": 110, |
|
"val/clipfrac_avg": 0.00204864121042192, |
|
"val/num_eos_tokens": 27167, |
|
"val/ratio": 0.9999716281890869, |
|
"val/ratio_var": 3.4549011616036296e-06 |
|
}, |
|
{ |
|
"episode": 56832, |
|
"epoch": 0.48690884167237836, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0019484013319015503, |
|
"loss/value_avg": 0.23634591698646545, |
|
"lr": 1.7109375e-06, |
|
"objective/entropy": -48.95277404785156, |
|
"objective/kl": 35.153472900390625, |
|
"objective/non_score_reward": -1.7576735019683838, |
|
"objective/rlhf_reward": 10.170576095581055, |
|
"objective/scores": 11.92824935913086, |
|
"policy/approxkl_avg": 0.0029592744540423155, |
|
"policy/clipfrac_avg": 0.010565382428467274, |
|
"policy/entropy_avg": 1.3721097707748413, |
|
"step": 111, |
|
"val/clipfrac_avg": 0.002308458089828491, |
|
"val/num_eos_tokens": 26862, |
|
"val/ratio": 1.0011966228485107, |
|
"val/ratio_var": 8.257182344095781e-05 |
|
}, |
|
{ |
|
"episode": 57344, |
|
"epoch": 0.49129540781357095, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00030158646404743195, |
|
"loss/value_avg": 0.22391614317893982, |
|
"lr": 1.69921875e-06, |
|
"objective/entropy": -49.59033966064453, |
|
"objective/kl": 35.19733428955078, |
|
"objective/non_score_reward": -1.759866714477539, |
|
"objective/rlhf_reward": 10.102191925048828, |
|
"objective/scores": 11.862058639526367, |
|
"policy/approxkl_avg": 0.0021578953601419926, |
|
"policy/clipfrac_avg": 0.009181549772620201, |
|
"policy/entropy_avg": 1.3748302459716797, |
|
"step": 112, |
|
"val/clipfrac_avg": 0.002646082080900669, |
|
"val/num_eos_tokens": 26964, |
|
"val/ratio": 1.0000518560409546, |
|
"val/ratio_var": 3.8748953556932975e-06 |
|
}, |
|
{ |
|
"episode": 57856, |
|
"epoch": 0.49568197395476354, |
|
"eps": 4, |
|
"loss/policy_avg": -0.00021987548097968102, |
|
"loss/value_avg": 0.22045229375362396, |
|
"lr": 1.6875e-06, |
|
"objective/entropy": -45.642669677734375, |
|
"objective/kl": 36.55120849609375, |
|
"objective/non_score_reward": -1.827560544013977, |
|
"objective/rlhf_reward": 10.042106628417969, |
|
"objective/scores": 11.869667053222656, |
|
"policy/approxkl_avg": 0.0021457457914948463, |
|
"policy/clipfrac_avg": 0.010356370359659195, |
|
"policy/entropy_avg": 1.2991323471069336, |
|
"step": 113, |
|
"val/clipfrac_avg": 0.003576356451958418, |
|
"val/num_eos_tokens": 26990, |
|
"val/ratio": 1.0002024173736572, |
|
"val/ratio_var": 7.810693205101416e-06 |
|
}, |
|
{ |
|
"episode": 58368, |
|
"epoch": 0.5000685400959561, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0011025657877326012, |
|
"loss/value_avg": 0.20879928767681122, |
|
"lr": 1.67578125e-06, |
|
"objective/entropy": -46.918296813964844, |
|
"objective/kl": 36.73741912841797, |
|
"objective/non_score_reward": -1.8368710279464722, |
|
"objective/rlhf_reward": 10.020646095275879, |
|
"objective/scores": 11.85751724243164, |
|
"policy/approxkl_avg": 0.0025296411477029324, |
|
"policy/clipfrac_avg": 0.009327422827482224, |
|
"policy/entropy_avg": 1.3181732892990112, |
|
"step": 114, |
|
"val/clipfrac_avg": 0.0026597436517477036, |
|
"val/num_eos_tokens": 26972, |
|
"val/ratio": 1.0004398822784424, |
|
"val/ratio_var": 1.119426087825559e-05 |
|
}, |
|
{ |
|
"episode": 58880, |
|
"epoch": 0.5044551062371487, |
|
"eps": 4, |
|
"loss/policy_avg": -0.00237057963386178, |
|
"loss/value_avg": 0.2178419530391693, |
|
"lr": 1.6640625e-06, |
|
"objective/entropy": -46.66753387451172, |
|
"objective/kl": 36.253822326660156, |
|
"objective/non_score_reward": -1.8126912117004395, |
|
"objective/rlhf_reward": 10.09880256652832, |
|
"objective/scores": 11.911494255065918, |
|
"policy/approxkl_avg": 0.002091196598485112, |
|
"policy/clipfrac_avg": 0.01094783004373312, |
|
"policy/entropy_avg": 1.3274283409118652, |
|
"step": 115, |
|
"val/clipfrac_avg": 0.004409347660839558, |
|
"val/num_eos_tokens": 27424, |
|
"val/ratio": 0.9999520778656006, |
|
"val/ratio_var": 4.443759280547965e-06 |
|
}, |
|
{ |
|
"episode": 59392, |
|
"epoch": 0.5088416723783413, |
|
"eps": 4, |
|
"loss/policy_avg": -0.005248534493148327, |
|
"loss/value_avg": 0.2638506293296814, |
|
"lr": 1.6523437500000001e-06, |
|
"objective/entropy": -47.24037170410156, |
|
"objective/kl": 36.8753776550293, |
|
"objective/non_score_reward": -1.8437689542770386, |
|
"objective/rlhf_reward": 9.983234405517578, |
|
"objective/scores": 11.827003479003906, |
|
"policy/approxkl_avg": 0.0032648907508701086, |
|
"policy/clipfrac_avg": 0.011387192644178867, |
|
"policy/entropy_avg": 1.3188259601593018, |
|
"step": 116, |
|
"val/clipfrac_avg": 0.004241817630827427, |
|
"val/num_eos_tokens": 25302, |
|
"val/ratio": 0.999230146408081, |
|
"val/ratio_var": 4.158954197919229e-06 |
|
}, |
|
{ |
|
"episode": 59904, |
|
"epoch": 0.5132282385195339, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0031682229600846767, |
|
"loss/value_avg": 0.23800501227378845, |
|
"lr": 1.640625e-06, |
|
"objective/entropy": -47.64235305786133, |
|
"objective/kl": 35.08871078491211, |
|
"objective/non_score_reward": -1.7544355392456055, |
|
"objective/rlhf_reward": 10.015641212463379, |
|
"objective/scores": 11.770076751708984, |
|
"policy/approxkl_avg": 0.002452462911605835, |
|
"policy/clipfrac_avg": 0.011082207784056664, |
|
"policy/entropy_avg": 1.320603609085083, |
|
"step": 117, |
|
"val/clipfrac_avg": 0.0038690143264830112, |
|
"val/num_eos_tokens": 26752, |
|
"val/ratio": 0.9997324347496033, |
|
"val/ratio_var": 5.267690085020149e-06 |
|
}, |
|
{ |
|
"episode": 60416, |
|
"epoch": 0.5176148046607265, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00654706871137023, |
|
"loss/value_avg": 0.18882016837596893, |
|
"lr": 1.62890625e-06, |
|
"objective/entropy": -49.72483825683594, |
|
"objective/kl": 34.498836517333984, |
|
"objective/non_score_reward": -1.7249417304992676, |
|
"objective/rlhf_reward": 10.259115219116211, |
|
"objective/scores": 11.98405647277832, |
|
"policy/approxkl_avg": 0.0024695878382772207, |
|
"policy/clipfrac_avg": 0.009761758148670197, |
|
"policy/entropy_avg": 1.3732898235321045, |
|
"step": 118, |
|
"val/clipfrac_avg": 0.0033802662510424852, |
|
"val/num_eos_tokens": 26374, |
|
"val/ratio": 1.0005288124084473, |
|
"val/ratio_var": 1.0841821676876862e-05 |
|
}, |
|
{ |
|
"episode": 60928, |
|
"epoch": 0.5220013708019191, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0007685907185077667, |
|
"loss/value_avg": 0.21882620453834534, |
|
"lr": 1.6171875000000001e-06, |
|
"objective/entropy": -49.3972053527832, |
|
"objective/kl": 35.464508056640625, |
|
"objective/non_score_reward": -1.7732254266738892, |
|
"objective/rlhf_reward": 9.918214797973633, |
|
"objective/scores": 11.69144058227539, |
|
"policy/approxkl_avg": 0.002548103453591466, |
|
"policy/clipfrac_avg": 0.010540506802499294, |
|
"policy/entropy_avg": 1.3686912059783936, |
|
"step": 119, |
|
"val/clipfrac_avg": 0.002275804989039898, |
|
"val/num_eos_tokens": 25251, |
|
"val/ratio": 0.9998582005500793, |
|
"val/ratio_var": 4.747326329379575e-06 |
|
}, |
|
{ |
|
"episode": 61440, |
|
"epoch": 0.5263879369431117, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004093550145626068, |
|
"loss/value_avg": 0.19169041514396667, |
|
"lr": 1.60546875e-06, |
|
"objective/entropy": -50.880088806152344, |
|
"objective/kl": 34.82711410522461, |
|
"objective/non_score_reward": -1.7413556575775146, |
|
"objective/rlhf_reward": 10.035165786743164, |
|
"objective/scores": 11.776521682739258, |
|
"policy/approxkl_avg": 0.002437584102153778, |
|
"policy/clipfrac_avg": 0.010569003410637379, |
|
"policy/entropy_avg": 1.3898723125457764, |
|
"step": 120, |
|
"val/clipfrac_avg": 0.003308035433292389, |
|
"val/num_eos_tokens": 25212, |
|
"val/ratio": 1.0002849102020264, |
|
"val/ratio_var": 3.5936700442107394e-05 |
|
}, |
|
{ |
|
"episode": 61952, |
|
"epoch": 0.5307745030843043, |
|
"eps": 4, |
|
"loss/policy_avg": 0.003542997408658266, |
|
"loss/value_avg": 0.18405793607234955, |
|
"lr": 1.59375e-06, |
|
"objective/entropy": -52.583984375, |
|
"objective/kl": 34.274208068847656, |
|
"objective/non_score_reward": -1.7137104272842407, |
|
"objective/rlhf_reward": 10.16425895690918, |
|
"objective/scores": 11.877969741821289, |
|
"policy/approxkl_avg": 0.0019055928569287062, |
|
"policy/clipfrac_avg": 0.010142171755433083, |
|
"policy/entropy_avg": 1.3990683555603027, |
|
"step": 121, |
|
"val/clipfrac_avg": 0.003069917904213071, |
|
"val/num_eos_tokens": 24678, |
|
"val/ratio": 0.9998403787612915, |
|
"val/ratio_var": 2.1590326468867715e-06 |
|
}, |
|
{ |
|
"episode": 62464, |
|
"epoch": 0.5351610692254969, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0034955721348524094, |
|
"loss/value_avg": 0.19773662090301514, |
|
"lr": 1.5820312500000001e-06, |
|
"objective/entropy": -52.74757766723633, |
|
"objective/kl": 33.07254409790039, |
|
"objective/non_score_reward": -1.6536272764205933, |
|
"objective/rlhf_reward": 10.184989929199219, |
|
"objective/scores": 11.838617324829102, |
|
"policy/approxkl_avg": 0.0024753790348768234, |
|
"policy/clipfrac_avg": 0.010373384691774845, |
|
"policy/entropy_avg": 1.4190127849578857, |
|
"step": 122, |
|
"val/clipfrac_avg": 0.0018084857147186995, |
|
"val/num_eos_tokens": 27268, |
|
"val/ratio": 1.00020432472229, |
|
"val/ratio_var": 4.94942651130259e-06 |
|
}, |
|
{ |
|
"episode": 62976, |
|
"epoch": 0.5395476353666895, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0025432901456952095, |
|
"loss/value_avg": 0.15946492552757263, |
|
"lr": 1.5703125e-06, |
|
"objective/entropy": -54.125823974609375, |
|
"objective/kl": 33.593475341796875, |
|
"objective/non_score_reward": -1.679673671722412, |
|
"objective/rlhf_reward": 10.296588897705078, |
|
"objective/scores": 11.976263046264648, |
|
"policy/approxkl_avg": 0.003026704303920269, |
|
"policy/clipfrac_avg": 0.0093125831335783, |
|
"policy/entropy_avg": 1.4351716041564941, |
|
"step": 123, |
|
"val/clipfrac_avg": 0.002426933031529188, |
|
"val/num_eos_tokens": 26522, |
|
"val/ratio": 0.9995898604393005, |
|
"val/ratio_var": 4.661137154471362e-06 |
|
}, |
|
{ |
|
"episode": 63488, |
|
"epoch": 0.5439342015078821, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0016291006468236446, |
|
"loss/value_avg": 0.1786973923444748, |
|
"lr": 1.55859375e-06, |
|
"objective/entropy": -55.55234909057617, |
|
"objective/kl": 33.02119827270508, |
|
"objective/non_score_reward": -1.651059865951538, |
|
"objective/rlhf_reward": 10.201143264770508, |
|
"objective/scores": 11.852203369140625, |
|
"policy/approxkl_avg": 0.0020746339578181505, |
|
"policy/clipfrac_avg": 0.010415926575660706, |
|
"policy/entropy_avg": 1.4947787523269653, |
|
"step": 124, |
|
"val/clipfrac_avg": 0.003645282005891204, |
|
"val/num_eos_tokens": 25188, |
|
"val/ratio": 1.00013267993927, |
|
"val/ratio_var": 3.663065626824391e-06 |
|
}, |
|
{ |
|
"episode": 64000, |
|
"epoch": 0.5483207676490747, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006848334334790707, |
|
"loss/value_avg": 0.17819851636886597, |
|
"lr": 1.5468750000000001e-06, |
|
"objective/entropy": -56.54252624511719, |
|
"objective/kl": 32.34886932373047, |
|
"objective/non_score_reward": -1.617443561553955, |
|
"objective/rlhf_reward": 10.11642074584961, |
|
"objective/scores": 11.733863830566406, |
|
"policy/approxkl_avg": 0.002013370394706726, |
|
"policy/clipfrac_avg": 0.011284420266747475, |
|
"policy/entropy_avg": 1.5066087245941162, |
|
"step": 125, |
|
"val/clipfrac_avg": 0.003249499946832657, |
|
"val/num_eos_tokens": 26126, |
|
"val/ratio": 0.9999648928642273, |
|
"val/ratio_var": 4.107211225345964e-06 |
|
}, |
|
{ |
|
"episode": 64512, |
|
"epoch": 0.5527073337902673, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005298146046698093, |
|
"loss/value_avg": 0.19392237067222595, |
|
"lr": 1.53515625e-06, |
|
"objective/entropy": -57.55706024169922, |
|
"objective/kl": 32.114959716796875, |
|
"objective/non_score_reward": -1.605747938156128, |
|
"objective/rlhf_reward": 10.174212455749512, |
|
"objective/scores": 11.779960632324219, |
|
"policy/approxkl_avg": 0.0027915926184505224, |
|
"policy/clipfrac_avg": 0.010949358344078064, |
|
"policy/entropy_avg": 1.5326869487762451, |
|
"step": 126, |
|
"val/clipfrac_avg": 0.002645657164976001, |
|
"val/num_eos_tokens": 24475, |
|
"val/ratio": 1.000093698501587, |
|
"val/ratio_var": 3.1418253456649836e-06 |
|
}, |
|
{ |
|
"episode": 65024, |
|
"epoch": 0.5570938999314599, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007277632597833872, |
|
"loss/value_avg": 0.17173150181770325, |
|
"lr": 1.5234375e-06, |
|
"objective/entropy": -58.10830307006836, |
|
"objective/kl": 32.09362030029297, |
|
"objective/non_score_reward": -1.6046810150146484, |
|
"objective/rlhf_reward": 10.152120590209961, |
|
"objective/scores": 11.75680160522461, |
|
"policy/approxkl_avg": 0.0019359358120709658, |
|
"policy/clipfrac_avg": 0.012156989425420761, |
|
"policy/entropy_avg": 1.539255142211914, |
|
"step": 127, |
|
"val/clipfrac_avg": 0.0033526804763823748, |
|
"val/num_eos_tokens": 24531, |
|
"val/ratio": 1.0003376007080078, |
|
"val/ratio_var": 5.461680757434806e-06 |
|
}, |
|
{ |
|
"episode": 65536, |
|
"epoch": 0.5614804660726525, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005196425132453442, |
|
"loss/value_avg": 0.17728489637374878, |
|
"lr": 1.5117187500000001e-06, |
|
"objective/entropy": -57.962921142578125, |
|
"objective/kl": 31.692747116088867, |
|
"objective/non_score_reward": -1.5846374034881592, |
|
"objective/rlhf_reward": 9.997382164001465, |
|
"objective/scores": 11.582019805908203, |
|
"policy/approxkl_avg": 0.0022988603450357914, |
|
"policy/clipfrac_avg": 0.012291998602449894, |
|
"policy/entropy_avg": 1.5566446781158447, |
|
"step": 128, |
|
"val/clipfrac_avg": 0.003016907721757889, |
|
"val/num_eos_tokens": 24547, |
|
"val/ratio": 1.0000189542770386, |
|
"val/ratio_var": 4.059977982251439e-06 |
|
}, |
|
{ |
|
"episode": 66048, |
|
"epoch": 0.565867032213845, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00474149826914072, |
|
"loss/value_avg": 0.16934943199157715, |
|
"lr": 1.5e-06, |
|
"objective/entropy": -60.81864929199219, |
|
"objective/kl": 30.907930374145508, |
|
"objective/non_score_reward": -1.5453965663909912, |
|
"objective/rlhf_reward": 10.061457633972168, |
|
"objective/scores": 11.606854438781738, |
|
"policy/approxkl_avg": 0.0017903585685417056, |
|
"policy/clipfrac_avg": 0.011585026048123837, |
|
"policy/entropy_avg": 1.6131141185760498, |
|
"step": 129, |
|
"val/clipfrac_avg": 0.0023407491389662027, |
|
"val/num_eos_tokens": 24418, |
|
"val/ratio": 1.0001299381256104, |
|
"val/ratio_var": 4.394762527226703e-06 |
|
}, |
|
{ |
|
"episode": 66560, |
|
"epoch": 0.5702535983550377, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002869675401598215, |
|
"loss/value_avg": 0.17518723011016846, |
|
"lr": 1.48828125e-06, |
|
"objective/entropy": -60.761024475097656, |
|
"objective/kl": 30.685794830322266, |
|
"objective/non_score_reward": -1.534289836883545, |
|
"objective/rlhf_reward": 10.12808609008789, |
|
"objective/scores": 11.662375450134277, |
|
"policy/approxkl_avg": 0.002911779098212719, |
|
"policy/clipfrac_avg": 0.01113644428551197, |
|
"policy/entropy_avg": 1.5905413627624512, |
|
"step": 130, |
|
"val/clipfrac_avg": 0.0030311732552945614, |
|
"val/num_eos_tokens": 23255, |
|
"val/ratio": 0.9994995594024658, |
|
"val/ratio_var": 4.020673713966971e-06 |
|
}, |
|
{ |
|
"episode": 67072, |
|
"epoch": 0.5746401644962303, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005674917250871658, |
|
"loss/value_avg": 0.1688387095928192, |
|
"lr": 1.4765625e-06, |
|
"objective/entropy": -62.38867950439453, |
|
"objective/kl": 30.885807037353516, |
|
"objective/non_score_reward": -1.5442904233932495, |
|
"objective/rlhf_reward": 9.997440338134766, |
|
"objective/scores": 11.541730880737305, |
|
"policy/approxkl_avg": 0.0019931201823055744, |
|
"policy/clipfrac_avg": 0.01189956534653902, |
|
"policy/entropy_avg": 1.6408116817474365, |
|
"step": 131, |
|
"val/clipfrac_avg": 0.0037230595480650663, |
|
"val/num_eos_tokens": 22733, |
|
"val/ratio": 0.9997772574424744, |
|
"val/ratio_var": 4.117905518796761e-06 |
|
}, |
|
{ |
|
"episode": 67584, |
|
"epoch": 0.5790267306374229, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0006350036710500717, |
|
"loss/value_avg": 0.18697667121887207, |
|
"lr": 1.46484375e-06, |
|
"objective/entropy": -59.36350631713867, |
|
"objective/kl": 31.597064971923828, |
|
"objective/non_score_reward": -1.5798532962799072, |
|
"objective/rlhf_reward": 10.01123046875, |
|
"objective/scores": 11.591083526611328, |
|
"policy/approxkl_avg": 0.0020872685126960278, |
|
"policy/clipfrac_avg": 0.010825317353010178, |
|
"policy/entropy_avg": 1.5486820936203003, |
|
"step": 132, |
|
"val/clipfrac_avg": 0.0028165532276034355, |
|
"val/num_eos_tokens": 24545, |
|
"val/ratio": 1.0000473260879517, |
|
"val/ratio_var": 4.7966873353288975e-06 |
|
}, |
|
{ |
|
"episode": 68096, |
|
"epoch": 0.5834132967786155, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004328216426074505, |
|
"loss/value_avg": 0.16442391276359558, |
|
"lr": 1.453125e-06, |
|
"objective/entropy": -59.58063507080078, |
|
"objective/kl": 31.654491424560547, |
|
"objective/non_score_reward": -1.5827245712280273, |
|
"objective/rlhf_reward": 10.193343162536621, |
|
"objective/scores": 11.776067733764648, |
|
"policy/approxkl_avg": 0.0024430665653198957, |
|
"policy/clipfrac_avg": 0.01117191556841135, |
|
"policy/entropy_avg": 1.5335578918457031, |
|
"step": 133, |
|
"val/clipfrac_avg": 0.00234953872859478, |
|
"val/num_eos_tokens": 22553, |
|
"val/ratio": 0.9998908042907715, |
|
"val/ratio_var": 3.7154718484089244e-06 |
|
}, |
|
{ |
|
"episode": 68608, |
|
"epoch": 0.5877998629198081, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00497487373650074, |
|
"loss/value_avg": 0.16096718609333038, |
|
"lr": 1.44140625e-06, |
|
"objective/entropy": -56.16926574707031, |
|
"objective/kl": 32.786376953125, |
|
"objective/non_score_reward": -1.639318823814392, |
|
"objective/rlhf_reward": 10.189998626708984, |
|
"objective/scores": 11.829317092895508, |
|
"policy/approxkl_avg": 0.0026530069299042225, |
|
"policy/clipfrac_avg": 0.011592323891818523, |
|
"policy/entropy_avg": 1.4631075859069824, |
|
"step": 134, |
|
"val/clipfrac_avg": 0.0031634648330509663, |
|
"val/num_eos_tokens": 23151, |
|
"val/ratio": 0.9998040795326233, |
|
"val/ratio_var": 4.819019522983581e-06 |
|
}, |
|
{ |
|
"episode": 69120, |
|
"epoch": 0.5921864290610007, |
|
"eps": 4, |
|
"loss/policy_avg": 0.013725158758461475, |
|
"loss/value_avg": 0.16442811489105225, |
|
"lr": 1.4296875e-06, |
|
"objective/entropy": -58.683013916015625, |
|
"objective/kl": 31.694040298461914, |
|
"objective/non_score_reward": -1.5847020149230957, |
|
"objective/rlhf_reward": 10.19643783569336, |
|
"objective/scores": 11.781139373779297, |
|
"policy/approxkl_avg": 0.0022610300220549107, |
|
"policy/clipfrac_avg": 0.010460296645760536, |
|
"policy/entropy_avg": 1.5235949754714966, |
|
"step": 135, |
|
"val/clipfrac_avg": 0.002330533927306533, |
|
"val/num_eos_tokens": 25330, |
|
"val/ratio": 1.0005124807357788, |
|
"val/ratio_var": 1.425193840987049e-05 |
|
}, |
|
{ |
|
"episode": 69632, |
|
"epoch": 0.5965729952021933, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007436072453856468, |
|
"loss/value_avg": 0.174909770488739, |
|
"lr": 1.41796875e-06, |
|
"objective/entropy": -59.78578186035156, |
|
"objective/kl": 32.64409637451172, |
|
"objective/non_score_reward": -1.6322047710418701, |
|
"objective/rlhf_reward": 10.095362663269043, |
|
"objective/scores": 11.727567672729492, |
|
"policy/approxkl_avg": 0.00211188942193985, |
|
"policy/clipfrac_avg": 0.01192308496683836, |
|
"policy/entropy_avg": 1.5522668361663818, |
|
"step": 136, |
|
"val/clipfrac_avg": 0.002892076037824154, |
|
"val/num_eos_tokens": 24760, |
|
"val/ratio": 0.9996564984321594, |
|
"val/ratio_var": 5.224440883466741e-06 |
|
}, |
|
{ |
|
"episode": 70144, |
|
"epoch": 0.6009595613433859, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009540164843201637, |
|
"loss/value_avg": 0.178801029920578, |
|
"lr": 1.40625e-06, |
|
"objective/entropy": -56.64319610595703, |
|
"objective/kl": 33.44987869262695, |
|
"objective/non_score_reward": -1.6724939346313477, |
|
"objective/rlhf_reward": 10.061247825622559, |
|
"objective/scores": 11.733741760253906, |
|
"policy/approxkl_avg": 0.002490841317921877, |
|
"policy/clipfrac_avg": 0.012106543406844139, |
|
"policy/entropy_avg": 1.4971638917922974, |
|
"step": 137, |
|
"val/clipfrac_avg": 0.0025153912138193846, |
|
"val/num_eos_tokens": 25677, |
|
"val/ratio": 1.0003026723861694, |
|
"val/ratio_var": 4.707488187705167e-06 |
|
}, |
|
{ |
|
"episode": 70656, |
|
"epoch": 0.6053461274845785, |
|
"eps": 4, |
|
"loss/policy_avg": 0.010735518299043179, |
|
"loss/value_avg": 0.17893172800540924, |
|
"lr": 1.39453125e-06, |
|
"objective/entropy": -58.26459503173828, |
|
"objective/kl": 32.324424743652344, |
|
"objective/non_score_reward": -1.6162214279174805, |
|
"objective/rlhf_reward": 10.123574256896973, |
|
"objective/scores": 11.739795684814453, |
|
"policy/approxkl_avg": 0.002283570822328329, |
|
"policy/clipfrac_avg": 0.010066288523375988, |
|
"policy/entropy_avg": 1.4838917255401611, |
|
"step": 138, |
|
"val/clipfrac_avg": 0.0031754274386912584, |
|
"val/num_eos_tokens": 21488, |
|
"val/ratio": 0.9996895790100098, |
|
"val/ratio_var": 3.484929720798391e-06 |
|
}, |
|
{ |
|
"episode": 71168, |
|
"epoch": 0.6097326936257711, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004294028505682945, |
|
"loss/value_avg": 0.1728200614452362, |
|
"lr": 1.3828125e-06, |
|
"objective/entropy": -58.1663818359375, |
|
"objective/kl": 33.08858871459961, |
|
"objective/non_score_reward": -1.6544294357299805, |
|
"objective/rlhf_reward": 10.134689331054688, |
|
"objective/scores": 11.789118766784668, |
|
"policy/approxkl_avg": 0.003025288227945566, |
|
"policy/clipfrac_avg": 0.010935579426586628, |
|
"policy/entropy_avg": 1.4885742664337158, |
|
"step": 139, |
|
"val/clipfrac_avg": 0.002140995115041733, |
|
"val/num_eos_tokens": 24079, |
|
"val/ratio": 0.9999958276748657, |
|
"val/ratio_var": 3.371385446371278e-06 |
|
}, |
|
{ |
|
"episode": 71680, |
|
"epoch": 0.6141192597669637, |
|
"eps": 4, |
|
"loss/policy_avg": 0.012265619821846485, |
|
"loss/value_avg": 0.15826916694641113, |
|
"lr": 1.37109375e-06, |
|
"objective/entropy": -53.690528869628906, |
|
"objective/kl": 33.63441848754883, |
|
"objective/non_score_reward": -1.6817208528518677, |
|
"objective/rlhf_reward": 10.200325965881348, |
|
"objective/scores": 11.882046699523926, |
|
"policy/approxkl_avg": 0.0019431847613304853, |
|
"policy/clipfrac_avg": 0.010160792618989944, |
|
"policy/entropy_avg": 1.4183900356292725, |
|
"step": 140, |
|
"val/clipfrac_avg": 0.0023755324073135853, |
|
"val/num_eos_tokens": 22014, |
|
"val/ratio": 1.0000615119934082, |
|
"val/ratio_var": 2.799153207888594e-06 |
|
}, |
|
{ |
|
"episode": 72192, |
|
"epoch": 0.6185058259081563, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007404782343655825, |
|
"loss/value_avg": 0.17414774000644684, |
|
"lr": 1.359375e-06, |
|
"objective/entropy": -56.323875427246094, |
|
"objective/kl": 34.29399490356445, |
|
"objective/non_score_reward": -1.7146997451782227, |
|
"objective/rlhf_reward": 10.027562141418457, |
|
"objective/scores": 11.74226188659668, |
|
"policy/approxkl_avg": 0.0020079202950000763, |
|
"policy/clipfrac_avg": 0.011949660256505013, |
|
"policy/entropy_avg": 1.4914710521697998, |
|
"step": 141, |
|
"val/clipfrac_avg": 0.0026488695293664932, |
|
"val/num_eos_tokens": 26006, |
|
"val/ratio": 0.9997506141662598, |
|
"val/ratio_var": 3.4570130083011463e-06 |
|
}, |
|
{ |
|
"episode": 72704, |
|
"epoch": 0.6228923920493489, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009324302896857262, |
|
"loss/value_avg": 0.17025059461593628, |
|
"lr": 1.34765625e-06, |
|
"objective/entropy": -56.78107452392578, |
|
"objective/kl": 33.23863220214844, |
|
"objective/non_score_reward": -1.6619315147399902, |
|
"objective/rlhf_reward": 10.156038284301758, |
|
"objective/scores": 11.817970275878906, |
|
"policy/approxkl_avg": 0.0020708302035927773, |
|
"policy/clipfrac_avg": 0.012341851368546486, |
|
"policy/entropy_avg": 1.486853837966919, |
|
"step": 142, |
|
"val/clipfrac_avg": 0.002559303306043148, |
|
"val/num_eos_tokens": 22403, |
|
"val/ratio": 1.0002156496047974, |
|
"val/ratio_var": 4.204774540994549e-06 |
|
}, |
|
{ |
|
"episode": 73216, |
|
"epoch": 0.6272789581905415, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004553023725748062, |
|
"loss/value_avg": 0.1775711476802826, |
|
"lr": 1.3359375e-06, |
|
"objective/entropy": -53.44776153564453, |
|
"objective/kl": 34.82638931274414, |
|
"objective/non_score_reward": -1.7413194179534912, |
|
"objective/rlhf_reward": 10.207488059997559, |
|
"objective/scores": 11.948807716369629, |
|
"policy/approxkl_avg": 0.0024012078065425158, |
|
"policy/clipfrac_avg": 0.012094511650502682, |
|
"policy/entropy_avg": 1.4035837650299072, |
|
"step": 143, |
|
"val/clipfrac_avg": 0.0028213425539433956, |
|
"val/num_eos_tokens": 25504, |
|
"val/ratio": 0.999752402305603, |
|
"val/ratio_var": 3.635649363786797e-06 |
|
}, |
|
{ |
|
"episode": 73728, |
|
"epoch": 0.6316655243317341, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00827038660645485, |
|
"loss/value_avg": 0.15853461623191833, |
|
"lr": 1.32421875e-06, |
|
"objective/entropy": -55.35993194580078, |
|
"objective/kl": 34.32511520385742, |
|
"objective/non_score_reward": -1.716255784034729, |
|
"objective/rlhf_reward": 10.100241661071777, |
|
"objective/scores": 11.816497802734375, |
|
"policy/approxkl_avg": 0.0027391049079596996, |
|
"policy/clipfrac_avg": 0.01181262731552124, |
|
"policy/entropy_avg": 1.4438374042510986, |
|
"step": 144, |
|
"val/clipfrac_avg": 0.0024406672455370426, |
|
"val/num_eos_tokens": 22691, |
|
"val/ratio": 1.0007078647613525, |
|
"val/ratio_var": 1.7917764125741087e-05 |
|
}, |
|
{ |
|
"episode": 74240, |
|
"epoch": 0.6360520904729267, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009457225911319256, |
|
"loss/value_avg": 0.157231405377388, |
|
"lr": 1.3125000000000001e-06, |
|
"objective/entropy": -54.185794830322266, |
|
"objective/kl": 34.9151496887207, |
|
"objective/non_score_reward": -1.7457575798034668, |
|
"objective/rlhf_reward": 10.300085067749023, |
|
"objective/scores": 12.045843124389648, |
|
"policy/approxkl_avg": 0.0025434326380491257, |
|
"policy/clipfrac_avg": 0.012178627774119377, |
|
"policy/entropy_avg": 1.4010084867477417, |
|
"step": 145, |
|
"val/clipfrac_avg": 0.0034856563434004784, |
|
"val/num_eos_tokens": 23737, |
|
"val/ratio": 1.0001429319381714, |
|
"val/ratio_var": 4.342706688476028e-06 |
|
}, |
|
{ |
|
"episode": 74752, |
|
"epoch": 0.6404386566141193, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005285304039716721, |
|
"loss/value_avg": 0.17335200309753418, |
|
"lr": 1.30078125e-06, |
|
"objective/entropy": -53.034061431884766, |
|
"objective/kl": 34.402244567871094, |
|
"objective/non_score_reward": -1.7201124429702759, |
|
"objective/rlhf_reward": 10.186295509338379, |
|
"objective/scores": 11.906408309936523, |
|
"policy/approxkl_avg": 0.0023625961039215326, |
|
"policy/clipfrac_avg": 0.011160846799612045, |
|
"policy/entropy_avg": 1.373847484588623, |
|
"step": 146, |
|
"val/clipfrac_avg": 0.002591262571513653, |
|
"val/num_eos_tokens": 23870, |
|
"val/ratio": 0.9997913837432861, |
|
"val/ratio_var": 3.1251217933458975e-06 |
|
}, |
|
{ |
|
"episode": 75264, |
|
"epoch": 0.6448252227553118, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0033188331872224808, |
|
"loss/value_avg": 0.178251251578331, |
|
"lr": 1.2890625e-06, |
|
"objective/entropy": -52.30128479003906, |
|
"objective/kl": 35.336875915527344, |
|
"objective/non_score_reward": -1.7668437957763672, |
|
"objective/rlhf_reward": 10.239588737487793, |
|
"objective/scores": 12.00643253326416, |
|
"policy/approxkl_avg": 0.002232671482488513, |
|
"policy/clipfrac_avg": 0.01211271807551384, |
|
"policy/entropy_avg": 1.3423898220062256, |
|
"step": 147, |
|
"val/clipfrac_avg": 0.0049108765088021755, |
|
"val/num_eos_tokens": 24415, |
|
"val/ratio": 0.999698281288147, |
|
"val/ratio_var": 4.744644684251398e-06 |
|
}, |
|
{ |
|
"episode": 75776, |
|
"epoch": 0.6492117888965044, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0045684343203902245, |
|
"loss/value_avg": 0.1551057994365692, |
|
"lr": 1.2773437500000001e-06, |
|
"objective/entropy": -42.2551383972168, |
|
"objective/kl": 36.38901901245117, |
|
"objective/non_score_reward": -1.8194509744644165, |
|
"objective/rlhf_reward": 10.190171241760254, |
|
"objective/scores": 12.009622573852539, |
|
"policy/approxkl_avg": 0.0027887800242751837, |
|
"policy/clipfrac_avg": 0.01179465465247631, |
|
"policy/entropy_avg": 1.1810457706451416, |
|
"step": 148, |
|
"val/clipfrac_avg": 0.002859487198293209, |
|
"val/num_eos_tokens": 21310, |
|
"val/ratio": 0.9999205470085144, |
|
"val/ratio_var": 4.294446171115851e-06 |
|
}, |
|
{ |
|
"episode": 76288, |
|
"epoch": 0.653598355037697, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0034446939826011658, |
|
"loss/value_avg": 0.1887798309326172, |
|
"lr": 1.265625e-06, |
|
"objective/entropy": -48.354408264160156, |
|
"objective/kl": 36.67412185668945, |
|
"objective/non_score_reward": -1.833706259727478, |
|
"objective/rlhf_reward": 10.170305252075195, |
|
"objective/scores": 12.004011154174805, |
|
"policy/approxkl_avg": 0.0017848997376859188, |
|
"policy/clipfrac_avg": 0.011014001443982124, |
|
"policy/entropy_avg": 1.2652117013931274, |
|
"step": 149, |
|
"val/clipfrac_avg": 0.0032921340316534042, |
|
"val/num_eos_tokens": 23606, |
|
"val/ratio": 0.9999603033065796, |
|
"val/ratio_var": 2.988560936501017e-06 |
|
}, |
|
{ |
|
"episode": 76800, |
|
"epoch": 0.6579849211788896, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0008725142106413841, |
|
"loss/value_avg": 0.1778767853975296, |
|
"lr": 1.25390625e-06, |
|
"objective/entropy": -48.64875793457031, |
|
"objective/kl": 35.84824752807617, |
|
"objective/non_score_reward": -1.792412519454956, |
|
"objective/rlhf_reward": 10.273049354553223, |
|
"objective/scores": 12.065462112426758, |
|
"policy/approxkl_avg": 0.002263781614601612, |
|
"policy/clipfrac_avg": 0.010905838571488857, |
|
"policy/entropy_avg": 1.2818143367767334, |
|
"step": 150, |
|
"val/clipfrac_avg": 0.003908317536115646, |
|
"val/num_eos_tokens": 25626, |
|
"val/ratio": 0.999974250793457, |
|
"val/ratio_var": 3.1995639346860116e-06 |
|
}, |
|
{ |
|
"episode": 77312, |
|
"epoch": 0.6623714873200822, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004413328133523464, |
|
"loss/value_avg": 0.1777411848306656, |
|
"lr": 1.2421875000000001e-06, |
|
"objective/entropy": -46.26213836669922, |
|
"objective/kl": 36.80657196044922, |
|
"objective/non_score_reward": -1.8403284549713135, |
|
"objective/rlhf_reward": 10.09887409210205, |
|
"objective/scores": 11.939202308654785, |
|
"policy/approxkl_avg": 0.0024394330102950335, |
|
"policy/clipfrac_avg": 0.012261416763067245, |
|
"policy/entropy_avg": 1.2491027116775513, |
|
"step": 151, |
|
"val/clipfrac_avg": 0.002783268690109253, |
|
"val/num_eos_tokens": 23946, |
|
"val/ratio": 0.9999766945838928, |
|
"val/ratio_var": 3.928019395971205e-06 |
|
}, |
|
{ |
|
"episode": 77824, |
|
"epoch": 0.6667580534612748, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0010167919099330902, |
|
"loss/value_avg": 0.17760787904262543, |
|
"lr": 1.23046875e-06, |
|
"objective/entropy": -45.87760925292969, |
|
"objective/kl": 36.983638763427734, |
|
"objective/non_score_reward": -1.849181890487671, |
|
"objective/rlhf_reward": 10.17978572845459, |
|
"objective/scores": 12.02896785736084, |
|
"policy/approxkl_avg": 0.002851827535778284, |
|
"policy/clipfrac_avg": 0.01199465710669756, |
|
"policy/entropy_avg": 1.2081284523010254, |
|
"step": 152, |
|
"val/clipfrac_avg": 0.0026916628703475, |
|
"val/num_eos_tokens": 24294, |
|
"val/ratio": 1.000422716140747, |
|
"val/ratio_var": 6.217844656930538e-06 |
|
}, |
|
{ |
|
"episode": 78336, |
|
"epoch": 0.6711446196024674, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002063746564090252, |
|
"loss/value_avg": 0.1728123426437378, |
|
"lr": 1.21875e-06, |
|
"objective/entropy": -44.998619079589844, |
|
"objective/kl": 37.763389587402344, |
|
"objective/non_score_reward": -1.888169765472412, |
|
"objective/rlhf_reward": 10.178922653198242, |
|
"objective/scores": 12.067092895507812, |
|
"policy/approxkl_avg": 0.002793453633785248, |
|
"policy/clipfrac_avg": 0.012047767639160156, |
|
"policy/entropy_avg": 1.2028660774230957, |
|
"step": 153, |
|
"val/clipfrac_avg": 0.0025438859593123198, |
|
"val/num_eos_tokens": 23546, |
|
"val/ratio": 1.0003751516342163, |
|
"val/ratio_var": 1.4593482774216682e-05 |
|
}, |
|
{ |
|
"episode": 78848, |
|
"epoch": 0.67553118574366, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0007819309830665588, |
|
"loss/value_avg": 0.1686643660068512, |
|
"lr": 1.2070312500000001e-06, |
|
"objective/entropy": -45.0665283203125, |
|
"objective/kl": 36.91722869873047, |
|
"objective/non_score_reward": -1.8458614349365234, |
|
"objective/rlhf_reward": 10.167716026306152, |
|
"objective/scores": 12.013577461242676, |
|
"policy/approxkl_avg": 0.002328127156943083, |
|
"policy/clipfrac_avg": 0.01130404882133007, |
|
"policy/entropy_avg": 1.207916498184204, |
|
"step": 154, |
|
"val/clipfrac_avg": 0.002523067407310009, |
|
"val/num_eos_tokens": 23131, |
|
"val/ratio": 0.9997349977493286, |
|
"val/ratio_var": 2.9115451525285607e-06 |
|
}, |
|
{ |
|
"episode": 79360, |
|
"epoch": 0.6799177518848526, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0004192842170596123, |
|
"loss/value_avg": 0.1749456375837326, |
|
"lr": 1.1953125e-06, |
|
"objective/entropy": -44.818763732910156, |
|
"objective/kl": 37.74750518798828, |
|
"objective/non_score_reward": -1.8873754739761353, |
|
"objective/rlhf_reward": 10.325039863586426, |
|
"objective/scores": 12.21241569519043, |
|
"policy/approxkl_avg": 0.00260849017649889, |
|
"policy/clipfrac_avg": 0.011585136875510216, |
|
"policy/entropy_avg": 1.2088732719421387, |
|
"step": 155, |
|
"val/clipfrac_avg": 0.001912396401166916, |
|
"val/num_eos_tokens": 23505, |
|
"val/ratio": 0.999869704246521, |
|
"val/ratio_var": 3.1894157928036293e-06 |
|
}, |
|
{ |
|
"episode": 79872, |
|
"epoch": 0.6843043180260452, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0022659683600068092, |
|
"loss/value_avg": 0.17857833206653595, |
|
"lr": 1.18359375e-06, |
|
"objective/entropy": -44.684295654296875, |
|
"objective/kl": 37.606056213378906, |
|
"objective/non_score_reward": -1.880302906036377, |
|
"objective/rlhf_reward": 10.136545181274414, |
|
"objective/scores": 12.01684856414795, |
|
"policy/approxkl_avg": 0.002529420889914036, |
|
"policy/clipfrac_avg": 0.010681129060685635, |
|
"policy/entropy_avg": 1.2111512422561646, |
|
"step": 156, |
|
"val/clipfrac_avg": 0.0028712116181850433, |
|
"val/num_eos_tokens": 23424, |
|
"val/ratio": 0.9998527765274048, |
|
"val/ratio_var": 3.897734586644219e-06 |
|
}, |
|
{ |
|
"episode": 80384, |
|
"epoch": 0.6886908841672378, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007091144565492868, |
|
"loss/value_avg": 0.17015790939331055, |
|
"lr": 1.1718750000000001e-06, |
|
"objective/entropy": -45.67634582519531, |
|
"objective/kl": 37.143402099609375, |
|
"objective/non_score_reward": -1.8571701049804688, |
|
"objective/rlhf_reward": 10.208694458007812, |
|
"objective/scores": 12.065864562988281, |
|
"policy/approxkl_avg": 0.0023084133863449097, |
|
"policy/clipfrac_avg": 0.011631271801888943, |
|
"policy/entropy_avg": 1.2184652090072632, |
|
"step": 157, |
|
"val/clipfrac_avg": 0.0025410668458789587, |
|
"val/num_eos_tokens": 23929, |
|
"val/ratio": 1.0000377893447876, |
|
"val/ratio_var": 4.961769263900351e-06 |
|
}, |
|
{ |
|
"episode": 80896, |
|
"epoch": 0.6930774503084305, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0037001436576247215, |
|
"loss/value_avg": 0.16299593448638916, |
|
"lr": 1.16015625e-06, |
|
"objective/entropy": -44.94884490966797, |
|
"objective/kl": 36.945613861083984, |
|
"objective/non_score_reward": -1.847280502319336, |
|
"objective/rlhf_reward": 10.337121963500977, |
|
"objective/scores": 12.184402465820312, |
|
"policy/approxkl_avg": 0.002592534990981221, |
|
"policy/clipfrac_avg": 0.010881590656936169, |
|
"policy/entropy_avg": 1.1857258081436157, |
|
"step": 158, |
|
"val/clipfrac_avg": 0.002265874994918704, |
|
"val/num_eos_tokens": 24409, |
|
"val/ratio": 0.9998010993003845, |
|
"val/ratio_var": 4.288477157388115e-06 |
|
}, |
|
{ |
|
"episode": 81408, |
|
"epoch": 0.6974640164496231, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0009241420775651932, |
|
"loss/value_avg": 0.1695103943347931, |
|
"lr": 1.1484375e-06, |
|
"objective/entropy": -45.550392150878906, |
|
"objective/kl": 36.468170166015625, |
|
"objective/non_score_reward": -1.8234084844589233, |
|
"objective/rlhf_reward": 10.236851692199707, |
|
"objective/scores": 12.060259819030762, |
|
"policy/approxkl_avg": 0.002230257960036397, |
|
"policy/clipfrac_avg": 0.010667338967323303, |
|
"policy/entropy_avg": 1.2040106058120728, |
|
"step": 159, |
|
"val/clipfrac_avg": 0.00268998509272933, |
|
"val/num_eos_tokens": 23921, |
|
"val/ratio": 1.0000641345977783, |
|
"val/ratio_var": 2.7710232188837836e-06 |
|
}, |
|
{ |
|
"episode": 81920, |
|
"epoch": 0.7018505825908157, |
|
"eps": 4, |
|
"loss/policy_avg": -0.00038310326635837555, |
|
"loss/value_avg": 0.18818724155426025, |
|
"lr": 1.13671875e-06, |
|
"objective/entropy": -45.712406158447266, |
|
"objective/kl": 35.81027603149414, |
|
"objective/non_score_reward": -1.7905137538909912, |
|
"objective/rlhf_reward": 10.11474609375, |
|
"objective/scores": 11.90526008605957, |
|
"policy/approxkl_avg": 0.002487615682184696, |
|
"policy/clipfrac_avg": 0.01108371652662754, |
|
"policy/entropy_avg": 1.212557315826416, |
|
"step": 160, |
|
"val/clipfrac_avg": 0.003145547118037939, |
|
"val/num_eos_tokens": 25204, |
|
"val/ratio": 0.999794065952301, |
|
"val/ratio_var": 4.027490376756759e-06 |
|
}, |
|
{ |
|
"episode": 82432, |
|
"epoch": 0.7062371487320083, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006643541157245636, |
|
"loss/value_avg": 0.18656548857688904, |
|
"lr": 1.125e-06, |
|
"objective/entropy": -47.61738586425781, |
|
"objective/kl": 35.84968566894531, |
|
"objective/non_score_reward": -1.7924842834472656, |
|
"objective/rlhf_reward": 10.187285423278809, |
|
"objective/scores": 11.979769706726074, |
|
"policy/approxkl_avg": 0.0026129959151148796, |
|
"policy/clipfrac_avg": 0.011652868241071701, |
|
"policy/entropy_avg": 1.2548754215240479, |
|
"step": 161, |
|
"val/clipfrac_avg": 0.0028428449295461178, |
|
"val/num_eos_tokens": 24230, |
|
"val/ratio": 0.9996559619903564, |
|
"val/ratio_var": 5.3404851314553525e-06 |
|
}, |
|
{ |
|
"episode": 82944, |
|
"epoch": 0.7106237148732009, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006890101823955774, |
|
"loss/value_avg": 0.16082976758480072, |
|
"lr": 1.11328125e-06, |
|
"objective/entropy": -50.946983337402344, |
|
"objective/kl": 35.273223876953125, |
|
"objective/non_score_reward": -1.7636611461639404, |
|
"objective/rlhf_reward": 10.322636604309082, |
|
"objective/scores": 12.086297988891602, |
|
"policy/approxkl_avg": 0.001959030982106924, |
|
"policy/clipfrac_avg": 0.012122605927288532, |
|
"policy/entropy_avg": 1.3099424839019775, |
|
"step": 162, |
|
"val/clipfrac_avg": 0.002845605369657278, |
|
"val/num_eos_tokens": 23063, |
|
"val/ratio": 0.9995248317718506, |
|
"val/ratio_var": 2.941842012660345e-06 |
|
}, |
|
{ |
|
"episode": 83456, |
|
"epoch": 0.7150102810143935, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0033896267414093018, |
|
"loss/value_avg": 0.16125231981277466, |
|
"lr": 1.1015625e-06, |
|
"objective/entropy": -50.64572525024414, |
|
"objective/kl": 34.75807189941406, |
|
"objective/non_score_reward": -1.7379035949707031, |
|
"objective/rlhf_reward": 10.104735374450684, |
|
"objective/scores": 11.842638969421387, |
|
"policy/approxkl_avg": 0.0020693184342235327, |
|
"policy/clipfrac_avg": 0.011038804426789284, |
|
"policy/entropy_avg": 1.307016134262085, |
|
"step": 163, |
|
"val/clipfrac_avg": 0.0025604660622775555, |
|
"val/num_eos_tokens": 22017, |
|
"val/ratio": 0.9999017119407654, |
|
"val/ratio_var": 2.949772351712454e-06 |
|
}, |
|
{ |
|
"episode": 83968, |
|
"epoch": 0.719396847155586, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0018947692587971687, |
|
"loss/value_avg": 0.16481269896030426, |
|
"lr": 1.08984375e-06, |
|
"objective/entropy": -50.58066940307617, |
|
"objective/kl": 34.62723159790039, |
|
"objective/non_score_reward": -1.7313616275787354, |
|
"objective/rlhf_reward": 10.169866561889648, |
|
"objective/scores": 11.901227951049805, |
|
"policy/approxkl_avg": 0.002324402565136552, |
|
"policy/clipfrac_avg": 0.011050723493099213, |
|
"policy/entropy_avg": 1.3137667179107666, |
|
"step": 164, |
|
"val/clipfrac_avg": 0.003440006636083126, |
|
"val/num_eos_tokens": 23923, |
|
"val/ratio": 0.9997102618217468, |
|
"val/ratio_var": 5.237521236267639e-06 |
|
}, |
|
{ |
|
"episode": 84480, |
|
"epoch": 0.7237834132967786, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00799738522619009, |
|
"loss/value_avg": 0.16653640568256378, |
|
"lr": 1.078125e-06, |
|
"objective/entropy": -52.23271179199219, |
|
"objective/kl": 34.09307098388672, |
|
"objective/non_score_reward": -1.7046536207199097, |
|
"objective/rlhf_reward": 10.162192344665527, |
|
"objective/scores": 11.866846084594727, |
|
"policy/approxkl_avg": 0.001924938871525228, |
|
"policy/clipfrac_avg": 0.010750483721494675, |
|
"policy/entropy_avg": 1.3296667337417603, |
|
"step": 165, |
|
"val/clipfrac_avg": 0.003241895930841565, |
|
"val/num_eos_tokens": 24023, |
|
"val/ratio": 0.9998387098312378, |
|
"val/ratio_var": 6.901913366164081e-06 |
|
}, |
|
{ |
|
"episode": 84992, |
|
"epoch": 0.7281699794379712, |
|
"eps": 4, |
|
"loss/policy_avg": 0.008404719643294811, |
|
"loss/value_avg": 0.1683107316493988, |
|
"lr": 1.06640625e-06, |
|
"objective/entropy": -52.128395080566406, |
|
"objective/kl": 33.078575134277344, |
|
"objective/non_score_reward": -1.6539287567138672, |
|
"objective/rlhf_reward": 10.260198593139648, |
|
"objective/scores": 11.914127349853516, |
|
"policy/approxkl_avg": 0.002755087101832032, |
|
"policy/clipfrac_avg": 0.010908817872405052, |
|
"policy/entropy_avg": 1.3239178657531738, |
|
"step": 166, |
|
"val/clipfrac_avg": 0.002500710543245077, |
|
"val/num_eos_tokens": 25626, |
|
"val/ratio": 1.0002832412719727, |
|
"val/ratio_var": 3.7921972761978395e-06 |
|
}, |
|
{ |
|
"episode": 85504, |
|
"epoch": 0.7325565455791638, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005474764853715897, |
|
"loss/value_avg": 0.15045103430747986, |
|
"lr": 1.0546875e-06, |
|
"objective/entropy": -51.094482421875, |
|
"objective/kl": 33.334014892578125, |
|
"objective/non_score_reward": -1.666700839996338, |
|
"objective/rlhf_reward": 10.26999282836914, |
|
"objective/scores": 11.93669319152832, |
|
"policy/approxkl_avg": 0.0023759384639561176, |
|
"policy/clipfrac_avg": 0.011528071947395802, |
|
"policy/entropy_avg": 1.3099522590637207, |
|
"step": 167, |
|
"val/clipfrac_avg": 0.0018291361629962921, |
|
"val/num_eos_tokens": 22563, |
|
"val/ratio": 1.0007615089416504, |
|
"val/ratio_var": 6.840497462690109e-06 |
|
}, |
|
{ |
|
"episode": 86016, |
|
"epoch": 0.7369431117203564, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00405261293053627, |
|
"loss/value_avg": 0.12342571467161179, |
|
"lr": 1.04296875e-06, |
|
"objective/entropy": -48.19590759277344, |
|
"objective/kl": 33.93254089355469, |
|
"objective/non_score_reward": -1.696627140045166, |
|
"objective/rlhf_reward": 10.41012954711914, |
|
"objective/scores": 12.106756210327148, |
|
"policy/approxkl_avg": 0.002511480124667287, |
|
"policy/clipfrac_avg": 0.01289713941514492, |
|
"policy/entropy_avg": 1.2494816780090332, |
|
"step": 168, |
|
"val/clipfrac_avg": 0.0024599945172667503, |
|
"val/num_eos_tokens": 23483, |
|
"val/ratio": 1.0000154972076416, |
|
"val/ratio_var": 1.347678426100174e-05 |
|
}, |
|
{ |
|
"episode": 86528, |
|
"epoch": 0.741329677861549, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004242723807692528, |
|
"loss/value_avg": 0.15402936935424805, |
|
"lr": 1.03125e-06, |
|
"objective/entropy": -51.71031188964844, |
|
"objective/kl": 34.08534240722656, |
|
"objective/non_score_reward": -1.7042670249938965, |
|
"objective/rlhf_reward": 10.246797561645508, |
|
"objective/scores": 11.951065063476562, |
|
"policy/approxkl_avg": 0.0018741288222372532, |
|
"policy/clipfrac_avg": 0.011544827371835709, |
|
"policy/entropy_avg": 1.304296851158142, |
|
"step": 169, |
|
"val/clipfrac_avg": 0.002608256647363305, |
|
"val/num_eos_tokens": 23311, |
|
"val/ratio": 1.0001602172851562, |
|
"val/ratio_var": 5.876947398064658e-06 |
|
}, |
|
{ |
|
"episode": 87040, |
|
"epoch": 0.7457162440027416, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006287221796810627, |
|
"loss/value_avg": 0.15575343370437622, |
|
"lr": 1.01953125e-06, |
|
"objective/entropy": -50.226593017578125, |
|
"objective/kl": 34.273902893066406, |
|
"objective/non_score_reward": -1.7136950492858887, |
|
"objective/rlhf_reward": 10.271360397338867, |
|
"objective/scores": 11.985054969787598, |
|
"policy/approxkl_avg": 0.002464515157043934, |
|
"policy/clipfrac_avg": 0.012612780556082726, |
|
"policy/entropy_avg": 1.2805957794189453, |
|
"step": 170, |
|
"val/clipfrac_avg": 0.0021345310378819704, |
|
"val/num_eos_tokens": 24292, |
|
"val/ratio": 1.0004091262817383, |
|
"val/ratio_var": 5.745941962231882e-06 |
|
}, |
|
{ |
|
"episode": 87552, |
|
"epoch": 0.7501028101439342, |
|
"eps": 4, |
|
"loss/policy_avg": 0.003506988286972046, |
|
"loss/value_avg": 0.1364804208278656, |
|
"lr": 1.0078125e-06, |
|
"objective/entropy": -51.53871154785156, |
|
"objective/kl": 33.873558044433594, |
|
"objective/non_score_reward": -1.6936776638031006, |
|
"objective/rlhf_reward": 10.34697151184082, |
|
"objective/scores": 12.0406494140625, |
|
"policy/approxkl_avg": 0.00181733223143965, |
|
"policy/clipfrac_avg": 0.011671137996017933, |
|
"policy/entropy_avg": 1.3100472688674927, |
|
"step": 171, |
|
"val/clipfrac_avg": 0.002693683374673128, |
|
"val/num_eos_tokens": 25970, |
|
"val/ratio": 1.0000426769256592, |
|
"val/ratio_var": 3.860561264446005e-06 |
|
}, |
|
{ |
|
"episode": 88064, |
|
"epoch": 0.7544893762851268, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009347271174192429, |
|
"loss/value_avg": 0.1529439389705658, |
|
"lr": 9.9609375e-07, |
|
"objective/entropy": -50.7225456237793, |
|
"objective/kl": 33.92055130004883, |
|
"objective/non_score_reward": -1.6960275173187256, |
|
"objective/rlhf_reward": 10.297569274902344, |
|
"objective/scores": 11.993597030639648, |
|
"policy/approxkl_avg": 0.0020273206755518913, |
|
"policy/clipfrac_avg": 0.012135770171880722, |
|
"policy/entropy_avg": 1.3071811199188232, |
|
"step": 172, |
|
"val/clipfrac_avg": 0.002652028575539589, |
|
"val/num_eos_tokens": 25641, |
|
"val/ratio": 1.0000050067901611, |
|
"val/ratio_var": 3.2831298995006364e-06 |
|
}, |
|
{ |
|
"episode": 88576, |
|
"epoch": 0.7588759424263194, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006738151423633099, |
|
"loss/value_avg": 0.15998482704162598, |
|
"lr": 9.84375e-07, |
|
"objective/entropy": -49.821937561035156, |
|
"objective/kl": 34.85704040527344, |
|
"objective/non_score_reward": -1.7428522109985352, |
|
"objective/rlhf_reward": 10.277626037597656, |
|
"objective/scores": 12.020478248596191, |
|
"policy/approxkl_avg": 0.0023002480156719685, |
|
"policy/clipfrac_avg": 0.010766479186713696, |
|
"policy/entropy_avg": 1.2641386985778809, |
|
"step": 173, |
|
"val/clipfrac_avg": 0.0034392056986689568, |
|
"val/num_eos_tokens": 24298, |
|
"val/ratio": 1.0001941919326782, |
|
"val/ratio_var": 7.138915862014983e-06 |
|
}, |
|
{ |
|
"episode": 89088, |
|
"epoch": 0.763262508567512, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0019265972077846527, |
|
"loss/value_avg": 0.16067391633987427, |
|
"lr": 9.7265625e-07, |
|
"objective/entropy": -48.73914337158203, |
|
"objective/kl": 34.770301818847656, |
|
"objective/non_score_reward": -1.7385151386260986, |
|
"objective/rlhf_reward": 10.19167709350586, |
|
"objective/scores": 11.930191993713379, |
|
"policy/approxkl_avg": 0.0023067870642989874, |
|
"policy/clipfrac_avg": 0.011778725311160088, |
|
"policy/entropy_avg": 1.2403905391693115, |
|
"step": 174, |
|
"val/clipfrac_avg": 0.002369035966694355, |
|
"val/num_eos_tokens": 24369, |
|
"val/ratio": 1.0003368854522705, |
|
"val/ratio_var": 9.785385373106692e-06 |
|
}, |
|
{ |
|
"episode": 89600, |
|
"epoch": 0.7676490747087046, |
|
"eps": 4, |
|
"loss/policy_avg": 0.009770754724740982, |
|
"loss/value_avg": 0.18742433190345764, |
|
"lr": 9.609375e-07, |
|
"objective/entropy": -48.57078552246094, |
|
"objective/kl": 34.64019012451172, |
|
"objective/non_score_reward": -1.7320095300674438, |
|
"objective/rlhf_reward": 10.104455947875977, |
|
"objective/scores": 11.836465835571289, |
|
"policy/approxkl_avg": 0.002243851777166128, |
|
"policy/clipfrac_avg": 0.011341418139636517, |
|
"policy/entropy_avg": 1.2522807121276855, |
|
"step": 175, |
|
"val/clipfrac_avg": 0.0024377312511205673, |
|
"val/num_eos_tokens": 25931, |
|
"val/ratio": 1.0000677108764648, |
|
"val/ratio_var": 2.8293659397604642e-06 |
|
}, |
|
{ |
|
"episode": 90112, |
|
"epoch": 0.7720356408498972, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0009953724220395088, |
|
"loss/value_avg": 0.16015103459358215, |
|
"lr": 9.4921875e-07, |
|
"objective/entropy": -47.74214172363281, |
|
"objective/kl": 35.312191009521484, |
|
"objective/non_score_reward": -1.7656095027923584, |
|
"objective/rlhf_reward": 10.254161834716797, |
|
"objective/scores": 12.019771575927734, |
|
"policy/approxkl_avg": 0.002973256167024374, |
|
"policy/clipfrac_avg": 0.012829918414354324, |
|
"policy/entropy_avg": 1.2157658338546753, |
|
"step": 176, |
|
"val/clipfrac_avg": 0.0022666973527520895, |
|
"val/num_eos_tokens": 25047, |
|
"val/ratio": 0.9998872876167297, |
|
"val/ratio_var": 1.183138920168858e-05 |
|
}, |
|
{ |
|
"episode": 90624, |
|
"epoch": 0.7764222069910898, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001389509066939354, |
|
"loss/value_avg": 0.17181482911109924, |
|
"lr": 9.375e-07, |
|
"objective/entropy": -46.99293518066406, |
|
"objective/kl": 35.26284408569336, |
|
"objective/non_score_reward": -1.7631421089172363, |
|
"objective/rlhf_reward": 10.230562210083008, |
|
"objective/scores": 11.993703842163086, |
|
"policy/approxkl_avg": 0.002044677734375, |
|
"policy/clipfrac_avg": 0.010602492839097977, |
|
"policy/entropy_avg": 1.1966073513031006, |
|
"step": 177, |
|
"val/clipfrac_avg": 0.002445896854624152, |
|
"val/num_eos_tokens": 24650, |
|
"val/ratio": 1.000166893005371, |
|
"val/ratio_var": 3.4964016322192037e-06 |
|
}, |
|
{ |
|
"episode": 91136, |
|
"epoch": 0.7808087731322824, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0006192019209265709, |
|
"loss/value_avg": 0.18841782212257385, |
|
"lr": 9.257812500000001e-07, |
|
"objective/entropy": -45.32579040527344, |
|
"objective/kl": 35.63456726074219, |
|
"objective/non_score_reward": -1.7817286252975464, |
|
"objective/rlhf_reward": 10.122578620910645, |
|
"objective/scores": 11.90430736541748, |
|
"policy/approxkl_avg": 0.0024878934491425753, |
|
"policy/clipfrac_avg": 0.011196051724255085, |
|
"policy/entropy_avg": 1.2084518671035767, |
|
"step": 178, |
|
"val/clipfrac_avg": 0.002982205478474498, |
|
"val/num_eos_tokens": 25471, |
|
"val/ratio": 0.9997743368148804, |
|
"val/ratio_var": 5.401115686254343e-06 |
|
}, |
|
{ |
|
"episode": 91648, |
|
"epoch": 0.785195339273475, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0005537644028663635, |
|
"loss/value_avg": 0.17692416906356812, |
|
"lr": 9.140625e-07, |
|
"objective/entropy": -44.86452102661133, |
|
"objective/kl": 35.01612091064453, |
|
"objective/non_score_reward": -1.7508059740066528, |
|
"objective/rlhf_reward": 10.318327903747559, |
|
"objective/scores": 12.069133758544922, |
|
"policy/approxkl_avg": 0.0021444151643663645, |
|
"policy/clipfrac_avg": 0.012152086943387985, |
|
"policy/entropy_avg": 1.1637636423110962, |
|
"step": 179, |
|
"val/clipfrac_avg": 0.0027516535483300686, |
|
"val/num_eos_tokens": 25029, |
|
"val/ratio": 0.9998598694801331, |
|
"val/ratio_var": 3.8798657442384865e-06 |
|
}, |
|
{ |
|
"episode": 92160, |
|
"epoch": 0.7895819054146676, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001009856816381216, |
|
"loss/value_avg": 0.16210728883743286, |
|
"lr": 9.0234375e-07, |
|
"objective/entropy": -44.95887756347656, |
|
"objective/kl": 35.57145309448242, |
|
"objective/non_score_reward": -1.7785727977752686, |
|
"objective/rlhf_reward": 10.33371639251709, |
|
"objective/scores": 12.112289428710938, |
|
"policy/approxkl_avg": 0.0019982215017080307, |
|
"policy/clipfrac_avg": 0.010970347560942173, |
|
"policy/entropy_avg": 1.1672537326812744, |
|
"step": 180, |
|
"val/clipfrac_avg": 0.0016274080844596028, |
|
"val/num_eos_tokens": 24106, |
|
"val/ratio": 1.0000462532043457, |
|
"val/ratio_var": 4.52870244771475e-06 |
|
}, |
|
{ |
|
"episode": 92672, |
|
"epoch": 0.7939684715558601, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004074078518897295, |
|
"loss/value_avg": 0.16225843131542206, |
|
"lr": 8.90625e-07, |
|
"objective/entropy": -46.57657241821289, |
|
"objective/kl": 35.93509292602539, |
|
"objective/non_score_reward": -1.7967547178268433, |
|
"objective/rlhf_reward": 10.253158569335938, |
|
"objective/scores": 12.04991340637207, |
|
"policy/approxkl_avg": 0.002360533457249403, |
|
"policy/clipfrac_avg": 0.01273175049573183, |
|
"policy/entropy_avg": 1.1883782148361206, |
|
"step": 181, |
|
"val/clipfrac_avg": 0.00262850197032094, |
|
"val/num_eos_tokens": 24088, |
|
"val/ratio": 0.999874472618103, |
|
"val/ratio_var": 4.06332810598542e-06 |
|
}, |
|
{ |
|
"episode": 93184, |
|
"epoch": 0.7983550376970527, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0031516384333372116, |
|
"loss/value_avg": 0.16150668263435364, |
|
"lr": 8.7890625e-07, |
|
"objective/entropy": -47.01606750488281, |
|
"objective/kl": 34.775672912597656, |
|
"objective/non_score_reward": -1.738783597946167, |
|
"objective/rlhf_reward": 10.400968551635742, |
|
"objective/scores": 12.139752388000488, |
|
"policy/approxkl_avg": 0.003061380237340927, |
|
"policy/clipfrac_avg": 0.011249782517552376, |
|
"policy/entropy_avg": 1.189601182937622, |
|
"step": 182, |
|
"val/clipfrac_avg": 0.0020699123851954937, |
|
"val/num_eos_tokens": 24774, |
|
"val/ratio": 0.9997990727424622, |
|
"val/ratio_var": 3.931295395886991e-06 |
|
}, |
|
{ |
|
"episode": 93696, |
|
"epoch": 0.8027416038382453, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0011372501030564308, |
|
"loss/value_avg": 0.1965566873550415, |
|
"lr": 8.671875e-07, |
|
"objective/entropy": -46.06599044799805, |
|
"objective/kl": 36.448936462402344, |
|
"objective/non_score_reward": -1.8224468231201172, |
|
"objective/rlhf_reward": 10.114370346069336, |
|
"objective/scores": 11.936817169189453, |
|
"policy/approxkl_avg": 0.0025729872286319733, |
|
"policy/clipfrac_avg": 0.01264517568051815, |
|
"policy/entropy_avg": 1.194599986076355, |
|
"step": 183, |
|
"val/clipfrac_avg": 0.0033960985019803047, |
|
"val/num_eos_tokens": 24209, |
|
"val/ratio": 1.0000592470169067, |
|
"val/ratio_var": 4.049191829835763e-06 |
|
}, |
|
{ |
|
"episode": 94208, |
|
"epoch": 0.8071281699794379, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007237900979816914, |
|
"loss/value_avg": 0.15372735261917114, |
|
"lr": 8.5546875e-07, |
|
"objective/entropy": -45.80794143676758, |
|
"objective/kl": 35.47758483886719, |
|
"objective/non_score_reward": -1.7738792896270752, |
|
"objective/rlhf_reward": 10.278848648071289, |
|
"objective/scores": 12.052727699279785, |
|
"policy/approxkl_avg": 0.002283816458657384, |
|
"policy/clipfrac_avg": 0.010822740383446217, |
|
"policy/entropy_avg": 1.1835873126983643, |
|
"step": 184, |
|
"val/clipfrac_avg": 0.0018642449285835028, |
|
"val/num_eos_tokens": 24856, |
|
"val/ratio": 0.9998406171798706, |
|
"val/ratio_var": 3.7159979910939e-06 |
|
}, |
|
{ |
|
"episode": 94720, |
|
"epoch": 0.8115147361206305, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0026482678949832916, |
|
"loss/value_avg": 0.18347406387329102, |
|
"lr": 8.4375e-07, |
|
"objective/entropy": -46.348575592041016, |
|
"objective/kl": 35.41945266723633, |
|
"objective/non_score_reward": -1.770972728729248, |
|
"objective/rlhf_reward": 10.092222213745117, |
|
"objective/scores": 11.863194465637207, |
|
"policy/approxkl_avg": 0.0023487925063818693, |
|
"policy/clipfrac_avg": 0.011329904198646545, |
|
"policy/entropy_avg": 1.1949329376220703, |
|
"step": 185, |
|
"val/clipfrac_avg": 0.0018796215299516916, |
|
"val/num_eos_tokens": 24285, |
|
"val/ratio": 1.0002095699310303, |
|
"val/ratio_var": 8.477360097458586e-06 |
|
}, |
|
{ |
|
"episode": 95232, |
|
"epoch": 0.8159013022618231, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004165485501289368, |
|
"loss/value_avg": 0.1504068225622177, |
|
"lr": 8.3203125e-07, |
|
"objective/entropy": -47.73908233642578, |
|
"objective/kl": 35.27336883544922, |
|
"objective/non_score_reward": -1.763668417930603, |
|
"objective/rlhf_reward": 10.285353660583496, |
|
"objective/scores": 12.04902172088623, |
|
"policy/approxkl_avg": 0.0021917533595114946, |
|
"policy/clipfrac_avg": 0.011097628623247147, |
|
"policy/entropy_avg": 1.2006309032440186, |
|
"step": 186, |
|
"val/clipfrac_avg": 0.0022048484534025192, |
|
"val/num_eos_tokens": 24147, |
|
"val/ratio": 1.0000174045562744, |
|
"val/ratio_var": 4.247439846949419e-06 |
|
}, |
|
{ |
|
"episode": 95744, |
|
"epoch": 0.8202878684030158, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004456181079149246, |
|
"loss/value_avg": 0.1416192352771759, |
|
"lr": 8.203125e-07, |
|
"objective/entropy": -49.53144836425781, |
|
"objective/kl": 34.152732849121094, |
|
"objective/non_score_reward": -1.7076367139816284, |
|
"objective/rlhf_reward": 10.372452735900879, |
|
"objective/scores": 12.080089569091797, |
|
"policy/approxkl_avg": 0.002187924925237894, |
|
"policy/clipfrac_avg": 0.010536652989685535, |
|
"policy/entropy_avg": 1.236476182937622, |
|
"step": 187, |
|
"val/clipfrac_avg": 0.00176910194568336, |
|
"val/num_eos_tokens": 24341, |
|
"val/ratio": 1.0000340938568115, |
|
"val/ratio_var": 3.6587950944522163e-06 |
|
}, |
|
{ |
|
"episode": 96256, |
|
"epoch": 0.8246744345442084, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006817615125328302, |
|
"loss/value_avg": 0.1469859927892685, |
|
"lr": 8.085937500000001e-07, |
|
"objective/entropy": -49.844322204589844, |
|
"objective/kl": 34.35259246826172, |
|
"objective/non_score_reward": -1.7176295518875122, |
|
"objective/rlhf_reward": 10.423017501831055, |
|
"objective/scores": 12.140646934509277, |
|
"policy/approxkl_avg": 0.0020276098512113094, |
|
"policy/clipfrac_avg": 0.011348921805620193, |
|
"policy/entropy_avg": 1.2337639331817627, |
|
"step": 188, |
|
"val/clipfrac_avg": 0.0017726544756442308, |
|
"val/num_eos_tokens": 22883, |
|
"val/ratio": 0.9999902248382568, |
|
"val/ratio_var": 2.778531097646919e-06 |
|
}, |
|
{ |
|
"episode": 96768, |
|
"epoch": 0.829061000685401, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0022352226078510284, |
|
"loss/value_avg": 0.14938578009605408, |
|
"lr": 7.96875e-07, |
|
"objective/entropy": -49.13550567626953, |
|
"objective/kl": 34.316654205322266, |
|
"objective/non_score_reward": -1.7158327102661133, |
|
"objective/rlhf_reward": 10.387197494506836, |
|
"objective/scores": 12.10303020477295, |
|
"policy/approxkl_avg": 0.0019436124712228775, |
|
"policy/clipfrac_avg": 0.010562841780483723, |
|
"policy/entropy_avg": 1.2204303741455078, |
|
"step": 189, |
|
"val/clipfrac_avg": 0.0015331670874729753, |
|
"val/num_eos_tokens": 25402, |
|
"val/ratio": 0.9999203681945801, |
|
"val/ratio_var": 3.020122221641941e-06 |
|
}, |
|
{ |
|
"episode": 97280, |
|
"epoch": 0.8334475668265936, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0039822859689593315, |
|
"loss/value_avg": 0.15465494990348816, |
|
"lr": 7.8515625e-07, |
|
"objective/entropy": -47.03410339355469, |
|
"objective/kl": 35.7232666015625, |
|
"objective/non_score_reward": -1.786163330078125, |
|
"objective/rlhf_reward": 10.189088821411133, |
|
"objective/scores": 11.975252151489258, |
|
"policy/approxkl_avg": 0.002349921502172947, |
|
"policy/clipfrac_avg": 0.011656483635306358, |
|
"policy/entropy_avg": 1.1775662899017334, |
|
"step": 190, |
|
"val/clipfrac_avg": 0.002678380813449621, |
|
"val/num_eos_tokens": 23956, |
|
"val/ratio": 1.0002427101135254, |
|
"val/ratio_var": 7.052185537759215e-06 |
|
}, |
|
{ |
|
"episode": 97792, |
|
"epoch": 0.8378341329677862, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0036186622455716133, |
|
"loss/value_avg": 0.14314797520637512, |
|
"lr": 7.734375000000001e-07, |
|
"objective/entropy": -48.098785400390625, |
|
"objective/kl": 35.30805969238281, |
|
"objective/non_score_reward": -1.765402913093567, |
|
"objective/rlhf_reward": 10.44815731048584, |
|
"objective/scores": 12.213560104370117, |
|
"policy/approxkl_avg": 0.001903231255710125, |
|
"policy/clipfrac_avg": 0.010514364577829838, |
|
"policy/entropy_avg": 1.1749199628829956, |
|
"step": 191, |
|
"val/clipfrac_avg": 0.002496888395398855, |
|
"val/num_eos_tokens": 23098, |
|
"val/ratio": 1.0006399154663086, |
|
"val/ratio_var": 1.3019835023442283e-05 |
|
}, |
|
{ |
|
"episode": 98304, |
|
"epoch": 0.8422206991089788, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0033899955451488495, |
|
"loss/value_avg": 0.1446259617805481, |
|
"lr": 7.6171875e-07, |
|
"objective/entropy": -48.63313293457031, |
|
"objective/kl": 35.44336700439453, |
|
"objective/non_score_reward": -1.7721683979034424, |
|
"objective/rlhf_reward": 10.299942970275879, |
|
"objective/scores": 12.072111129760742, |
|
"policy/approxkl_avg": 0.0017086728475987911, |
|
"policy/clipfrac_avg": 0.00998393353074789, |
|
"policy/entropy_avg": 1.199033498764038, |
|
"step": 192, |
|
"val/clipfrac_avg": 0.001696545397862792, |
|
"val/num_eos_tokens": 25110, |
|
"val/ratio": 0.9996849298477173, |
|
"val/ratio_var": 3.5158464015694335e-06 |
|
}, |
|
{ |
|
"episode": 98816, |
|
"epoch": 0.8466072652501714, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004922365769743919, |
|
"loss/value_avg": 0.15805533528327942, |
|
"lr": 7.5e-07, |
|
"objective/entropy": -48.80876541137695, |
|
"objective/kl": 34.96177291870117, |
|
"objective/non_score_reward": -1.7480887174606323, |
|
"objective/rlhf_reward": 10.34687614440918, |
|
"objective/scores": 12.094964981079102, |
|
"policy/approxkl_avg": 0.002475408371537924, |
|
"policy/clipfrac_avg": 0.011553257703781128, |
|
"policy/entropy_avg": 1.202178955078125, |
|
"step": 193, |
|
"val/clipfrac_avg": 0.0017627595225349069, |
|
"val/num_eos_tokens": 25458, |
|
"val/ratio": 0.999587893486023, |
|
"val/ratio_var": 2.171113692384097e-06 |
|
}, |
|
{ |
|
"episode": 99328, |
|
"epoch": 0.850993831391364, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007751693949103355, |
|
"loss/value_avg": 0.15220610797405243, |
|
"lr": 7.3828125e-07, |
|
"objective/entropy": -47.91047668457031, |
|
"objective/kl": 35.784759521484375, |
|
"objective/non_score_reward": -1.7892380952835083, |
|
"objective/rlhf_reward": 10.237627029418945, |
|
"objective/scores": 12.026865005493164, |
|
"policy/approxkl_avg": 0.002570272423326969, |
|
"policy/clipfrac_avg": 0.011220266111195087, |
|
"policy/entropy_avg": 1.1578489542007446, |
|
"step": 194, |
|
"val/clipfrac_avg": 0.0022793509997427464, |
|
"val/num_eos_tokens": 23474, |
|
"val/ratio": 1.0003504753112793, |
|
"val/ratio_var": 1.2456192962417845e-05 |
|
}, |
|
{ |
|
"episode": 99840, |
|
"epoch": 0.8553803975325566, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0006990758702158928, |
|
"loss/value_avg": 0.15493687987327576, |
|
"lr": 7.265625e-07, |
|
"objective/entropy": -47.84299850463867, |
|
"objective/kl": 35.89054870605469, |
|
"objective/non_score_reward": -1.794527530670166, |
|
"objective/rlhf_reward": 10.264701843261719, |
|
"objective/scores": 12.059228897094727, |
|
"policy/approxkl_avg": 0.001959962071850896, |
|
"policy/clipfrac_avg": 0.01036808267235756, |
|
"policy/entropy_avg": 1.162892460823059, |
|
"step": 195, |
|
"val/clipfrac_avg": 0.0018003088189288974, |
|
"val/num_eos_tokens": 23847, |
|
"val/ratio": 0.9998455047607422, |
|
"val/ratio_var": 3.4515687730163336e-06 |
|
}, |
|
{ |
|
"episode": 100352, |
|
"epoch": 0.8597669636737492, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004795054439455271, |
|
"loss/value_avg": 0.16058918833732605, |
|
"lr": 7.1484375e-07, |
|
"objective/entropy": -47.96992492675781, |
|
"objective/kl": 35.145347595214844, |
|
"objective/non_score_reward": -1.7572674751281738, |
|
"objective/rlhf_reward": 10.19577407836914, |
|
"objective/scores": 11.953042030334473, |
|
"policy/approxkl_avg": 0.0025707499589771032, |
|
"policy/clipfrac_avg": 0.01175383199006319, |
|
"policy/entropy_avg": 1.185746192932129, |
|
"step": 196, |
|
"val/clipfrac_avg": 0.002270677126944065, |
|
"val/num_eos_tokens": 24550, |
|
"val/ratio": 0.99980628490448, |
|
"val/ratio_var": 4.205757250019815e-06 |
|
}, |
|
{ |
|
"episode": 100864, |
|
"epoch": 0.8641535298149418, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005628856830298901, |
|
"loss/value_avg": 0.14903391897678375, |
|
"lr": 7.03125e-07, |
|
"objective/entropy": -45.81394958496094, |
|
"objective/kl": 35.49262619018555, |
|
"objective/non_score_reward": -1.7746312618255615, |
|
"objective/rlhf_reward": 10.463909149169922, |
|
"objective/scores": 12.238540649414062, |
|
"policy/approxkl_avg": 0.0024222065694630146, |
|
"policy/clipfrac_avg": 0.010860033333301544, |
|
"policy/entropy_avg": 1.1221274137496948, |
|
"step": 197, |
|
"val/clipfrac_avg": 0.0026403269730508327, |
|
"val/num_eos_tokens": 24000, |
|
"val/ratio": 0.9999848008155823, |
|
"val/ratio_var": 3.7348772821133025e-06 |
|
}, |
|
{ |
|
"episode": 101376, |
|
"epoch": 0.8685400959561344, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0002153497189283371, |
|
"loss/value_avg": 0.16071295738220215, |
|
"lr": 6.9140625e-07, |
|
"objective/entropy": -47.071693420410156, |
|
"objective/kl": 35.359169006347656, |
|
"objective/non_score_reward": -1.767958402633667, |
|
"objective/rlhf_reward": 10.270247459411621, |
|
"objective/scores": 12.038206100463867, |
|
"policy/approxkl_avg": 0.002538530621677637, |
|
"policy/clipfrac_avg": 0.010970663279294968, |
|
"policy/entropy_avg": 1.1485331058502197, |
|
"step": 198, |
|
"val/clipfrac_avg": 0.0015574777498841286, |
|
"val/num_eos_tokens": 24137, |
|
"val/ratio": 0.9999282360076904, |
|
"val/ratio_var": 4.009260919701774e-06 |
|
}, |
|
{ |
|
"episode": 101888, |
|
"epoch": 0.8729266620973269, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0026365332305431366, |
|
"loss/value_avg": 0.12809592485427856, |
|
"lr": 6.796875e-07, |
|
"objective/entropy": -46.45412826538086, |
|
"objective/kl": 35.395294189453125, |
|
"objective/non_score_reward": -1.7697646617889404, |
|
"objective/rlhf_reward": 10.501349449157715, |
|
"objective/scores": 12.271114349365234, |
|
"policy/approxkl_avg": 0.0024031256325542927, |
|
"policy/clipfrac_avg": 0.010782474651932716, |
|
"policy/entropy_avg": 1.1210780143737793, |
|
"step": 199, |
|
"val/clipfrac_avg": 0.001396391773596406, |
|
"val/num_eos_tokens": 21839, |
|
"val/ratio": 1.0000112056732178, |
|
"val/ratio_var": 4.58755812360323e-06 |
|
}, |
|
{ |
|
"episode": 102400, |
|
"epoch": 0.8773132282385195, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004148578271269798, |
|
"loss/value_avg": 0.1365332007408142, |
|
"lr": 6.6796875e-07, |
|
"objective/entropy": -48.283546447753906, |
|
"objective/kl": 34.94371032714844, |
|
"objective/non_score_reward": -1.7471855878829956, |
|
"objective/rlhf_reward": 10.321309089660645, |
|
"objective/scores": 12.06849479675293, |
|
"policy/approxkl_avg": 0.002053589327260852, |
|
"policy/clipfrac_avg": 0.010953281074762344, |
|
"policy/entropy_avg": 1.1579573154449463, |
|
"step": 200, |
|
"val/clipfrac_avg": 0.001975214807316661, |
|
"val/num_eos_tokens": 23925, |
|
"val/ratio": 0.9997996091842651, |
|
"val/ratio_var": 4.9919021876121406e-06 |
|
}, |
|
{ |
|
"episode": 102912, |
|
"epoch": 0.8816997943797121, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0071252393536269665, |
|
"loss/value_avg": 0.15587176382541656, |
|
"lr": 6.562500000000001e-07, |
|
"objective/entropy": -48.567020416259766, |
|
"objective/kl": 34.97012710571289, |
|
"objective/non_score_reward": -1.7485063076019287, |
|
"objective/rlhf_reward": 10.221208572387695, |
|
"objective/scores": 11.969715118408203, |
|
"policy/approxkl_avg": 0.00224270299077034, |
|
"policy/clipfrac_avg": 0.010910441167652607, |
|
"policy/entropy_avg": 1.1749496459960938, |
|
"step": 201, |
|
"val/clipfrac_avg": 0.0020675300620496273, |
|
"val/num_eos_tokens": 22747, |
|
"val/ratio": 0.9998396635055542, |
|
"val/ratio_var": 3.2513178211956983e-06 |
|
}, |
|
{ |
|
"episode": 103424, |
|
"epoch": 0.8860863605209047, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002688491716980934, |
|
"loss/value_avg": 0.14362746477127075, |
|
"lr": 6.4453125e-07, |
|
"objective/entropy": -44.80986022949219, |
|
"objective/kl": 36.38422775268555, |
|
"objective/non_score_reward": -1.819211483001709, |
|
"objective/rlhf_reward": 10.282768249511719, |
|
"objective/scores": 12.101980209350586, |
|
"policy/approxkl_avg": 0.0024515336845070124, |
|
"policy/clipfrac_avg": 0.011868854984641075, |
|
"policy/entropy_avg": 1.0818434953689575, |
|
"step": 202, |
|
"val/clipfrac_avg": 0.002219648100435734, |
|
"val/num_eos_tokens": 21456, |
|
"val/ratio": 1.0001237392425537, |
|
"val/ratio_var": 6.081787887524115e-06 |
|
}, |
|
{ |
|
"episode": 103936, |
|
"epoch": 0.8904729266620973, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0033247251994907856, |
|
"loss/value_avg": 0.14334289729595184, |
|
"lr": 6.328125e-07, |
|
"objective/entropy": -46.834434509277344, |
|
"objective/kl": 35.398094177246094, |
|
"objective/non_score_reward": -1.769904613494873, |
|
"objective/rlhf_reward": 10.301237106323242, |
|
"objective/scores": 12.071142196655273, |
|
"policy/approxkl_avg": 0.0021492401137948036, |
|
"policy/clipfrac_avg": 0.012414924800395966, |
|
"policy/entropy_avg": 1.1050291061401367, |
|
"step": 203, |
|
"val/clipfrac_avg": 0.0018738624639809132, |
|
"val/num_eos_tokens": 22219, |
|
"val/ratio": 0.9999430179595947, |
|
"val/ratio_var": 4.555758096103091e-06 |
|
}, |
|
{ |
|
"episode": 104448, |
|
"epoch": 0.8948594928032899, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006169524043798447, |
|
"loss/value_avg": 0.14988917112350464, |
|
"lr": 6.210937500000001e-07, |
|
"objective/entropy": -45.63399124145508, |
|
"objective/kl": 36.69015884399414, |
|
"objective/non_score_reward": -1.834507942199707, |
|
"objective/rlhf_reward": 10.198760032653809, |
|
"objective/scores": 12.033267974853516, |
|
"policy/approxkl_avg": 0.002657091710716486, |
|
"policy/clipfrac_avg": 0.011476716957986355, |
|
"policy/entropy_avg": 1.1221380233764648, |
|
"step": 204, |
|
"val/clipfrac_avg": 0.002059329068288207, |
|
"val/num_eos_tokens": 23233, |
|
"val/ratio": 1.0008026361465454, |
|
"val/ratio_var": 3.8777681766077876e-05 |
|
}, |
|
{ |
|
"episode": 104960, |
|
"epoch": 0.8992460589444825, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0033749323338270187, |
|
"loss/value_avg": 0.13801273703575134, |
|
"lr": 6.09375e-07, |
|
"objective/entropy": -48.137725830078125, |
|
"objective/kl": 35.192413330078125, |
|
"objective/non_score_reward": -1.7596206665039062, |
|
"objective/rlhf_reward": 10.384641647338867, |
|
"objective/scores": 12.144262313842773, |
|
"policy/approxkl_avg": 0.002190415980294347, |
|
"policy/clipfrac_avg": 0.011061472818255424, |
|
"policy/entropy_avg": 1.1410572528839111, |
|
"step": 205, |
|
"val/clipfrac_avg": 0.0016738364938646555, |
|
"val/num_eos_tokens": 23974, |
|
"val/ratio": 1.0002024173736572, |
|
"val/ratio_var": 4.371653631096706e-06 |
|
}, |
|
{ |
|
"episode": 105472, |
|
"epoch": 0.9036326250856751, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00836427416652441, |
|
"loss/value_avg": 0.13613036274909973, |
|
"lr": 5.9765625e-07, |
|
"objective/entropy": -46.69685745239258, |
|
"objective/kl": 35.53401184082031, |
|
"objective/non_score_reward": -1.7767008543014526, |
|
"objective/rlhf_reward": 10.326574325561523, |
|
"objective/scores": 12.103275299072266, |
|
"policy/approxkl_avg": 0.002368978690356016, |
|
"policy/clipfrac_avg": 0.011249538511037827, |
|
"policy/entropy_avg": 1.1251120567321777, |
|
"step": 206, |
|
"val/clipfrac_avg": 0.0025745341554284096, |
|
"val/num_eos_tokens": 22087, |
|
"val/ratio": 0.999896228313446, |
|
"val/ratio_var": 5.420577963377582e-06 |
|
}, |
|
{ |
|
"episode": 105984, |
|
"epoch": 0.9080191912268677, |
|
"eps": 4, |
|
"loss/policy_avg": 0.008257454261183739, |
|
"loss/value_avg": 0.13564562797546387, |
|
"lr": 5.859375000000001e-07, |
|
"objective/entropy": -45.498809814453125, |
|
"objective/kl": 36.362369537353516, |
|
"objective/non_score_reward": -1.8181185722351074, |
|
"objective/rlhf_reward": 10.476442337036133, |
|
"objective/scores": 12.294561386108398, |
|
"policy/approxkl_avg": 0.0021072309464216232, |
|
"policy/clipfrac_avg": 0.010421659797430038, |
|
"policy/entropy_avg": 1.0942033529281616, |
|
"step": 207, |
|
"val/clipfrac_avg": 0.002059993799775839, |
|
"val/num_eos_tokens": 23421, |
|
"val/ratio": 1.0002628564834595, |
|
"val/ratio_var": 7.209391242213314e-06 |
|
}, |
|
{ |
|
"episode": 106496, |
|
"epoch": 0.9124057573680603, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001601784024387598, |
|
"loss/value_avg": 0.16962260007858276, |
|
"lr": 5.7421875e-07, |
|
"objective/entropy": -46.86063766479492, |
|
"objective/kl": 35.87854766845703, |
|
"objective/non_score_reward": -1.7939273118972778, |
|
"objective/rlhf_reward": 10.281155586242676, |
|
"objective/scores": 12.075082778930664, |
|
"policy/approxkl_avg": 0.0026243766769766808, |
|
"policy/clipfrac_avg": 0.011249695904552937, |
|
"policy/entropy_avg": 1.1037336587905884, |
|
"step": 208, |
|
"val/clipfrac_avg": 0.0016680224798619747, |
|
"val/num_eos_tokens": 22760, |
|
"val/ratio": 0.9998372197151184, |
|
"val/ratio_var": 4.619852916221134e-06 |
|
}, |
|
{ |
|
"episode": 107008, |
|
"epoch": 0.9167923235092529, |
|
"eps": 4, |
|
"loss/policy_avg": 0.003604589030146599, |
|
"loss/value_avg": 0.13913950324058533, |
|
"lr": 5.625e-07, |
|
"objective/entropy": -47.09626388549805, |
|
"objective/kl": 36.34856414794922, |
|
"objective/non_score_reward": -1.8174282312393188, |
|
"objective/rlhf_reward": 10.236056327819824, |
|
"objective/scores": 12.053484916687012, |
|
"policy/approxkl_avg": 0.0020685973577201366, |
|
"policy/clipfrac_avg": 0.010848737321794033, |
|
"policy/entropy_avg": 1.1276905536651611, |
|
"step": 209, |
|
"val/clipfrac_avg": 0.0022783444728702307, |
|
"val/num_eos_tokens": 22894, |
|
"val/ratio": 0.9998518228530884, |
|
"val/ratio_var": 2.5297301817772677e-06 |
|
}, |
|
{ |
|
"episode": 107520, |
|
"epoch": 0.9211788896504455, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004152492620050907, |
|
"loss/value_avg": 0.1309879571199417, |
|
"lr": 5.5078125e-07, |
|
"objective/entropy": -46.065147399902344, |
|
"objective/kl": 35.999778747558594, |
|
"objective/non_score_reward": -1.799989104270935, |
|
"objective/rlhf_reward": 10.247265815734863, |
|
"objective/scores": 12.04725456237793, |
|
"policy/approxkl_avg": 0.0024839870166033506, |
|
"policy/clipfrac_avg": 0.01161247305572033, |
|
"policy/entropy_avg": 1.1072354316711426, |
|
"step": 210, |
|
"val/clipfrac_avg": 0.002184495097026229, |
|
"val/num_eos_tokens": 22270, |
|
"val/ratio": 0.9998332858085632, |
|
"val/ratio_var": 4.201321189611917e-06 |
|
}, |
|
{ |
|
"episode": 108032, |
|
"epoch": 0.9255654557916381, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004213474225252867, |
|
"loss/value_avg": 0.1342218518257141, |
|
"lr": 5.390625e-07, |
|
"objective/entropy": -45.06540298461914, |
|
"objective/kl": 36.81282424926758, |
|
"objective/non_score_reward": -1.8406412601470947, |
|
"objective/rlhf_reward": 10.265863418579102, |
|
"objective/scores": 12.106504440307617, |
|
"policy/approxkl_avg": 0.0019367935601621866, |
|
"policy/clipfrac_avg": 0.011182930320501328, |
|
"policy/entropy_avg": 1.0617645978927612, |
|
"step": 211, |
|
"val/clipfrac_avg": 0.0014739616308361292, |
|
"val/num_eos_tokens": 23334, |
|
"val/ratio": 0.9999167323112488, |
|
"val/ratio_var": 3.8635080272797495e-06 |
|
}, |
|
{ |
|
"episode": 108544, |
|
"epoch": 0.9299520219328307, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002486391458660364, |
|
"loss/value_avg": 0.13077521324157715, |
|
"lr": 5.2734375e-07, |
|
"objective/entropy": -47.85795974731445, |
|
"objective/kl": 35.55070877075195, |
|
"objective/non_score_reward": -1.7775355577468872, |
|
"objective/rlhf_reward": 10.375813484191895, |
|
"objective/scores": 12.153348922729492, |
|
"policy/approxkl_avg": 0.002080064732581377, |
|
"policy/clipfrac_avg": 0.009690655395388603, |
|
"policy/entropy_avg": 1.1195100545883179, |
|
"step": 212, |
|
"val/clipfrac_avg": 0.0017993149813264608, |
|
"val/num_eos_tokens": 22692, |
|
"val/ratio": 0.9999110698699951, |
|
"val/ratio_var": 5.023774974688422e-06 |
|
}, |
|
{ |
|
"episode": 109056, |
|
"epoch": 0.9343385880740233, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0036895014345645905, |
|
"loss/value_avg": 0.132409006357193, |
|
"lr": 5.15625e-07, |
|
"objective/entropy": -47.587425231933594, |
|
"objective/kl": 35.982322692871094, |
|
"objective/non_score_reward": -1.7991161346435547, |
|
"objective/rlhf_reward": 10.343294143676758, |
|
"objective/scores": 12.142410278320312, |
|
"policy/approxkl_avg": 0.0029481318779289722, |
|
"policy/clipfrac_avg": 0.010319727472960949, |
|
"policy/entropy_avg": 1.1227898597717285, |
|
"step": 213, |
|
"val/clipfrac_avg": 0.0008735989686101675, |
|
"val/num_eos_tokens": 22491, |
|
"val/ratio": 0.9999507069587708, |
|
"val/ratio_var": 6.943369953660294e-06 |
|
}, |
|
{ |
|
"episode": 109568, |
|
"epoch": 0.9387251542152159, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001358766108751297, |
|
"loss/value_avg": 0.155538409948349, |
|
"lr": 5.0390625e-07, |
|
"objective/entropy": -47.061283111572266, |
|
"objective/kl": 35.63045120239258, |
|
"objective/non_score_reward": -1.781522512435913, |
|
"objective/rlhf_reward": 10.259801864624023, |
|
"objective/scores": 12.041324615478516, |
|
"policy/approxkl_avg": 0.0025432356633245945, |
|
"policy/clipfrac_avg": 0.011627338826656342, |
|
"policy/entropy_avg": 1.122612476348877, |
|
"step": 214, |
|
"val/clipfrac_avg": 0.0017226223135367036, |
|
"val/num_eos_tokens": 23435, |
|
"val/ratio": 0.9998894929885864, |
|
"val/ratio_var": 2.8884762741654413e-06 |
|
}, |
|
{ |
|
"episode": 110080, |
|
"epoch": 0.9431117203564084, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0025219814851880074, |
|
"loss/value_avg": 0.15307217836380005, |
|
"lr": 4.921875e-07, |
|
"objective/entropy": -47.419456481933594, |
|
"objective/kl": 35.47161102294922, |
|
"objective/non_score_reward": -1.773580551147461, |
|
"objective/rlhf_reward": 10.303951263427734, |
|
"objective/scores": 12.077531814575195, |
|
"policy/approxkl_avg": 0.002379181096330285, |
|
"policy/clipfrac_avg": 0.010707897134125233, |
|
"policy/entropy_avg": 1.1141124963760376, |
|
"step": 215, |
|
"val/clipfrac_avg": 0.002012796001508832, |
|
"val/num_eos_tokens": 22478, |
|
"val/ratio": 1.0004652738571167, |
|
"val/ratio_var": 8.836183951643761e-06 |
|
}, |
|
{ |
|
"episode": 110592, |
|
"epoch": 0.9474982864976011, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0005590170621871948, |
|
"loss/value_avg": 0.13267451524734497, |
|
"lr": 4.8046875e-07, |
|
"objective/entropy": -46.87737274169922, |
|
"objective/kl": 36.430267333984375, |
|
"objective/non_score_reward": -1.8215134143829346, |
|
"objective/rlhf_reward": 10.235672950744629, |
|
"objective/scores": 12.057186126708984, |
|
"policy/approxkl_avg": 0.0019323707092553377, |
|
"policy/clipfrac_avg": 0.010391879826784134, |
|
"policy/entropy_avg": 1.084083080291748, |
|
"step": 216, |
|
"val/clipfrac_avg": 0.0023698105942457914, |
|
"val/num_eos_tokens": 21561, |
|
"val/ratio": 1.0000874996185303, |
|
"val/ratio_var": 3.9766900954418816e-06 |
|
}, |
|
{ |
|
"episode": 111104, |
|
"epoch": 0.9518848526387937, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002204008400440216, |
|
"loss/value_avg": 0.12931928038597107, |
|
"lr": 4.6875e-07, |
|
"objective/entropy": -47.961090087890625, |
|
"objective/kl": 34.691619873046875, |
|
"objective/non_score_reward": -1.7345812320709229, |
|
"objective/rlhf_reward": 10.36070442199707, |
|
"objective/scores": 12.095285415649414, |
|
"policy/approxkl_avg": 0.002234948333352804, |
|
"policy/clipfrac_avg": 0.010548978112637997, |
|
"policy/entropy_avg": 1.117429494857788, |
|
"step": 217, |
|
"val/clipfrac_avg": 0.0020855211187154055, |
|
"val/num_eos_tokens": 23147, |
|
"val/ratio": 0.9999337792396545, |
|
"val/ratio_var": 3.004956170116202e-06 |
|
}, |
|
{ |
|
"episode": 111616, |
|
"epoch": 0.9562714187799863, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0032723252661526203, |
|
"loss/value_avg": 0.13847306370735168, |
|
"lr": 4.5703125e-07, |
|
"objective/entropy": -46.945213317871094, |
|
"objective/kl": 36.218875885009766, |
|
"objective/non_score_reward": -1.8109439611434937, |
|
"objective/rlhf_reward": 10.1881103515625, |
|
"objective/scores": 11.999053955078125, |
|
"policy/approxkl_avg": 0.0020719291642308235, |
|
"policy/clipfrac_avg": 0.010710496455430984, |
|
"policy/entropy_avg": 1.1050124168395996, |
|
"step": 218, |
|
"val/clipfrac_avg": 0.0017812025034800172, |
|
"val/num_eos_tokens": 22247, |
|
"val/ratio": 0.9995983839035034, |
|
"val/ratio_var": 3.6509884466795484e-06 |
|
}, |
|
{ |
|
"episode": 112128, |
|
"epoch": 0.9606579849211789, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0027086399495601654, |
|
"loss/value_avg": 0.13496465981006622, |
|
"lr": 4.453125e-07, |
|
"objective/entropy": -46.59452819824219, |
|
"objective/kl": 36.066009521484375, |
|
"objective/non_score_reward": -1.803300380706787, |
|
"objective/rlhf_reward": 10.2237548828125, |
|
"objective/scores": 12.027054786682129, |
|
"policy/approxkl_avg": 0.0025894979480654, |
|
"policy/clipfrac_avg": 0.010560642927885056, |
|
"policy/entropy_avg": 1.0761702060699463, |
|
"step": 219, |
|
"val/clipfrac_avg": 0.0019034635042771697, |
|
"val/num_eos_tokens": 23244, |
|
"val/ratio": 0.9996868968009949, |
|
"val/ratio_var": 2.6758764306578087e-06 |
|
}, |
|
{ |
|
"episode": 112640, |
|
"epoch": 0.9650445510623715, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0027356455102562904, |
|
"loss/value_avg": 0.1426704227924347, |
|
"lr": 4.3359375e-07, |
|
"objective/entropy": -48.531700134277344, |
|
"objective/kl": 36.28923034667969, |
|
"objective/non_score_reward": -1.8144614696502686, |
|
"objective/rlhf_reward": 10.108583450317383, |
|
"objective/scores": 11.92304515838623, |
|
"policy/approxkl_avg": 0.0024003470316529274, |
|
"policy/clipfrac_avg": 0.011623581871390343, |
|
"policy/entropy_avg": 1.1353890895843506, |
|
"step": 220, |
|
"val/clipfrac_avg": 0.0018647483084350824, |
|
"val/num_eos_tokens": 24624, |
|
"val/ratio": 0.9997994899749756, |
|
"val/ratio_var": 4.277234438632149e-06 |
|
}, |
|
{ |
|
"episode": 113152, |
|
"epoch": 0.9694311172035641, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007160656154155731, |
|
"loss/value_avg": 0.1320515125989914, |
|
"lr": 4.21875e-07, |
|
"objective/entropy": -46.6253662109375, |
|
"objective/kl": 35.65117645263672, |
|
"objective/non_score_reward": -1.7825589179992676, |
|
"objective/rlhf_reward": 10.30819320678711, |
|
"objective/scores": 12.090752601623535, |
|
"policy/approxkl_avg": 0.0027557946741580963, |
|
"policy/clipfrac_avg": 0.011182291433215141, |
|
"policy/entropy_avg": 1.102109432220459, |
|
"step": 221, |
|
"val/clipfrac_avg": 0.0012917916756123304, |
|
"val/num_eos_tokens": 24368, |
|
"val/ratio": 0.9995945692062378, |
|
"val/ratio_var": 2.0290351585572353e-06 |
|
}, |
|
{ |
|
"episode": 113664, |
|
"epoch": 0.9738176833447567, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0003069937229156494, |
|
"loss/value_avg": 0.15999022126197815, |
|
"lr": 4.1015625e-07, |
|
"objective/entropy": -48.70159149169922, |
|
"objective/kl": 35.106689453125, |
|
"objective/non_score_reward": -1.7553346157073975, |
|
"objective/rlhf_reward": 10.207782745361328, |
|
"objective/scores": 11.963117599487305, |
|
"policy/approxkl_avg": 0.0020455708727240562, |
|
"policy/clipfrac_avg": 0.010850891470909119, |
|
"policy/entropy_avg": 1.1390907764434814, |
|
"step": 222, |
|
"val/clipfrac_avg": 0.0014877349603921175, |
|
"val/num_eos_tokens": 24290, |
|
"val/ratio": 0.9998856782913208, |
|
"val/ratio_var": 3.1563197353534633e-06 |
|
}, |
|
{ |
|
"episode": 114176, |
|
"epoch": 0.9782042494859493, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002647135406732559, |
|
"loss/value_avg": 0.15004819631576538, |
|
"lr": 3.984375e-07, |
|
"objective/entropy": -48.554466247558594, |
|
"objective/kl": 35.3194580078125, |
|
"objective/non_score_reward": -1.7659728527069092, |
|
"objective/rlhf_reward": 10.268902778625488, |
|
"objective/scores": 12.034875869750977, |
|
"policy/approxkl_avg": 0.0024222272913903, |
|
"policy/clipfrac_avg": 0.012112165801227093, |
|
"policy/entropy_avg": 1.1444189548492432, |
|
"step": 223, |
|
"val/clipfrac_avg": 0.0013970729196444154, |
|
"val/num_eos_tokens": 24690, |
|
"val/ratio": 1.0001357793807983, |
|
"val/ratio_var": 5.459288331621792e-06 |
|
}, |
|
{ |
|
"episode": 114688, |
|
"epoch": 0.9825908156271419, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005581329111009836, |
|
"loss/value_avg": 0.13726581633090973, |
|
"lr": 3.8671875000000003e-07, |
|
"objective/entropy": -47.08404541015625, |
|
"objective/kl": 35.355735778808594, |
|
"objective/non_score_reward": -1.7677868604660034, |
|
"objective/rlhf_reward": 10.195508003234863, |
|
"objective/scores": 11.963294982910156, |
|
"policy/approxkl_avg": 0.00215457403101027, |
|
"policy/clipfrac_avg": 0.01111712958663702, |
|
"policy/entropy_avg": 1.0979572534561157, |
|
"step": 224, |
|
"val/clipfrac_avg": 0.0013495224993675947, |
|
"val/num_eos_tokens": 24061, |
|
"val/ratio": 0.9996528625488281, |
|
"val/ratio_var": 4.114271177968476e-06 |
|
}, |
|
{ |
|
"episode": 115200, |
|
"epoch": 0.9869773817683345, |
|
"eps": 4, |
|
"loss/policy_avg": 0.007961828261613846, |
|
"loss/value_avg": 0.14187262952327728, |
|
"lr": 3.75e-07, |
|
"objective/entropy": -48.313079833984375, |
|
"objective/kl": 35.43455505371094, |
|
"objective/non_score_reward": -1.7717278003692627, |
|
"objective/rlhf_reward": 10.25037670135498, |
|
"objective/scores": 12.022104263305664, |
|
"policy/approxkl_avg": 0.002115798881277442, |
|
"policy/clipfrac_avg": 0.010869062505662441, |
|
"policy/entropy_avg": 1.1199058294296265, |
|
"step": 225, |
|
"val/clipfrac_avg": 0.0015077003045007586, |
|
"val/num_eos_tokens": 24011, |
|
"val/ratio": 1.0002377033233643, |
|
"val/ratio_var": 8.308678843604866e-06 |
|
}, |
|
{ |
|
"episode": 115712, |
|
"epoch": 0.9913639479095271, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005859264172613621, |
|
"loss/value_avg": 0.14565059542655945, |
|
"lr": 3.6328125e-07, |
|
"objective/entropy": -45.33184051513672, |
|
"objective/kl": 36.10816955566406, |
|
"objective/non_score_reward": -1.8054085969924927, |
|
"objective/rlhf_reward": 10.235603332519531, |
|
"objective/scores": 12.041011810302734, |
|
"policy/approxkl_avg": 0.00207823165692389, |
|
"policy/clipfrac_avg": 0.01044812798500061, |
|
"policy/entropy_avg": 1.0732884407043457, |
|
"step": 226, |
|
"val/clipfrac_avg": 0.002065906533971429, |
|
"val/num_eos_tokens": 22806, |
|
"val/ratio": 1.0006978511810303, |
|
"val/ratio_var": 5.532293926080456e-06 |
|
}, |
|
{ |
|
"episode": 116224, |
|
"epoch": 0.9957505140507197, |
|
"eps": 4, |
|
"loss/policy_avg": 0.003005429171025753, |
|
"loss/value_avg": 0.14912931621074677, |
|
"lr": 3.515625e-07, |
|
"objective/entropy": -45.97642135620117, |
|
"objective/kl": 35.881935119628906, |
|
"objective/non_score_reward": -1.7940969467163086, |
|
"objective/rlhf_reward": 10.282238006591797, |
|
"objective/scores": 12.076334953308105, |
|
"policy/approxkl_avg": 0.002462461357936263, |
|
"policy/clipfrac_avg": 0.010989394038915634, |
|
"policy/entropy_avg": 1.076866626739502, |
|
"step": 227, |
|
"val/clipfrac_avg": 0.0014049881137907505, |
|
"val/num_eos_tokens": 22413, |
|
"val/ratio": 0.9997053146362305, |
|
"val/ratio_var": 6.198345545271877e-06 |
|
}, |
|
{ |
|
"episode": 116736, |
|
"epoch": 1.0001370801919123, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006791248917579651, |
|
"loss/value_avg": 0.13567443192005157, |
|
"lr": 3.3984375e-07, |
|
"objective/entropy": -47.08728790283203, |
|
"objective/kl": 35.24159622192383, |
|
"objective/non_score_reward": -1.7620798349380493, |
|
"objective/rlhf_reward": 10.378169059753418, |
|
"objective/scores": 12.140249252319336, |
|
"policy/approxkl_avg": 0.0016869257669895887, |
|
"policy/clipfrac_avg": 0.009463367983698845, |
|
"policy/entropy_avg": 1.0775138139724731, |
|
"step": 228, |
|
"val/clipfrac_avg": 0.0023141992278397083, |
|
"val/num_eos_tokens": 23909, |
|
"val/ratio": 0.9998793601989746, |
|
"val/ratio_var": 4.120348421565723e-06 |
|
}, |
|
{ |
|
"episode": 117248, |
|
"epoch": 1.0045236463331049, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0007241852581501007, |
|
"loss/value_avg": 0.14128395915031433, |
|
"lr": 3.2812500000000003e-07, |
|
"objective/entropy": -46.4316291809082, |
|
"objective/kl": 35.44841384887695, |
|
"objective/non_score_reward": -1.7724206447601318, |
|
"objective/rlhf_reward": 10.439834594726562, |
|
"objective/scores": 12.212255477905273, |
|
"policy/approxkl_avg": 0.0024355538189411163, |
|
"policy/clipfrac_avg": 0.010117866098880768, |
|
"policy/entropy_avg": 1.0649350881576538, |
|
"step": 229, |
|
"val/clipfrac_avg": 0.0018390328623354435, |
|
"val/num_eos_tokens": 23709, |
|
"val/ratio": 1.0000131130218506, |
|
"val/ratio_var": 4.8111301111930516e-06 |
|
}, |
|
{ |
|
"episode": 117760, |
|
"epoch": 1.0089102124742975, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006022402085363865, |
|
"loss/value_avg": 0.12594205141067505, |
|
"lr": 3.1640625e-07, |
|
"objective/entropy": -47.71858215332031, |
|
"objective/kl": 34.64881896972656, |
|
"objective/non_score_reward": -1.7324409484863281, |
|
"objective/rlhf_reward": 10.422674179077148, |
|
"objective/scores": 12.155115127563477, |
|
"policy/approxkl_avg": 0.002340013859793544, |
|
"policy/clipfrac_avg": 0.009795863181352615, |
|
"policy/entropy_avg": 1.0929601192474365, |
|
"step": 230, |
|
"val/clipfrac_avg": 0.0014780564233660698, |
|
"val/num_eos_tokens": 22730, |
|
"val/ratio": 0.9997072815895081, |
|
"val/ratio_var": 4.112517217436107e-06 |
|
}, |
|
{ |
|
"episode": 118272, |
|
"epoch": 1.01329677861549, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0035825008526444435, |
|
"loss/value_avg": 0.1515202820301056, |
|
"lr": 3.046875e-07, |
|
"objective/entropy": -47.19580078125, |
|
"objective/kl": 34.86238098144531, |
|
"objective/non_score_reward": -1.7431188821792603, |
|
"objective/rlhf_reward": 10.363493919372559, |
|
"objective/scores": 12.106613159179688, |
|
"policy/approxkl_avg": 0.0021840957924723625, |
|
"policy/clipfrac_avg": 0.010647189803421497, |
|
"policy/entropy_avg": 1.0970317125320435, |
|
"step": 231, |
|
"val/clipfrac_avg": 0.0015769560122862458, |
|
"val/num_eos_tokens": 23717, |
|
"val/ratio": 1.0001730918884277, |
|
"val/ratio_var": 6.174034751893487e-06 |
|
}, |
|
{ |
|
"episode": 118784, |
|
"epoch": 1.0176833447566827, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001027967780828476, |
|
"loss/value_avg": 0.12807387113571167, |
|
"lr": 2.9296875000000003e-07, |
|
"objective/entropy": -46.97780227661133, |
|
"objective/kl": 36.12638854980469, |
|
"objective/non_score_reward": -1.8063193559646606, |
|
"objective/rlhf_reward": 10.298711776733398, |
|
"objective/scores": 12.10503101348877, |
|
"policy/approxkl_avg": 0.0020326317753642797, |
|
"policy/clipfrac_avg": 0.010506751015782356, |
|
"policy/entropy_avg": 1.0705652236938477, |
|
"step": 232, |
|
"val/clipfrac_avg": 0.002296661026775837, |
|
"val/num_eos_tokens": 23196, |
|
"val/ratio": 0.9996442794799805, |
|
"val/ratio_var": 2.570702235971112e-06 |
|
}, |
|
{ |
|
"episode": 119296, |
|
"epoch": 1.0220699108978752, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006970872171223164, |
|
"loss/value_avg": 0.12893542647361755, |
|
"lr": 2.8125e-07, |
|
"objective/entropy": -47.03704071044922, |
|
"objective/kl": 35.707496643066406, |
|
"objective/non_score_reward": -1.7853751182556152, |
|
"objective/rlhf_reward": 10.371620178222656, |
|
"objective/scores": 12.15699577331543, |
|
"policy/approxkl_avg": 0.0026198499836027622, |
|
"policy/clipfrac_avg": 0.01155446469783783, |
|
"policy/entropy_avg": 1.0714943408966064, |
|
"step": 233, |
|
"val/clipfrac_avg": 0.0014992888318374753, |
|
"val/num_eos_tokens": 21060, |
|
"val/ratio": 0.9997165203094482, |
|
"val/ratio_var": 3.354538421262987e-06 |
|
}, |
|
{ |
|
"episode": 119808, |
|
"epoch": 1.0264564770390678, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0018435269594192505, |
|
"loss/value_avg": 0.1402617245912552, |
|
"lr": 2.6953125e-07, |
|
"objective/entropy": -47.30288314819336, |
|
"objective/kl": 35.687347412109375, |
|
"objective/non_score_reward": -1.784367322921753, |
|
"objective/rlhf_reward": 10.228802680969238, |
|
"objective/scores": 12.01317024230957, |
|
"policy/approxkl_avg": 0.0018239500932395458, |
|
"policy/clipfrac_avg": 0.010121040977537632, |
|
"policy/entropy_avg": 1.0836563110351562, |
|
"step": 234, |
|
"val/clipfrac_avg": 0.0013189124874770641, |
|
"val/num_eos_tokens": 22280, |
|
"val/ratio": 0.9998680353164673, |
|
"val/ratio_var": 2.612711568872328e-06 |
|
}, |
|
{ |
|
"episode": 120320, |
|
"epoch": 1.0308430431802604, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0069306232035160065, |
|
"loss/value_avg": 0.1296517699956894, |
|
"lr": 2.578125e-07, |
|
"objective/entropy": -46.29168701171875, |
|
"objective/kl": 35.88372802734375, |
|
"objective/non_score_reward": -1.7941863536834717, |
|
"objective/rlhf_reward": 10.230424880981445, |
|
"objective/scores": 12.024611473083496, |
|
"policy/approxkl_avg": 0.0034364780876785517, |
|
"policy/clipfrac_avg": 0.011023009195923805, |
|
"policy/entropy_avg": 1.062299132347107, |
|
"step": 235, |
|
"val/clipfrac_avg": 0.0019735856913030148, |
|
"val/num_eos_tokens": 22020, |
|
"val/ratio": 0.9997754096984863, |
|
"val/ratio_var": 5.2869540922984015e-06 |
|
}, |
|
{ |
|
"episode": 120832, |
|
"epoch": 1.035229609321453, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004135345108807087, |
|
"loss/value_avg": 0.14806649088859558, |
|
"lr": 2.4609375e-07, |
|
"objective/entropy": -46.469215393066406, |
|
"objective/kl": 36.00146484375, |
|
"objective/non_score_reward": -1.8000733852386475, |
|
"objective/rlhf_reward": 10.298752784729004, |
|
"objective/scores": 12.09882640838623, |
|
"policy/approxkl_avg": 0.0021121413446962833, |
|
"policy/clipfrac_avg": 0.011154447682201862, |
|
"policy/entropy_avg": 1.0819756984710693, |
|
"step": 236, |
|
"val/clipfrac_avg": 0.0024919421412050724, |
|
"val/num_eos_tokens": 22175, |
|
"val/ratio": 1.000499963760376, |
|
"val/ratio_var": 1.280141714232741e-05 |
|
}, |
|
{ |
|
"episode": 121344, |
|
"epoch": 1.0396161754626456, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0052656615152955055, |
|
"loss/value_avg": 0.1300714910030365, |
|
"lr": 2.34375e-07, |
|
"objective/entropy": -44.959388732910156, |
|
"objective/kl": 35.47280502319336, |
|
"objective/non_score_reward": -1.7736401557922363, |
|
"objective/rlhf_reward": 10.349384307861328, |
|
"objective/scores": 12.123023986816406, |
|
"policy/approxkl_avg": 0.0020022448152303696, |
|
"policy/clipfrac_avg": 0.01132948324084282, |
|
"policy/entropy_avg": 1.073385238647461, |
|
"step": 237, |
|
"val/clipfrac_avg": 0.0013668034225702286, |
|
"val/num_eos_tokens": 24303, |
|
"val/ratio": 1.0004512071609497, |
|
"val/ratio_var": 6.641071649937658e-06 |
|
}, |
|
{ |
|
"episode": 121856, |
|
"epoch": 1.0440027416038382, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0022417306900024414, |
|
"loss/value_avg": 0.1331208050251007, |
|
"lr": 2.2265625e-07, |
|
"objective/entropy": -46.06434631347656, |
|
"objective/kl": 35.35786437988281, |
|
"objective/non_score_reward": -1.7678931951522827, |
|
"objective/rlhf_reward": 10.399948120117188, |
|
"objective/scores": 12.167840957641602, |
|
"policy/approxkl_avg": 0.0024931542575359344, |
|
"policy/clipfrac_avg": 0.011008227244019508, |
|
"policy/entropy_avg": 1.0718121528625488, |
|
"step": 238, |
|
"val/clipfrac_avg": 0.0017535353545099497, |
|
"val/num_eos_tokens": 24789, |
|
"val/ratio": 0.9997462034225464, |
|
"val/ratio_var": 4.035917754663387e-06 |
|
}, |
|
{ |
|
"episode": 122368, |
|
"epoch": 1.0483893077450308, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006296452134847641, |
|
"loss/value_avg": 0.13648873567581177, |
|
"lr": 2.109375e-07, |
|
"objective/entropy": -47.731388092041016, |
|
"objective/kl": 35.815277099609375, |
|
"objective/non_score_reward": -1.7907638549804688, |
|
"objective/rlhf_reward": 10.327536582946777, |
|
"objective/scores": 12.118300437927246, |
|
"policy/approxkl_avg": 0.0019831331446766853, |
|
"policy/clipfrac_avg": 0.0104678338393569, |
|
"policy/entropy_avg": 1.0991512537002563, |
|
"step": 239, |
|
"val/clipfrac_avg": 0.0010415198048576713, |
|
"val/num_eos_tokens": 22777, |
|
"val/ratio": 1.0002403259277344, |
|
"val/ratio_var": 4.645487933885306e-06 |
|
}, |
|
{ |
|
"episode": 122880, |
|
"epoch": 1.0527758738862234, |
|
"eps": 4, |
|
"loss/policy_avg": 0.005139458924531937, |
|
"loss/value_avg": 0.1517227441072464, |
|
"lr": 1.9921875e-07, |
|
"objective/entropy": -46.621482849121094, |
|
"objective/kl": 34.758949279785156, |
|
"objective/non_score_reward": -1.7379475831985474, |
|
"objective/rlhf_reward": 10.491247177124023, |
|
"objective/scores": 12.229194641113281, |
|
"policy/approxkl_avg": 0.0020756043959409, |
|
"policy/clipfrac_avg": 0.01085658185184002, |
|
"policy/entropy_avg": 1.0900070667266846, |
|
"step": 240, |
|
"val/clipfrac_avg": 0.0016429282259196043, |
|
"val/num_eos_tokens": 24132, |
|
"val/ratio": 1.0001587867736816, |
|
"val/ratio_var": 4.0278300730278715e-06 |
|
}, |
|
{ |
|
"episode": 123392, |
|
"epoch": 1.057162440027416, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0028034071438014507, |
|
"loss/value_avg": 0.16330860555171967, |
|
"lr": 1.875e-07, |
|
"objective/entropy": -46.73743438720703, |
|
"objective/kl": 35.08587646484375, |
|
"objective/non_score_reward": -1.7542940378189087, |
|
"objective/rlhf_reward": 10.32256031036377, |
|
"objective/scores": 12.076854705810547, |
|
"policy/approxkl_avg": 0.0024638704489916563, |
|
"policy/clipfrac_avg": 0.010213883593678474, |
|
"policy/entropy_avg": 1.0772242546081543, |
|
"step": 241, |
|
"val/clipfrac_avg": 0.0014605314936488867, |
|
"val/num_eos_tokens": 23469, |
|
"val/ratio": 0.9998643398284912, |
|
"val/ratio_var": 3.752075599550153e-06 |
|
}, |
|
{ |
|
"episode": 123904, |
|
"epoch": 1.0615490061686086, |
|
"eps": 4, |
|
"loss/policy_avg": 0.004879022017121315, |
|
"loss/value_avg": 0.14888252317905426, |
|
"lr": 1.7578125e-07, |
|
"objective/entropy": -47.842430114746094, |
|
"objective/kl": 35.32581329345703, |
|
"objective/non_score_reward": -1.7662907838821411, |
|
"objective/rlhf_reward": 10.462313652038574, |
|
"objective/scores": 12.228604316711426, |
|
"policy/approxkl_avg": 0.0017638729186728597, |
|
"policy/clipfrac_avg": 0.011290816590189934, |
|
"policy/entropy_avg": 1.0770862102508545, |
|
"step": 242, |
|
"val/clipfrac_avg": 0.0015572316478937864, |
|
"val/num_eos_tokens": 22615, |
|
"val/ratio": 1.0004723072052002, |
|
"val/ratio_var": 6.782573564123595e-06 |
|
}, |
|
{ |
|
"episode": 124416, |
|
"epoch": 1.0659355723098012, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0035593826323747635, |
|
"loss/value_avg": 0.1295519769191742, |
|
"lr": 1.6406250000000002e-07, |
|
"objective/entropy": -45.786006927490234, |
|
"objective/kl": 35.52326202392578, |
|
"objective/non_score_reward": -1.7761629819869995, |
|
"objective/rlhf_reward": 10.437166213989258, |
|
"objective/scores": 12.213329315185547, |
|
"policy/approxkl_avg": 0.0021176172886043787, |
|
"policy/clipfrac_avg": 0.010787077248096466, |
|
"policy/entropy_avg": 1.096700668334961, |
|
"step": 243, |
|
"val/clipfrac_avg": 0.0014366672839969397, |
|
"val/num_eos_tokens": 22355, |
|
"val/ratio": 1.0001835823059082, |
|
"val/ratio_var": 6.8035760705242865e-06 |
|
}, |
|
{ |
|
"episode": 124928, |
|
"epoch": 1.0703221384509938, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001890458632260561, |
|
"loss/value_avg": 0.13558140397071838, |
|
"lr": 1.5234375e-07, |
|
"objective/entropy": -46.62615203857422, |
|
"objective/kl": 35.85779571533203, |
|
"objective/non_score_reward": -1.7928898334503174, |
|
"objective/rlhf_reward": 10.345151901245117, |
|
"objective/scores": 12.138041496276855, |
|
"policy/approxkl_avg": 0.002403097692877054, |
|
"policy/clipfrac_avg": 0.011035319417715073, |
|
"policy/entropy_avg": 1.0733857154846191, |
|
"step": 244, |
|
"val/clipfrac_avg": 0.001738998107612133, |
|
"val/num_eos_tokens": 21746, |
|
"val/ratio": 1.0001354217529297, |
|
"val/ratio_var": 2.896329078794224e-06 |
|
}, |
|
{ |
|
"episode": 125440, |
|
"epoch": 1.0747087045921864, |
|
"eps": 4, |
|
"loss/policy_avg": 0.002462470903992653, |
|
"loss/value_avg": 0.13766974210739136, |
|
"lr": 1.40625e-07, |
|
"objective/entropy": -45.997737884521484, |
|
"objective/kl": 35.553016662597656, |
|
"objective/non_score_reward": -1.7776508331298828, |
|
"objective/rlhf_reward": 10.40100383758545, |
|
"objective/scores": 12.178654670715332, |
|
"policy/approxkl_avg": 0.003136041574180126, |
|
"policy/clipfrac_avg": 0.010911045596003532, |
|
"policy/entropy_avg": 1.0624918937683105, |
|
"step": 245, |
|
"val/clipfrac_avg": 0.0014719897881150246, |
|
"val/num_eos_tokens": 22770, |
|
"val/ratio": 1.0002856254577637, |
|
"val/ratio_var": 9.461476111027878e-06 |
|
}, |
|
{ |
|
"episode": 125952, |
|
"epoch": 1.079095270733379, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00028504710644483566, |
|
"loss/value_avg": 0.14360447227954865, |
|
"lr": 1.2890625e-07, |
|
"objective/entropy": -46.9267463684082, |
|
"objective/kl": 35.44313049316406, |
|
"objective/non_score_reward": -1.7721564769744873, |
|
"objective/rlhf_reward": 10.315094947814941, |
|
"objective/scores": 12.087251663208008, |
|
"policy/approxkl_avg": 0.002328254049643874, |
|
"policy/clipfrac_avg": 0.009981741197407246, |
|
"policy/entropy_avg": 1.0975849628448486, |
|
"step": 246, |
|
"val/clipfrac_avg": 0.0021861500572413206, |
|
"val/num_eos_tokens": 24914, |
|
"val/ratio": 1.0002243518829346, |
|
"val/ratio_var": 5.957234407105716e-06 |
|
}, |
|
{ |
|
"episode": 126464, |
|
"epoch": 1.0834818368745716, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0003530774265527725, |
|
"loss/value_avg": 0.13446325063705444, |
|
"lr": 1.171875e-07, |
|
"objective/entropy": -46.55187225341797, |
|
"objective/kl": 35.19268798828125, |
|
"objective/non_score_reward": -1.7596344947814941, |
|
"objective/rlhf_reward": 10.49343490600586, |
|
"objective/scores": 12.253069877624512, |
|
"policy/approxkl_avg": 0.00239111902192235, |
|
"policy/clipfrac_avg": 0.010795432142913342, |
|
"policy/entropy_avg": 1.0839779376983643, |
|
"step": 247, |
|
"val/clipfrac_avg": 0.0015913444804027677, |
|
"val/num_eos_tokens": 23642, |
|
"val/ratio": 1.000253438949585, |
|
"val/ratio_var": 5.258910732663935e-06 |
|
}, |
|
{ |
|
"episode": 126976, |
|
"epoch": 1.0878684030157642, |
|
"eps": 4, |
|
"loss/policy_avg": -0.0018387939780950546, |
|
"loss/value_avg": 0.15468762814998627, |
|
"lr": 1.0546875e-07, |
|
"objective/entropy": -47.14482498168945, |
|
"objective/kl": 35.40525436401367, |
|
"objective/non_score_reward": -1.7702628374099731, |
|
"objective/rlhf_reward": 10.223344802856445, |
|
"objective/scores": 11.993607521057129, |
|
"policy/approxkl_avg": 0.0021429010666906834, |
|
"policy/clipfrac_avg": 0.010971945710480213, |
|
"policy/entropy_avg": 1.1065943241119385, |
|
"step": 248, |
|
"val/clipfrac_avg": 0.0013208456803113222, |
|
"val/num_eos_tokens": 22990, |
|
"val/ratio": 0.9996612071990967, |
|
"val/ratio_var": 4.5201718421594705e-06 |
|
}, |
|
{ |
|
"episode": 127488, |
|
"epoch": 1.0922549691569567, |
|
"eps": 4, |
|
"loss/policy_avg": 0.001225670799612999, |
|
"loss/value_avg": 0.1466779112815857, |
|
"lr": 9.375e-08, |
|
"objective/entropy": -45.93384552001953, |
|
"objective/kl": 35.7270622253418, |
|
"objective/non_score_reward": -1.7863531112670898, |
|
"objective/rlhf_reward": 10.394562721252441, |
|
"objective/scores": 12.180915832519531, |
|
"policy/approxkl_avg": 0.002070215530693531, |
|
"policy/clipfrac_avg": 0.010630708187818527, |
|
"policy/entropy_avg": 1.0799144506454468, |
|
"step": 249, |
|
"val/clipfrac_avg": 0.0013516065664589405, |
|
"val/num_eos_tokens": 23734, |
|
"val/ratio": 1.0008833408355713, |
|
"val/ratio_var": 2.696674710023217e-05 |
|
}, |
|
{ |
|
"episode": 128000, |
|
"epoch": 1.0966415352981493, |
|
"eps": 4, |
|
"loss/policy_avg": 0.006959846243262291, |
|
"loss/value_avg": 0.134637713432312, |
|
"lr": 8.203125000000001e-08, |
|
"objective/entropy": -45.95152282714844, |
|
"objective/kl": 35.22590637207031, |
|
"objective/non_score_reward": -1.7612950801849365, |
|
"objective/rlhf_reward": 10.304990768432617, |
|
"objective/scores": 12.066286087036133, |
|
"policy/approxkl_avg": 0.0024010255001485348, |
|
"policy/clipfrac_avg": 0.011238099075853825, |
|
"policy/entropy_avg": 1.0736041069030762, |
|
"step": 250, |
|
"val/clipfrac_avg": 0.0017618590500205755, |
|
"val/num_eos_tokens": 22063, |
|
"val/ratio": 0.9997179508209229, |
|
"val/ratio_var": 2.0375098301883554e-06 |
|
}, |
|
{ |
|
"episode": 128512, |
|
"epoch": 1.101028101439342, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0056634098291397095, |
|
"loss/value_avg": 0.14280955493450165, |
|
"lr": 7.03125e-08, |
|
"objective/entropy": -47.430274963378906, |
|
"objective/kl": 35.55634307861328, |
|
"objective/non_score_reward": -1.7778171300888062, |
|
"objective/rlhf_reward": 10.224863052368164, |
|
"objective/scores": 12.002679824829102, |
|
"policy/approxkl_avg": 0.0028075524605810642, |
|
"policy/clipfrac_avg": 0.010713065043091774, |
|
"policy/entropy_avg": 1.1045918464660645, |
|
"step": 251, |
|
"val/clipfrac_avg": 0.0016868215752765536, |
|
"val/num_eos_tokens": 23025, |
|
"val/ratio": 0.999710202217102, |
|
"val/ratio_var": 7.573556104034651e-06 |
|
}, |
|
{ |
|
"episode": 129024, |
|
"epoch": 1.1054146675805345, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0015937616117298603, |
|
"loss/value_avg": 0.1460372358560562, |
|
"lr": 5.859375e-08, |
|
"objective/entropy": -47.605384826660156, |
|
"objective/kl": 35.3289794921875, |
|
"objective/non_score_reward": -1.766448974609375, |
|
"objective/rlhf_reward": 10.445650100708008, |
|
"objective/scores": 12.212099075317383, |
|
"policy/approxkl_avg": 0.0021890033967792988, |
|
"policy/clipfrac_avg": 0.009831791743636131, |
|
"policy/entropy_avg": 1.072847604751587, |
|
"step": 252, |
|
"val/clipfrac_avg": 0.0019376241834834218, |
|
"val/num_eos_tokens": 20707, |
|
"val/ratio": 1.0002546310424805, |
|
"val/ratio_var": 5.4702413763152435e-06 |
|
}, |
|
{ |
|
"episode": 129536, |
|
"epoch": 1.1098012337217271, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0011267205700278282, |
|
"loss/value_avg": 0.13995370268821716, |
|
"lr": 4.6875e-08, |
|
"objective/entropy": -47.466609954833984, |
|
"objective/kl": 35.21324920654297, |
|
"objective/non_score_reward": -1.76066255569458, |
|
"objective/rlhf_reward": 10.34493637084961, |
|
"objective/scores": 12.105599403381348, |
|
"policy/approxkl_avg": 0.002287943847477436, |
|
"policy/clipfrac_avg": 0.011924582533538342, |
|
"policy/entropy_avg": 1.0961564779281616, |
|
"step": 253, |
|
"val/clipfrac_avg": 0.0030874176882207394, |
|
"val/num_eos_tokens": 23391, |
|
"val/ratio": 0.9999744296073914, |
|
"val/ratio_var": 4.732663910544943e-06 |
|
}, |
|
{ |
|
"episode": 130048, |
|
"epoch": 1.1141877998629197, |
|
"eps": 4, |
|
"loss/policy_avg": 0.00187746062874794, |
|
"loss/value_avg": 0.16102451086044312, |
|
"lr": 3.515625e-08, |
|
"objective/entropy": -46.87982940673828, |
|
"objective/kl": 35.481868743896484, |
|
"objective/non_score_reward": -1.774093508720398, |
|
"objective/rlhf_reward": 10.216583251953125, |
|
"objective/scores": 11.990676879882812, |
|
"policy/approxkl_avg": 0.002129252767190337, |
|
"policy/clipfrac_avg": 0.011353913694620132, |
|
"policy/entropy_avg": 1.085520625114441, |
|
"step": 254, |
|
"val/clipfrac_avg": 0.002106403699144721, |
|
"val/num_eos_tokens": 24079, |
|
"val/ratio": 1.0003963708877563, |
|
"val/ratio_var": 6.27804001851473e-06 |
|
}, |
|
{ |
|
"episode": 130560, |
|
"epoch": 1.1185743660041123, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0023504868149757385, |
|
"loss/value_avg": 0.1395518034696579, |
|
"lr": 2.34375e-08, |
|
"objective/entropy": -47.189300537109375, |
|
"objective/kl": 35.595664978027344, |
|
"objective/non_score_reward": -1.7797832489013672, |
|
"objective/rlhf_reward": 10.323637008666992, |
|
"objective/scores": 12.10342025756836, |
|
"policy/approxkl_avg": 0.0020733638666570187, |
|
"policy/clipfrac_avg": 0.01057223230600357, |
|
"policy/entropy_avg": 1.1228654384613037, |
|
"step": 255, |
|
"val/clipfrac_avg": 0.0016230044420808554, |
|
"val/num_eos_tokens": 24372, |
|
"val/ratio": 0.9999302625656128, |
|
"val/ratio_var": 5.258788405626547e-06 |
|
}, |
|
{ |
|
"episode": 131072, |
|
"epoch": 1.122960932145305, |
|
"eps": 4, |
|
"loss/policy_avg": 0.0048648901283741, |
|
"loss/value_avg": 0.15009374916553497, |
|
"lr": 1.171875e-08, |
|
"objective/entropy": -45.09252166748047, |
|
"objective/kl": 35.825538635253906, |
|
"objective/non_score_reward": -1.7912769317626953, |
|
"objective/rlhf_reward": 10.314563751220703, |
|
"objective/scores": 12.105840682983398, |
|
"policy/approxkl_avg": 0.0024984171614050865, |
|
"policy/clipfrac_avg": 0.010642223991453648, |
|
"policy/entropy_avg": 1.051030158996582, |
|
"step": 256, |
|
"val/clipfrac_avg": 0.0016988374991342425, |
|
"val/num_eos_tokens": 22804, |
|
"val/ratio": 1.0000357627868652, |
|
"val/ratio_var": 4.6805571400909685e-06 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 256, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1.122960932145305, |
|
"save_steps": 52, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0, |
|
"train_batch_size": null, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|