Muqeeth's picture
Upload files
35d6da8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"episode": 131072,
"epoch": 1.122960932145305,
"eval_steps": 500,
"global_step": 256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 512,
"epoch": 0.004386566141192597,
"eps": 5,
"loss/policy_avg": -0.0013769641518592834,
"loss/value_avg": 1.9423742294311523,
"lr": 3e-06,
"objective/entropy": -56.19151306152344,
"objective/kl": 2.391636371612549e-06,
"objective/non_score_reward": -1.1958181289628556e-07,
"objective/rlhf_reward": 4.956933975219727,
"objective/scores": 4.956933975219727,
"policy/approxkl_avg": 0.001525376457720995,
"policy/clipfrac_avg": 0.022334059700369835,
"policy/entropy_avg": 1.2976276874542236,
"step": 1,
"val/clipfrac_avg": 0.006576753221452236,
"val/num_eos_tokens": 13845,
"val/ratio": 0.9997893571853638,
"val/ratio_var": 4.300739419704769e-06
},
{
"episode": 1024,
"epoch": 0.008773132282385195,
"eps": 6,
"loss/policy_avg": 0.0031125713139772415,
"loss/value_avg": 1.3287988901138306,
"lr": 2.9882812500000002e-06,
"objective/entropy": 7.951052665710449,
"objective/kl": 0.7057143449783325,
"objective/non_score_reward": -0.035285718739032745,
"objective/rlhf_reward": 5.132782936096191,
"objective/scores": 5.168068885803223,
"policy/approxkl_avg": 0.0028971810825169086,
"policy/clipfrac_avg": 0.02630750462412834,
"policy/entropy_avg": 0.9777142405509949,
"step": 2,
"val/clipfrac_avg": 0.017064230516552925,
"val/num_eos_tokens": 8612,
"val/ratio": 1.000025749206543,
"val/ratio_var": 1.2729425179713871e-05
},
{
"episode": 1536,
"epoch": 0.013159698423577794,
"eps": 6,
"loss/policy_avg": 0.004569530487060547,
"loss/value_avg": 1.8551630973815918,
"lr": 2.9765625e-06,
"objective/entropy": 6.086846351623535,
"objective/kl": 1.7537505626678467,
"objective/non_score_reward": -0.08768752217292786,
"objective/rlhf_reward": 5.270642280578613,
"objective/scores": 5.358329772949219,
"policy/approxkl_avg": 0.0035523450933396816,
"policy/clipfrac_avg": 0.02881922945380211,
"policy/entropy_avg": 0.99476557970047,
"step": 3,
"val/clipfrac_avg": 0.023893361911177635,
"val/num_eos_tokens": 8606,
"val/ratio": 0.999504804611206,
"val/ratio_var": 2.043731728917919e-05
},
{
"episode": 2048,
"epoch": 0.01754626456477039,
"eps": 6,
"loss/policy_avg": 0.00686279870569706,
"loss/value_avg": 2.0762438774108887,
"lr": 2.96484375e-06,
"objective/entropy": -72.34222412109375,
"objective/kl": 2.7035207748413086,
"objective/non_score_reward": -0.13517603278160095,
"objective/rlhf_reward": 5.987885475158691,
"objective/scores": 6.123061656951904,
"policy/approxkl_avg": 0.0016497070901095867,
"policy/clipfrac_avg": 0.02908516302704811,
"policy/entropy_avg": 1.7031397819519043,
"step": 4,
"val/clipfrac_avg": 0.027677347883582115,
"val/num_eos_tokens": 14870,
"val/ratio": 1.000091552734375,
"val/ratio_var": 2.1516761989914812e-06
},
{
"episode": 2560,
"epoch": 0.02193283070596299,
"eps": 6,
"loss/policy_avg": 0.00263179000467062,
"loss/value_avg": 1.606541633605957,
"lr": 2.953125e-06,
"objective/entropy": -65.28475952148438,
"objective/kl": 4.556520462036133,
"objective/non_score_reward": -0.22782602906227112,
"objective/rlhf_reward": 6.3742218017578125,
"objective/scores": 6.602047920227051,
"policy/approxkl_avg": 0.0017431321321055293,
"policy/clipfrac_avg": 0.02620251104235649,
"policy/entropy_avg": 2.0425772666931152,
"step": 5,
"val/clipfrac_avg": 0.023111186921596527,
"val/num_eos_tokens": 13325,
"val/ratio": 0.9996423721313477,
"val/ratio_var": 2.722375029406976e-06
},
{
"episode": 3072,
"epoch": 0.026319396847155587,
"eps": 6,
"loss/policy_avg": 0.0040515996515750885,
"loss/value_avg": 1.1282833814620972,
"lr": 2.94140625e-06,
"objective/entropy": -56.7620964050293,
"objective/kl": 10.2549467086792,
"objective/non_score_reward": -0.5127473473548889,
"objective/rlhf_reward": 7.096510887145996,
"objective/scores": 7.60925817489624,
"policy/approxkl_avg": 0.002651135204359889,
"policy/clipfrac_avg": 0.02610427513718605,
"policy/entropy_avg": 2.47774600982666,
"step": 6,
"val/clipfrac_avg": 0.01816035807132721,
"val/num_eos_tokens": 20916,
"val/ratio": 0.9997857809066772,
"val/ratio_var": 3.7805486954312073e-06
},
{
"episode": 3584,
"epoch": 0.030705962988348184,
"eps": 6,
"loss/policy_avg": -0.0022932137362658978,
"loss/value_avg": 0.9223657846450806,
"lr": 2.9296875e-06,
"objective/entropy": -51.8396110534668,
"objective/kl": 15.692605972290039,
"objective/non_score_reward": -0.784630298614502,
"objective/rlhf_reward": 7.749014377593994,
"objective/scores": 8.533644676208496,
"policy/approxkl_avg": 0.002274099038913846,
"policy/clipfrac_avg": 0.025012236088514328,
"policy/entropy_avg": 2.5470399856567383,
"step": 7,
"val/clipfrac_avg": 0.018111344426870346,
"val/num_eos_tokens": 27788,
"val/ratio": 1.0004068613052368,
"val/ratio_var": 3.3165326840389753e-06
},
{
"episode": 4096,
"epoch": 0.03509252912954078,
"eps": 5,
"loss/policy_avg": -0.02864566072821617,
"loss/value_avg": 0.8586666584014893,
"lr": 2.91796875e-06,
"objective/entropy": -44.25556564331055,
"objective/kl": 18.9859619140625,
"objective/non_score_reward": -0.9492980241775513,
"objective/rlhf_reward": 7.566014289855957,
"objective/scores": 8.515312194824219,
"policy/approxkl_avg": 0.0024859202094376087,
"policy/clipfrac_avg": 0.02374047227203846,
"policy/entropy_avg": 2.4501914978027344,
"step": 8,
"val/clipfrac_avg": 0.012094689533114433,
"val/num_eos_tokens": 39697,
"val/ratio": 0.9997825622558594,
"val/ratio_var": 2.589023324617301e-06
},
{
"episode": 4608,
"epoch": 0.03947909527073338,
"eps": 5,
"loss/policy_avg": -0.03993895649909973,
"loss/value_avg": 0.8039427995681763,
"lr": 2.90625e-06,
"objective/entropy": -41.25428771972656,
"objective/kl": 21.300134658813477,
"objective/non_score_reward": -1.0650067329406738,
"objective/rlhf_reward": 7.658048152923584,
"objective/scores": 8.723054885864258,
"policy/approxkl_avg": 0.002434398978948593,
"policy/clipfrac_avg": 0.020647358149290085,
"policy/entropy_avg": 2.318600654602051,
"step": 9,
"val/clipfrac_avg": 0.012456808239221573,
"val/num_eos_tokens": 38279,
"val/ratio": 0.9994508028030396,
"val/ratio_var": 2.497590912753367e-06
},
{
"episode": 5120,
"epoch": 0.04386566141192598,
"eps": 5,
"loss/policy_avg": -0.038270700722932816,
"loss/value_avg": 0.7831208109855652,
"lr": 2.89453125e-06,
"objective/entropy": -40.375099182128906,
"objective/kl": 23.66585922241211,
"objective/non_score_reward": -1.1832929849624634,
"objective/rlhf_reward": 7.1149516105651855,
"objective/scores": 8.29824447631836,
"policy/approxkl_avg": 0.0023879953660070896,
"policy/clipfrac_avg": 0.02045682817697525,
"policy/entropy_avg": 2.171109199523926,
"step": 10,
"val/clipfrac_avg": 0.01572496071457863,
"val/num_eos_tokens": 36087,
"val/ratio": 0.9995651245117188,
"val/ratio_var": 4.97030487167649e-06
},
{
"episode": 5632,
"epoch": 0.048252227553118573,
"eps": 5,
"loss/policy_avg": -0.03474128991365433,
"loss/value_avg": 0.7656794786453247,
"lr": 2.8828125e-06,
"objective/entropy": -43.84745788574219,
"objective/kl": 24.729175567626953,
"objective/non_score_reward": -1.2364587783813477,
"objective/rlhf_reward": 7.341504096984863,
"objective/scores": 8.577962875366211,
"policy/approxkl_avg": 0.0027273856103420258,
"policy/clipfrac_avg": 0.022427301853895187,
"policy/entropy_avg": 2.208101272583008,
"step": 11,
"val/clipfrac_avg": 0.016871843487024307,
"val/num_eos_tokens": 36666,
"val/ratio": 0.9991950988769531,
"val/ratio_var": 4.898628048977116e-06
},
{
"episode": 6144,
"epoch": 0.052638793694311174,
"eps": 5,
"loss/policy_avg": -0.0249569620937109,
"loss/value_avg": 0.7158781290054321,
"lr": 2.87109375e-06,
"objective/entropy": -48.97076416015625,
"objective/kl": 25.502344131469727,
"objective/non_score_reward": -1.27511727809906,
"objective/rlhf_reward": 7.928722858428955,
"objective/scores": 9.203840255737305,
"policy/approxkl_avg": 0.0023700755555182695,
"policy/clipfrac_avg": 0.02416691742837429,
"policy/entropy_avg": 2.2255916595458984,
"step": 12,
"val/clipfrac_avg": 0.014078151434659958,
"val/num_eos_tokens": 37746,
"val/ratio": 0.9998907446861267,
"val/ratio_var": 5.1026213441218715e-06
},
{
"episode": 6656,
"epoch": 0.05702535983550377,
"eps": 5,
"loss/policy_avg": -0.018949007615447044,
"loss/value_avg": 0.7686400413513184,
"lr": 2.859375e-06,
"objective/entropy": -47.11943817138672,
"objective/kl": 27.999988555908203,
"objective/non_score_reward": -1.3999994993209839,
"objective/rlhf_reward": 7.812624931335449,
"objective/scores": 9.212624549865723,
"policy/approxkl_avg": 0.002762872725725174,
"policy/clipfrac_avg": 0.024748779833316803,
"policy/entropy_avg": 2.155306339263916,
"step": 13,
"val/clipfrac_avg": 0.015288694761693478,
"val/num_eos_tokens": 36580,
"val/ratio": 1.000169277191162,
"val/ratio_var": 5.652353593177395e-06
},
{
"episode": 7168,
"epoch": 0.06141192597669637,
"eps": 5,
"loss/policy_avg": -0.009994697757065296,
"loss/value_avg": 0.7283180952072144,
"lr": 2.84765625e-06,
"objective/entropy": -53.18806457519531,
"objective/kl": 27.922462463378906,
"objective/non_score_reward": -1.3961231708526611,
"objective/rlhf_reward": 8.238632202148438,
"objective/scores": 9.63475513458252,
"policy/approxkl_avg": 0.002478944603353739,
"policy/clipfrac_avg": 0.021354785189032555,
"policy/entropy_avg": 2.150880813598633,
"step": 14,
"val/clipfrac_avg": 0.012931845150887966,
"val/num_eos_tokens": 36004,
"val/ratio": 1.0006251335144043,
"val/ratio_var": 7.0911496550252195e-06
},
{
"episode": 7680,
"epoch": 0.06579849211788896,
"eps": 5,
"loss/policy_avg": -0.006545604206621647,
"loss/value_avg": 0.6837283968925476,
"lr": 2.8359375e-06,
"objective/entropy": -51.048362731933594,
"objective/kl": 29.153331756591797,
"objective/non_score_reward": -1.4576666355133057,
"objective/rlhf_reward": 8.508513450622559,
"objective/scores": 9.966179847717285,
"policy/approxkl_avg": 0.003793728072196245,
"policy/clipfrac_avg": 0.023298773914575577,
"policy/entropy_avg": 2.1069443225860596,
"step": 15,
"val/clipfrac_avg": 0.016515308991074562,
"val/num_eos_tokens": 33784,
"val/ratio": 1.0005180835723877,
"val/ratio_var": 1.2400751074892469e-05
},
{
"episode": 8192,
"epoch": 0.07018505825908156,
"eps": 5,
"loss/policy_avg": -0.010740559548139572,
"loss/value_avg": 0.6928367614746094,
"lr": 2.82421875e-06,
"objective/entropy": -50.633888244628906,
"objective/kl": 29.69721794128418,
"objective/non_score_reward": -1.484860897064209,
"objective/rlhf_reward": 8.630350112915039,
"objective/scores": 10.115211486816406,
"policy/approxkl_avg": 0.002925678389146924,
"policy/clipfrac_avg": 0.022546332329511642,
"policy/entropy_avg": 2.0426688194274902,
"step": 16,
"val/clipfrac_avg": 0.01313821505755186,
"val/num_eos_tokens": 34515,
"val/ratio": 0.9999200105667114,
"val/ratio_var": 3.5139350984536577e-06
},
{
"episode": 8704,
"epoch": 0.07457162440027416,
"eps": 5,
"loss/policy_avg": -0.0003042006865143776,
"loss/value_avg": 0.630737841129303,
"lr": 2.8125e-06,
"objective/entropy": -54.703208923339844,
"objective/kl": 29.418441772460938,
"objective/non_score_reward": -1.4709219932556152,
"objective/rlhf_reward": 8.782114028930664,
"objective/scores": 10.253036499023438,
"policy/approxkl_avg": 0.0034269755706191063,
"policy/clipfrac_avg": 0.021969493478536606,
"policy/entropy_avg": 2.0263655185699463,
"step": 17,
"val/clipfrac_avg": 0.013666579499840736,
"val/num_eos_tokens": 32440,
"val/ratio": 0.9990756511688232,
"val/ratio_var": 5.783027063444024e-06
},
{
"episode": 9216,
"epoch": 0.07895819054146676,
"eps": 5,
"loss/policy_avg": -0.0013809381052851677,
"loss/value_avg": 0.625673234462738,
"lr": 2.80078125e-06,
"objective/entropy": -50.05335235595703,
"objective/kl": 32.24420166015625,
"objective/non_score_reward": -1.6122100353240967,
"objective/rlhf_reward": 8.869917869567871,
"objective/scores": 10.482128143310547,
"policy/approxkl_avg": 0.0037200015503913164,
"policy/clipfrac_avg": 0.023221854120492935,
"policy/entropy_avg": 1.9334304332733154,
"step": 18,
"val/clipfrac_avg": 0.012253889814019203,
"val/num_eos_tokens": 32755,
"val/ratio": 1.0010600090026855,
"val/ratio_var": 1.8864358935388736e-05
},
{
"episode": 9728,
"epoch": 0.08334475668265935,
"eps": 5,
"loss/policy_avg": -0.001600255724042654,
"loss/value_avg": 0.5548046231269836,
"lr": 2.7890625e-06,
"objective/entropy": -52.448543548583984,
"objective/kl": 31.141429901123047,
"objective/non_score_reward": -1.5570714473724365,
"objective/rlhf_reward": 9.025203704833984,
"objective/scores": 10.582275390625,
"policy/approxkl_avg": 0.00342507753521204,
"policy/clipfrac_avg": 0.022851863875985146,
"policy/entropy_avg": 1.9168764352798462,
"step": 19,
"val/clipfrac_avg": 0.011491503566503525,
"val/num_eos_tokens": 33170,
"val/ratio": 0.999735951423645,
"val/ratio_var": 5.808557489217492e-06
},
{
"episode": 10240,
"epoch": 0.08773132282385196,
"eps": 5,
"loss/policy_avg": 0.00035352353006601334,
"loss/value_avg": 0.5776740312576294,
"lr": 2.77734375e-06,
"objective/entropy": -49.64569854736328,
"objective/kl": 32.26124572753906,
"objective/non_score_reward": -1.6130623817443848,
"objective/rlhf_reward": 9.020620346069336,
"objective/scores": 10.633682250976562,
"policy/approxkl_avg": 0.003732402576133609,
"policy/clipfrac_avg": 0.02131691202521324,
"policy/entropy_avg": 1.8495761156082153,
"step": 20,
"val/clipfrac_avg": 0.014484411105513573,
"val/num_eos_tokens": 32124,
"val/ratio": 1.000256061553955,
"val/ratio_var": 1.3850245522917248e-05
},
{
"episode": 10752,
"epoch": 0.09211788896504455,
"eps": 5,
"loss/policy_avg": 0.004326590336859226,
"loss/value_avg": 0.5091855525970459,
"lr": 2.765625e-06,
"objective/entropy": -52.526283264160156,
"objective/kl": 32.356727600097656,
"objective/non_score_reward": -1.6178364753723145,
"objective/rlhf_reward": 8.98788070678711,
"objective/scores": 10.605716705322266,
"policy/approxkl_avg": 0.002898063976317644,
"policy/clipfrac_avg": 0.02005818486213684,
"policy/entropy_avg": 1.8272948265075684,
"step": 21,
"val/clipfrac_avg": 0.010025454685091972,
"val/num_eos_tokens": 31636,
"val/ratio": 0.9998326301574707,
"val/ratio_var": 4.743439149024198e-06
},
{
"episode": 11264,
"epoch": 0.09650445510623715,
"eps": 5,
"loss/policy_avg": 0.001326502999290824,
"loss/value_avg": 0.5260515213012695,
"lr": 2.75390625e-06,
"objective/entropy": -50.08943557739258,
"objective/kl": 32.325721740722656,
"objective/non_score_reward": -1.6162861585617065,
"objective/rlhf_reward": 9.08689022064209,
"objective/scores": 10.703176498413086,
"policy/approxkl_avg": 0.004229161888360977,
"policy/clipfrac_avg": 0.019012872129678726,
"policy/entropy_avg": 1.7281861305236816,
"step": 22,
"val/clipfrac_avg": 0.012183602899312973,
"val/num_eos_tokens": 30727,
"val/ratio": 0.9997547268867493,
"val/ratio_var": 5.818676982016768e-06
},
{
"episode": 11776,
"epoch": 0.10089102124742974,
"eps": 5,
"loss/policy_avg": 0.0023662205785512924,
"loss/value_avg": 0.4883805811405182,
"lr": 2.7421875e-06,
"objective/entropy": -53.46156692504883,
"objective/kl": 31.849475860595703,
"objective/non_score_reward": -1.5924739837646484,
"objective/rlhf_reward": 9.090110778808594,
"objective/scores": 10.682584762573242,
"policy/approxkl_avg": 0.003692931029945612,
"policy/clipfrac_avg": 0.01948227360844612,
"policy/entropy_avg": 1.7976675033569336,
"step": 23,
"val/clipfrac_avg": 0.012527575716376305,
"val/num_eos_tokens": 31978,
"val/ratio": 1.0005892515182495,
"val/ratio_var": 5.0292437663301826e-05
},
{
"episode": 12288,
"epoch": 0.10527758738862235,
"eps": 5,
"loss/policy_avg": -0.002815414220094681,
"loss/value_avg": 0.47801172733306885,
"lr": 2.73046875e-06,
"objective/entropy": -54.20009231567383,
"objective/kl": 30.75657081604004,
"objective/non_score_reward": -1.5378286838531494,
"objective/rlhf_reward": 9.155134201049805,
"objective/scores": 10.692962646484375,
"policy/approxkl_avg": 0.0031356574036180973,
"policy/clipfrac_avg": 0.01814776286482811,
"policy/entropy_avg": 1.7910568714141846,
"step": 24,
"val/clipfrac_avg": 0.0124925896525383,
"val/num_eos_tokens": 30355,
"val/ratio": 0.9999936819076538,
"val/ratio_var": 3.8795255932200234e-06
},
{
"episode": 12800,
"epoch": 0.10966415352981494,
"eps": 5,
"loss/policy_avg": 0.004107598215341568,
"loss/value_avg": 0.42799514532089233,
"lr": 2.71875e-06,
"objective/entropy": -56.50166320800781,
"objective/kl": 29.329524993896484,
"objective/non_score_reward": -1.466476321220398,
"objective/rlhf_reward": 9.282110214233398,
"objective/scores": 10.748586654663086,
"policy/approxkl_avg": 0.0033666701056063175,
"policy/clipfrac_avg": 0.018708810210227966,
"policy/entropy_avg": 1.836845874786377,
"step": 25,
"val/clipfrac_avg": 0.007121690083295107,
"val/num_eos_tokens": 29066,
"val/ratio": 0.9996716976165771,
"val/ratio_var": 5.8772338888957165e-06
},
{
"episode": 13312,
"epoch": 0.11405071967100754,
"eps": 5,
"loss/policy_avg": 0.00938927847892046,
"loss/value_avg": 0.3949218690395355,
"lr": 2.70703125e-06,
"objective/entropy": -60.591705322265625,
"objective/kl": 28.337303161621094,
"objective/non_score_reward": -1.4168651103973389,
"objective/rlhf_reward": 9.06982707977295,
"objective/scores": 10.486692428588867,
"policy/approxkl_avg": 0.0028319661505520344,
"policy/clipfrac_avg": 0.021056555211544037,
"policy/entropy_avg": 1.9272569417953491,
"step": 26,
"val/clipfrac_avg": 0.00876462459564209,
"val/num_eos_tokens": 28183,
"val/ratio": 0.9999039173126221,
"val/ratio_var": 4.642893145501148e-06
},
{
"episode": 13824,
"epoch": 0.11843728581220014,
"eps": 5,
"loss/policy_avg": 0.006120752543210983,
"loss/value_avg": 0.3891026973724365,
"lr": 2.6953125e-06,
"objective/entropy": -60.93333435058594,
"objective/kl": 28.110050201416016,
"objective/non_score_reward": -1.4055025577545166,
"objective/rlhf_reward": 9.255789756774902,
"objective/scores": 10.66129207611084,
"policy/approxkl_avg": 0.0032745348289608955,
"policy/clipfrac_avg": 0.0199459008872509,
"policy/entropy_avg": 1.9059574604034424,
"step": 27,
"val/clipfrac_avg": 0.007393369916826487,
"val/num_eos_tokens": 29010,
"val/ratio": 0.999842643737793,
"val/ratio_var": 8.019253073143773e-06
},
{
"episode": 14336,
"epoch": 0.12282385195339274,
"eps": 5,
"loss/policy_avg": 0.008166640996932983,
"loss/value_avg": 0.33365607261657715,
"lr": 2.68359375e-06,
"objective/entropy": -61.89513397216797,
"objective/kl": 27.650426864624023,
"objective/non_score_reward": -1.382521390914917,
"objective/rlhf_reward": 9.297162055969238,
"objective/scores": 10.679683685302734,
"policy/approxkl_avg": 0.0037113018333911896,
"policy/clipfrac_avg": 0.020008713006973267,
"policy/entropy_avg": 1.9356969594955444,
"step": 28,
"val/clipfrac_avg": 0.008162135258316994,
"val/num_eos_tokens": 25073,
"val/ratio": 1.0000048875808716,
"val/ratio_var": 6.840070909674978e-06
},
{
"episode": 14848,
"epoch": 0.12721041809458533,
"eps": 5,
"loss/policy_avg": 0.01543242298066616,
"loss/value_avg": 0.3317902088165283,
"lr": 2.671875e-06,
"objective/entropy": -53.287864685058594,
"objective/kl": 28.098434448242188,
"objective/non_score_reward": -1.4049216508865356,
"objective/rlhf_reward": 9.339518547058105,
"objective/scores": 10.744440078735352,
"policy/approxkl_avg": 0.00311127002350986,
"policy/clipfrac_avg": 0.02064812183380127,
"policy/entropy_avg": 1.7959502935409546,
"step": 29,
"val/clipfrac_avg": 0.00857294537127018,
"val/num_eos_tokens": 24235,
"val/ratio": 1.000222086906433,
"val/ratio_var": 5.494624929269776e-06
},
{
"episode": 15360,
"epoch": 0.13159698423577793,
"eps": 5,
"loss/policy_avg": 0.010417070239782333,
"loss/value_avg": 0.34546566009521484,
"lr": 2.66015625e-06,
"objective/entropy": -58.94676971435547,
"objective/kl": 28.113197326660156,
"objective/non_score_reward": -1.4056599140167236,
"objective/rlhf_reward": 9.112338066101074,
"objective/scores": 10.517997741699219,
"policy/approxkl_avg": 0.003508294001221657,
"policy/clipfrac_avg": 0.021732624620199203,
"policy/entropy_avg": 1.8900206089019775,
"step": 30,
"val/clipfrac_avg": 0.006745144259184599,
"val/num_eos_tokens": 26990,
"val/ratio": 0.9994415044784546,
"val/ratio_var": 4.539244855550351e-06
},
{
"episode": 15872,
"epoch": 0.13598355037697052,
"eps": 5,
"loss/policy_avg": 0.011747404932975769,
"loss/value_avg": 0.32529520988464355,
"lr": 2.6484375e-06,
"objective/entropy": -50.62657165527344,
"objective/kl": 28.064453125,
"objective/non_score_reward": -1.403222680091858,
"objective/rlhf_reward": 9.3327054977417,
"objective/scores": 10.735928535461426,
"policy/approxkl_avg": 0.0032155239023268223,
"policy/clipfrac_avg": 0.02317969501018524,
"policy/entropy_avg": 1.7806997299194336,
"step": 31,
"val/clipfrac_avg": 0.006999637931585312,
"val/num_eos_tokens": 23939,
"val/ratio": 1.0002076625823975,
"val/ratio_var": 2.5230063329217955e-05
},
{
"episode": 16384,
"epoch": 0.1403701165181631,
"eps": 5,
"loss/policy_avg": 0.005137978587299585,
"loss/value_avg": 0.32520678639411926,
"lr": 2.63671875e-06,
"objective/entropy": -57.196502685546875,
"objective/kl": 28.533008575439453,
"objective/non_score_reward": -1.4266504049301147,
"objective/rlhf_reward": 9.323891639709473,
"objective/scores": 10.750541687011719,
"policy/approxkl_avg": 0.003107226686552167,
"policy/clipfrac_avg": 0.02019701898097992,
"policy/entropy_avg": 1.8475944995880127,
"step": 32,
"val/clipfrac_avg": 0.006430475041270256,
"val/num_eos_tokens": 25239,
"val/ratio": 1.0004234313964844,
"val/ratio_var": 4.5365977712208405e-06
},
{
"episode": 16896,
"epoch": 0.14475668265935573,
"eps": 5,
"loss/policy_avg": 0.01028348132967949,
"loss/value_avg": 0.35146623849868774,
"lr": 2.6250000000000003e-06,
"objective/entropy": -54.473472595214844,
"objective/kl": 30.472434997558594,
"objective/non_score_reward": -1.523621678352356,
"objective/rlhf_reward": 9.17609977722168,
"objective/scores": 10.699721336364746,
"policy/approxkl_avg": 0.003178014885634184,
"policy/clipfrac_avg": 0.0210000891238451,
"policy/entropy_avg": 1.8384833335876465,
"step": 33,
"val/clipfrac_avg": 0.005109312012791634,
"val/num_eos_tokens": 24597,
"val/ratio": 1.000359058380127,
"val/ratio_var": 6.385233064065687e-06
},
{
"episode": 17408,
"epoch": 0.14914324880054833,
"eps": 5,
"loss/policy_avg": 0.004060306120663881,
"loss/value_avg": 0.3234812617301941,
"lr": 2.61328125e-06,
"objective/entropy": -51.67649841308594,
"objective/kl": 31.204999923706055,
"objective/non_score_reward": -1.5602500438690186,
"objective/rlhf_reward": 9.3241605758667,
"objective/scores": 10.884410858154297,
"policy/approxkl_avg": 0.0030823112465441227,
"policy/clipfrac_avg": 0.02090715430676937,
"policy/entropy_avg": 1.7866475582122803,
"step": 34,
"val/clipfrac_avg": 0.006479810923337936,
"val/num_eos_tokens": 26150,
"val/ratio": 0.9998917579650879,
"val/ratio_var": 4.709030235972023e-06
},
{
"episode": 17920,
"epoch": 0.15352981494174092,
"eps": 5,
"loss/policy_avg": -0.0002043084241449833,
"loss/value_avg": 0.331778347492218,
"lr": 2.6015625e-06,
"objective/entropy": -50.791839599609375,
"objective/kl": 32.52722930908203,
"objective/non_score_reward": -1.62636137008667,
"objective/rlhf_reward": 9.229442596435547,
"objective/scores": 10.855804443359375,
"policy/approxkl_avg": 0.0038988732267171144,
"policy/clipfrac_avg": 0.02275794744491577,
"policy/entropy_avg": 1.7861764430999756,
"step": 35,
"val/clipfrac_avg": 0.0063492972403764725,
"val/num_eos_tokens": 24757,
"val/ratio": 0.999879002571106,
"val/ratio_var": 6.5008221099560615e-06
},
{
"episode": 18432,
"epoch": 0.15791638108293352,
"eps": 5,
"loss/policy_avg": 0.0009975926950573921,
"loss/value_avg": 0.3189573884010315,
"lr": 2.5898437500000003e-06,
"objective/entropy": -48.66301727294922,
"objective/kl": 33.829776763916016,
"objective/non_score_reward": -1.6914888620376587,
"objective/rlhf_reward": 9.39903736114502,
"objective/scores": 11.090526580810547,
"policy/approxkl_avg": 0.0024533928371965885,
"policy/clipfrac_avg": 0.020511234179139137,
"policy/entropy_avg": 1.7290668487548828,
"step": 36,
"val/clipfrac_avg": 0.007418747525662184,
"val/num_eos_tokens": 25660,
"val/ratio": 1.0001412630081177,
"val/ratio_var": 4.908704795525409e-06
},
{
"episode": 18944,
"epoch": 0.1623029472241261,
"eps": 5,
"loss/policy_avg": -0.00018343282863497734,
"loss/value_avg": 0.3374910354614258,
"lr": 2.578125e-06,
"objective/entropy": -48.08941650390625,
"objective/kl": 34.32440948486328,
"objective/non_score_reward": -1.716220498085022,
"objective/rlhf_reward": 9.393345832824707,
"objective/scores": 11.109566688537598,
"policy/approxkl_avg": 0.002999143209308386,
"policy/clipfrac_avg": 0.02006850577890873,
"policy/entropy_avg": 1.6796194314956665,
"step": 37,
"val/clipfrac_avg": 0.006178020033985376,
"val/num_eos_tokens": 23563,
"val/ratio": 1.0002793073654175,
"val/ratio_var": 4.9771865633374546e-06
},
{
"episode": 19456,
"epoch": 0.1666895133653187,
"eps": 5,
"loss/policy_avg": 0.006254453212022781,
"loss/value_avg": 0.35177081823349,
"lr": 2.56640625e-06,
"objective/entropy": -48.374507904052734,
"objective/kl": 34.51807403564453,
"objective/non_score_reward": -1.725903868675232,
"objective/rlhf_reward": 9.309269905090332,
"objective/scores": 11.035173416137695,
"policy/approxkl_avg": 0.0031045477371662855,
"policy/clipfrac_avg": 0.020648740231990814,
"policy/entropy_avg": 1.7128504514694214,
"step": 38,
"val/clipfrac_avg": 0.008920802734792233,
"val/num_eos_tokens": 25658,
"val/ratio": 0.9995635747909546,
"val/ratio_var": 6.544411462527933e-06
},
{
"episode": 19968,
"epoch": 0.1710760795065113,
"eps": 5,
"loss/policy_avg": -0.0010195476934313774,
"loss/value_avg": 0.3460981845855713,
"lr": 2.5546875000000003e-06,
"objective/entropy": -51.683372497558594,
"objective/kl": 32.773372650146484,
"objective/non_score_reward": -1.6386685371398926,
"objective/rlhf_reward": 9.41845703125,
"objective/scores": 11.057125091552734,
"policy/approxkl_avg": 0.0031053770799189806,
"policy/clipfrac_avg": 0.02040484920144081,
"policy/entropy_avg": 1.729933261871338,
"step": 39,
"val/clipfrac_avg": 0.006355272606015205,
"val/num_eos_tokens": 24834,
"val/ratio": 0.9998283982276917,
"val/ratio_var": 4.5907336243544705e-06
},
{
"episode": 20480,
"epoch": 0.17546264564770392,
"eps": 5,
"loss/policy_avg": 0.0011649285443127155,
"loss/value_avg": 0.32062214612960815,
"lr": 2.54296875e-06,
"objective/entropy": -53.09362030029297,
"objective/kl": 33.102630615234375,
"objective/non_score_reward": -1.6551315784454346,
"objective/rlhf_reward": 9.409041404724121,
"objective/scores": 11.064172744750977,
"policy/approxkl_avg": 0.0030981849413365126,
"policy/clipfrac_avg": 0.01863136701285839,
"policy/entropy_avg": 1.7126240730285645,
"step": 40,
"val/clipfrac_avg": 0.007702820934355259,
"val/num_eos_tokens": 24091,
"val/ratio": 0.9998593330383301,
"val/ratio_var": 5.052448614151217e-06
},
{
"episode": 20992,
"epoch": 0.1798492117888965,
"eps": 5,
"loss/policy_avg": 0.0019053546711802483,
"loss/value_avg": 0.3188174068927765,
"lr": 2.53125e-06,
"objective/entropy": -54.4024543762207,
"objective/kl": 32.57151794433594,
"objective/non_score_reward": -1.6285758018493652,
"objective/rlhf_reward": 9.431205749511719,
"objective/scores": 11.059782028198242,
"policy/approxkl_avg": 0.0028727450408041477,
"policy/clipfrac_avg": 0.01854875311255455,
"policy/entropy_avg": 1.7159607410430908,
"step": 41,
"val/clipfrac_avg": 0.008048100396990776,
"val/num_eos_tokens": 24981,
"val/ratio": 0.9997231960296631,
"val/ratio_var": 4.052901658724295e-06
},
{
"episode": 21504,
"epoch": 0.1842357779300891,
"eps": 5,
"loss/policy_avg": 0.001745171844959259,
"loss/value_avg": 0.35853347182273865,
"lr": 2.5195312500000003e-06,
"objective/entropy": -54.93397521972656,
"objective/kl": 32.59296417236328,
"objective/non_score_reward": -1.6296483278274536,
"objective/rlhf_reward": 9.271963119506836,
"objective/scores": 10.901611328125,
"policy/approxkl_avg": 0.002658254001289606,
"policy/clipfrac_avg": 0.01734079420566559,
"policy/entropy_avg": 1.7013740539550781,
"step": 42,
"val/clipfrac_avg": 0.005504803732037544,
"val/num_eos_tokens": 27864,
"val/ratio": 1.0002515316009521,
"val/ratio_var": 4.718350737675792e-06
},
{
"episode": 22016,
"epoch": 0.1886223440712817,
"eps": 5,
"loss/policy_avg": 0.0031983088701963425,
"loss/value_avg": 0.32714200019836426,
"lr": 2.5078125e-06,
"objective/entropy": -56.087486267089844,
"objective/kl": 31.602561950683594,
"objective/non_score_reward": -1.5801280736923218,
"objective/rlhf_reward": 9.48602294921875,
"objective/scores": 11.066150665283203,
"policy/approxkl_avg": 0.0033945119939744473,
"policy/clipfrac_avg": 0.017319316044449806,
"policy/entropy_avg": 1.7106884717941284,
"step": 43,
"val/clipfrac_avg": 0.00773418415337801,
"val/num_eos_tokens": 27303,
"val/ratio": 0.9996844530105591,
"val/ratio_var": 4.000376975454856e-06
},
{
"episode": 22528,
"epoch": 0.1930089102124743,
"eps": 5,
"loss/policy_avg": 0.0024800747632980347,
"loss/value_avg": 0.3157750368118286,
"lr": 2.49609375e-06,
"objective/entropy": -58.142826080322266,
"objective/kl": 31.376550674438477,
"objective/non_score_reward": -1.5688276290893555,
"objective/rlhf_reward": 9.588839530944824,
"objective/scores": 11.15766716003418,
"policy/approxkl_avg": 0.0025360295549035072,
"policy/clipfrac_avg": 0.015292399562895298,
"policy/entropy_avg": 1.6972548961639404,
"step": 44,
"val/clipfrac_avg": 0.007428675889968872,
"val/num_eos_tokens": 26608,
"val/ratio": 1.0000505447387695,
"val/ratio_var": 4.718193395092385e-06
},
{
"episode": 23040,
"epoch": 0.1973954763536669,
"eps": 5,
"loss/policy_avg": 0.001205185428261757,
"loss/value_avg": 0.3156132996082306,
"lr": 2.4843750000000002e-06,
"objective/entropy": -58.6962890625,
"objective/kl": 30.90795135498047,
"objective/non_score_reward": -1.5453976392745972,
"objective/rlhf_reward": 9.614949226379395,
"objective/scores": 11.160346984863281,
"policy/approxkl_avg": 0.0026144087314605713,
"policy/clipfrac_avg": 0.016292206943035126,
"policy/entropy_avg": 1.7031623125076294,
"step": 45,
"val/clipfrac_avg": 0.0070708440616726875,
"val/num_eos_tokens": 26980,
"val/ratio": 0.9996762275695801,
"val/ratio_var": 4.505399829213275e-06
},
{
"episode": 23552,
"epoch": 0.20178204249485948,
"eps": 5,
"loss/policy_avg": 0.004359962418675423,
"loss/value_avg": 0.32259702682495117,
"lr": 2.47265625e-06,
"objective/entropy": -61.484619140625,
"objective/kl": 29.10392189025879,
"objective/non_score_reward": -1.4551961421966553,
"objective/rlhf_reward": 9.753060340881348,
"objective/scores": 11.208256721496582,
"policy/approxkl_avg": 0.002397476462647319,
"policy/clipfrac_avg": 0.013912687078118324,
"policy/entropy_avg": 1.727178931236267,
"step": 46,
"val/clipfrac_avg": 0.00816885195672512,
"val/num_eos_tokens": 28750,
"val/ratio": 0.999366283416748,
"val/ratio_var": 4.691919002652867e-06
},
{
"episode": 24064,
"epoch": 0.2061686086360521,
"eps": 5,
"loss/policy_avg": 0.001963222399353981,
"loss/value_avg": 0.31864023208618164,
"lr": 2.4609375e-06,
"objective/entropy": -61.265846252441406,
"objective/kl": 29.692306518554688,
"objective/non_score_reward": -1.4846153259277344,
"objective/rlhf_reward": 9.666860580444336,
"objective/scores": 11.15147590637207,
"policy/approxkl_avg": 0.002565302886068821,
"policy/clipfrac_avg": 0.013390684500336647,
"policy/entropy_avg": 1.7250237464904785,
"step": 47,
"val/clipfrac_avg": 0.007603655569255352,
"val/num_eos_tokens": 28052,
"val/ratio": 1.0003018379211426,
"val/ratio_var": 5.1418255679891445e-06
},
{
"episode": 24576,
"epoch": 0.2105551747772447,
"eps": 5,
"loss/policy_avg": 0.007894441485404968,
"loss/value_avg": 0.32393255829811096,
"lr": 2.4492187500000002e-06,
"objective/entropy": -61.45293426513672,
"objective/kl": 29.473636627197266,
"objective/non_score_reward": -1.4736818075180054,
"objective/rlhf_reward": 9.70655345916748,
"objective/scores": 11.180234909057617,
"policy/approxkl_avg": 0.0018561662873253226,
"policy/clipfrac_avg": 0.012259826064109802,
"policy/entropy_avg": 1.7109990119934082,
"step": 48,
"val/clipfrac_avg": 0.007823488675057888,
"val/num_eos_tokens": 30985,
"val/ratio": 1.0001144409179688,
"val/ratio_var": 3.26874760503415e-06
},
{
"episode": 25088,
"epoch": 0.2149417409184373,
"eps": 5,
"loss/policy_avg": 0.00292217917740345,
"loss/value_avg": 0.3294008672237396,
"lr": 2.4375e-06,
"objective/entropy": -60.33291244506836,
"objective/kl": 30.310314178466797,
"objective/non_score_reward": -1.5155158042907715,
"objective/rlhf_reward": 9.636428833007812,
"objective/scores": 11.151945114135742,
"policy/approxkl_avg": 0.0021009996999055147,
"policy/clipfrac_avg": 0.012688988819718361,
"policy/entropy_avg": 1.6571977138519287,
"step": 49,
"val/clipfrac_avg": 0.0057515716180205345,
"val/num_eos_tokens": 27460,
"val/ratio": 1.0001912117004395,
"val/ratio_var": 3.742007493201527e-06
},
{
"episode": 25600,
"epoch": 0.21932830705962988,
"eps": 5,
"loss/policy_avg": 0.003821900114417076,
"loss/value_avg": 0.30453431606292725,
"lr": 2.42578125e-06,
"objective/entropy": -59.68878173828125,
"objective/kl": 30.681987762451172,
"objective/non_score_reward": -1.5340994596481323,
"objective/rlhf_reward": 9.784141540527344,
"objective/scores": 11.318241119384766,
"policy/approxkl_avg": 0.0016853193519636989,
"policy/clipfrac_avg": 0.011813260614871979,
"policy/entropy_avg": 1.642488956451416,
"step": 50,
"val/clipfrac_avg": 0.00796700082719326,
"val/num_eos_tokens": 28255,
"val/ratio": 1.0001150369644165,
"val/ratio_var": 3.880189069604967e-06
},
{
"episode": 26112,
"epoch": 0.22371487320082248,
"eps": 5,
"loss/policy_avg": 0.0011352086439728737,
"loss/value_avg": 0.30040693283081055,
"lr": 2.4140625000000002e-06,
"objective/entropy": -55.53452682495117,
"objective/kl": 33.050025939941406,
"objective/non_score_reward": -1.6525013446807861,
"objective/rlhf_reward": 9.85799503326416,
"objective/scores": 11.510496139526367,
"policy/approxkl_avg": 0.0019366666674613953,
"policy/clipfrac_avg": 0.011372741311788559,
"policy/entropy_avg": 1.5299839973449707,
"step": 51,
"val/clipfrac_avg": 0.008255399763584137,
"val/num_eos_tokens": 27884,
"val/ratio": 0.9998023509979248,
"val/ratio_var": 4.162790446571307e-06
},
{
"episode": 26624,
"epoch": 0.22810143934201507,
"eps": 5,
"loss/policy_avg": 0.008498594164848328,
"loss/value_avg": 0.2648542523384094,
"lr": 2.40234375e-06,
"objective/entropy": -56.34510803222656,
"objective/kl": 32.5233154296875,
"objective/non_score_reward": -1.626165747642517,
"objective/rlhf_reward": 9.983355522155762,
"objective/scores": 11.60952091217041,
"policy/approxkl_avg": 0.0027015511877834797,
"policy/clipfrac_avg": 0.011898016557097435,
"policy/entropy_avg": 1.5591082572937012,
"step": 52,
"val/clipfrac_avg": 0.006908833980560303,
"val/num_eos_tokens": 27153,
"val/ratio": 0.9998370409011841,
"val/ratio_var": 2.8140475478721783e-06
},
{
"episode": 27136,
"epoch": 0.23248800548320767,
"eps": 5,
"loss/policy_avg": -0.0007923748344182968,
"loss/value_avg": 0.2664443850517273,
"lr": 2.390625e-06,
"objective/entropy": -52.48090362548828,
"objective/kl": 35.079097747802734,
"objective/non_score_reward": -1.7539548873901367,
"objective/rlhf_reward": 9.7537202835083,
"objective/scores": 11.507675170898438,
"policy/approxkl_avg": 0.002516430802643299,
"policy/clipfrac_avg": 0.012202695943415165,
"policy/entropy_avg": 1.4821527004241943,
"step": 53,
"val/clipfrac_avg": 0.005511893425136805,
"val/num_eos_tokens": 27268,
"val/ratio": 1.0001299381256104,
"val/ratio_var": 2.9662376164196758e-06
},
{
"episode": 27648,
"epoch": 0.2368745716244003,
"eps": 5,
"loss/policy_avg": 0.00248061865568161,
"loss/value_avg": 0.29047083854675293,
"lr": 2.3789062500000002e-06,
"objective/entropy": -50.3958740234375,
"objective/kl": 35.103553771972656,
"objective/non_score_reward": -1.755177617073059,
"objective/rlhf_reward": 9.844841003417969,
"objective/scores": 11.600018501281738,
"policy/approxkl_avg": 0.002680136589333415,
"policy/clipfrac_avg": 0.010980302467942238,
"policy/entropy_avg": 1.4363842010498047,
"step": 54,
"val/clipfrac_avg": 0.006265181582421064,
"val/num_eos_tokens": 26313,
"val/ratio": 1.0002726316452026,
"val/ratio_var": 3.265275154262781e-06
},
{
"episode": 28160,
"epoch": 0.24126113776559288,
"eps": 5,
"loss/policy_avg": 0.00040969252586364746,
"loss/value_avg": 0.27804386615753174,
"lr": 2.3671875e-06,
"objective/entropy": -50.88187789916992,
"objective/kl": 34.17361068725586,
"objective/non_score_reward": -1.7086806297302246,
"objective/rlhf_reward": 9.788152694702148,
"objective/scores": 11.496833801269531,
"policy/approxkl_avg": 0.00247185374610126,
"policy/clipfrac_avg": 0.011061925441026688,
"policy/entropy_avg": 1.4272186756134033,
"step": 55,
"val/clipfrac_avg": 0.004378362558782101,
"val/num_eos_tokens": 28037,
"val/ratio": 0.9997042417526245,
"val/ratio_var": 3.098047955063521e-06
},
{
"episode": 28672,
"epoch": 0.24564770390678548,
"eps": 5,
"loss/policy_avg": 0.001188849564641714,
"loss/value_avg": 0.2825721502304077,
"lr": 2.35546875e-06,
"objective/entropy": -52.136688232421875,
"objective/kl": 33.818946838378906,
"objective/non_score_reward": -1.690947413444519,
"objective/rlhf_reward": 9.844747543334961,
"objective/scores": 11.53569507598877,
"policy/approxkl_avg": 0.0022125309333205223,
"policy/clipfrac_avg": 0.011263608932495117,
"policy/entropy_avg": 1.4427647590637207,
"step": 56,
"val/clipfrac_avg": 0.004626412410289049,
"val/num_eos_tokens": 27546,
"val/ratio": 0.999881386756897,
"val/ratio_var": 4.78684432891896e-06
},
{
"episode": 29184,
"epoch": 0.25003427004797807,
"eps": 4,
"loss/policy_avg": 0.009166279807686806,
"loss/value_avg": 0.2479139119386673,
"lr": 2.3437500000000002e-06,
"objective/entropy": -54.860206604003906,
"objective/kl": 32.90901184082031,
"objective/non_score_reward": -1.6454508304595947,
"objective/rlhf_reward": 9.943949699401855,
"objective/scores": 11.589400291442871,
"policy/approxkl_avg": 0.0023442034143954515,
"policy/clipfrac_avg": 0.011214188300073147,
"policy/entropy_avg": 1.4958158731460571,
"step": 57,
"val/clipfrac_avg": 0.005244936794042587,
"val/num_eos_tokens": 26152,
"val/ratio": 1.0008132457733154,
"val/ratio_var": 4.664201696868986e-06
},
{
"episode": 29696,
"epoch": 0.25442083618917066,
"eps": 4,
"loss/policy_avg": 0.004646984860301018,
"loss/value_avg": 0.2771589457988739,
"lr": 2.33203125e-06,
"objective/entropy": -53.552146911621094,
"objective/kl": 33.505615234375,
"objective/non_score_reward": -1.6752808094024658,
"objective/rlhf_reward": 9.639711380004883,
"objective/scores": 11.31499195098877,
"policy/approxkl_avg": 0.0028129604179412127,
"policy/clipfrac_avg": 0.013663064688444138,
"policy/entropy_avg": 1.483880639076233,
"step": 58,
"val/clipfrac_avg": 0.004003824666142464,
"val/num_eos_tokens": 26445,
"val/ratio": 0.9997650980949402,
"val/ratio_var": 2.8647132239711937e-06
},
{
"episode": 30208,
"epoch": 0.25880740233036326,
"eps": 4,
"loss/policy_avg": 0.006868647411465645,
"loss/value_avg": 0.25104641914367676,
"lr": 2.3203125e-06,
"objective/entropy": -57.64221954345703,
"objective/kl": 30.967973709106445,
"objective/non_score_reward": -1.548398733139038,
"objective/rlhf_reward": 9.892714500427246,
"objective/scores": 11.441113471984863,
"policy/approxkl_avg": 0.001667265547439456,
"policy/clipfrac_avg": 0.010815152898430824,
"policy/entropy_avg": 1.515237808227539,
"step": 59,
"val/clipfrac_avg": 0.004705238156020641,
"val/num_eos_tokens": 25173,
"val/ratio": 0.999830424785614,
"val/ratio_var": 3.4093393423972884e-06
},
{
"episode": 30720,
"epoch": 0.26319396847155585,
"eps": 4,
"loss/policy_avg": 0.011243993416428566,
"loss/value_avg": 0.2495395541191101,
"lr": 2.30859375e-06,
"objective/entropy": -60.84779357910156,
"objective/kl": 31.15279769897461,
"objective/non_score_reward": -1.5576398372650146,
"objective/rlhf_reward": 9.788341522216797,
"objective/scores": 11.34598159790039,
"policy/approxkl_avg": 0.002522544004023075,
"policy/clipfrac_avg": 0.01168464682996273,
"policy/entropy_avg": 1.5892558097839355,
"step": 60,
"val/clipfrac_avg": 0.0036916325334459543,
"val/num_eos_tokens": 25030,
"val/ratio": 1.000048041343689,
"val/ratio_var": 5.555901225307025e-06
},
{
"episode": 31232,
"epoch": 0.26758053461274844,
"eps": 4,
"loss/policy_avg": 0.005039035342633724,
"loss/value_avg": 0.23521627485752106,
"lr": 2.296875e-06,
"objective/entropy": -52.48558807373047,
"objective/kl": 29.97555923461914,
"objective/non_score_reward": -1.498777985572815,
"objective/rlhf_reward": 9.8745756149292,
"objective/scores": 11.373353958129883,
"policy/approxkl_avg": 0.0022459528408944607,
"policy/clipfrac_avg": 0.011031204834580421,
"policy/entropy_avg": 1.452695608139038,
"step": 61,
"val/clipfrac_avg": 0.0035397186875343323,
"val/num_eos_tokens": 22719,
"val/ratio": 1.0000181198120117,
"val/ratio_var": 7.56223380449228e-06
},
{
"episode": 31744,
"epoch": 0.27196710075394104,
"eps": 5,
"loss/policy_avg": 0.003956328146159649,
"loss/value_avg": 0.2286529839038849,
"lr": 2.28515625e-06,
"objective/entropy": -62.272193908691406,
"objective/kl": 30.68024444580078,
"objective/non_score_reward": -1.5340123176574707,
"objective/rlhf_reward": 9.862707138061523,
"objective/scores": 11.396718978881836,
"policy/approxkl_avg": 0.0018089789664372802,
"policy/clipfrac_avg": 0.01095383707433939,
"policy/entropy_avg": 1.5233193635940552,
"step": 62,
"val/clipfrac_avg": 0.0031363165471702814,
"val/num_eos_tokens": 21682,
"val/ratio": 1.0003973245620728,
"val/ratio_var": 3.887281764036743e-06
},
{
"episode": 32256,
"epoch": 0.27635366689513363,
"eps": 5,
"loss/policy_avg": 0.006541299633681774,
"loss/value_avg": 0.2510289251804352,
"lr": 2.2734375e-06,
"objective/entropy": -63.541847229003906,
"objective/kl": 29.277267456054688,
"objective/non_score_reward": -1.4638633728027344,
"objective/rlhf_reward": 9.708379745483398,
"objective/scores": 11.172243118286133,
"policy/approxkl_avg": 0.0018989848904311657,
"policy/clipfrac_avg": 0.01218925230205059,
"policy/entropy_avg": 1.6170835494995117,
"step": 63,
"val/clipfrac_avg": 0.0036716386675834656,
"val/num_eos_tokens": 23597,
"val/ratio": 0.999724268913269,
"val/ratio_var": 3.221219003535225e-06
},
{
"episode": 32768,
"epoch": 0.2807402330363262,
"eps": 5,
"loss/policy_avg": 0.00918935239315033,
"loss/value_avg": 0.2206374704837799,
"lr": 2.26171875e-06,
"objective/entropy": -62.05379867553711,
"objective/kl": 29.984851837158203,
"objective/non_score_reward": -1.4992427825927734,
"objective/rlhf_reward": 9.81045913696289,
"objective/scores": 11.309701919555664,
"policy/approxkl_avg": 0.0021062542218714952,
"policy/clipfrac_avg": 0.010909339413046837,
"policy/entropy_avg": 1.581649661064148,
"step": 64,
"val/clipfrac_avg": 0.0033828443847596645,
"val/num_eos_tokens": 25070,
"val/ratio": 0.9998538494110107,
"val/ratio_var": 2.1343398657336365e-06
},
{
"episode": 33280,
"epoch": 0.2851267991775189,
"eps": 5,
"loss/policy_avg": 0.0031117182224988937,
"loss/value_avg": 0.2214275449514389,
"lr": 2.25e-06,
"objective/entropy": -61.279544830322266,
"objective/kl": 30.740419387817383,
"objective/non_score_reward": -1.5370210409164429,
"objective/rlhf_reward": 9.933812141418457,
"objective/scores": 11.470832824707031,
"policy/approxkl_avg": 0.003304574405774474,
"policy/clipfrac_avg": 0.011563178151845932,
"policy/entropy_avg": 1.5370697975158691,
"step": 65,
"val/clipfrac_avg": 0.0039400579407811165,
"val/num_eos_tokens": 23425,
"val/ratio": 1.0000369548797607,
"val/ratio_var": 2.214629830632475e-06
},
{
"episode": 33792,
"epoch": 0.28951336531871147,
"eps": 5,
"loss/policy_avg": 0.006491166073828936,
"loss/value_avg": 0.22057856619358063,
"lr": 2.23828125e-06,
"objective/entropy": -61.981842041015625,
"objective/kl": 30.143644332885742,
"objective/non_score_reward": -1.5071823596954346,
"objective/rlhf_reward": 9.982759475708008,
"objective/scores": 11.489941596984863,
"policy/approxkl_avg": 0.0016894883010536432,
"policy/clipfrac_avg": 0.012134966440498829,
"policy/entropy_avg": 1.5720046758651733,
"step": 66,
"val/clipfrac_avg": 0.0033682635985314846,
"val/num_eos_tokens": 22234,
"val/ratio": 1.0001835823059082,
"val/ratio_var": 5.471536951517919e-06
},
{
"episode": 34304,
"epoch": 0.29389993145990406,
"eps": 5,
"loss/policy_avg": 0.00841301865875721,
"loss/value_avg": 0.2318386435508728,
"lr": 2.2265625e-06,
"objective/entropy": -60.53614044189453,
"objective/kl": 31.55874252319336,
"objective/non_score_reward": -1.577937126159668,
"objective/rlhf_reward": 9.732769012451172,
"objective/scores": 11.31070613861084,
"policy/approxkl_avg": 0.0018887519836425781,
"policy/clipfrac_avg": 0.011964268051087856,
"policy/entropy_avg": 1.5244885683059692,
"step": 67,
"val/clipfrac_avg": 0.0025918553583323956,
"val/num_eos_tokens": 22458,
"val/ratio": 1.0001020431518555,
"val/ratio_var": 1.6006388250389136e-06
},
{
"episode": 34816,
"epoch": 0.29828649760109666,
"eps": 5,
"loss/policy_avg": 0.004491077736020088,
"loss/value_avg": 0.21774470806121826,
"lr": 2.21484375e-06,
"objective/entropy": -61.31328582763672,
"objective/kl": 30.977596282958984,
"objective/non_score_reward": -1.548879861831665,
"objective/rlhf_reward": 9.94593620300293,
"objective/scores": 11.494815826416016,
"policy/approxkl_avg": 0.002278459956869483,
"policy/clipfrac_avg": 0.01156328059732914,
"policy/entropy_avg": 1.5633031129837036,
"step": 68,
"val/clipfrac_avg": 0.004319990985095501,
"val/num_eos_tokens": 22420,
"val/ratio": 0.9998349547386169,
"val/ratio_var": 2.804338009809726e-06
},
{
"episode": 35328,
"epoch": 0.30267306374228925,
"eps": 5,
"loss/policy_avg": 0.012953916564583778,
"loss/value_avg": 0.22159670293331146,
"lr": 2.203125e-06,
"objective/entropy": -60.40982437133789,
"objective/kl": 31.161331176757812,
"objective/non_score_reward": -1.5580666065216064,
"objective/rlhf_reward": 9.979460716247559,
"objective/scores": 11.537527084350586,
"policy/approxkl_avg": 0.002316855126991868,
"policy/clipfrac_avg": 0.012466225773096085,
"policy/entropy_avg": 1.5732831954956055,
"step": 69,
"val/clipfrac_avg": 0.004431965295225382,
"val/num_eos_tokens": 23475,
"val/ratio": 0.9999319314956665,
"val/ratio_var": 2.884213245124556e-06
},
{
"episode": 35840,
"epoch": 0.30705962988348184,
"eps": 5,
"loss/policy_avg": 0.0026891864836215973,
"loss/value_avg": 0.22345781326293945,
"lr": 2.19140625e-06,
"objective/entropy": -59.21501541137695,
"objective/kl": 31.3795166015625,
"objective/non_score_reward": -1.568975806236267,
"objective/rlhf_reward": 10.109546661376953,
"objective/scores": 11.678522109985352,
"policy/approxkl_avg": 0.0033681951463222504,
"policy/clipfrac_avg": 0.01226731576025486,
"policy/entropy_avg": 1.559905767440796,
"step": 70,
"val/clipfrac_avg": 0.004147649277001619,
"val/num_eos_tokens": 26871,
"val/ratio": 0.9998610615730286,
"val/ratio_var": 3.8059629332565237e-06
},
{
"episode": 36352,
"epoch": 0.31144619602467444,
"eps": 5,
"loss/policy_avg": 0.010949812829494476,
"loss/value_avg": 0.21867918968200684,
"lr": 2.1796875e-06,
"objective/entropy": -55.472938537597656,
"objective/kl": 33.11960220336914,
"objective/non_score_reward": -1.655980110168457,
"objective/rlhf_reward": 9.923175811767578,
"objective/scores": 11.579155921936035,
"policy/approxkl_avg": 0.0021974374540150166,
"policy/clipfrac_avg": 0.013162683695554733,
"policy/entropy_avg": 1.446916103363037,
"step": 71,
"val/clipfrac_avg": 0.004135113209486008,
"val/num_eos_tokens": 25201,
"val/ratio": 1.0002843141555786,
"val/ratio_var": 5.781992513220757e-06
},
{
"episode": 36864,
"epoch": 0.31583276216586703,
"eps": 5,
"loss/policy_avg": 0.008956819772720337,
"loss/value_avg": 0.22117221355438232,
"lr": 2.16796875e-06,
"objective/entropy": -57.94511413574219,
"objective/kl": 32.885406494140625,
"objective/non_score_reward": -1.6442703008651733,
"objective/rlhf_reward": 9.936177253723145,
"objective/scores": 11.58044719696045,
"policy/approxkl_avg": 0.0023873518221080303,
"policy/clipfrac_avg": 0.011646779254078865,
"policy/entropy_avg": 1.5099616050720215,
"step": 72,
"val/clipfrac_avg": 0.003031653817743063,
"val/num_eos_tokens": 23619,
"val/ratio": 0.9997458457946777,
"val/ratio_var": 3.188845766999293e-06
},
{
"episode": 37376,
"epoch": 0.3202193283070596,
"eps": 5,
"loss/policy_avg": 0.006841774098575115,
"loss/value_avg": 0.22015002369880676,
"lr": 2.15625e-06,
"objective/entropy": -53.92641830444336,
"objective/kl": 34.366966247558594,
"objective/non_score_reward": -1.7183483839035034,
"objective/rlhf_reward": 10.033120155334473,
"objective/scores": 11.751468658447266,
"policy/approxkl_avg": 0.003308027284219861,
"policy/clipfrac_avg": 0.011116940528154373,
"policy/entropy_avg": 1.41293466091156,
"step": 73,
"val/clipfrac_avg": 0.004825121723115444,
"val/num_eos_tokens": 23706,
"val/ratio": 1.0000604391098022,
"val/ratio_var": 3.1772717647982063e-06
},
{
"episode": 37888,
"epoch": 0.3246058944482522,
"eps": 5,
"loss/policy_avg": 8.680112659931183e-05,
"loss/value_avg": 0.22518639266490936,
"lr": 2.14453125e-06,
"objective/entropy": -51.67547607421875,
"objective/kl": 33.763038635253906,
"objective/non_score_reward": -1.6881520748138428,
"objective/rlhf_reward": 10.125733375549316,
"objective/scores": 11.813885688781738,
"policy/approxkl_avg": 0.002383920131251216,
"policy/clipfrac_avg": 0.012023954652249813,
"policy/entropy_avg": 1.3859004974365234,
"step": 74,
"val/clipfrac_avg": 0.0035750826355069876,
"val/num_eos_tokens": 27803,
"val/ratio": 0.9996781945228577,
"val/ratio_var": 4.62918205812457e-06
},
{
"episode": 38400,
"epoch": 0.3289924605894448,
"eps": 5,
"loss/policy_avg": 0.00683591328561306,
"loss/value_avg": 0.19435633718967438,
"lr": 2.1328125e-06,
"objective/entropy": -55.22687911987305,
"objective/kl": 33.985172271728516,
"objective/non_score_reward": -1.69925856590271,
"objective/rlhf_reward": 10.095266342163086,
"objective/scores": 11.794525146484375,
"policy/approxkl_avg": 0.0020253900438547134,
"policy/clipfrac_avg": 0.011549219489097595,
"policy/entropy_avg": 1.4170671701431274,
"step": 75,
"val/clipfrac_avg": 0.005189975723624229,
"val/num_eos_tokens": 23555,
"val/ratio": 1.0000282526016235,
"val/ratio_var": 2.6403824904264184e-06
},
{
"episode": 38912,
"epoch": 0.3333790267306374,
"eps": 5,
"loss/policy_avg": 0.0018177703022956848,
"loss/value_avg": 0.2192225456237793,
"lr": 2.12109375e-06,
"objective/entropy": -52.719966888427734,
"objective/kl": 34.939414978027344,
"objective/non_score_reward": -1.746970772743225,
"objective/rlhf_reward": 9.940811157226562,
"objective/scores": 11.687782287597656,
"policy/approxkl_avg": 0.003144835354760289,
"policy/clipfrac_avg": 0.011414816603064537,
"policy/entropy_avg": 1.3688035011291504,
"step": 76,
"val/clipfrac_avg": 0.00428430363535881,
"val/num_eos_tokens": 25195,
"val/ratio": 0.9994131326675415,
"val/ratio_var": 3.1442646104551386e-06
},
{
"episode": 39424,
"epoch": 0.33776559287183,
"eps": 5,
"loss/policy_avg": 0.002414613962173462,
"loss/value_avg": 0.22705943882465363,
"lr": 2.109375e-06,
"objective/entropy": -52.9849853515625,
"objective/kl": 35.76004409790039,
"objective/non_score_reward": -1.7880022525787354,
"objective/rlhf_reward": 10.003686904907227,
"objective/scores": 11.791688919067383,
"policy/approxkl_avg": 0.0024730274453759193,
"policy/clipfrac_avg": 0.012745586223900318,
"policy/entropy_avg": 1.3729814291000366,
"step": 77,
"val/clipfrac_avg": 0.004229060374200344,
"val/num_eos_tokens": 26826,
"val/ratio": 0.9999415278434753,
"val/ratio_var": 5.440687800728483e-06
},
{
"episode": 39936,
"epoch": 0.3421521590130226,
"eps": 4,
"loss/policy_avg": 0.007603425532579422,
"loss/value_avg": 0.20695388317108154,
"lr": 2.09765625e-06,
"objective/entropy": -52.369571685791016,
"objective/kl": 35.140933990478516,
"objective/non_score_reward": -1.7570466995239258,
"objective/rlhf_reward": 10.074074745178223,
"objective/scores": 11.831121444702148,
"policy/approxkl_avg": 0.002142505254596472,
"policy/clipfrac_avg": 0.010965963825583458,
"policy/entropy_avg": 1.3412797451019287,
"step": 78,
"val/clipfrac_avg": 0.0038626650348305702,
"val/num_eos_tokens": 23611,
"val/ratio": 0.9996312856674194,
"val/ratio_var": 4.994772552890936e-06
},
{
"episode": 40448,
"epoch": 0.34653872515421524,
"eps": 4,
"loss/policy_avg": 0.0018220320343971252,
"loss/value_avg": 0.21862611174583435,
"lr": 2.0859375e-06,
"objective/entropy": -51.15296936035156,
"objective/kl": 35.11440658569336,
"objective/non_score_reward": -1.7557203769683838,
"objective/rlhf_reward": 10.102788925170898,
"objective/scores": 11.858509063720703,
"policy/approxkl_avg": 0.0024529770016670227,
"policy/clipfrac_avg": 0.012614256702363491,
"policy/entropy_avg": 1.3262717723846436,
"step": 79,
"val/clipfrac_avg": 0.003596197348088026,
"val/num_eos_tokens": 24645,
"val/ratio": 1.000266194343567,
"val/ratio_var": 6.255781045183539e-06
},
{
"episode": 40960,
"epoch": 0.35092529129540784,
"eps": 4,
"loss/policy_avg": 0.0023819338530302048,
"loss/value_avg": 0.22472721338272095,
"lr": 2.07421875e-06,
"objective/entropy": -50.92372512817383,
"objective/kl": 36.36195373535156,
"objective/non_score_reward": -1.8180978298187256,
"objective/rlhf_reward": 10.099588394165039,
"objective/scores": 11.917686462402344,
"policy/approxkl_avg": 0.002738222246989608,
"policy/clipfrac_avg": 0.01162832509726286,
"policy/entropy_avg": 1.3125125169754028,
"step": 80,
"val/clipfrac_avg": 0.005824130028486252,
"val/num_eos_tokens": 22979,
"val/ratio": 0.9995706677436829,
"val/ratio_var": 3.720711902133189e-06
},
{
"episode": 41472,
"epoch": 0.35531185743660043,
"eps": 4,
"loss/policy_avg": 0.004819877445697784,
"loss/value_avg": 0.22295227646827698,
"lr": 2.0625e-06,
"objective/entropy": -50.77427291870117,
"objective/kl": 35.64800262451172,
"objective/non_score_reward": -1.782400131225586,
"objective/rlhf_reward": 9.891586303710938,
"objective/scores": 11.673986434936523,
"policy/approxkl_avg": 0.0026750012766569853,
"policy/clipfrac_avg": 0.012281844392418861,
"policy/entropy_avg": 1.3098926544189453,
"step": 81,
"val/clipfrac_avg": 0.003562133526429534,
"val/num_eos_tokens": 22481,
"val/ratio": 1.0002925395965576,
"val/ratio_var": 3.08896505885059e-06
},
{
"episode": 41984,
"epoch": 0.359698423577793,
"eps": 4,
"loss/policy_avg": 0.003094850108027458,
"loss/value_avg": 0.20467893779277802,
"lr": 2.05078125e-06,
"objective/entropy": -49.50064468383789,
"objective/kl": 35.082889556884766,
"objective/non_score_reward": -1.7541444301605225,
"objective/rlhf_reward": 10.12310791015625,
"objective/scores": 11.877252578735352,
"policy/approxkl_avg": 0.0031247385777533054,
"policy/clipfrac_avg": 0.010663332417607307,
"policy/entropy_avg": 1.2899810075759888,
"step": 82,
"val/clipfrac_avg": 0.003626835998147726,
"val/num_eos_tokens": 23688,
"val/ratio": 0.9997555613517761,
"val/ratio_var": 4.246959178999532e-06
},
{
"episode": 42496,
"epoch": 0.3640849897189856,
"eps": 4,
"loss/policy_avg": 0.006415246054530144,
"loss/value_avg": 0.2166299670934677,
"lr": 2.0390625e-06,
"objective/entropy": -49.80467987060547,
"objective/kl": 35.149269104003906,
"objective/non_score_reward": -1.7574634552001953,
"objective/rlhf_reward": 9.97119140625,
"objective/scores": 11.728654861450195,
"policy/approxkl_avg": 0.002845556242391467,
"policy/clipfrac_avg": 0.013346903957426548,
"policy/entropy_avg": 1.293116569519043,
"step": 83,
"val/clipfrac_avg": 0.004545033443719149,
"val/num_eos_tokens": 24437,
"val/ratio": 0.9999039173126221,
"val/ratio_var": 6.112253231549403e-06
},
{
"episode": 43008,
"epoch": 0.3684715558601782,
"eps": 4,
"loss/policy_avg": 0.004797011613845825,
"loss/value_avg": 0.2133270502090454,
"lr": 2.02734375e-06,
"objective/entropy": -51.009857177734375,
"objective/kl": 35.441497802734375,
"objective/non_score_reward": -1.7720749378204346,
"objective/rlhf_reward": 10.048232078552246,
"objective/scores": 11.820306777954102,
"policy/approxkl_avg": 0.002602731343358755,
"policy/clipfrac_avg": 0.012205180712044239,
"policy/entropy_avg": 1.3177757263183594,
"step": 84,
"val/clipfrac_avg": 0.004008126445114613,
"val/num_eos_tokens": 24990,
"val/ratio": 1.0002726316452026,
"val/ratio_var": 5.879127456864808e-06
},
{
"episode": 43520,
"epoch": 0.3728581220013708,
"eps": 4,
"loss/policy_avg": 0.004787225276231766,
"loss/value_avg": 0.213937908411026,
"lr": 2.015625e-06,
"objective/entropy": -48.71440124511719,
"objective/kl": 35.934600830078125,
"objective/non_score_reward": -1.7967300415039062,
"objective/rlhf_reward": 10.027629852294922,
"objective/scores": 11.824359893798828,
"policy/approxkl_avg": 0.0032878173515200615,
"policy/clipfrac_avg": 0.01084982417523861,
"policy/entropy_avg": 1.26298189163208,
"step": 85,
"val/clipfrac_avg": 0.004145544022321701,
"val/num_eos_tokens": 25423,
"val/ratio": 0.9995351433753967,
"val/ratio_var": 4.676774096878944e-06
},
{
"episode": 44032,
"epoch": 0.3772446881425634,
"eps": 4,
"loss/policy_avg": 0.009194480255246162,
"loss/value_avg": 0.19612735509872437,
"lr": 2.00390625e-06,
"objective/entropy": -50.50733947753906,
"objective/kl": 35.42319869995117,
"objective/non_score_reward": -1.7711601257324219,
"objective/rlhf_reward": 10.026796340942383,
"objective/scores": 11.797956466674805,
"policy/approxkl_avg": 0.0025596285704523325,
"policy/clipfrac_avg": 0.011088266968727112,
"policy/entropy_avg": 1.2955846786499023,
"step": 86,
"val/clipfrac_avg": 0.002618049271404743,
"val/num_eos_tokens": 24385,
"val/ratio": 1.0004734992980957,
"val/ratio_var": 7.4206191129633226e-06
},
{
"episode": 44544,
"epoch": 0.381631254283756,
"eps": 4,
"loss/policy_avg": 0.0011375932954251766,
"loss/value_avg": 0.19349417090415955,
"lr": 1.9921875e-06,
"objective/entropy": -48.215614318847656,
"objective/kl": 36.09561538696289,
"objective/non_score_reward": -1.8047807216644287,
"objective/rlhf_reward": 10.043390274047852,
"objective/scores": 11.84817123413086,
"policy/approxkl_avg": 0.002257507061585784,
"policy/clipfrac_avg": 0.011135649867355824,
"policy/entropy_avg": 1.2575373649597168,
"step": 87,
"val/clipfrac_avg": 0.0035062048118561506,
"val/num_eos_tokens": 26536,
"val/ratio": 1.0005232095718384,
"val/ratio_var": 7.836673830752261e-06
},
{
"episode": 45056,
"epoch": 0.3860178204249486,
"eps": 4,
"loss/policy_avg": 0.004499746486544609,
"loss/value_avg": 0.20375367999076843,
"lr": 1.98046875e-06,
"objective/entropy": -48.469757080078125,
"objective/kl": 35.357276916503906,
"objective/non_score_reward": -1.7678639888763428,
"objective/rlhf_reward": 10.101089477539062,
"objective/scores": 11.868953704833984,
"policy/approxkl_avg": 0.0027951907832175493,
"policy/clipfrac_avg": 0.011439654044806957,
"policy/entropy_avg": 1.2559094429016113,
"step": 88,
"val/clipfrac_avg": 0.0032751811668276787,
"val/num_eos_tokens": 24904,
"val/ratio": 0.999579668045044,
"val/ratio_var": 3.818234290520195e-06
},
{
"episode": 45568,
"epoch": 0.3904043865661412,
"eps": 4,
"loss/policy_avg": 0.004391288384795189,
"loss/value_avg": 0.19215956330299377,
"lr": 1.96875e-06,
"objective/entropy": -47.49878692626953,
"objective/kl": 35.51283645629883,
"objective/non_score_reward": -1.775641918182373,
"objective/rlhf_reward": 10.171991348266602,
"objective/scores": 11.947633743286133,
"policy/approxkl_avg": 0.003075700718909502,
"policy/clipfrac_avg": 0.011072011664509773,
"policy/entropy_avg": 1.2150685787200928,
"step": 89,
"val/clipfrac_avg": 0.0027455922681838274,
"val/num_eos_tokens": 23593,
"val/ratio": 0.9998563528060913,
"val/ratio_var": 3.37912251779926e-06
},
{
"episode": 46080,
"epoch": 0.3947909527073338,
"eps": 4,
"loss/policy_avg": 0.0004821214824914932,
"loss/value_avg": 0.21164780855178833,
"lr": 1.95703125e-06,
"objective/entropy": -49.131282806396484,
"objective/kl": 35.567405700683594,
"objective/non_score_reward": -1.7783703804016113,
"objective/rlhf_reward": 10.056915283203125,
"objective/scores": 11.835285186767578,
"policy/approxkl_avg": 0.0025346523616462946,
"policy/clipfrac_avg": 0.01132851280272007,
"policy/entropy_avg": 1.2548737525939941,
"step": 90,
"val/clipfrac_avg": 0.0035444346722215414,
"val/num_eos_tokens": 23956,
"val/ratio": 1.0001569986343384,
"val/ratio_var": 4.048593382321997e-06
},
{
"episode": 46592,
"epoch": 0.39917751884852637,
"eps": 4,
"loss/policy_avg": 0.002457218011841178,
"loss/value_avg": 0.20488248765468597,
"lr": 1.9453125e-06,
"objective/entropy": -50.223350524902344,
"objective/kl": 34.380897521972656,
"objective/non_score_reward": -1.719044804573059,
"objective/rlhf_reward": 10.15352725982666,
"objective/scores": 11.87257194519043,
"policy/approxkl_avg": 0.0025883247144520283,
"policy/clipfrac_avg": 0.01095154695212841,
"policy/entropy_avg": 1.2806235551834106,
"step": 91,
"val/clipfrac_avg": 0.0030915343668311834,
"val/num_eos_tokens": 24619,
"val/ratio": 1.000389814376831,
"val/ratio_var": 5.21131823916221e-06
},
{
"episode": 47104,
"epoch": 0.40356408498971896,
"eps": 4,
"loss/policy_avg": 0.009746008552610874,
"loss/value_avg": 0.19489187002182007,
"lr": 1.93359375e-06,
"objective/entropy": -49.78590774536133,
"objective/kl": 34.79002380371094,
"objective/non_score_reward": -1.7395012378692627,
"objective/rlhf_reward": 10.149117469787598,
"objective/scores": 11.888618469238281,
"policy/approxkl_avg": 0.0028350851498544216,
"policy/clipfrac_avg": 0.012067590840160847,
"policy/entropy_avg": 1.321395993232727,
"step": 92,
"val/clipfrac_avg": 0.0034278968814760447,
"val/num_eos_tokens": 24707,
"val/ratio": 1.000132441520691,
"val/ratio_var": 3.817275683104526e-06
},
{
"episode": 47616,
"epoch": 0.40795065113091156,
"eps": 4,
"loss/policy_avg": 0.007891923189163208,
"loss/value_avg": 0.18601296842098236,
"lr": 1.921875e-06,
"objective/entropy": -51.31802749633789,
"objective/kl": 33.889923095703125,
"objective/non_score_reward": -1.6944962739944458,
"objective/rlhf_reward": 10.027242660522461,
"objective/scores": 11.721738815307617,
"policy/approxkl_avg": 0.0023833350278437138,
"policy/clipfrac_avg": 0.011782050132751465,
"policy/entropy_avg": 1.3522666692733765,
"step": 93,
"val/clipfrac_avg": 0.002397140022367239,
"val/num_eos_tokens": 23485,
"val/ratio": 0.9999991655349731,
"val/ratio_var": 4.472914497455349e-06
},
{
"episode": 48128,
"epoch": 0.4123372172721042,
"eps": 4,
"loss/policy_avg": 0.008123669773340225,
"loss/value_avg": 0.19486477971076965,
"lr": 1.91015625e-06,
"objective/entropy": -51.119022369384766,
"objective/kl": 33.58214569091797,
"objective/non_score_reward": -1.679107427597046,
"objective/rlhf_reward": 10.173272132873535,
"objective/scores": 11.85237979888916,
"policy/approxkl_avg": 0.002184953773394227,
"policy/clipfrac_avg": 0.010806472972035408,
"policy/entropy_avg": 1.3613462448120117,
"step": 94,
"val/clipfrac_avg": 0.0037152436561882496,
"val/num_eos_tokens": 24130,
"val/ratio": 1.0001533031463623,
"val/ratio_var": 7.5301450124243274e-06
},
{
"episode": 48640,
"epoch": 0.4167237834132968,
"eps": 4,
"loss/policy_avg": 0.006111526861786842,
"loss/value_avg": 0.18791520595550537,
"lr": 1.8984375e-06,
"objective/entropy": -50.709373474121094,
"objective/kl": 34.28173065185547,
"objective/non_score_reward": -1.7140867710113525,
"objective/rlhf_reward": 10.056640625,
"objective/scores": 11.770727157592773,
"policy/approxkl_avg": 0.0028409743681550026,
"policy/clipfrac_avg": 0.010896073654294014,
"policy/entropy_avg": 1.3718466758728027,
"step": 95,
"val/clipfrac_avg": 0.0028323421720415354,
"val/num_eos_tokens": 24401,
"val/ratio": 1.0004030466079712,
"val/ratio_var": 5.722152764064958e-06
},
{
"episode": 49152,
"epoch": 0.4211103495544894,
"eps": 4,
"loss/policy_avg": 0.006566773168742657,
"loss/value_avg": 0.21990007162094116,
"lr": 1.8867187500000001e-06,
"objective/entropy": -51.272518157958984,
"objective/kl": 33.29633712768555,
"objective/non_score_reward": -1.6648168563842773,
"objective/rlhf_reward": 10.015409469604492,
"objective/scores": 11.68022632598877,
"policy/approxkl_avg": 0.002582971705123782,
"policy/clipfrac_avg": 0.012000022456049919,
"policy/entropy_avg": 1.3877381086349487,
"step": 96,
"val/clipfrac_avg": 0.00376589922234416,
"val/num_eos_tokens": 26887,
"val/ratio": 0.9997319579124451,
"val/ratio_var": 3.3458989037171705e-06
},
{
"episode": 49664,
"epoch": 0.425496915695682,
"eps": 4,
"loss/policy_avg": 0.00574074499309063,
"loss/value_avg": 0.20088031888008118,
"lr": 1.875e-06,
"objective/entropy": -50.57700729370117,
"objective/kl": 33.54800033569336,
"objective/non_score_reward": -1.67739999294281,
"objective/rlhf_reward": 10.143403053283691,
"objective/scores": 11.820802688598633,
"policy/approxkl_avg": 0.00242623221129179,
"policy/clipfrac_avg": 0.011794717982411385,
"policy/entropy_avg": 1.3870760202407837,
"step": 97,
"val/clipfrac_avg": 0.002550810342654586,
"val/num_eos_tokens": 26478,
"val/ratio": 1.0001178979873657,
"val/ratio_var": 3.899105195159791e-06
},
{
"episode": 50176,
"epoch": 0.4298834818368746,
"eps": 4,
"loss/policy_avg": 0.002134094014763832,
"loss/value_avg": 0.2105971872806549,
"lr": 1.86328125e-06,
"objective/entropy": -50.969947814941406,
"objective/kl": 32.82494354248047,
"objective/non_score_reward": -1.6412471532821655,
"objective/rlhf_reward": 10.093246459960938,
"objective/scores": 11.734493255615234,
"policy/approxkl_avg": 0.002924936590716243,
"policy/clipfrac_avg": 0.012574190273880959,
"policy/entropy_avg": 1.4206920862197876,
"step": 98,
"val/clipfrac_avg": 0.0022323690354824066,
"val/num_eos_tokens": 25788,
"val/ratio": 1.0000821352005005,
"val/ratio_var": 4.282260761101497e-06
},
{
"episode": 50688,
"epoch": 0.4342700479780672,
"eps": 4,
"loss/policy_avg": 0.002779986709356308,
"loss/value_avg": 0.21304886043071747,
"lr": 1.8515625000000001e-06,
"objective/entropy": -48.35157012939453,
"objective/kl": 34.29575729370117,
"objective/non_score_reward": -1.7147879600524902,
"objective/rlhf_reward": 10.013525009155273,
"objective/scores": 11.728313446044922,
"policy/approxkl_avg": 0.0031486451625823975,
"policy/clipfrac_avg": 0.011947648599743843,
"policy/entropy_avg": 1.3698291778564453,
"step": 99,
"val/clipfrac_avg": 0.003086227923631668,
"val/num_eos_tokens": 27226,
"val/ratio": 1.0002012252807617,
"val/ratio_var": 3.848301275866106e-06
},
{
"episode": 51200,
"epoch": 0.43865661411925977,
"eps": 4,
"loss/policy_avg": 0.002698383294045925,
"loss/value_avg": 0.1958826333284378,
"lr": 1.83984375e-06,
"objective/entropy": -51.114051818847656,
"objective/kl": 33.804664611816406,
"objective/non_score_reward": -1.6902332305908203,
"objective/rlhf_reward": 10.000195503234863,
"objective/scores": 11.690428733825684,
"policy/approxkl_avg": 0.002505134791135788,
"policy/clipfrac_avg": 0.013553831726312637,
"policy/entropy_avg": 1.435497522354126,
"step": 100,
"val/clipfrac_avg": 0.0029125860892236233,
"val/num_eos_tokens": 27134,
"val/ratio": 0.9996867179870605,
"val/ratio_var": 4.967731001670472e-06
},
{
"episode": 51712,
"epoch": 0.44304318026045236,
"eps": 4,
"loss/policy_avg": 0.00523423682898283,
"loss/value_avg": 0.20203697681427002,
"lr": 1.828125e-06,
"objective/entropy": -53.56639862060547,
"objective/kl": 32.214115142822266,
"objective/non_score_reward": -1.610705852508545,
"objective/rlhf_reward": 10.233478546142578,
"objective/scores": 11.844184875488281,
"policy/approxkl_avg": 0.003430293407291174,
"policy/clipfrac_avg": 0.013302096165716648,
"policy/entropy_avg": 1.4647541046142578,
"step": 101,
"val/clipfrac_avg": 0.0026878612115979195,
"val/num_eos_tokens": 24773,
"val/ratio": 1.0002996921539307,
"val/ratio_var": 1.0531030056881718e-05
},
{
"episode": 52224,
"epoch": 0.44742974640164496,
"eps": 4,
"loss/policy_avg": 0.007096525281667709,
"loss/value_avg": 0.2158614844083786,
"lr": 1.81640625e-06,
"objective/entropy": -51.106422424316406,
"objective/kl": 33.817195892333984,
"objective/non_score_reward": -1.6908597946166992,
"objective/rlhf_reward": 10.014439582824707,
"objective/scores": 11.705299377441406,
"policy/approxkl_avg": 0.0026541282422840595,
"policy/clipfrac_avg": 0.012738144025206566,
"policy/entropy_avg": 1.4463238716125488,
"step": 102,
"val/clipfrac_avg": 0.0025819321162998676,
"val/num_eos_tokens": 26794,
"val/ratio": 1.0001683235168457,
"val/ratio_var": 4.307699327910086e-06
},
{
"episode": 52736,
"epoch": 0.45181631254283755,
"eps": 4,
"loss/policy_avg": 0.0076528494246304035,
"loss/value_avg": 0.22683024406433105,
"lr": 1.8046875e-06,
"objective/entropy": -51.95180130004883,
"objective/kl": 32.943721771240234,
"objective/non_score_reward": -1.647186040878296,
"objective/rlhf_reward": 9.92637825012207,
"objective/scores": 11.573564529418945,
"policy/approxkl_avg": 0.002516075037419796,
"policy/clipfrac_avg": 0.012894165702164173,
"policy/entropy_avg": 1.4716830253601074,
"step": 103,
"val/clipfrac_avg": 0.003462804015725851,
"val/num_eos_tokens": 27258,
"val/ratio": 0.9998781681060791,
"val/ratio_var": 3.6184294458507793e-06
},
{
"episode": 53248,
"epoch": 0.45620287868403014,
"eps": 4,
"loss/policy_avg": 0.0059026554226875305,
"loss/value_avg": 0.18866774439811707,
"lr": 1.79296875e-06,
"objective/entropy": -51.065303802490234,
"objective/kl": 33.99561309814453,
"objective/non_score_reward": -1.699780821800232,
"objective/rlhf_reward": 10.005899429321289,
"objective/scores": 11.705679893493652,
"policy/approxkl_avg": 0.003799548838287592,
"policy/clipfrac_avg": 0.011900994926691055,
"policy/entropy_avg": 1.453720211982727,
"step": 104,
"val/clipfrac_avg": 0.002593266312032938,
"val/num_eos_tokens": 25703,
"val/ratio": 1.0001416206359863,
"val/ratio_var": 5.38453923581983e-06
},
{
"episode": 53760,
"epoch": 0.46058944482522274,
"eps": 4,
"loss/policy_avg": -0.00120542012155056,
"loss/value_avg": 0.20909513533115387,
"lr": 1.78125e-06,
"objective/entropy": -51.853965759277344,
"objective/kl": 33.504188537597656,
"objective/non_score_reward": -1.6752095222473145,
"objective/rlhf_reward": 9.993560791015625,
"objective/scores": 11.668770790100098,
"policy/approxkl_avg": 0.0022638142108917236,
"policy/clipfrac_avg": 0.012438900768756866,
"policy/entropy_avg": 1.4844257831573486,
"step": 105,
"val/clipfrac_avg": 0.0025224490091204643,
"val/num_eos_tokens": 28217,
"val/ratio": 0.9998506307601929,
"val/ratio_var": 4.211783107166411e-06
},
{
"episode": 54272,
"epoch": 0.46497601096641533,
"eps": 4,
"loss/policy_avg": 0.0032410603016614914,
"loss/value_avg": 0.19734174013137817,
"lr": 1.76953125e-06,
"objective/entropy": -49.862060546875,
"objective/kl": 34.40680694580078,
"objective/non_score_reward": -1.7203404903411865,
"objective/rlhf_reward": 10.194497108459473,
"objective/scores": 11.914837837219238,
"policy/approxkl_avg": 0.00206130463629961,
"policy/clipfrac_avg": 0.01056149136275053,
"policy/entropy_avg": 1.3804571628570557,
"step": 106,
"val/clipfrac_avg": 0.0024648173712193966,
"val/num_eos_tokens": 25190,
"val/ratio": 1.000199794769287,
"val/ratio_var": 4.576507308229338e-06
},
{
"episode": 54784,
"epoch": 0.4693625771076079,
"eps": 4,
"loss/policy_avg": 0.0007011368870735168,
"loss/value_avg": 0.21626059710979462,
"lr": 1.7578125e-06,
"objective/entropy": -49.26763916015625,
"objective/kl": 35.990623474121094,
"objective/non_score_reward": -1.7995312213897705,
"objective/rlhf_reward": 10.014336585998535,
"objective/scores": 11.813867568969727,
"policy/approxkl_avg": 0.002608464565128088,
"policy/clipfrac_avg": 0.011158171109855175,
"policy/entropy_avg": 1.4025707244873047,
"step": 107,
"val/clipfrac_avg": 0.0035355808213353157,
"val/num_eos_tokens": 26415,
"val/ratio": 0.999910831451416,
"val/ratio_var": 3.4601300740177976e-06
},
{
"episode": 55296,
"epoch": 0.4737491432488006,
"eps": 4,
"loss/policy_avg": 0.010623332113027573,
"loss/value_avg": 0.22851604223251343,
"lr": 1.74609375e-06,
"objective/entropy": -48.97447967529297,
"objective/kl": 35.410743713378906,
"objective/non_score_reward": -1.7705371379852295,
"objective/rlhf_reward": 9.869864463806152,
"objective/scores": 11.640401840209961,
"policy/approxkl_avg": 0.0030459933914244175,
"policy/clipfrac_avg": 0.010695299133658409,
"policy/entropy_avg": 1.3910768032073975,
"step": 108,
"val/clipfrac_avg": 0.003275398164987564,
"val/num_eos_tokens": 27971,
"val/ratio": 0.9999659061431885,
"val/ratio_var": 4.657621047954308e-06
},
{
"episode": 55808,
"epoch": 0.47813570938999317,
"eps": 4,
"loss/policy_avg": -0.0018949531950056553,
"loss/value_avg": 0.2573157548904419,
"lr": 1.734375e-06,
"objective/entropy": -47.27846145629883,
"objective/kl": 36.20323181152344,
"objective/non_score_reward": -1.8101614713668823,
"objective/rlhf_reward": 9.8946533203125,
"objective/scores": 11.704814910888672,
"policy/approxkl_avg": 0.0025291882921010256,
"policy/clipfrac_avg": 0.010728440247476101,
"policy/entropy_avg": 1.3530826568603516,
"step": 109,
"val/clipfrac_avg": 0.003135326784104109,
"val/num_eos_tokens": 28041,
"val/ratio": 1.000365972518921,
"val/ratio_var": 1.0167857908527367e-05
},
{
"episode": 56320,
"epoch": 0.48252227553118576,
"eps": 4,
"loss/policy_avg": -0.000798303633928299,
"loss/value_avg": 0.24519430100917816,
"lr": 1.72265625e-06,
"objective/entropy": -46.93909454345703,
"objective/kl": 35.551170349121094,
"objective/non_score_reward": -1.7775585651397705,
"objective/rlhf_reward": 10.142352104187012,
"objective/scores": 11.919910430908203,
"policy/approxkl_avg": 0.003023324767127633,
"policy/clipfrac_avg": 0.01054347399622202,
"policy/entropy_avg": 1.3338515758514404,
"step": 110,
"val/clipfrac_avg": 0.00204864121042192,
"val/num_eos_tokens": 27167,
"val/ratio": 0.9999716281890869,
"val/ratio_var": 3.4549011616036296e-06
},
{
"episode": 56832,
"epoch": 0.48690884167237836,
"eps": 4,
"loss/policy_avg": -0.0019484013319015503,
"loss/value_avg": 0.23634591698646545,
"lr": 1.7109375e-06,
"objective/entropy": -48.95277404785156,
"objective/kl": 35.153472900390625,
"objective/non_score_reward": -1.7576735019683838,
"objective/rlhf_reward": 10.170576095581055,
"objective/scores": 11.92824935913086,
"policy/approxkl_avg": 0.0029592744540423155,
"policy/clipfrac_avg": 0.010565382428467274,
"policy/entropy_avg": 1.3721097707748413,
"step": 111,
"val/clipfrac_avg": 0.002308458089828491,
"val/num_eos_tokens": 26862,
"val/ratio": 1.0011966228485107,
"val/ratio_var": 8.257182344095781e-05
},
{
"episode": 57344,
"epoch": 0.49129540781357095,
"eps": 4,
"loss/policy_avg": 0.00030158646404743195,
"loss/value_avg": 0.22391614317893982,
"lr": 1.69921875e-06,
"objective/entropy": -49.59033966064453,
"objective/kl": 35.19733428955078,
"objective/non_score_reward": -1.759866714477539,
"objective/rlhf_reward": 10.102191925048828,
"objective/scores": 11.862058639526367,
"policy/approxkl_avg": 0.0021578953601419926,
"policy/clipfrac_avg": 0.009181549772620201,
"policy/entropy_avg": 1.3748302459716797,
"step": 112,
"val/clipfrac_avg": 0.002646082080900669,
"val/num_eos_tokens": 26964,
"val/ratio": 1.0000518560409546,
"val/ratio_var": 3.8748953556932975e-06
},
{
"episode": 57856,
"epoch": 0.49568197395476354,
"eps": 4,
"loss/policy_avg": -0.00021987548097968102,
"loss/value_avg": 0.22045229375362396,
"lr": 1.6875e-06,
"objective/entropy": -45.642669677734375,
"objective/kl": 36.55120849609375,
"objective/non_score_reward": -1.827560544013977,
"objective/rlhf_reward": 10.042106628417969,
"objective/scores": 11.869667053222656,
"policy/approxkl_avg": 0.0021457457914948463,
"policy/clipfrac_avg": 0.010356370359659195,
"policy/entropy_avg": 1.2991323471069336,
"step": 113,
"val/clipfrac_avg": 0.003576356451958418,
"val/num_eos_tokens": 26990,
"val/ratio": 1.0002024173736572,
"val/ratio_var": 7.810693205101416e-06
},
{
"episode": 58368,
"epoch": 0.5000685400959561,
"eps": 4,
"loss/policy_avg": 0.0011025657877326012,
"loss/value_avg": 0.20879928767681122,
"lr": 1.67578125e-06,
"objective/entropy": -46.918296813964844,
"objective/kl": 36.73741912841797,
"objective/non_score_reward": -1.8368710279464722,
"objective/rlhf_reward": 10.020646095275879,
"objective/scores": 11.85751724243164,
"policy/approxkl_avg": 0.0025296411477029324,
"policy/clipfrac_avg": 0.009327422827482224,
"policy/entropy_avg": 1.3181732892990112,
"step": 114,
"val/clipfrac_avg": 0.0026597436517477036,
"val/num_eos_tokens": 26972,
"val/ratio": 1.0004398822784424,
"val/ratio_var": 1.119426087825559e-05
},
{
"episode": 58880,
"epoch": 0.5044551062371487,
"eps": 4,
"loss/policy_avg": -0.00237057963386178,
"loss/value_avg": 0.2178419530391693,
"lr": 1.6640625e-06,
"objective/entropy": -46.66753387451172,
"objective/kl": 36.253822326660156,
"objective/non_score_reward": -1.8126912117004395,
"objective/rlhf_reward": 10.09880256652832,
"objective/scores": 11.911494255065918,
"policy/approxkl_avg": 0.002091196598485112,
"policy/clipfrac_avg": 0.01094783004373312,
"policy/entropy_avg": 1.3274283409118652,
"step": 115,
"val/clipfrac_avg": 0.004409347660839558,
"val/num_eos_tokens": 27424,
"val/ratio": 0.9999520778656006,
"val/ratio_var": 4.443759280547965e-06
},
{
"episode": 59392,
"epoch": 0.5088416723783413,
"eps": 4,
"loss/policy_avg": -0.005248534493148327,
"loss/value_avg": 0.2638506293296814,
"lr": 1.6523437500000001e-06,
"objective/entropy": -47.24037170410156,
"objective/kl": 36.8753776550293,
"objective/non_score_reward": -1.8437689542770386,
"objective/rlhf_reward": 9.983234405517578,
"objective/scores": 11.827003479003906,
"policy/approxkl_avg": 0.0032648907508701086,
"policy/clipfrac_avg": 0.011387192644178867,
"policy/entropy_avg": 1.3188259601593018,
"step": 116,
"val/clipfrac_avg": 0.004241817630827427,
"val/num_eos_tokens": 25302,
"val/ratio": 0.999230146408081,
"val/ratio_var": 4.158954197919229e-06
},
{
"episode": 59904,
"epoch": 0.5132282385195339,
"eps": 4,
"loss/policy_avg": -0.0031682229600846767,
"loss/value_avg": 0.23800501227378845,
"lr": 1.640625e-06,
"objective/entropy": -47.64235305786133,
"objective/kl": 35.08871078491211,
"objective/non_score_reward": -1.7544355392456055,
"objective/rlhf_reward": 10.015641212463379,
"objective/scores": 11.770076751708984,
"policy/approxkl_avg": 0.002452462911605835,
"policy/clipfrac_avg": 0.011082207784056664,
"policy/entropy_avg": 1.320603609085083,
"step": 117,
"val/clipfrac_avg": 0.0038690143264830112,
"val/num_eos_tokens": 26752,
"val/ratio": 0.9997324347496033,
"val/ratio_var": 5.267690085020149e-06
},
{
"episode": 60416,
"epoch": 0.5176148046607265,
"eps": 4,
"loss/policy_avg": 0.00654706871137023,
"loss/value_avg": 0.18882016837596893,
"lr": 1.62890625e-06,
"objective/entropy": -49.72483825683594,
"objective/kl": 34.498836517333984,
"objective/non_score_reward": -1.7249417304992676,
"objective/rlhf_reward": 10.259115219116211,
"objective/scores": 11.98405647277832,
"policy/approxkl_avg": 0.0024695878382772207,
"policy/clipfrac_avg": 0.009761758148670197,
"policy/entropy_avg": 1.3732898235321045,
"step": 118,
"val/clipfrac_avg": 0.0033802662510424852,
"val/num_eos_tokens": 26374,
"val/ratio": 1.0005288124084473,
"val/ratio_var": 1.0841821676876862e-05
},
{
"episode": 60928,
"epoch": 0.5220013708019191,
"eps": 4,
"loss/policy_avg": 0.0007685907185077667,
"loss/value_avg": 0.21882620453834534,
"lr": 1.6171875000000001e-06,
"objective/entropy": -49.3972053527832,
"objective/kl": 35.464508056640625,
"objective/non_score_reward": -1.7732254266738892,
"objective/rlhf_reward": 9.918214797973633,
"objective/scores": 11.69144058227539,
"policy/approxkl_avg": 0.002548103453591466,
"policy/clipfrac_avg": 0.010540506802499294,
"policy/entropy_avg": 1.3686912059783936,
"step": 119,
"val/clipfrac_avg": 0.002275804989039898,
"val/num_eos_tokens": 25251,
"val/ratio": 0.9998582005500793,
"val/ratio_var": 4.747326329379575e-06
},
{
"episode": 61440,
"epoch": 0.5263879369431117,
"eps": 4,
"loss/policy_avg": 0.004093550145626068,
"loss/value_avg": 0.19169041514396667,
"lr": 1.60546875e-06,
"objective/entropy": -50.880088806152344,
"objective/kl": 34.82711410522461,
"objective/non_score_reward": -1.7413556575775146,
"objective/rlhf_reward": 10.035165786743164,
"objective/scores": 11.776521682739258,
"policy/approxkl_avg": 0.002437584102153778,
"policy/clipfrac_avg": 0.010569003410637379,
"policy/entropy_avg": 1.3898723125457764,
"step": 120,
"val/clipfrac_avg": 0.003308035433292389,
"val/num_eos_tokens": 25212,
"val/ratio": 1.0002849102020264,
"val/ratio_var": 3.5936700442107394e-05
},
{
"episode": 61952,
"epoch": 0.5307745030843043,
"eps": 4,
"loss/policy_avg": 0.003542997408658266,
"loss/value_avg": 0.18405793607234955,
"lr": 1.59375e-06,
"objective/entropy": -52.583984375,
"objective/kl": 34.274208068847656,
"objective/non_score_reward": -1.7137104272842407,
"objective/rlhf_reward": 10.16425895690918,
"objective/scores": 11.877969741821289,
"policy/approxkl_avg": 0.0019055928569287062,
"policy/clipfrac_avg": 0.010142171755433083,
"policy/entropy_avg": 1.3990683555603027,
"step": 121,
"val/clipfrac_avg": 0.003069917904213071,
"val/num_eos_tokens": 24678,
"val/ratio": 0.9998403787612915,
"val/ratio_var": 2.1590326468867715e-06
},
{
"episode": 62464,
"epoch": 0.5351610692254969,
"eps": 4,
"loss/policy_avg": 0.0034955721348524094,
"loss/value_avg": 0.19773662090301514,
"lr": 1.5820312500000001e-06,
"objective/entropy": -52.74757766723633,
"objective/kl": 33.07254409790039,
"objective/non_score_reward": -1.6536272764205933,
"objective/rlhf_reward": 10.184989929199219,
"objective/scores": 11.838617324829102,
"policy/approxkl_avg": 0.0024753790348768234,
"policy/clipfrac_avg": 0.010373384691774845,
"policy/entropy_avg": 1.4190127849578857,
"step": 122,
"val/clipfrac_avg": 0.0018084857147186995,
"val/num_eos_tokens": 27268,
"val/ratio": 1.00020432472229,
"val/ratio_var": 4.94942651130259e-06
},
{
"episode": 62976,
"epoch": 0.5395476353666895,
"eps": 4,
"loss/policy_avg": 0.0025432901456952095,
"loss/value_avg": 0.15946492552757263,
"lr": 1.5703125e-06,
"objective/entropy": -54.125823974609375,
"objective/kl": 33.593475341796875,
"objective/non_score_reward": -1.679673671722412,
"objective/rlhf_reward": 10.296588897705078,
"objective/scores": 11.976263046264648,
"policy/approxkl_avg": 0.003026704303920269,
"policy/clipfrac_avg": 0.0093125831335783,
"policy/entropy_avg": 1.4351716041564941,
"step": 123,
"val/clipfrac_avg": 0.002426933031529188,
"val/num_eos_tokens": 26522,
"val/ratio": 0.9995898604393005,
"val/ratio_var": 4.661137154471362e-06
},
{
"episode": 63488,
"epoch": 0.5439342015078821,
"eps": 4,
"loss/policy_avg": 0.0016291006468236446,
"loss/value_avg": 0.1786973923444748,
"lr": 1.55859375e-06,
"objective/entropy": -55.55234909057617,
"objective/kl": 33.02119827270508,
"objective/non_score_reward": -1.651059865951538,
"objective/rlhf_reward": 10.201143264770508,
"objective/scores": 11.852203369140625,
"policy/approxkl_avg": 0.0020746339578181505,
"policy/clipfrac_avg": 0.010415926575660706,
"policy/entropy_avg": 1.4947787523269653,
"step": 124,
"val/clipfrac_avg": 0.003645282005891204,
"val/num_eos_tokens": 25188,
"val/ratio": 1.00013267993927,
"val/ratio_var": 3.663065626824391e-06
},
{
"episode": 64000,
"epoch": 0.5483207676490747,
"eps": 4,
"loss/policy_avg": 0.006848334334790707,
"loss/value_avg": 0.17819851636886597,
"lr": 1.5468750000000001e-06,
"objective/entropy": -56.54252624511719,
"objective/kl": 32.34886932373047,
"objective/non_score_reward": -1.617443561553955,
"objective/rlhf_reward": 10.11642074584961,
"objective/scores": 11.733863830566406,
"policy/approxkl_avg": 0.002013370394706726,
"policy/clipfrac_avg": 0.011284420266747475,
"policy/entropy_avg": 1.5066087245941162,
"step": 125,
"val/clipfrac_avg": 0.003249499946832657,
"val/num_eos_tokens": 26126,
"val/ratio": 0.9999648928642273,
"val/ratio_var": 4.107211225345964e-06
},
{
"episode": 64512,
"epoch": 0.5527073337902673,
"eps": 4,
"loss/policy_avg": 0.005298146046698093,
"loss/value_avg": 0.19392237067222595,
"lr": 1.53515625e-06,
"objective/entropy": -57.55706024169922,
"objective/kl": 32.114959716796875,
"objective/non_score_reward": -1.605747938156128,
"objective/rlhf_reward": 10.174212455749512,
"objective/scores": 11.779960632324219,
"policy/approxkl_avg": 0.0027915926184505224,
"policy/clipfrac_avg": 0.010949358344078064,
"policy/entropy_avg": 1.5326869487762451,
"step": 126,
"val/clipfrac_avg": 0.002645657164976001,
"val/num_eos_tokens": 24475,
"val/ratio": 1.000093698501587,
"val/ratio_var": 3.1418253456649836e-06
},
{
"episode": 65024,
"epoch": 0.5570938999314599,
"eps": 4,
"loss/policy_avg": 0.007277632597833872,
"loss/value_avg": 0.17173150181770325,
"lr": 1.5234375e-06,
"objective/entropy": -58.10830307006836,
"objective/kl": 32.09362030029297,
"objective/non_score_reward": -1.6046810150146484,
"objective/rlhf_reward": 10.152120590209961,
"objective/scores": 11.75680160522461,
"policy/approxkl_avg": 0.0019359358120709658,
"policy/clipfrac_avg": 0.012156989425420761,
"policy/entropy_avg": 1.539255142211914,
"step": 127,
"val/clipfrac_avg": 0.0033526804763823748,
"val/num_eos_tokens": 24531,
"val/ratio": 1.0003376007080078,
"val/ratio_var": 5.461680757434806e-06
},
{
"episode": 65536,
"epoch": 0.5614804660726525,
"eps": 4,
"loss/policy_avg": 0.005196425132453442,
"loss/value_avg": 0.17728489637374878,
"lr": 1.5117187500000001e-06,
"objective/entropy": -57.962921142578125,
"objective/kl": 31.692747116088867,
"objective/non_score_reward": -1.5846374034881592,
"objective/rlhf_reward": 9.997382164001465,
"objective/scores": 11.582019805908203,
"policy/approxkl_avg": 0.0022988603450357914,
"policy/clipfrac_avg": 0.012291998602449894,
"policy/entropy_avg": 1.5566446781158447,
"step": 128,
"val/clipfrac_avg": 0.003016907721757889,
"val/num_eos_tokens": 24547,
"val/ratio": 1.0000189542770386,
"val/ratio_var": 4.059977982251439e-06
},
{
"episode": 66048,
"epoch": 0.565867032213845,
"eps": 4,
"loss/policy_avg": 0.00474149826914072,
"loss/value_avg": 0.16934943199157715,
"lr": 1.5e-06,
"objective/entropy": -60.81864929199219,
"objective/kl": 30.907930374145508,
"objective/non_score_reward": -1.5453965663909912,
"objective/rlhf_reward": 10.061457633972168,
"objective/scores": 11.606854438781738,
"policy/approxkl_avg": 0.0017903585685417056,
"policy/clipfrac_avg": 0.011585026048123837,
"policy/entropy_avg": 1.6131141185760498,
"step": 129,
"val/clipfrac_avg": 0.0023407491389662027,
"val/num_eos_tokens": 24418,
"val/ratio": 1.0001299381256104,
"val/ratio_var": 4.394762527226703e-06
},
{
"episode": 66560,
"epoch": 0.5702535983550377,
"eps": 4,
"loss/policy_avg": 0.002869675401598215,
"loss/value_avg": 0.17518723011016846,
"lr": 1.48828125e-06,
"objective/entropy": -60.761024475097656,
"objective/kl": 30.685794830322266,
"objective/non_score_reward": -1.534289836883545,
"objective/rlhf_reward": 10.12808609008789,
"objective/scores": 11.662375450134277,
"policy/approxkl_avg": 0.002911779098212719,
"policy/clipfrac_avg": 0.01113644428551197,
"policy/entropy_avg": 1.5905413627624512,
"step": 130,
"val/clipfrac_avg": 0.0030311732552945614,
"val/num_eos_tokens": 23255,
"val/ratio": 0.9994995594024658,
"val/ratio_var": 4.020673713966971e-06
},
{
"episode": 67072,
"epoch": 0.5746401644962303,
"eps": 4,
"loss/policy_avg": 0.005674917250871658,
"loss/value_avg": 0.1688387095928192,
"lr": 1.4765625e-06,
"objective/entropy": -62.38867950439453,
"objective/kl": 30.885807037353516,
"objective/non_score_reward": -1.5442904233932495,
"objective/rlhf_reward": 9.997440338134766,
"objective/scores": 11.541730880737305,
"policy/approxkl_avg": 0.0019931201823055744,
"policy/clipfrac_avg": 0.01189956534653902,
"policy/entropy_avg": 1.6408116817474365,
"step": 131,
"val/clipfrac_avg": 0.0037230595480650663,
"val/num_eos_tokens": 22733,
"val/ratio": 0.9997772574424744,
"val/ratio_var": 4.117905518796761e-06
},
{
"episode": 67584,
"epoch": 0.5790267306374229,
"eps": 4,
"loss/policy_avg": -0.0006350036710500717,
"loss/value_avg": 0.18697667121887207,
"lr": 1.46484375e-06,
"objective/entropy": -59.36350631713867,
"objective/kl": 31.597064971923828,
"objective/non_score_reward": -1.5798532962799072,
"objective/rlhf_reward": 10.01123046875,
"objective/scores": 11.591083526611328,
"policy/approxkl_avg": 0.0020872685126960278,
"policy/clipfrac_avg": 0.010825317353010178,
"policy/entropy_avg": 1.5486820936203003,
"step": 132,
"val/clipfrac_avg": 0.0028165532276034355,
"val/num_eos_tokens": 24545,
"val/ratio": 1.0000473260879517,
"val/ratio_var": 4.7966873353288975e-06
},
{
"episode": 68096,
"epoch": 0.5834132967786155,
"eps": 4,
"loss/policy_avg": 0.004328216426074505,
"loss/value_avg": 0.16442391276359558,
"lr": 1.453125e-06,
"objective/entropy": -59.58063507080078,
"objective/kl": 31.654491424560547,
"objective/non_score_reward": -1.5827245712280273,
"objective/rlhf_reward": 10.193343162536621,
"objective/scores": 11.776067733764648,
"policy/approxkl_avg": 0.0024430665653198957,
"policy/clipfrac_avg": 0.01117191556841135,
"policy/entropy_avg": 1.5335578918457031,
"step": 133,
"val/clipfrac_avg": 0.00234953872859478,
"val/num_eos_tokens": 22553,
"val/ratio": 0.9998908042907715,
"val/ratio_var": 3.7154718484089244e-06
},
{
"episode": 68608,
"epoch": 0.5877998629198081,
"eps": 4,
"loss/policy_avg": 0.00497487373650074,
"loss/value_avg": 0.16096718609333038,
"lr": 1.44140625e-06,
"objective/entropy": -56.16926574707031,
"objective/kl": 32.786376953125,
"objective/non_score_reward": -1.639318823814392,
"objective/rlhf_reward": 10.189998626708984,
"objective/scores": 11.829317092895508,
"policy/approxkl_avg": 0.0026530069299042225,
"policy/clipfrac_avg": 0.011592323891818523,
"policy/entropy_avg": 1.4631075859069824,
"step": 134,
"val/clipfrac_avg": 0.0031634648330509663,
"val/num_eos_tokens": 23151,
"val/ratio": 0.9998040795326233,
"val/ratio_var": 4.819019522983581e-06
},
{
"episode": 69120,
"epoch": 0.5921864290610007,
"eps": 4,
"loss/policy_avg": 0.013725158758461475,
"loss/value_avg": 0.16442811489105225,
"lr": 1.4296875e-06,
"objective/entropy": -58.683013916015625,
"objective/kl": 31.694040298461914,
"objective/non_score_reward": -1.5847020149230957,
"objective/rlhf_reward": 10.19643783569336,
"objective/scores": 11.781139373779297,
"policy/approxkl_avg": 0.0022610300220549107,
"policy/clipfrac_avg": 0.010460296645760536,
"policy/entropy_avg": 1.5235949754714966,
"step": 135,
"val/clipfrac_avg": 0.002330533927306533,
"val/num_eos_tokens": 25330,
"val/ratio": 1.0005124807357788,
"val/ratio_var": 1.425193840987049e-05
},
{
"episode": 69632,
"epoch": 0.5965729952021933,
"eps": 4,
"loss/policy_avg": 0.007436072453856468,
"loss/value_avg": 0.174909770488739,
"lr": 1.41796875e-06,
"objective/entropy": -59.78578186035156,
"objective/kl": 32.64409637451172,
"objective/non_score_reward": -1.6322047710418701,
"objective/rlhf_reward": 10.095362663269043,
"objective/scores": 11.727567672729492,
"policy/approxkl_avg": 0.00211188942193985,
"policy/clipfrac_avg": 0.01192308496683836,
"policy/entropy_avg": 1.5522668361663818,
"step": 136,
"val/clipfrac_avg": 0.002892076037824154,
"val/num_eos_tokens": 24760,
"val/ratio": 0.9996564984321594,
"val/ratio_var": 5.224440883466741e-06
},
{
"episode": 70144,
"epoch": 0.6009595613433859,
"eps": 4,
"loss/policy_avg": 0.009540164843201637,
"loss/value_avg": 0.178801029920578,
"lr": 1.40625e-06,
"objective/entropy": -56.64319610595703,
"objective/kl": 33.44987869262695,
"objective/non_score_reward": -1.6724939346313477,
"objective/rlhf_reward": 10.061247825622559,
"objective/scores": 11.733741760253906,
"policy/approxkl_avg": 0.002490841317921877,
"policy/clipfrac_avg": 0.012106543406844139,
"policy/entropy_avg": 1.4971638917922974,
"step": 137,
"val/clipfrac_avg": 0.0025153912138193846,
"val/num_eos_tokens": 25677,
"val/ratio": 1.0003026723861694,
"val/ratio_var": 4.707488187705167e-06
},
{
"episode": 70656,
"epoch": 0.6053461274845785,
"eps": 4,
"loss/policy_avg": 0.010735518299043179,
"loss/value_avg": 0.17893172800540924,
"lr": 1.39453125e-06,
"objective/entropy": -58.26459503173828,
"objective/kl": 32.324424743652344,
"objective/non_score_reward": -1.6162214279174805,
"objective/rlhf_reward": 10.123574256896973,
"objective/scores": 11.739795684814453,
"policy/approxkl_avg": 0.002283570822328329,
"policy/clipfrac_avg": 0.010066288523375988,
"policy/entropy_avg": 1.4838917255401611,
"step": 138,
"val/clipfrac_avg": 0.0031754274386912584,
"val/num_eos_tokens": 21488,
"val/ratio": 0.9996895790100098,
"val/ratio_var": 3.484929720798391e-06
},
{
"episode": 71168,
"epoch": 0.6097326936257711,
"eps": 4,
"loss/policy_avg": 0.004294028505682945,
"loss/value_avg": 0.1728200614452362,
"lr": 1.3828125e-06,
"objective/entropy": -58.1663818359375,
"objective/kl": 33.08858871459961,
"objective/non_score_reward": -1.6544294357299805,
"objective/rlhf_reward": 10.134689331054688,
"objective/scores": 11.789118766784668,
"policy/approxkl_avg": 0.003025288227945566,
"policy/clipfrac_avg": 0.010935579426586628,
"policy/entropy_avg": 1.4885742664337158,
"step": 139,
"val/clipfrac_avg": 0.002140995115041733,
"val/num_eos_tokens": 24079,
"val/ratio": 0.9999958276748657,
"val/ratio_var": 3.371385446371278e-06
},
{
"episode": 71680,
"epoch": 0.6141192597669637,
"eps": 4,
"loss/policy_avg": 0.012265619821846485,
"loss/value_avg": 0.15826916694641113,
"lr": 1.37109375e-06,
"objective/entropy": -53.690528869628906,
"objective/kl": 33.63441848754883,
"objective/non_score_reward": -1.6817208528518677,
"objective/rlhf_reward": 10.200325965881348,
"objective/scores": 11.882046699523926,
"policy/approxkl_avg": 0.0019431847613304853,
"policy/clipfrac_avg": 0.010160792618989944,
"policy/entropy_avg": 1.4183900356292725,
"step": 140,
"val/clipfrac_avg": 0.0023755324073135853,
"val/num_eos_tokens": 22014,
"val/ratio": 1.0000615119934082,
"val/ratio_var": 2.799153207888594e-06
},
{
"episode": 72192,
"epoch": 0.6185058259081563,
"eps": 4,
"loss/policy_avg": 0.007404782343655825,
"loss/value_avg": 0.17414774000644684,
"lr": 1.359375e-06,
"objective/entropy": -56.323875427246094,
"objective/kl": 34.29399490356445,
"objective/non_score_reward": -1.7146997451782227,
"objective/rlhf_reward": 10.027562141418457,
"objective/scores": 11.74226188659668,
"policy/approxkl_avg": 0.0020079202950000763,
"policy/clipfrac_avg": 0.011949660256505013,
"policy/entropy_avg": 1.4914710521697998,
"step": 141,
"val/clipfrac_avg": 0.0026488695293664932,
"val/num_eos_tokens": 26006,
"val/ratio": 0.9997506141662598,
"val/ratio_var": 3.4570130083011463e-06
},
{
"episode": 72704,
"epoch": 0.6228923920493489,
"eps": 4,
"loss/policy_avg": 0.009324302896857262,
"loss/value_avg": 0.17025059461593628,
"lr": 1.34765625e-06,
"objective/entropy": -56.78107452392578,
"objective/kl": 33.23863220214844,
"objective/non_score_reward": -1.6619315147399902,
"objective/rlhf_reward": 10.156038284301758,
"objective/scores": 11.817970275878906,
"policy/approxkl_avg": 0.0020708302035927773,
"policy/clipfrac_avg": 0.012341851368546486,
"policy/entropy_avg": 1.486853837966919,
"step": 142,
"val/clipfrac_avg": 0.002559303306043148,
"val/num_eos_tokens": 22403,
"val/ratio": 1.0002156496047974,
"val/ratio_var": 4.204774540994549e-06
},
{
"episode": 73216,
"epoch": 0.6272789581905415,
"eps": 4,
"loss/policy_avg": 0.004553023725748062,
"loss/value_avg": 0.1775711476802826,
"lr": 1.3359375e-06,
"objective/entropy": -53.44776153564453,
"objective/kl": 34.82638931274414,
"objective/non_score_reward": -1.7413194179534912,
"objective/rlhf_reward": 10.207488059997559,
"objective/scores": 11.948807716369629,
"policy/approxkl_avg": 0.0024012078065425158,
"policy/clipfrac_avg": 0.012094511650502682,
"policy/entropy_avg": 1.4035837650299072,
"step": 143,
"val/clipfrac_avg": 0.0028213425539433956,
"val/num_eos_tokens": 25504,
"val/ratio": 0.999752402305603,
"val/ratio_var": 3.635649363786797e-06
},
{
"episode": 73728,
"epoch": 0.6316655243317341,
"eps": 4,
"loss/policy_avg": 0.00827038660645485,
"loss/value_avg": 0.15853461623191833,
"lr": 1.32421875e-06,
"objective/entropy": -55.35993194580078,
"objective/kl": 34.32511520385742,
"objective/non_score_reward": -1.716255784034729,
"objective/rlhf_reward": 10.100241661071777,
"objective/scores": 11.816497802734375,
"policy/approxkl_avg": 0.0027391049079596996,
"policy/clipfrac_avg": 0.01181262731552124,
"policy/entropy_avg": 1.4438374042510986,
"step": 144,
"val/clipfrac_avg": 0.0024406672455370426,
"val/num_eos_tokens": 22691,
"val/ratio": 1.0007078647613525,
"val/ratio_var": 1.7917764125741087e-05
},
{
"episode": 74240,
"epoch": 0.6360520904729267,
"eps": 4,
"loss/policy_avg": 0.009457225911319256,
"loss/value_avg": 0.157231405377388,
"lr": 1.3125000000000001e-06,
"objective/entropy": -54.185794830322266,
"objective/kl": 34.9151496887207,
"objective/non_score_reward": -1.7457575798034668,
"objective/rlhf_reward": 10.300085067749023,
"objective/scores": 12.045843124389648,
"policy/approxkl_avg": 0.0025434326380491257,
"policy/clipfrac_avg": 0.012178627774119377,
"policy/entropy_avg": 1.4010084867477417,
"step": 145,
"val/clipfrac_avg": 0.0034856563434004784,
"val/num_eos_tokens": 23737,
"val/ratio": 1.0001429319381714,
"val/ratio_var": 4.342706688476028e-06
},
{
"episode": 74752,
"epoch": 0.6404386566141193,
"eps": 4,
"loss/policy_avg": 0.005285304039716721,
"loss/value_avg": 0.17335200309753418,
"lr": 1.30078125e-06,
"objective/entropy": -53.034061431884766,
"objective/kl": 34.402244567871094,
"objective/non_score_reward": -1.7201124429702759,
"objective/rlhf_reward": 10.186295509338379,
"objective/scores": 11.906408309936523,
"policy/approxkl_avg": 0.0023625961039215326,
"policy/clipfrac_avg": 0.011160846799612045,
"policy/entropy_avg": 1.373847484588623,
"step": 146,
"val/clipfrac_avg": 0.002591262571513653,
"val/num_eos_tokens": 23870,
"val/ratio": 0.9997913837432861,
"val/ratio_var": 3.1251217933458975e-06
},
{
"episode": 75264,
"epoch": 0.6448252227553118,
"eps": 4,
"loss/policy_avg": 0.0033188331872224808,
"loss/value_avg": 0.178251251578331,
"lr": 1.2890625e-06,
"objective/entropy": -52.30128479003906,
"objective/kl": 35.336875915527344,
"objective/non_score_reward": -1.7668437957763672,
"objective/rlhf_reward": 10.239588737487793,
"objective/scores": 12.00643253326416,
"policy/approxkl_avg": 0.002232671482488513,
"policy/clipfrac_avg": 0.01211271807551384,
"policy/entropy_avg": 1.3423898220062256,
"step": 147,
"val/clipfrac_avg": 0.0049108765088021755,
"val/num_eos_tokens": 24415,
"val/ratio": 0.999698281288147,
"val/ratio_var": 4.744644684251398e-06
},
{
"episode": 75776,
"epoch": 0.6492117888965044,
"eps": 4,
"loss/policy_avg": 0.0045684343203902245,
"loss/value_avg": 0.1551057994365692,
"lr": 1.2773437500000001e-06,
"objective/entropy": -42.2551383972168,
"objective/kl": 36.38901901245117,
"objective/non_score_reward": -1.8194509744644165,
"objective/rlhf_reward": 10.190171241760254,
"objective/scores": 12.009622573852539,
"policy/approxkl_avg": 0.0027887800242751837,
"policy/clipfrac_avg": 0.01179465465247631,
"policy/entropy_avg": 1.1810457706451416,
"step": 148,
"val/clipfrac_avg": 0.002859487198293209,
"val/num_eos_tokens": 21310,
"val/ratio": 0.9999205470085144,
"val/ratio_var": 4.294446171115851e-06
},
{
"episode": 76288,
"epoch": 0.653598355037697,
"eps": 4,
"loss/policy_avg": 0.0034446939826011658,
"loss/value_avg": 0.1887798309326172,
"lr": 1.265625e-06,
"objective/entropy": -48.354408264160156,
"objective/kl": 36.67412185668945,
"objective/non_score_reward": -1.833706259727478,
"objective/rlhf_reward": 10.170305252075195,
"objective/scores": 12.004011154174805,
"policy/approxkl_avg": 0.0017848997376859188,
"policy/clipfrac_avg": 0.011014001443982124,
"policy/entropy_avg": 1.2652117013931274,
"step": 149,
"val/clipfrac_avg": 0.0032921340316534042,
"val/num_eos_tokens": 23606,
"val/ratio": 0.9999603033065796,
"val/ratio_var": 2.988560936501017e-06
},
{
"episode": 76800,
"epoch": 0.6579849211788896,
"eps": 4,
"loss/policy_avg": -0.0008725142106413841,
"loss/value_avg": 0.1778767853975296,
"lr": 1.25390625e-06,
"objective/entropy": -48.64875793457031,
"objective/kl": 35.84824752807617,
"objective/non_score_reward": -1.792412519454956,
"objective/rlhf_reward": 10.273049354553223,
"objective/scores": 12.065462112426758,
"policy/approxkl_avg": 0.002263781614601612,
"policy/clipfrac_avg": 0.010905838571488857,
"policy/entropy_avg": 1.2818143367767334,
"step": 150,
"val/clipfrac_avg": 0.003908317536115646,
"val/num_eos_tokens": 25626,
"val/ratio": 0.999974250793457,
"val/ratio_var": 3.1995639346860116e-06
},
{
"episode": 77312,
"epoch": 0.6623714873200822,
"eps": 4,
"loss/policy_avg": 0.004413328133523464,
"loss/value_avg": 0.1777411848306656,
"lr": 1.2421875000000001e-06,
"objective/entropy": -46.26213836669922,
"objective/kl": 36.80657196044922,
"objective/non_score_reward": -1.8403284549713135,
"objective/rlhf_reward": 10.09887409210205,
"objective/scores": 11.939202308654785,
"policy/approxkl_avg": 0.0024394330102950335,
"policy/clipfrac_avg": 0.012261416763067245,
"policy/entropy_avg": 1.2491027116775513,
"step": 151,
"val/clipfrac_avg": 0.002783268690109253,
"val/num_eos_tokens": 23946,
"val/ratio": 0.9999766945838928,
"val/ratio_var": 3.928019395971205e-06
},
{
"episode": 77824,
"epoch": 0.6667580534612748,
"eps": 4,
"loss/policy_avg": 0.0010167919099330902,
"loss/value_avg": 0.17760787904262543,
"lr": 1.23046875e-06,
"objective/entropy": -45.87760925292969,
"objective/kl": 36.983638763427734,
"objective/non_score_reward": -1.849181890487671,
"objective/rlhf_reward": 10.17978572845459,
"objective/scores": 12.02896785736084,
"policy/approxkl_avg": 0.002851827535778284,
"policy/clipfrac_avg": 0.01199465710669756,
"policy/entropy_avg": 1.2081284523010254,
"step": 152,
"val/clipfrac_avg": 0.0026916628703475,
"val/num_eos_tokens": 24294,
"val/ratio": 1.000422716140747,
"val/ratio_var": 6.217844656930538e-06
},
{
"episode": 78336,
"epoch": 0.6711446196024674,
"eps": 4,
"loss/policy_avg": 0.002063746564090252,
"loss/value_avg": 0.1728123426437378,
"lr": 1.21875e-06,
"objective/entropy": -44.998619079589844,
"objective/kl": 37.763389587402344,
"objective/non_score_reward": -1.888169765472412,
"objective/rlhf_reward": 10.178922653198242,
"objective/scores": 12.067092895507812,
"policy/approxkl_avg": 0.002793453633785248,
"policy/clipfrac_avg": 0.012047767639160156,
"policy/entropy_avg": 1.2028660774230957,
"step": 153,
"val/clipfrac_avg": 0.0025438859593123198,
"val/num_eos_tokens": 23546,
"val/ratio": 1.0003751516342163,
"val/ratio_var": 1.4593482774216682e-05
},
{
"episode": 78848,
"epoch": 0.67553118574366,
"eps": 4,
"loss/policy_avg": 0.0007819309830665588,
"loss/value_avg": 0.1686643660068512,
"lr": 1.2070312500000001e-06,
"objective/entropy": -45.0665283203125,
"objective/kl": 36.91722869873047,
"objective/non_score_reward": -1.8458614349365234,
"objective/rlhf_reward": 10.167716026306152,
"objective/scores": 12.013577461242676,
"policy/approxkl_avg": 0.002328127156943083,
"policy/clipfrac_avg": 0.01130404882133007,
"policy/entropy_avg": 1.207916498184204,
"step": 154,
"val/clipfrac_avg": 0.002523067407310009,
"val/num_eos_tokens": 23131,
"val/ratio": 0.9997349977493286,
"val/ratio_var": 2.9115451525285607e-06
},
{
"episode": 79360,
"epoch": 0.6799177518848526,
"eps": 4,
"loss/policy_avg": -0.0004192842170596123,
"loss/value_avg": 0.1749456375837326,
"lr": 1.1953125e-06,
"objective/entropy": -44.818763732910156,
"objective/kl": 37.74750518798828,
"objective/non_score_reward": -1.8873754739761353,
"objective/rlhf_reward": 10.325039863586426,
"objective/scores": 12.21241569519043,
"policy/approxkl_avg": 0.00260849017649889,
"policy/clipfrac_avg": 0.011585136875510216,
"policy/entropy_avg": 1.2088732719421387,
"step": 155,
"val/clipfrac_avg": 0.001912396401166916,
"val/num_eos_tokens": 23505,
"val/ratio": 0.999869704246521,
"val/ratio_var": 3.1894157928036293e-06
},
{
"episode": 79872,
"epoch": 0.6843043180260452,
"eps": 4,
"loss/policy_avg": -0.0022659683600068092,
"loss/value_avg": 0.17857833206653595,
"lr": 1.18359375e-06,
"objective/entropy": -44.684295654296875,
"objective/kl": 37.606056213378906,
"objective/non_score_reward": -1.880302906036377,
"objective/rlhf_reward": 10.136545181274414,
"objective/scores": 12.01684856414795,
"policy/approxkl_avg": 0.002529420889914036,
"policy/clipfrac_avg": 0.010681129060685635,
"policy/entropy_avg": 1.2111512422561646,
"step": 156,
"val/clipfrac_avg": 0.0028712116181850433,
"val/num_eos_tokens": 23424,
"val/ratio": 0.9998527765274048,
"val/ratio_var": 3.897734586644219e-06
},
{
"episode": 80384,
"epoch": 0.6886908841672378,
"eps": 4,
"loss/policy_avg": 0.007091144565492868,
"loss/value_avg": 0.17015790939331055,
"lr": 1.1718750000000001e-06,
"objective/entropy": -45.67634582519531,
"objective/kl": 37.143402099609375,
"objective/non_score_reward": -1.8571701049804688,
"objective/rlhf_reward": 10.208694458007812,
"objective/scores": 12.065864562988281,
"policy/approxkl_avg": 0.0023084133863449097,
"policy/clipfrac_avg": 0.011631271801888943,
"policy/entropy_avg": 1.2184652090072632,
"step": 157,
"val/clipfrac_avg": 0.0025410668458789587,
"val/num_eos_tokens": 23929,
"val/ratio": 1.0000377893447876,
"val/ratio_var": 4.961769263900351e-06
},
{
"episode": 80896,
"epoch": 0.6930774503084305,
"eps": 4,
"loss/policy_avg": 0.0037001436576247215,
"loss/value_avg": 0.16299593448638916,
"lr": 1.16015625e-06,
"objective/entropy": -44.94884490966797,
"objective/kl": 36.945613861083984,
"objective/non_score_reward": -1.847280502319336,
"objective/rlhf_reward": 10.337121963500977,
"objective/scores": 12.184402465820312,
"policy/approxkl_avg": 0.002592534990981221,
"policy/clipfrac_avg": 0.010881590656936169,
"policy/entropy_avg": 1.1857258081436157,
"step": 158,
"val/clipfrac_avg": 0.002265874994918704,
"val/num_eos_tokens": 24409,
"val/ratio": 0.9998010993003845,
"val/ratio_var": 4.288477157388115e-06
},
{
"episode": 81408,
"epoch": 0.6974640164496231,
"eps": 4,
"loss/policy_avg": 0.0009241420775651932,
"loss/value_avg": 0.1695103943347931,
"lr": 1.1484375e-06,
"objective/entropy": -45.550392150878906,
"objective/kl": 36.468170166015625,
"objective/non_score_reward": -1.8234084844589233,
"objective/rlhf_reward": 10.236851692199707,
"objective/scores": 12.060259819030762,
"policy/approxkl_avg": 0.002230257960036397,
"policy/clipfrac_avg": 0.010667338967323303,
"policy/entropy_avg": 1.2040106058120728,
"step": 159,
"val/clipfrac_avg": 0.00268998509272933,
"val/num_eos_tokens": 23921,
"val/ratio": 1.0000641345977783,
"val/ratio_var": 2.7710232188837836e-06
},
{
"episode": 81920,
"epoch": 0.7018505825908157,
"eps": 4,
"loss/policy_avg": -0.00038310326635837555,
"loss/value_avg": 0.18818724155426025,
"lr": 1.13671875e-06,
"objective/entropy": -45.712406158447266,
"objective/kl": 35.81027603149414,
"objective/non_score_reward": -1.7905137538909912,
"objective/rlhf_reward": 10.11474609375,
"objective/scores": 11.90526008605957,
"policy/approxkl_avg": 0.002487615682184696,
"policy/clipfrac_avg": 0.01108371652662754,
"policy/entropy_avg": 1.212557315826416,
"step": 160,
"val/clipfrac_avg": 0.003145547118037939,
"val/num_eos_tokens": 25204,
"val/ratio": 0.999794065952301,
"val/ratio_var": 4.027490376756759e-06
},
{
"episode": 82432,
"epoch": 0.7062371487320083,
"eps": 4,
"loss/policy_avg": 0.006643541157245636,
"loss/value_avg": 0.18656548857688904,
"lr": 1.125e-06,
"objective/entropy": -47.61738586425781,
"objective/kl": 35.84968566894531,
"objective/non_score_reward": -1.7924842834472656,
"objective/rlhf_reward": 10.187285423278809,
"objective/scores": 11.979769706726074,
"policy/approxkl_avg": 0.0026129959151148796,
"policy/clipfrac_avg": 0.011652868241071701,
"policy/entropy_avg": 1.2548754215240479,
"step": 161,
"val/clipfrac_avg": 0.0028428449295461178,
"val/num_eos_tokens": 24230,
"val/ratio": 0.9996559619903564,
"val/ratio_var": 5.3404851314553525e-06
},
{
"episode": 82944,
"epoch": 0.7106237148732009,
"eps": 4,
"loss/policy_avg": 0.006890101823955774,
"loss/value_avg": 0.16082976758480072,
"lr": 1.11328125e-06,
"objective/entropy": -50.946983337402344,
"objective/kl": 35.273223876953125,
"objective/non_score_reward": -1.7636611461639404,
"objective/rlhf_reward": 10.322636604309082,
"objective/scores": 12.086297988891602,
"policy/approxkl_avg": 0.001959030982106924,
"policy/clipfrac_avg": 0.012122605927288532,
"policy/entropy_avg": 1.3099424839019775,
"step": 162,
"val/clipfrac_avg": 0.002845605369657278,
"val/num_eos_tokens": 23063,
"val/ratio": 0.9995248317718506,
"val/ratio_var": 2.941842012660345e-06
},
{
"episode": 83456,
"epoch": 0.7150102810143935,
"eps": 4,
"loss/policy_avg": 0.0033896267414093018,
"loss/value_avg": 0.16125231981277466,
"lr": 1.1015625e-06,
"objective/entropy": -50.64572525024414,
"objective/kl": 34.75807189941406,
"objective/non_score_reward": -1.7379035949707031,
"objective/rlhf_reward": 10.104735374450684,
"objective/scores": 11.842638969421387,
"policy/approxkl_avg": 0.0020693184342235327,
"policy/clipfrac_avg": 0.011038804426789284,
"policy/entropy_avg": 1.307016134262085,
"step": 163,
"val/clipfrac_avg": 0.0025604660622775555,
"val/num_eos_tokens": 22017,
"val/ratio": 0.9999017119407654,
"val/ratio_var": 2.949772351712454e-06
},
{
"episode": 83968,
"epoch": 0.719396847155586,
"eps": 4,
"loss/policy_avg": 0.0018947692587971687,
"loss/value_avg": 0.16481269896030426,
"lr": 1.08984375e-06,
"objective/entropy": -50.58066940307617,
"objective/kl": 34.62723159790039,
"objective/non_score_reward": -1.7313616275787354,
"objective/rlhf_reward": 10.169866561889648,
"objective/scores": 11.901227951049805,
"policy/approxkl_avg": 0.002324402565136552,
"policy/clipfrac_avg": 0.011050723493099213,
"policy/entropy_avg": 1.3137667179107666,
"step": 164,
"val/clipfrac_avg": 0.003440006636083126,
"val/num_eos_tokens": 23923,
"val/ratio": 0.9997102618217468,
"val/ratio_var": 5.237521236267639e-06
},
{
"episode": 84480,
"epoch": 0.7237834132967786,
"eps": 4,
"loss/policy_avg": 0.00799738522619009,
"loss/value_avg": 0.16653640568256378,
"lr": 1.078125e-06,
"objective/entropy": -52.23271179199219,
"objective/kl": 34.09307098388672,
"objective/non_score_reward": -1.7046536207199097,
"objective/rlhf_reward": 10.162192344665527,
"objective/scores": 11.866846084594727,
"policy/approxkl_avg": 0.001924938871525228,
"policy/clipfrac_avg": 0.010750483721494675,
"policy/entropy_avg": 1.3296667337417603,
"step": 165,
"val/clipfrac_avg": 0.003241895930841565,
"val/num_eos_tokens": 24023,
"val/ratio": 0.9998387098312378,
"val/ratio_var": 6.901913366164081e-06
},
{
"episode": 84992,
"epoch": 0.7281699794379712,
"eps": 4,
"loss/policy_avg": 0.008404719643294811,
"loss/value_avg": 0.1683107316493988,
"lr": 1.06640625e-06,
"objective/entropy": -52.128395080566406,
"objective/kl": 33.078575134277344,
"objective/non_score_reward": -1.6539287567138672,
"objective/rlhf_reward": 10.260198593139648,
"objective/scores": 11.914127349853516,
"policy/approxkl_avg": 0.002755087101832032,
"policy/clipfrac_avg": 0.010908817872405052,
"policy/entropy_avg": 1.3239178657531738,
"step": 166,
"val/clipfrac_avg": 0.002500710543245077,
"val/num_eos_tokens": 25626,
"val/ratio": 1.0002832412719727,
"val/ratio_var": 3.7921972761978395e-06
},
{
"episode": 85504,
"epoch": 0.7325565455791638,
"eps": 4,
"loss/policy_avg": 0.005474764853715897,
"loss/value_avg": 0.15045103430747986,
"lr": 1.0546875e-06,
"objective/entropy": -51.094482421875,
"objective/kl": 33.334014892578125,
"objective/non_score_reward": -1.666700839996338,
"objective/rlhf_reward": 10.26999282836914,
"objective/scores": 11.93669319152832,
"policy/approxkl_avg": 0.0023759384639561176,
"policy/clipfrac_avg": 0.011528071947395802,
"policy/entropy_avg": 1.3099522590637207,
"step": 167,
"val/clipfrac_avg": 0.0018291361629962921,
"val/num_eos_tokens": 22563,
"val/ratio": 1.0007615089416504,
"val/ratio_var": 6.840497462690109e-06
},
{
"episode": 86016,
"epoch": 0.7369431117203564,
"eps": 4,
"loss/policy_avg": 0.00405261293053627,
"loss/value_avg": 0.12342571467161179,
"lr": 1.04296875e-06,
"objective/entropy": -48.19590759277344,
"objective/kl": 33.93254089355469,
"objective/non_score_reward": -1.696627140045166,
"objective/rlhf_reward": 10.41012954711914,
"objective/scores": 12.106756210327148,
"policy/approxkl_avg": 0.002511480124667287,
"policy/clipfrac_avg": 0.01289713941514492,
"policy/entropy_avg": 1.2494816780090332,
"step": 168,
"val/clipfrac_avg": 0.0024599945172667503,
"val/num_eos_tokens": 23483,
"val/ratio": 1.0000154972076416,
"val/ratio_var": 1.347678426100174e-05
},
{
"episode": 86528,
"epoch": 0.741329677861549,
"eps": 4,
"loss/policy_avg": 0.004242723807692528,
"loss/value_avg": 0.15402936935424805,
"lr": 1.03125e-06,
"objective/entropy": -51.71031188964844,
"objective/kl": 34.08534240722656,
"objective/non_score_reward": -1.7042670249938965,
"objective/rlhf_reward": 10.246797561645508,
"objective/scores": 11.951065063476562,
"policy/approxkl_avg": 0.0018741288222372532,
"policy/clipfrac_avg": 0.011544827371835709,
"policy/entropy_avg": 1.304296851158142,
"step": 169,
"val/clipfrac_avg": 0.002608256647363305,
"val/num_eos_tokens": 23311,
"val/ratio": 1.0001602172851562,
"val/ratio_var": 5.876947398064658e-06
},
{
"episode": 87040,
"epoch": 0.7457162440027416,
"eps": 4,
"loss/policy_avg": 0.006287221796810627,
"loss/value_avg": 0.15575343370437622,
"lr": 1.01953125e-06,
"objective/entropy": -50.226593017578125,
"objective/kl": 34.273902893066406,
"objective/non_score_reward": -1.7136950492858887,
"objective/rlhf_reward": 10.271360397338867,
"objective/scores": 11.985054969787598,
"policy/approxkl_avg": 0.002464515157043934,
"policy/clipfrac_avg": 0.012612780556082726,
"policy/entropy_avg": 1.2805957794189453,
"step": 170,
"val/clipfrac_avg": 0.0021345310378819704,
"val/num_eos_tokens": 24292,
"val/ratio": 1.0004091262817383,
"val/ratio_var": 5.745941962231882e-06
},
{
"episode": 87552,
"epoch": 0.7501028101439342,
"eps": 4,
"loss/policy_avg": 0.003506988286972046,
"loss/value_avg": 0.1364804208278656,
"lr": 1.0078125e-06,
"objective/entropy": -51.53871154785156,
"objective/kl": 33.873558044433594,
"objective/non_score_reward": -1.6936776638031006,
"objective/rlhf_reward": 10.34697151184082,
"objective/scores": 12.0406494140625,
"policy/approxkl_avg": 0.00181733223143965,
"policy/clipfrac_avg": 0.011671137996017933,
"policy/entropy_avg": 1.3100472688674927,
"step": 171,
"val/clipfrac_avg": 0.002693683374673128,
"val/num_eos_tokens": 25970,
"val/ratio": 1.0000426769256592,
"val/ratio_var": 3.860561264446005e-06
},
{
"episode": 88064,
"epoch": 0.7544893762851268,
"eps": 4,
"loss/policy_avg": 0.009347271174192429,
"loss/value_avg": 0.1529439389705658,
"lr": 9.9609375e-07,
"objective/entropy": -50.7225456237793,
"objective/kl": 33.92055130004883,
"objective/non_score_reward": -1.6960275173187256,
"objective/rlhf_reward": 10.297569274902344,
"objective/scores": 11.993597030639648,
"policy/approxkl_avg": 0.0020273206755518913,
"policy/clipfrac_avg": 0.012135770171880722,
"policy/entropy_avg": 1.3071811199188232,
"step": 172,
"val/clipfrac_avg": 0.002652028575539589,
"val/num_eos_tokens": 25641,
"val/ratio": 1.0000050067901611,
"val/ratio_var": 3.2831298995006364e-06
},
{
"episode": 88576,
"epoch": 0.7588759424263194,
"eps": 4,
"loss/policy_avg": 0.006738151423633099,
"loss/value_avg": 0.15998482704162598,
"lr": 9.84375e-07,
"objective/entropy": -49.821937561035156,
"objective/kl": 34.85704040527344,
"objective/non_score_reward": -1.7428522109985352,
"objective/rlhf_reward": 10.277626037597656,
"objective/scores": 12.020478248596191,
"policy/approxkl_avg": 0.0023002480156719685,
"policy/clipfrac_avg": 0.010766479186713696,
"policy/entropy_avg": 1.2641386985778809,
"step": 173,
"val/clipfrac_avg": 0.0034392056986689568,
"val/num_eos_tokens": 24298,
"val/ratio": 1.0001941919326782,
"val/ratio_var": 7.138915862014983e-06
},
{
"episode": 89088,
"epoch": 0.763262508567512,
"eps": 4,
"loss/policy_avg": 0.0019265972077846527,
"loss/value_avg": 0.16067391633987427,
"lr": 9.7265625e-07,
"objective/entropy": -48.73914337158203,
"objective/kl": 34.770301818847656,
"objective/non_score_reward": -1.7385151386260986,
"objective/rlhf_reward": 10.19167709350586,
"objective/scores": 11.930191993713379,
"policy/approxkl_avg": 0.0023067870642989874,
"policy/clipfrac_avg": 0.011778725311160088,
"policy/entropy_avg": 1.2403905391693115,
"step": 174,
"val/clipfrac_avg": 0.002369035966694355,
"val/num_eos_tokens": 24369,
"val/ratio": 1.0003368854522705,
"val/ratio_var": 9.785385373106692e-06
},
{
"episode": 89600,
"epoch": 0.7676490747087046,
"eps": 4,
"loss/policy_avg": 0.009770754724740982,
"loss/value_avg": 0.18742433190345764,
"lr": 9.609375e-07,
"objective/entropy": -48.57078552246094,
"objective/kl": 34.64019012451172,
"objective/non_score_reward": -1.7320095300674438,
"objective/rlhf_reward": 10.104455947875977,
"objective/scores": 11.836465835571289,
"policy/approxkl_avg": 0.002243851777166128,
"policy/clipfrac_avg": 0.011341418139636517,
"policy/entropy_avg": 1.2522807121276855,
"step": 175,
"val/clipfrac_avg": 0.0024377312511205673,
"val/num_eos_tokens": 25931,
"val/ratio": 1.0000677108764648,
"val/ratio_var": 2.8293659397604642e-06
},
{
"episode": 90112,
"epoch": 0.7720356408498972,
"eps": 4,
"loss/policy_avg": 0.0009953724220395088,
"loss/value_avg": 0.16015103459358215,
"lr": 9.4921875e-07,
"objective/entropy": -47.74214172363281,
"objective/kl": 35.312191009521484,
"objective/non_score_reward": -1.7656095027923584,
"objective/rlhf_reward": 10.254161834716797,
"objective/scores": 12.019771575927734,
"policy/approxkl_avg": 0.002973256167024374,
"policy/clipfrac_avg": 0.012829918414354324,
"policy/entropy_avg": 1.2157658338546753,
"step": 176,
"val/clipfrac_avg": 0.0022666973527520895,
"val/num_eos_tokens": 25047,
"val/ratio": 0.9998872876167297,
"val/ratio_var": 1.183138920168858e-05
},
{
"episode": 90624,
"epoch": 0.7764222069910898,
"eps": 4,
"loss/policy_avg": 0.001389509066939354,
"loss/value_avg": 0.17181482911109924,
"lr": 9.375e-07,
"objective/entropy": -46.99293518066406,
"objective/kl": 35.26284408569336,
"objective/non_score_reward": -1.7631421089172363,
"objective/rlhf_reward": 10.230562210083008,
"objective/scores": 11.993703842163086,
"policy/approxkl_avg": 0.002044677734375,
"policy/clipfrac_avg": 0.010602492839097977,
"policy/entropy_avg": 1.1966073513031006,
"step": 177,
"val/clipfrac_avg": 0.002445896854624152,
"val/num_eos_tokens": 24650,
"val/ratio": 1.000166893005371,
"val/ratio_var": 3.4964016322192037e-06
},
{
"episode": 91136,
"epoch": 0.7808087731322824,
"eps": 4,
"loss/policy_avg": 0.0006192019209265709,
"loss/value_avg": 0.18841782212257385,
"lr": 9.257812500000001e-07,
"objective/entropy": -45.32579040527344,
"objective/kl": 35.63456726074219,
"objective/non_score_reward": -1.7817286252975464,
"objective/rlhf_reward": 10.122578620910645,
"objective/scores": 11.90430736541748,
"policy/approxkl_avg": 0.0024878934491425753,
"policy/clipfrac_avg": 0.011196051724255085,
"policy/entropy_avg": 1.2084518671035767,
"step": 178,
"val/clipfrac_avg": 0.002982205478474498,
"val/num_eos_tokens": 25471,
"val/ratio": 0.9997743368148804,
"val/ratio_var": 5.401115686254343e-06
},
{
"episode": 91648,
"epoch": 0.785195339273475,
"eps": 4,
"loss/policy_avg": 0.0005537644028663635,
"loss/value_avg": 0.17692416906356812,
"lr": 9.140625e-07,
"objective/entropy": -44.86452102661133,
"objective/kl": 35.01612091064453,
"objective/non_score_reward": -1.7508059740066528,
"objective/rlhf_reward": 10.318327903747559,
"objective/scores": 12.069133758544922,
"policy/approxkl_avg": 0.0021444151643663645,
"policy/clipfrac_avg": 0.012152086943387985,
"policy/entropy_avg": 1.1637636423110962,
"step": 179,
"val/clipfrac_avg": 0.0027516535483300686,
"val/num_eos_tokens": 25029,
"val/ratio": 0.9998598694801331,
"val/ratio_var": 3.8798657442384865e-06
},
{
"episode": 92160,
"epoch": 0.7895819054146676,
"eps": 4,
"loss/policy_avg": 0.001009856816381216,
"loss/value_avg": 0.16210728883743286,
"lr": 9.0234375e-07,
"objective/entropy": -44.95887756347656,
"objective/kl": 35.57145309448242,
"objective/non_score_reward": -1.7785727977752686,
"objective/rlhf_reward": 10.33371639251709,
"objective/scores": 12.112289428710938,
"policy/approxkl_avg": 0.0019982215017080307,
"policy/clipfrac_avg": 0.010970347560942173,
"policy/entropy_avg": 1.1672537326812744,
"step": 180,
"val/clipfrac_avg": 0.0016274080844596028,
"val/num_eos_tokens": 24106,
"val/ratio": 1.0000462532043457,
"val/ratio_var": 4.52870244771475e-06
},
{
"episode": 92672,
"epoch": 0.7939684715558601,
"eps": 4,
"loss/policy_avg": 0.004074078518897295,
"loss/value_avg": 0.16225843131542206,
"lr": 8.90625e-07,
"objective/entropy": -46.57657241821289,
"objective/kl": 35.93509292602539,
"objective/non_score_reward": -1.7967547178268433,
"objective/rlhf_reward": 10.253158569335938,
"objective/scores": 12.04991340637207,
"policy/approxkl_avg": 0.002360533457249403,
"policy/clipfrac_avg": 0.01273175049573183,
"policy/entropy_avg": 1.1883782148361206,
"step": 181,
"val/clipfrac_avg": 0.00262850197032094,
"val/num_eos_tokens": 24088,
"val/ratio": 0.999874472618103,
"val/ratio_var": 4.06332810598542e-06
},
{
"episode": 93184,
"epoch": 0.7983550376970527,
"eps": 4,
"loss/policy_avg": 0.0031516384333372116,
"loss/value_avg": 0.16150668263435364,
"lr": 8.7890625e-07,
"objective/entropy": -47.01606750488281,
"objective/kl": 34.775672912597656,
"objective/non_score_reward": -1.738783597946167,
"objective/rlhf_reward": 10.400968551635742,
"objective/scores": 12.139752388000488,
"policy/approxkl_avg": 0.003061380237340927,
"policy/clipfrac_avg": 0.011249782517552376,
"policy/entropy_avg": 1.189601182937622,
"step": 182,
"val/clipfrac_avg": 0.0020699123851954937,
"val/num_eos_tokens": 24774,
"val/ratio": 0.9997990727424622,
"val/ratio_var": 3.931295395886991e-06
},
{
"episode": 93696,
"epoch": 0.8027416038382453,
"eps": 4,
"loss/policy_avg": 0.0011372501030564308,
"loss/value_avg": 0.1965566873550415,
"lr": 8.671875e-07,
"objective/entropy": -46.06599044799805,
"objective/kl": 36.448936462402344,
"objective/non_score_reward": -1.8224468231201172,
"objective/rlhf_reward": 10.114370346069336,
"objective/scores": 11.936817169189453,
"policy/approxkl_avg": 0.0025729872286319733,
"policy/clipfrac_avg": 0.01264517568051815,
"policy/entropy_avg": 1.194599986076355,
"step": 183,
"val/clipfrac_avg": 0.0033960985019803047,
"val/num_eos_tokens": 24209,
"val/ratio": 1.0000592470169067,
"val/ratio_var": 4.049191829835763e-06
},
{
"episode": 94208,
"epoch": 0.8071281699794379,
"eps": 4,
"loss/policy_avg": 0.007237900979816914,
"loss/value_avg": 0.15372735261917114,
"lr": 8.5546875e-07,
"objective/entropy": -45.80794143676758,
"objective/kl": 35.47758483886719,
"objective/non_score_reward": -1.7738792896270752,
"objective/rlhf_reward": 10.278848648071289,
"objective/scores": 12.052727699279785,
"policy/approxkl_avg": 0.002283816458657384,
"policy/clipfrac_avg": 0.010822740383446217,
"policy/entropy_avg": 1.1835873126983643,
"step": 184,
"val/clipfrac_avg": 0.0018642449285835028,
"val/num_eos_tokens": 24856,
"val/ratio": 0.9998406171798706,
"val/ratio_var": 3.7159979910939e-06
},
{
"episode": 94720,
"epoch": 0.8115147361206305,
"eps": 4,
"loss/policy_avg": 0.0026482678949832916,
"loss/value_avg": 0.18347406387329102,
"lr": 8.4375e-07,
"objective/entropy": -46.348575592041016,
"objective/kl": 35.41945266723633,
"objective/non_score_reward": -1.770972728729248,
"objective/rlhf_reward": 10.092222213745117,
"objective/scores": 11.863194465637207,
"policy/approxkl_avg": 0.0023487925063818693,
"policy/clipfrac_avg": 0.011329904198646545,
"policy/entropy_avg": 1.1949329376220703,
"step": 185,
"val/clipfrac_avg": 0.0018796215299516916,
"val/num_eos_tokens": 24285,
"val/ratio": 1.0002095699310303,
"val/ratio_var": 8.477360097458586e-06
},
{
"episode": 95232,
"epoch": 0.8159013022618231,
"eps": 4,
"loss/policy_avg": 0.004165485501289368,
"loss/value_avg": 0.1504068225622177,
"lr": 8.3203125e-07,
"objective/entropy": -47.73908233642578,
"objective/kl": 35.27336883544922,
"objective/non_score_reward": -1.763668417930603,
"objective/rlhf_reward": 10.285353660583496,
"objective/scores": 12.04902172088623,
"policy/approxkl_avg": 0.0021917533595114946,
"policy/clipfrac_avg": 0.011097628623247147,
"policy/entropy_avg": 1.2006309032440186,
"step": 186,
"val/clipfrac_avg": 0.0022048484534025192,
"val/num_eos_tokens": 24147,
"val/ratio": 1.0000174045562744,
"val/ratio_var": 4.247439846949419e-06
},
{
"episode": 95744,
"epoch": 0.8202878684030158,
"eps": 4,
"loss/policy_avg": 0.004456181079149246,
"loss/value_avg": 0.1416192352771759,
"lr": 8.203125e-07,
"objective/entropy": -49.53144836425781,
"objective/kl": 34.152732849121094,
"objective/non_score_reward": -1.7076367139816284,
"objective/rlhf_reward": 10.372452735900879,
"objective/scores": 12.080089569091797,
"policy/approxkl_avg": 0.002187924925237894,
"policy/clipfrac_avg": 0.010536652989685535,
"policy/entropy_avg": 1.236476182937622,
"step": 187,
"val/clipfrac_avg": 0.00176910194568336,
"val/num_eos_tokens": 24341,
"val/ratio": 1.0000340938568115,
"val/ratio_var": 3.6587950944522163e-06
},
{
"episode": 96256,
"epoch": 0.8246744345442084,
"eps": 4,
"loss/policy_avg": 0.006817615125328302,
"loss/value_avg": 0.1469859927892685,
"lr": 8.085937500000001e-07,
"objective/entropy": -49.844322204589844,
"objective/kl": 34.35259246826172,
"objective/non_score_reward": -1.7176295518875122,
"objective/rlhf_reward": 10.423017501831055,
"objective/scores": 12.140646934509277,
"policy/approxkl_avg": 0.0020276098512113094,
"policy/clipfrac_avg": 0.011348921805620193,
"policy/entropy_avg": 1.2337639331817627,
"step": 188,
"val/clipfrac_avg": 0.0017726544756442308,
"val/num_eos_tokens": 22883,
"val/ratio": 0.9999902248382568,
"val/ratio_var": 2.778531097646919e-06
},
{
"episode": 96768,
"epoch": 0.829061000685401,
"eps": 4,
"loss/policy_avg": 0.0022352226078510284,
"loss/value_avg": 0.14938578009605408,
"lr": 7.96875e-07,
"objective/entropy": -49.13550567626953,
"objective/kl": 34.316654205322266,
"objective/non_score_reward": -1.7158327102661133,
"objective/rlhf_reward": 10.387197494506836,
"objective/scores": 12.10303020477295,
"policy/approxkl_avg": 0.0019436124712228775,
"policy/clipfrac_avg": 0.010562841780483723,
"policy/entropy_avg": 1.2204303741455078,
"step": 189,
"val/clipfrac_avg": 0.0015331670874729753,
"val/num_eos_tokens": 25402,
"val/ratio": 0.9999203681945801,
"val/ratio_var": 3.020122221641941e-06
},
{
"episode": 97280,
"epoch": 0.8334475668265936,
"eps": 4,
"loss/policy_avg": 0.0039822859689593315,
"loss/value_avg": 0.15465494990348816,
"lr": 7.8515625e-07,
"objective/entropy": -47.03410339355469,
"objective/kl": 35.7232666015625,
"objective/non_score_reward": -1.786163330078125,
"objective/rlhf_reward": 10.189088821411133,
"objective/scores": 11.975252151489258,
"policy/approxkl_avg": 0.002349921502172947,
"policy/clipfrac_avg": 0.011656483635306358,
"policy/entropy_avg": 1.1775662899017334,
"step": 190,
"val/clipfrac_avg": 0.002678380813449621,
"val/num_eos_tokens": 23956,
"val/ratio": 1.0002427101135254,
"val/ratio_var": 7.052185537759215e-06
},
{
"episode": 97792,
"epoch": 0.8378341329677862,
"eps": 4,
"loss/policy_avg": 0.0036186622455716133,
"loss/value_avg": 0.14314797520637512,
"lr": 7.734375000000001e-07,
"objective/entropy": -48.098785400390625,
"objective/kl": 35.30805969238281,
"objective/non_score_reward": -1.765402913093567,
"objective/rlhf_reward": 10.44815731048584,
"objective/scores": 12.213560104370117,
"policy/approxkl_avg": 0.001903231255710125,
"policy/clipfrac_avg": 0.010514364577829838,
"policy/entropy_avg": 1.1749199628829956,
"step": 191,
"val/clipfrac_avg": 0.002496888395398855,
"val/num_eos_tokens": 23098,
"val/ratio": 1.0006399154663086,
"val/ratio_var": 1.3019835023442283e-05
},
{
"episode": 98304,
"epoch": 0.8422206991089788,
"eps": 4,
"loss/policy_avg": 0.0033899955451488495,
"loss/value_avg": 0.1446259617805481,
"lr": 7.6171875e-07,
"objective/entropy": -48.63313293457031,
"objective/kl": 35.44336700439453,
"objective/non_score_reward": -1.7721683979034424,
"objective/rlhf_reward": 10.299942970275879,
"objective/scores": 12.072111129760742,
"policy/approxkl_avg": 0.0017086728475987911,
"policy/clipfrac_avg": 0.00998393353074789,
"policy/entropy_avg": 1.199033498764038,
"step": 192,
"val/clipfrac_avg": 0.001696545397862792,
"val/num_eos_tokens": 25110,
"val/ratio": 0.9996849298477173,
"val/ratio_var": 3.5158464015694335e-06
},
{
"episode": 98816,
"epoch": 0.8466072652501714,
"eps": 4,
"loss/policy_avg": 0.004922365769743919,
"loss/value_avg": 0.15805533528327942,
"lr": 7.5e-07,
"objective/entropy": -48.80876541137695,
"objective/kl": 34.96177291870117,
"objective/non_score_reward": -1.7480887174606323,
"objective/rlhf_reward": 10.34687614440918,
"objective/scores": 12.094964981079102,
"policy/approxkl_avg": 0.002475408371537924,
"policy/clipfrac_avg": 0.011553257703781128,
"policy/entropy_avg": 1.202178955078125,
"step": 193,
"val/clipfrac_avg": 0.0017627595225349069,
"val/num_eos_tokens": 25458,
"val/ratio": 0.999587893486023,
"val/ratio_var": 2.171113692384097e-06
},
{
"episode": 99328,
"epoch": 0.850993831391364,
"eps": 4,
"loss/policy_avg": 0.007751693949103355,
"loss/value_avg": 0.15220610797405243,
"lr": 7.3828125e-07,
"objective/entropy": -47.91047668457031,
"objective/kl": 35.784759521484375,
"objective/non_score_reward": -1.7892380952835083,
"objective/rlhf_reward": 10.237627029418945,
"objective/scores": 12.026865005493164,
"policy/approxkl_avg": 0.002570272423326969,
"policy/clipfrac_avg": 0.011220266111195087,
"policy/entropy_avg": 1.1578489542007446,
"step": 194,
"val/clipfrac_avg": 0.0022793509997427464,
"val/num_eos_tokens": 23474,
"val/ratio": 1.0003504753112793,
"val/ratio_var": 1.2456192962417845e-05
},
{
"episode": 99840,
"epoch": 0.8553803975325566,
"eps": 4,
"loss/policy_avg": 0.0006990758702158928,
"loss/value_avg": 0.15493687987327576,
"lr": 7.265625e-07,
"objective/entropy": -47.84299850463867,
"objective/kl": 35.89054870605469,
"objective/non_score_reward": -1.794527530670166,
"objective/rlhf_reward": 10.264701843261719,
"objective/scores": 12.059228897094727,
"policy/approxkl_avg": 0.001959962071850896,
"policy/clipfrac_avg": 0.01036808267235756,
"policy/entropy_avg": 1.162892460823059,
"step": 195,
"val/clipfrac_avg": 0.0018003088189288974,
"val/num_eos_tokens": 23847,
"val/ratio": 0.9998455047607422,
"val/ratio_var": 3.4515687730163336e-06
},
{
"episode": 100352,
"epoch": 0.8597669636737492,
"eps": 4,
"loss/policy_avg": 0.004795054439455271,
"loss/value_avg": 0.16058918833732605,
"lr": 7.1484375e-07,
"objective/entropy": -47.96992492675781,
"objective/kl": 35.145347595214844,
"objective/non_score_reward": -1.7572674751281738,
"objective/rlhf_reward": 10.19577407836914,
"objective/scores": 11.953042030334473,
"policy/approxkl_avg": 0.0025707499589771032,
"policy/clipfrac_avg": 0.01175383199006319,
"policy/entropy_avg": 1.185746192932129,
"step": 196,
"val/clipfrac_avg": 0.002270677126944065,
"val/num_eos_tokens": 24550,
"val/ratio": 0.99980628490448,
"val/ratio_var": 4.205757250019815e-06
},
{
"episode": 100864,
"epoch": 0.8641535298149418,
"eps": 4,
"loss/policy_avg": 0.005628856830298901,
"loss/value_avg": 0.14903391897678375,
"lr": 7.03125e-07,
"objective/entropy": -45.81394958496094,
"objective/kl": 35.49262619018555,
"objective/non_score_reward": -1.7746312618255615,
"objective/rlhf_reward": 10.463909149169922,
"objective/scores": 12.238540649414062,
"policy/approxkl_avg": 0.0024222065694630146,
"policy/clipfrac_avg": 0.010860033333301544,
"policy/entropy_avg": 1.1221274137496948,
"step": 197,
"val/clipfrac_avg": 0.0026403269730508327,
"val/num_eos_tokens": 24000,
"val/ratio": 0.9999848008155823,
"val/ratio_var": 3.7348772821133025e-06
},
{
"episode": 101376,
"epoch": 0.8685400959561344,
"eps": 4,
"loss/policy_avg": -0.0002153497189283371,
"loss/value_avg": 0.16071295738220215,
"lr": 6.9140625e-07,
"objective/entropy": -47.071693420410156,
"objective/kl": 35.359169006347656,
"objective/non_score_reward": -1.767958402633667,
"objective/rlhf_reward": 10.270247459411621,
"objective/scores": 12.038206100463867,
"policy/approxkl_avg": 0.002538530621677637,
"policy/clipfrac_avg": 0.010970663279294968,
"policy/entropy_avg": 1.1485331058502197,
"step": 198,
"val/clipfrac_avg": 0.0015574777498841286,
"val/num_eos_tokens": 24137,
"val/ratio": 0.9999282360076904,
"val/ratio_var": 4.009260919701774e-06
},
{
"episode": 101888,
"epoch": 0.8729266620973269,
"eps": 4,
"loss/policy_avg": 0.0026365332305431366,
"loss/value_avg": 0.12809592485427856,
"lr": 6.796875e-07,
"objective/entropy": -46.45412826538086,
"objective/kl": 35.395294189453125,
"objective/non_score_reward": -1.7697646617889404,
"objective/rlhf_reward": 10.501349449157715,
"objective/scores": 12.271114349365234,
"policy/approxkl_avg": 0.0024031256325542927,
"policy/clipfrac_avg": 0.010782474651932716,
"policy/entropy_avg": 1.1210780143737793,
"step": 199,
"val/clipfrac_avg": 0.001396391773596406,
"val/num_eos_tokens": 21839,
"val/ratio": 1.0000112056732178,
"val/ratio_var": 4.58755812360323e-06
},
{
"episode": 102400,
"epoch": 0.8773132282385195,
"eps": 4,
"loss/policy_avg": 0.004148578271269798,
"loss/value_avg": 0.1365332007408142,
"lr": 6.6796875e-07,
"objective/entropy": -48.283546447753906,
"objective/kl": 34.94371032714844,
"objective/non_score_reward": -1.7471855878829956,
"objective/rlhf_reward": 10.321309089660645,
"objective/scores": 12.06849479675293,
"policy/approxkl_avg": 0.002053589327260852,
"policy/clipfrac_avg": 0.010953281074762344,
"policy/entropy_avg": 1.1579573154449463,
"step": 200,
"val/clipfrac_avg": 0.001975214807316661,
"val/num_eos_tokens": 23925,
"val/ratio": 0.9997996091842651,
"val/ratio_var": 4.9919021876121406e-06
},
{
"episode": 102912,
"epoch": 0.8816997943797121,
"eps": 4,
"loss/policy_avg": 0.0071252393536269665,
"loss/value_avg": 0.15587176382541656,
"lr": 6.562500000000001e-07,
"objective/entropy": -48.567020416259766,
"objective/kl": 34.97012710571289,
"objective/non_score_reward": -1.7485063076019287,
"objective/rlhf_reward": 10.221208572387695,
"objective/scores": 11.969715118408203,
"policy/approxkl_avg": 0.00224270299077034,
"policy/clipfrac_avg": 0.010910441167652607,
"policy/entropy_avg": 1.1749496459960938,
"step": 201,
"val/clipfrac_avg": 0.0020675300620496273,
"val/num_eos_tokens": 22747,
"val/ratio": 0.9998396635055542,
"val/ratio_var": 3.2513178211956983e-06
},
{
"episode": 103424,
"epoch": 0.8860863605209047,
"eps": 4,
"loss/policy_avg": 0.002688491716980934,
"loss/value_avg": 0.14362746477127075,
"lr": 6.4453125e-07,
"objective/entropy": -44.80986022949219,
"objective/kl": 36.38422775268555,
"objective/non_score_reward": -1.819211483001709,
"objective/rlhf_reward": 10.282768249511719,
"objective/scores": 12.101980209350586,
"policy/approxkl_avg": 0.0024515336845070124,
"policy/clipfrac_avg": 0.011868854984641075,
"policy/entropy_avg": 1.0818434953689575,
"step": 202,
"val/clipfrac_avg": 0.002219648100435734,
"val/num_eos_tokens": 21456,
"val/ratio": 1.0001237392425537,
"val/ratio_var": 6.081787887524115e-06
},
{
"episode": 103936,
"epoch": 0.8904729266620973,
"eps": 4,
"loss/policy_avg": 0.0033247251994907856,
"loss/value_avg": 0.14334289729595184,
"lr": 6.328125e-07,
"objective/entropy": -46.834434509277344,
"objective/kl": 35.398094177246094,
"objective/non_score_reward": -1.769904613494873,
"objective/rlhf_reward": 10.301237106323242,
"objective/scores": 12.071142196655273,
"policy/approxkl_avg": 0.0021492401137948036,
"policy/clipfrac_avg": 0.012414924800395966,
"policy/entropy_avg": 1.1050291061401367,
"step": 203,
"val/clipfrac_avg": 0.0018738624639809132,
"val/num_eos_tokens": 22219,
"val/ratio": 0.9999430179595947,
"val/ratio_var": 4.555758096103091e-06
},
{
"episode": 104448,
"epoch": 0.8948594928032899,
"eps": 4,
"loss/policy_avg": 0.006169524043798447,
"loss/value_avg": 0.14988917112350464,
"lr": 6.210937500000001e-07,
"objective/entropy": -45.63399124145508,
"objective/kl": 36.69015884399414,
"objective/non_score_reward": -1.834507942199707,
"objective/rlhf_reward": 10.198760032653809,
"objective/scores": 12.033267974853516,
"policy/approxkl_avg": 0.002657091710716486,
"policy/clipfrac_avg": 0.011476716957986355,
"policy/entropy_avg": 1.1221380233764648,
"step": 204,
"val/clipfrac_avg": 0.002059329068288207,
"val/num_eos_tokens": 23233,
"val/ratio": 1.0008026361465454,
"val/ratio_var": 3.8777681766077876e-05
},
{
"episode": 104960,
"epoch": 0.8992460589444825,
"eps": 4,
"loss/policy_avg": 0.0033749323338270187,
"loss/value_avg": 0.13801273703575134,
"lr": 6.09375e-07,
"objective/entropy": -48.137725830078125,
"objective/kl": 35.192413330078125,
"objective/non_score_reward": -1.7596206665039062,
"objective/rlhf_reward": 10.384641647338867,
"objective/scores": 12.144262313842773,
"policy/approxkl_avg": 0.002190415980294347,
"policy/clipfrac_avg": 0.011061472818255424,
"policy/entropy_avg": 1.1410572528839111,
"step": 205,
"val/clipfrac_avg": 0.0016738364938646555,
"val/num_eos_tokens": 23974,
"val/ratio": 1.0002024173736572,
"val/ratio_var": 4.371653631096706e-06
},
{
"episode": 105472,
"epoch": 0.9036326250856751,
"eps": 4,
"loss/policy_avg": 0.00836427416652441,
"loss/value_avg": 0.13613036274909973,
"lr": 5.9765625e-07,
"objective/entropy": -46.69685745239258,
"objective/kl": 35.53401184082031,
"objective/non_score_reward": -1.7767008543014526,
"objective/rlhf_reward": 10.326574325561523,
"objective/scores": 12.103275299072266,
"policy/approxkl_avg": 0.002368978690356016,
"policy/clipfrac_avg": 0.011249538511037827,
"policy/entropy_avg": 1.1251120567321777,
"step": 206,
"val/clipfrac_avg": 0.0025745341554284096,
"val/num_eos_tokens": 22087,
"val/ratio": 0.999896228313446,
"val/ratio_var": 5.420577963377582e-06
},
{
"episode": 105984,
"epoch": 0.9080191912268677,
"eps": 4,
"loss/policy_avg": 0.008257454261183739,
"loss/value_avg": 0.13564562797546387,
"lr": 5.859375000000001e-07,
"objective/entropy": -45.498809814453125,
"objective/kl": 36.362369537353516,
"objective/non_score_reward": -1.8181185722351074,
"objective/rlhf_reward": 10.476442337036133,
"objective/scores": 12.294561386108398,
"policy/approxkl_avg": 0.0021072309464216232,
"policy/clipfrac_avg": 0.010421659797430038,
"policy/entropy_avg": 1.0942033529281616,
"step": 207,
"val/clipfrac_avg": 0.002059993799775839,
"val/num_eos_tokens": 23421,
"val/ratio": 1.0002628564834595,
"val/ratio_var": 7.209391242213314e-06
},
{
"episode": 106496,
"epoch": 0.9124057573680603,
"eps": 4,
"loss/policy_avg": 0.001601784024387598,
"loss/value_avg": 0.16962260007858276,
"lr": 5.7421875e-07,
"objective/entropy": -46.86063766479492,
"objective/kl": 35.87854766845703,
"objective/non_score_reward": -1.7939273118972778,
"objective/rlhf_reward": 10.281155586242676,
"objective/scores": 12.075082778930664,
"policy/approxkl_avg": 0.0026243766769766808,
"policy/clipfrac_avg": 0.011249695904552937,
"policy/entropy_avg": 1.1037336587905884,
"step": 208,
"val/clipfrac_avg": 0.0016680224798619747,
"val/num_eos_tokens": 22760,
"val/ratio": 0.9998372197151184,
"val/ratio_var": 4.619852916221134e-06
},
{
"episode": 107008,
"epoch": 0.9167923235092529,
"eps": 4,
"loss/policy_avg": 0.003604589030146599,
"loss/value_avg": 0.13913950324058533,
"lr": 5.625e-07,
"objective/entropy": -47.09626388549805,
"objective/kl": 36.34856414794922,
"objective/non_score_reward": -1.8174282312393188,
"objective/rlhf_reward": 10.236056327819824,
"objective/scores": 12.053484916687012,
"policy/approxkl_avg": 0.0020685973577201366,
"policy/clipfrac_avg": 0.010848737321794033,
"policy/entropy_avg": 1.1276905536651611,
"step": 209,
"val/clipfrac_avg": 0.0022783444728702307,
"val/num_eos_tokens": 22894,
"val/ratio": 0.9998518228530884,
"val/ratio_var": 2.5297301817772677e-06
},
{
"episode": 107520,
"epoch": 0.9211788896504455,
"eps": 4,
"loss/policy_avg": 0.004152492620050907,
"loss/value_avg": 0.1309879571199417,
"lr": 5.5078125e-07,
"objective/entropy": -46.065147399902344,
"objective/kl": 35.999778747558594,
"objective/non_score_reward": -1.799989104270935,
"objective/rlhf_reward": 10.247265815734863,
"objective/scores": 12.04725456237793,
"policy/approxkl_avg": 0.0024839870166033506,
"policy/clipfrac_avg": 0.01161247305572033,
"policy/entropy_avg": 1.1072354316711426,
"step": 210,
"val/clipfrac_avg": 0.002184495097026229,
"val/num_eos_tokens": 22270,
"val/ratio": 0.9998332858085632,
"val/ratio_var": 4.201321189611917e-06
},
{
"episode": 108032,
"epoch": 0.9255654557916381,
"eps": 4,
"loss/policy_avg": 0.004213474225252867,
"loss/value_avg": 0.1342218518257141,
"lr": 5.390625e-07,
"objective/entropy": -45.06540298461914,
"objective/kl": 36.81282424926758,
"objective/non_score_reward": -1.8406412601470947,
"objective/rlhf_reward": 10.265863418579102,
"objective/scores": 12.106504440307617,
"policy/approxkl_avg": 0.0019367935601621866,
"policy/clipfrac_avg": 0.011182930320501328,
"policy/entropy_avg": 1.0617645978927612,
"step": 211,
"val/clipfrac_avg": 0.0014739616308361292,
"val/num_eos_tokens": 23334,
"val/ratio": 0.9999167323112488,
"val/ratio_var": 3.8635080272797495e-06
},
{
"episode": 108544,
"epoch": 0.9299520219328307,
"eps": 4,
"loss/policy_avg": 0.002486391458660364,
"loss/value_avg": 0.13077521324157715,
"lr": 5.2734375e-07,
"objective/entropy": -47.85795974731445,
"objective/kl": 35.55070877075195,
"objective/non_score_reward": -1.7775355577468872,
"objective/rlhf_reward": 10.375813484191895,
"objective/scores": 12.153348922729492,
"policy/approxkl_avg": 0.002080064732581377,
"policy/clipfrac_avg": 0.009690655395388603,
"policy/entropy_avg": 1.1195100545883179,
"step": 212,
"val/clipfrac_avg": 0.0017993149813264608,
"val/num_eos_tokens": 22692,
"val/ratio": 0.9999110698699951,
"val/ratio_var": 5.023774974688422e-06
},
{
"episode": 109056,
"epoch": 0.9343385880740233,
"eps": 4,
"loss/policy_avg": 0.0036895014345645905,
"loss/value_avg": 0.132409006357193,
"lr": 5.15625e-07,
"objective/entropy": -47.587425231933594,
"objective/kl": 35.982322692871094,
"objective/non_score_reward": -1.7991161346435547,
"objective/rlhf_reward": 10.343294143676758,
"objective/scores": 12.142410278320312,
"policy/approxkl_avg": 0.0029481318779289722,
"policy/clipfrac_avg": 0.010319727472960949,
"policy/entropy_avg": 1.1227898597717285,
"step": 213,
"val/clipfrac_avg": 0.0008735989686101675,
"val/num_eos_tokens": 22491,
"val/ratio": 0.9999507069587708,
"val/ratio_var": 6.943369953660294e-06
},
{
"episode": 109568,
"epoch": 0.9387251542152159,
"eps": 4,
"loss/policy_avg": 0.001358766108751297,
"loss/value_avg": 0.155538409948349,
"lr": 5.0390625e-07,
"objective/entropy": -47.061283111572266,
"objective/kl": 35.63045120239258,
"objective/non_score_reward": -1.781522512435913,
"objective/rlhf_reward": 10.259801864624023,
"objective/scores": 12.041324615478516,
"policy/approxkl_avg": 0.0025432356633245945,
"policy/clipfrac_avg": 0.011627338826656342,
"policy/entropy_avg": 1.122612476348877,
"step": 214,
"val/clipfrac_avg": 0.0017226223135367036,
"val/num_eos_tokens": 23435,
"val/ratio": 0.9998894929885864,
"val/ratio_var": 2.8884762741654413e-06
},
{
"episode": 110080,
"epoch": 0.9431117203564084,
"eps": 4,
"loss/policy_avg": 0.0025219814851880074,
"loss/value_avg": 0.15307217836380005,
"lr": 4.921875e-07,
"objective/entropy": -47.419456481933594,
"objective/kl": 35.47161102294922,
"objective/non_score_reward": -1.773580551147461,
"objective/rlhf_reward": 10.303951263427734,
"objective/scores": 12.077531814575195,
"policy/approxkl_avg": 0.002379181096330285,
"policy/clipfrac_avg": 0.010707897134125233,
"policy/entropy_avg": 1.1141124963760376,
"step": 215,
"val/clipfrac_avg": 0.002012796001508832,
"val/num_eos_tokens": 22478,
"val/ratio": 1.0004652738571167,
"val/ratio_var": 8.836183951643761e-06
},
{
"episode": 110592,
"epoch": 0.9474982864976011,
"eps": 4,
"loss/policy_avg": 0.0005590170621871948,
"loss/value_avg": 0.13267451524734497,
"lr": 4.8046875e-07,
"objective/entropy": -46.87737274169922,
"objective/kl": 36.430267333984375,
"objective/non_score_reward": -1.8215134143829346,
"objective/rlhf_reward": 10.235672950744629,
"objective/scores": 12.057186126708984,
"policy/approxkl_avg": 0.0019323707092553377,
"policy/clipfrac_avg": 0.010391879826784134,
"policy/entropy_avg": 1.084083080291748,
"step": 216,
"val/clipfrac_avg": 0.0023698105942457914,
"val/num_eos_tokens": 21561,
"val/ratio": 1.0000874996185303,
"val/ratio_var": 3.9766900954418816e-06
},
{
"episode": 111104,
"epoch": 0.9518848526387937,
"eps": 4,
"loss/policy_avg": 0.002204008400440216,
"loss/value_avg": 0.12931928038597107,
"lr": 4.6875e-07,
"objective/entropy": -47.961090087890625,
"objective/kl": 34.691619873046875,
"objective/non_score_reward": -1.7345812320709229,
"objective/rlhf_reward": 10.36070442199707,
"objective/scores": 12.095285415649414,
"policy/approxkl_avg": 0.002234948333352804,
"policy/clipfrac_avg": 0.010548978112637997,
"policy/entropy_avg": 1.117429494857788,
"step": 217,
"val/clipfrac_avg": 0.0020855211187154055,
"val/num_eos_tokens": 23147,
"val/ratio": 0.9999337792396545,
"val/ratio_var": 3.004956170116202e-06
},
{
"episode": 111616,
"epoch": 0.9562714187799863,
"eps": 4,
"loss/policy_avg": 0.0032723252661526203,
"loss/value_avg": 0.13847306370735168,
"lr": 4.5703125e-07,
"objective/entropy": -46.945213317871094,
"objective/kl": 36.218875885009766,
"objective/non_score_reward": -1.8109439611434937,
"objective/rlhf_reward": 10.1881103515625,
"objective/scores": 11.999053955078125,
"policy/approxkl_avg": 0.0020719291642308235,
"policy/clipfrac_avg": 0.010710496455430984,
"policy/entropy_avg": 1.1050124168395996,
"step": 218,
"val/clipfrac_avg": 0.0017812025034800172,
"val/num_eos_tokens": 22247,
"val/ratio": 0.9995983839035034,
"val/ratio_var": 3.6509884466795484e-06
},
{
"episode": 112128,
"epoch": 0.9606579849211789,
"eps": 4,
"loss/policy_avg": 0.0027086399495601654,
"loss/value_avg": 0.13496465981006622,
"lr": 4.453125e-07,
"objective/entropy": -46.59452819824219,
"objective/kl": 36.066009521484375,
"objective/non_score_reward": -1.803300380706787,
"objective/rlhf_reward": 10.2237548828125,
"objective/scores": 12.027054786682129,
"policy/approxkl_avg": 0.0025894979480654,
"policy/clipfrac_avg": 0.010560642927885056,
"policy/entropy_avg": 1.0761702060699463,
"step": 219,
"val/clipfrac_avg": 0.0019034635042771697,
"val/num_eos_tokens": 23244,
"val/ratio": 0.9996868968009949,
"val/ratio_var": 2.6758764306578087e-06
},
{
"episode": 112640,
"epoch": 0.9650445510623715,
"eps": 4,
"loss/policy_avg": 0.0027356455102562904,
"loss/value_avg": 0.1426704227924347,
"lr": 4.3359375e-07,
"objective/entropy": -48.531700134277344,
"objective/kl": 36.28923034667969,
"objective/non_score_reward": -1.8144614696502686,
"objective/rlhf_reward": 10.108583450317383,
"objective/scores": 11.92304515838623,
"policy/approxkl_avg": 0.0024003470316529274,
"policy/clipfrac_avg": 0.011623581871390343,
"policy/entropy_avg": 1.1353890895843506,
"step": 220,
"val/clipfrac_avg": 0.0018647483084350824,
"val/num_eos_tokens": 24624,
"val/ratio": 0.9997994899749756,
"val/ratio_var": 4.277234438632149e-06
},
{
"episode": 113152,
"epoch": 0.9694311172035641,
"eps": 4,
"loss/policy_avg": 0.007160656154155731,
"loss/value_avg": 0.1320515125989914,
"lr": 4.21875e-07,
"objective/entropy": -46.6253662109375,
"objective/kl": 35.65117645263672,
"objective/non_score_reward": -1.7825589179992676,
"objective/rlhf_reward": 10.30819320678711,
"objective/scores": 12.090752601623535,
"policy/approxkl_avg": 0.0027557946741580963,
"policy/clipfrac_avg": 0.011182291433215141,
"policy/entropy_avg": 1.102109432220459,
"step": 221,
"val/clipfrac_avg": 0.0012917916756123304,
"val/num_eos_tokens": 24368,
"val/ratio": 0.9995945692062378,
"val/ratio_var": 2.0290351585572353e-06
},
{
"episode": 113664,
"epoch": 0.9738176833447567,
"eps": 4,
"loss/policy_avg": 0.0003069937229156494,
"loss/value_avg": 0.15999022126197815,
"lr": 4.1015625e-07,
"objective/entropy": -48.70159149169922,
"objective/kl": 35.106689453125,
"objective/non_score_reward": -1.7553346157073975,
"objective/rlhf_reward": 10.207782745361328,
"objective/scores": 11.963117599487305,
"policy/approxkl_avg": 0.0020455708727240562,
"policy/clipfrac_avg": 0.010850891470909119,
"policy/entropy_avg": 1.1390907764434814,
"step": 222,
"val/clipfrac_avg": 0.0014877349603921175,
"val/num_eos_tokens": 24290,
"val/ratio": 0.9998856782913208,
"val/ratio_var": 3.1563197353534633e-06
},
{
"episode": 114176,
"epoch": 0.9782042494859493,
"eps": 4,
"loss/policy_avg": 0.002647135406732559,
"loss/value_avg": 0.15004819631576538,
"lr": 3.984375e-07,
"objective/entropy": -48.554466247558594,
"objective/kl": 35.3194580078125,
"objective/non_score_reward": -1.7659728527069092,
"objective/rlhf_reward": 10.268902778625488,
"objective/scores": 12.034875869750977,
"policy/approxkl_avg": 0.0024222272913903,
"policy/clipfrac_avg": 0.012112165801227093,
"policy/entropy_avg": 1.1444189548492432,
"step": 223,
"val/clipfrac_avg": 0.0013970729196444154,
"val/num_eos_tokens": 24690,
"val/ratio": 1.0001357793807983,
"val/ratio_var": 5.459288331621792e-06
},
{
"episode": 114688,
"epoch": 0.9825908156271419,
"eps": 4,
"loss/policy_avg": 0.005581329111009836,
"loss/value_avg": 0.13726581633090973,
"lr": 3.8671875000000003e-07,
"objective/entropy": -47.08404541015625,
"objective/kl": 35.355735778808594,
"objective/non_score_reward": -1.7677868604660034,
"objective/rlhf_reward": 10.195508003234863,
"objective/scores": 11.963294982910156,
"policy/approxkl_avg": 0.00215457403101027,
"policy/clipfrac_avg": 0.01111712958663702,
"policy/entropy_avg": 1.0979572534561157,
"step": 224,
"val/clipfrac_avg": 0.0013495224993675947,
"val/num_eos_tokens": 24061,
"val/ratio": 0.9996528625488281,
"val/ratio_var": 4.114271177968476e-06
},
{
"episode": 115200,
"epoch": 0.9869773817683345,
"eps": 4,
"loss/policy_avg": 0.007961828261613846,
"loss/value_avg": 0.14187262952327728,
"lr": 3.75e-07,
"objective/entropy": -48.313079833984375,
"objective/kl": 35.43455505371094,
"objective/non_score_reward": -1.7717278003692627,
"objective/rlhf_reward": 10.25037670135498,
"objective/scores": 12.022104263305664,
"policy/approxkl_avg": 0.002115798881277442,
"policy/clipfrac_avg": 0.010869062505662441,
"policy/entropy_avg": 1.1199058294296265,
"step": 225,
"val/clipfrac_avg": 0.0015077003045007586,
"val/num_eos_tokens": 24011,
"val/ratio": 1.0002377033233643,
"val/ratio_var": 8.308678843604866e-06
},
{
"episode": 115712,
"epoch": 0.9913639479095271,
"eps": 4,
"loss/policy_avg": 0.005859264172613621,
"loss/value_avg": 0.14565059542655945,
"lr": 3.6328125e-07,
"objective/entropy": -45.33184051513672,
"objective/kl": 36.10816955566406,
"objective/non_score_reward": -1.8054085969924927,
"objective/rlhf_reward": 10.235603332519531,
"objective/scores": 12.041011810302734,
"policy/approxkl_avg": 0.00207823165692389,
"policy/clipfrac_avg": 0.01044812798500061,
"policy/entropy_avg": 1.0732884407043457,
"step": 226,
"val/clipfrac_avg": 0.002065906533971429,
"val/num_eos_tokens": 22806,
"val/ratio": 1.0006978511810303,
"val/ratio_var": 5.532293926080456e-06
},
{
"episode": 116224,
"epoch": 0.9957505140507197,
"eps": 4,
"loss/policy_avg": 0.003005429171025753,
"loss/value_avg": 0.14912931621074677,
"lr": 3.515625e-07,
"objective/entropy": -45.97642135620117,
"objective/kl": 35.881935119628906,
"objective/non_score_reward": -1.7940969467163086,
"objective/rlhf_reward": 10.282238006591797,
"objective/scores": 12.076334953308105,
"policy/approxkl_avg": 0.002462461357936263,
"policy/clipfrac_avg": 0.010989394038915634,
"policy/entropy_avg": 1.076866626739502,
"step": 227,
"val/clipfrac_avg": 0.0014049881137907505,
"val/num_eos_tokens": 22413,
"val/ratio": 0.9997053146362305,
"val/ratio_var": 6.198345545271877e-06
},
{
"episode": 116736,
"epoch": 1.0001370801919123,
"eps": 4,
"loss/policy_avg": 0.006791248917579651,
"loss/value_avg": 0.13567443192005157,
"lr": 3.3984375e-07,
"objective/entropy": -47.08728790283203,
"objective/kl": 35.24159622192383,
"objective/non_score_reward": -1.7620798349380493,
"objective/rlhf_reward": 10.378169059753418,
"objective/scores": 12.140249252319336,
"policy/approxkl_avg": 0.0016869257669895887,
"policy/clipfrac_avg": 0.009463367983698845,
"policy/entropy_avg": 1.0775138139724731,
"step": 228,
"val/clipfrac_avg": 0.0023141992278397083,
"val/num_eos_tokens": 23909,
"val/ratio": 0.9998793601989746,
"val/ratio_var": 4.120348421565723e-06
},
{
"episode": 117248,
"epoch": 1.0045236463331049,
"eps": 4,
"loss/policy_avg": 0.0007241852581501007,
"loss/value_avg": 0.14128395915031433,
"lr": 3.2812500000000003e-07,
"objective/entropy": -46.4316291809082,
"objective/kl": 35.44841384887695,
"objective/non_score_reward": -1.7724206447601318,
"objective/rlhf_reward": 10.439834594726562,
"objective/scores": 12.212255477905273,
"policy/approxkl_avg": 0.0024355538189411163,
"policy/clipfrac_avg": 0.010117866098880768,
"policy/entropy_avg": 1.0649350881576538,
"step": 229,
"val/clipfrac_avg": 0.0018390328623354435,
"val/num_eos_tokens": 23709,
"val/ratio": 1.0000131130218506,
"val/ratio_var": 4.8111301111930516e-06
},
{
"episode": 117760,
"epoch": 1.0089102124742975,
"eps": 4,
"loss/policy_avg": 0.006022402085363865,
"loss/value_avg": 0.12594205141067505,
"lr": 3.1640625e-07,
"objective/entropy": -47.71858215332031,
"objective/kl": 34.64881896972656,
"objective/non_score_reward": -1.7324409484863281,
"objective/rlhf_reward": 10.422674179077148,
"objective/scores": 12.155115127563477,
"policy/approxkl_avg": 0.002340013859793544,
"policy/clipfrac_avg": 0.009795863181352615,
"policy/entropy_avg": 1.0929601192474365,
"step": 230,
"val/clipfrac_avg": 0.0014780564233660698,
"val/num_eos_tokens": 22730,
"val/ratio": 0.9997072815895081,
"val/ratio_var": 4.112517217436107e-06
},
{
"episode": 118272,
"epoch": 1.01329677861549,
"eps": 4,
"loss/policy_avg": 0.0035825008526444435,
"loss/value_avg": 0.1515202820301056,
"lr": 3.046875e-07,
"objective/entropy": -47.19580078125,
"objective/kl": 34.86238098144531,
"objective/non_score_reward": -1.7431188821792603,
"objective/rlhf_reward": 10.363493919372559,
"objective/scores": 12.106613159179688,
"policy/approxkl_avg": 0.0021840957924723625,
"policy/clipfrac_avg": 0.010647189803421497,
"policy/entropy_avg": 1.0970317125320435,
"step": 231,
"val/clipfrac_avg": 0.0015769560122862458,
"val/num_eos_tokens": 23717,
"val/ratio": 1.0001730918884277,
"val/ratio_var": 6.174034751893487e-06
},
{
"episode": 118784,
"epoch": 1.0176833447566827,
"eps": 4,
"loss/policy_avg": 0.001027967780828476,
"loss/value_avg": 0.12807387113571167,
"lr": 2.9296875000000003e-07,
"objective/entropy": -46.97780227661133,
"objective/kl": 36.12638854980469,
"objective/non_score_reward": -1.8063193559646606,
"objective/rlhf_reward": 10.298711776733398,
"objective/scores": 12.10503101348877,
"policy/approxkl_avg": 0.0020326317753642797,
"policy/clipfrac_avg": 0.010506751015782356,
"policy/entropy_avg": 1.0705652236938477,
"step": 232,
"val/clipfrac_avg": 0.002296661026775837,
"val/num_eos_tokens": 23196,
"val/ratio": 0.9996442794799805,
"val/ratio_var": 2.570702235971112e-06
},
{
"episode": 119296,
"epoch": 1.0220699108978752,
"eps": 4,
"loss/policy_avg": 0.006970872171223164,
"loss/value_avg": 0.12893542647361755,
"lr": 2.8125e-07,
"objective/entropy": -47.03704071044922,
"objective/kl": 35.707496643066406,
"objective/non_score_reward": -1.7853751182556152,
"objective/rlhf_reward": 10.371620178222656,
"objective/scores": 12.15699577331543,
"policy/approxkl_avg": 0.0026198499836027622,
"policy/clipfrac_avg": 0.01155446469783783,
"policy/entropy_avg": 1.0714943408966064,
"step": 233,
"val/clipfrac_avg": 0.0014992888318374753,
"val/num_eos_tokens": 21060,
"val/ratio": 0.9997165203094482,
"val/ratio_var": 3.354538421262987e-06
},
{
"episode": 119808,
"epoch": 1.0264564770390678,
"eps": 4,
"loss/policy_avg": 0.0018435269594192505,
"loss/value_avg": 0.1402617245912552,
"lr": 2.6953125e-07,
"objective/entropy": -47.30288314819336,
"objective/kl": 35.687347412109375,
"objective/non_score_reward": -1.784367322921753,
"objective/rlhf_reward": 10.228802680969238,
"objective/scores": 12.01317024230957,
"policy/approxkl_avg": 0.0018239500932395458,
"policy/clipfrac_avg": 0.010121040977537632,
"policy/entropy_avg": 1.0836563110351562,
"step": 234,
"val/clipfrac_avg": 0.0013189124874770641,
"val/num_eos_tokens": 22280,
"val/ratio": 0.9998680353164673,
"val/ratio_var": 2.612711568872328e-06
},
{
"episode": 120320,
"epoch": 1.0308430431802604,
"eps": 4,
"loss/policy_avg": 0.0069306232035160065,
"loss/value_avg": 0.1296517699956894,
"lr": 2.578125e-07,
"objective/entropy": -46.29168701171875,
"objective/kl": 35.88372802734375,
"objective/non_score_reward": -1.7941863536834717,
"objective/rlhf_reward": 10.230424880981445,
"objective/scores": 12.024611473083496,
"policy/approxkl_avg": 0.0034364780876785517,
"policy/clipfrac_avg": 0.011023009195923805,
"policy/entropy_avg": 1.062299132347107,
"step": 235,
"val/clipfrac_avg": 0.0019735856913030148,
"val/num_eos_tokens": 22020,
"val/ratio": 0.9997754096984863,
"val/ratio_var": 5.2869540922984015e-06
},
{
"episode": 120832,
"epoch": 1.035229609321453,
"eps": 4,
"loss/policy_avg": 0.004135345108807087,
"loss/value_avg": 0.14806649088859558,
"lr": 2.4609375e-07,
"objective/entropy": -46.469215393066406,
"objective/kl": 36.00146484375,
"objective/non_score_reward": -1.8000733852386475,
"objective/rlhf_reward": 10.298752784729004,
"objective/scores": 12.09882640838623,
"policy/approxkl_avg": 0.0021121413446962833,
"policy/clipfrac_avg": 0.011154447682201862,
"policy/entropy_avg": 1.0819756984710693,
"step": 236,
"val/clipfrac_avg": 0.0024919421412050724,
"val/num_eos_tokens": 22175,
"val/ratio": 1.000499963760376,
"val/ratio_var": 1.280141714232741e-05
},
{
"episode": 121344,
"epoch": 1.0396161754626456,
"eps": 4,
"loss/policy_avg": 0.0052656615152955055,
"loss/value_avg": 0.1300714910030365,
"lr": 2.34375e-07,
"objective/entropy": -44.959388732910156,
"objective/kl": 35.47280502319336,
"objective/non_score_reward": -1.7736401557922363,
"objective/rlhf_reward": 10.349384307861328,
"objective/scores": 12.123023986816406,
"policy/approxkl_avg": 0.0020022448152303696,
"policy/clipfrac_avg": 0.01132948324084282,
"policy/entropy_avg": 1.073385238647461,
"step": 237,
"val/clipfrac_avg": 0.0013668034225702286,
"val/num_eos_tokens": 24303,
"val/ratio": 1.0004512071609497,
"val/ratio_var": 6.641071649937658e-06
},
{
"episode": 121856,
"epoch": 1.0440027416038382,
"eps": 4,
"loss/policy_avg": 0.0022417306900024414,
"loss/value_avg": 0.1331208050251007,
"lr": 2.2265625e-07,
"objective/entropy": -46.06434631347656,
"objective/kl": 35.35786437988281,
"objective/non_score_reward": -1.7678931951522827,
"objective/rlhf_reward": 10.399948120117188,
"objective/scores": 12.167840957641602,
"policy/approxkl_avg": 0.0024931542575359344,
"policy/clipfrac_avg": 0.011008227244019508,
"policy/entropy_avg": 1.0718121528625488,
"step": 238,
"val/clipfrac_avg": 0.0017535353545099497,
"val/num_eos_tokens": 24789,
"val/ratio": 0.9997462034225464,
"val/ratio_var": 4.035917754663387e-06
},
{
"episode": 122368,
"epoch": 1.0483893077450308,
"eps": 4,
"loss/policy_avg": 0.006296452134847641,
"loss/value_avg": 0.13648873567581177,
"lr": 2.109375e-07,
"objective/entropy": -47.731388092041016,
"objective/kl": 35.815277099609375,
"objective/non_score_reward": -1.7907638549804688,
"objective/rlhf_reward": 10.327536582946777,
"objective/scores": 12.118300437927246,
"policy/approxkl_avg": 0.0019831331446766853,
"policy/clipfrac_avg": 0.0104678338393569,
"policy/entropy_avg": 1.0991512537002563,
"step": 239,
"val/clipfrac_avg": 0.0010415198048576713,
"val/num_eos_tokens": 22777,
"val/ratio": 1.0002403259277344,
"val/ratio_var": 4.645487933885306e-06
},
{
"episode": 122880,
"epoch": 1.0527758738862234,
"eps": 4,
"loss/policy_avg": 0.005139458924531937,
"loss/value_avg": 0.1517227441072464,
"lr": 1.9921875e-07,
"objective/entropy": -46.621482849121094,
"objective/kl": 34.758949279785156,
"objective/non_score_reward": -1.7379475831985474,
"objective/rlhf_reward": 10.491247177124023,
"objective/scores": 12.229194641113281,
"policy/approxkl_avg": 0.0020756043959409,
"policy/clipfrac_avg": 0.01085658185184002,
"policy/entropy_avg": 1.0900070667266846,
"step": 240,
"val/clipfrac_avg": 0.0016429282259196043,
"val/num_eos_tokens": 24132,
"val/ratio": 1.0001587867736816,
"val/ratio_var": 4.0278300730278715e-06
},
{
"episode": 123392,
"epoch": 1.057162440027416,
"eps": 4,
"loss/policy_avg": 0.0028034071438014507,
"loss/value_avg": 0.16330860555171967,
"lr": 1.875e-07,
"objective/entropy": -46.73743438720703,
"objective/kl": 35.08587646484375,
"objective/non_score_reward": -1.7542940378189087,
"objective/rlhf_reward": 10.32256031036377,
"objective/scores": 12.076854705810547,
"policy/approxkl_avg": 0.0024638704489916563,
"policy/clipfrac_avg": 0.010213883593678474,
"policy/entropy_avg": 1.0772242546081543,
"step": 241,
"val/clipfrac_avg": 0.0014605314936488867,
"val/num_eos_tokens": 23469,
"val/ratio": 0.9998643398284912,
"val/ratio_var": 3.752075599550153e-06
},
{
"episode": 123904,
"epoch": 1.0615490061686086,
"eps": 4,
"loss/policy_avg": 0.004879022017121315,
"loss/value_avg": 0.14888252317905426,
"lr": 1.7578125e-07,
"objective/entropy": -47.842430114746094,
"objective/kl": 35.32581329345703,
"objective/non_score_reward": -1.7662907838821411,
"objective/rlhf_reward": 10.462313652038574,
"objective/scores": 12.228604316711426,
"policy/approxkl_avg": 0.0017638729186728597,
"policy/clipfrac_avg": 0.011290816590189934,
"policy/entropy_avg": 1.0770862102508545,
"step": 242,
"val/clipfrac_avg": 0.0015572316478937864,
"val/num_eos_tokens": 22615,
"val/ratio": 1.0004723072052002,
"val/ratio_var": 6.782573564123595e-06
},
{
"episode": 124416,
"epoch": 1.0659355723098012,
"eps": 4,
"loss/policy_avg": 0.0035593826323747635,
"loss/value_avg": 0.1295519769191742,
"lr": 1.6406250000000002e-07,
"objective/entropy": -45.786006927490234,
"objective/kl": 35.52326202392578,
"objective/non_score_reward": -1.7761629819869995,
"objective/rlhf_reward": 10.437166213989258,
"objective/scores": 12.213329315185547,
"policy/approxkl_avg": 0.0021176172886043787,
"policy/clipfrac_avg": 0.010787077248096466,
"policy/entropy_avg": 1.096700668334961,
"step": 243,
"val/clipfrac_avg": 0.0014366672839969397,
"val/num_eos_tokens": 22355,
"val/ratio": 1.0001835823059082,
"val/ratio_var": 6.8035760705242865e-06
},
{
"episode": 124928,
"epoch": 1.0703221384509938,
"eps": 4,
"loss/policy_avg": 0.001890458632260561,
"loss/value_avg": 0.13558140397071838,
"lr": 1.5234375e-07,
"objective/entropy": -46.62615203857422,
"objective/kl": 35.85779571533203,
"objective/non_score_reward": -1.7928898334503174,
"objective/rlhf_reward": 10.345151901245117,
"objective/scores": 12.138041496276855,
"policy/approxkl_avg": 0.002403097692877054,
"policy/clipfrac_avg": 0.011035319417715073,
"policy/entropy_avg": 1.0733857154846191,
"step": 244,
"val/clipfrac_avg": 0.001738998107612133,
"val/num_eos_tokens": 21746,
"val/ratio": 1.0001354217529297,
"val/ratio_var": 2.896329078794224e-06
},
{
"episode": 125440,
"epoch": 1.0747087045921864,
"eps": 4,
"loss/policy_avg": 0.002462470903992653,
"loss/value_avg": 0.13766974210739136,
"lr": 1.40625e-07,
"objective/entropy": -45.997737884521484,
"objective/kl": 35.553016662597656,
"objective/non_score_reward": -1.7776508331298828,
"objective/rlhf_reward": 10.40100383758545,
"objective/scores": 12.178654670715332,
"policy/approxkl_avg": 0.003136041574180126,
"policy/clipfrac_avg": 0.010911045596003532,
"policy/entropy_avg": 1.0624918937683105,
"step": 245,
"val/clipfrac_avg": 0.0014719897881150246,
"val/num_eos_tokens": 22770,
"val/ratio": 1.0002856254577637,
"val/ratio_var": 9.461476111027878e-06
},
{
"episode": 125952,
"epoch": 1.079095270733379,
"eps": 4,
"loss/policy_avg": 0.00028504710644483566,
"loss/value_avg": 0.14360447227954865,
"lr": 1.2890625e-07,
"objective/entropy": -46.9267463684082,
"objective/kl": 35.44313049316406,
"objective/non_score_reward": -1.7721564769744873,
"objective/rlhf_reward": 10.315094947814941,
"objective/scores": 12.087251663208008,
"policy/approxkl_avg": 0.002328254049643874,
"policy/clipfrac_avg": 0.009981741197407246,
"policy/entropy_avg": 1.0975849628448486,
"step": 246,
"val/clipfrac_avg": 0.0021861500572413206,
"val/num_eos_tokens": 24914,
"val/ratio": 1.0002243518829346,
"val/ratio_var": 5.957234407105716e-06
},
{
"episode": 126464,
"epoch": 1.0834818368745716,
"eps": 4,
"loss/policy_avg": 0.0003530774265527725,
"loss/value_avg": 0.13446325063705444,
"lr": 1.171875e-07,
"objective/entropy": -46.55187225341797,
"objective/kl": 35.19268798828125,
"objective/non_score_reward": -1.7596344947814941,
"objective/rlhf_reward": 10.49343490600586,
"objective/scores": 12.253069877624512,
"policy/approxkl_avg": 0.00239111902192235,
"policy/clipfrac_avg": 0.010795432142913342,
"policy/entropy_avg": 1.0839779376983643,
"step": 247,
"val/clipfrac_avg": 0.0015913444804027677,
"val/num_eos_tokens": 23642,
"val/ratio": 1.000253438949585,
"val/ratio_var": 5.258910732663935e-06
},
{
"episode": 126976,
"epoch": 1.0878684030157642,
"eps": 4,
"loss/policy_avg": -0.0018387939780950546,
"loss/value_avg": 0.15468762814998627,
"lr": 1.0546875e-07,
"objective/entropy": -47.14482498168945,
"objective/kl": 35.40525436401367,
"objective/non_score_reward": -1.7702628374099731,
"objective/rlhf_reward": 10.223344802856445,
"objective/scores": 11.993607521057129,
"policy/approxkl_avg": 0.0021429010666906834,
"policy/clipfrac_avg": 0.010971945710480213,
"policy/entropy_avg": 1.1065943241119385,
"step": 248,
"val/clipfrac_avg": 0.0013208456803113222,
"val/num_eos_tokens": 22990,
"val/ratio": 0.9996612071990967,
"val/ratio_var": 4.5201718421594705e-06
},
{
"episode": 127488,
"epoch": 1.0922549691569567,
"eps": 4,
"loss/policy_avg": 0.001225670799612999,
"loss/value_avg": 0.1466779112815857,
"lr": 9.375e-08,
"objective/entropy": -45.93384552001953,
"objective/kl": 35.7270622253418,
"objective/non_score_reward": -1.7863531112670898,
"objective/rlhf_reward": 10.394562721252441,
"objective/scores": 12.180915832519531,
"policy/approxkl_avg": 0.002070215530693531,
"policy/clipfrac_avg": 0.010630708187818527,
"policy/entropy_avg": 1.0799144506454468,
"step": 249,
"val/clipfrac_avg": 0.0013516065664589405,
"val/num_eos_tokens": 23734,
"val/ratio": 1.0008833408355713,
"val/ratio_var": 2.696674710023217e-05
},
{
"episode": 128000,
"epoch": 1.0966415352981493,
"eps": 4,
"loss/policy_avg": 0.006959846243262291,
"loss/value_avg": 0.134637713432312,
"lr": 8.203125000000001e-08,
"objective/entropy": -45.95152282714844,
"objective/kl": 35.22590637207031,
"objective/non_score_reward": -1.7612950801849365,
"objective/rlhf_reward": 10.304990768432617,
"objective/scores": 12.066286087036133,
"policy/approxkl_avg": 0.0024010255001485348,
"policy/clipfrac_avg": 0.011238099075853825,
"policy/entropy_avg": 1.0736041069030762,
"step": 250,
"val/clipfrac_avg": 0.0017618590500205755,
"val/num_eos_tokens": 22063,
"val/ratio": 0.9997179508209229,
"val/ratio_var": 2.0375098301883554e-06
},
{
"episode": 128512,
"epoch": 1.101028101439342,
"eps": 4,
"loss/policy_avg": 0.0056634098291397095,
"loss/value_avg": 0.14280955493450165,
"lr": 7.03125e-08,
"objective/entropy": -47.430274963378906,
"objective/kl": 35.55634307861328,
"objective/non_score_reward": -1.7778171300888062,
"objective/rlhf_reward": 10.224863052368164,
"objective/scores": 12.002679824829102,
"policy/approxkl_avg": 0.0028075524605810642,
"policy/clipfrac_avg": 0.010713065043091774,
"policy/entropy_avg": 1.1045918464660645,
"step": 251,
"val/clipfrac_avg": 0.0016868215752765536,
"val/num_eos_tokens": 23025,
"val/ratio": 0.999710202217102,
"val/ratio_var": 7.573556104034651e-06
},
{
"episode": 129024,
"epoch": 1.1054146675805345,
"eps": 4,
"loss/policy_avg": 0.0015937616117298603,
"loss/value_avg": 0.1460372358560562,
"lr": 5.859375e-08,
"objective/entropy": -47.605384826660156,
"objective/kl": 35.3289794921875,
"objective/non_score_reward": -1.766448974609375,
"objective/rlhf_reward": 10.445650100708008,
"objective/scores": 12.212099075317383,
"policy/approxkl_avg": 0.0021890033967792988,
"policy/clipfrac_avg": 0.009831791743636131,
"policy/entropy_avg": 1.072847604751587,
"step": 252,
"val/clipfrac_avg": 0.0019376241834834218,
"val/num_eos_tokens": 20707,
"val/ratio": 1.0002546310424805,
"val/ratio_var": 5.4702413763152435e-06
},
{
"episode": 129536,
"epoch": 1.1098012337217271,
"eps": 4,
"loss/policy_avg": 0.0011267205700278282,
"loss/value_avg": 0.13995370268821716,
"lr": 4.6875e-08,
"objective/entropy": -47.466609954833984,
"objective/kl": 35.21324920654297,
"objective/non_score_reward": -1.76066255569458,
"objective/rlhf_reward": 10.34493637084961,
"objective/scores": 12.105599403381348,
"policy/approxkl_avg": 0.002287943847477436,
"policy/clipfrac_avg": 0.011924582533538342,
"policy/entropy_avg": 1.0961564779281616,
"step": 253,
"val/clipfrac_avg": 0.0030874176882207394,
"val/num_eos_tokens": 23391,
"val/ratio": 0.9999744296073914,
"val/ratio_var": 4.732663910544943e-06
},
{
"episode": 130048,
"epoch": 1.1141877998629197,
"eps": 4,
"loss/policy_avg": 0.00187746062874794,
"loss/value_avg": 0.16102451086044312,
"lr": 3.515625e-08,
"objective/entropy": -46.87982940673828,
"objective/kl": 35.481868743896484,
"objective/non_score_reward": -1.774093508720398,
"objective/rlhf_reward": 10.216583251953125,
"objective/scores": 11.990676879882812,
"policy/approxkl_avg": 0.002129252767190337,
"policy/clipfrac_avg": 0.011353913694620132,
"policy/entropy_avg": 1.085520625114441,
"step": 254,
"val/clipfrac_avg": 0.002106403699144721,
"val/num_eos_tokens": 24079,
"val/ratio": 1.0003963708877563,
"val/ratio_var": 6.27804001851473e-06
},
{
"episode": 130560,
"epoch": 1.1185743660041123,
"eps": 4,
"loss/policy_avg": 0.0023504868149757385,
"loss/value_avg": 0.1395518034696579,
"lr": 2.34375e-08,
"objective/entropy": -47.189300537109375,
"objective/kl": 35.595664978027344,
"objective/non_score_reward": -1.7797832489013672,
"objective/rlhf_reward": 10.323637008666992,
"objective/scores": 12.10342025756836,
"policy/approxkl_avg": 0.0020733638666570187,
"policy/clipfrac_avg": 0.01057223230600357,
"policy/entropy_avg": 1.1228654384613037,
"step": 255,
"val/clipfrac_avg": 0.0016230044420808554,
"val/num_eos_tokens": 24372,
"val/ratio": 0.9999302625656128,
"val/ratio_var": 5.258788405626547e-06
},
{
"episode": 131072,
"epoch": 1.122960932145305,
"eps": 4,
"loss/policy_avg": 0.0048648901283741,
"loss/value_avg": 0.15009374916553497,
"lr": 1.171875e-08,
"objective/entropy": -45.09252166748047,
"objective/kl": 35.825538635253906,
"objective/non_score_reward": -1.7912769317626953,
"objective/rlhf_reward": 10.314563751220703,
"objective/scores": 12.105840682983398,
"policy/approxkl_avg": 0.0024984171614050865,
"policy/clipfrac_avg": 0.010642223991453648,
"policy/entropy_avg": 1.051030158996582,
"step": 256,
"val/clipfrac_avg": 0.0016988374991342425,
"val/num_eos_tokens": 22804,
"val/ratio": 1.0000357627868652,
"val/ratio_var": 4.6805571400909685e-06
}
],
"logging_steps": 100,
"max_steps": 256,
"num_input_tokens_seen": 0,
"num_train_epochs": 1.122960932145305,
"save_steps": 52,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}