{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.8211143695014663,
  "eval_steps": 500,
  "global_step": 3500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.2,
      "completions/mean_length": 254.4375,
      "completions/mean_terminated_length": 1.2,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 1.2,
      "epoch": 0.0011730205278592375,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.765625,
      "kl": 0.0012255147332325578,
      "learning_rate": 9.047771428571428e-07,
      "loss": -0.0012,
      "num_tokens": 54676.0,
      "reward": 7.508432197570801,
      "reward_std": 7.795059728622436,
      "rewards/wrapper/mean": 3.7542161136865615,
      "rewards/wrapper/std": 11.808628790080547,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.8,
      "completions/mean_length": 254.4875,
      "completions/mean_terminated_length": 2.8,
      "completions/min_length": 207.6,
      "completions/min_terminated_length": 2.8,
      "epoch": 0.002346041055718475,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.1875,
      "kl": 0.002615465858252719,
      "learning_rate": 2.035748571428571e-06,
      "loss": -0.0035,
      "num_tokens": 113354.0,
      "reward": 6.667659282684326,
      "reward_std": 8.151447796821595,
      "rewards/wrapper/mean": 3.333829724043608,
      "rewards/wrapper/std": 12.321574296057225,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.0,
      "completions/mean_length": 250.7875,
      "completions/mean_terminated_length": 9.5,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.0035190615835777126,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.9375,
      "kl": 0.005194541340461001,
      "learning_rate": 3.1667200000000002e-06,
      "loss": -0.0003,
      "num_tokens": 169594.0,
      "reward": 11.774119424819947,
      "reward_std": 13.959662055969238,
      "rewards/wrapper/mean": 5.887059649825096,
      "rewards/wrapper/std": 18.77371552735567,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 253.2875,
      "completions/mean_terminated_length": 15.6,
      "completions/min_length": 169.2,
      "completions/min_terminated_length": 15.6,
      "epoch": 0.00469208211143695,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.71875,
      "kl": 0.007731236255494878,
      "learning_rate": 4.297691428571428e-06,
      "loss": -0.0052,
      "num_tokens": 222882.0,
      "reward": 12.04462718963623,
      "reward_std": 15.45330753326416,
      "rewards/wrapper/mean": 6.022313681989909,
      "rewards/wrapper/std": 18.897284054756163,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 255.3375,
      "completions/mean_terminated_length": 30.0,
      "completions/min_length": 234.8,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.005865102639296188,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.484375,
      "kl": 0.005328995548188687,
      "learning_rate": 5.428662857142858e-06,
      "loss": 0.0021,
      "num_tokens": 278444.0,
      "reward": 13.085074043273925,
      "reward_std": 16.36840648651123,
      "rewards/wrapper/mean": 6.542536787688732,
      "rewards/wrapper/std": 17.1350009560585,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 52.0,
      "completions/mean_length": 251.225,
      "completions/mean_terminated_length": 52.0,
      "completions/min_length": 103.2,
      "completions/min_terminated_length": 52.0,
      "epoch": 0.007038123167155425,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.421875,
      "kl": 0.0037205405707936732,
      "learning_rate": 6.559634285714286e-06,
      "loss": -0.0098,
      "num_tokens": 332848.0,
      "reward": 9.3982759475708,
      "reward_std": 10.730877304077149,
      "rewards/wrapper/mean": 4.6991379834711555,
      "rewards/wrapper/std": 15.566500315070153,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 58.6,
      "completions/mean_length": 254.63125,
      "completions/mean_terminated_length": 58.6,
      "completions/min_length": 212.2,
      "completions/min_terminated_length": 58.6,
      "epoch": 0.008211143695014663,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 2.578125,
      "kl": 0.010630438197404146,
      "learning_rate": 7.690605714285714e-06,
      "loss": 0.0029,
      "num_tokens": 388563.0,
      "reward": 10.763528490066529,
      "reward_std": 7.830057907104492,
      "rewards/wrapper/mean": 5.381764186918735,
      "rewards/wrapper/std": 17.07995459139347,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.8,
      "completions/mean_length": 250.03125,
      "completions/mean_terminated_length": 25.53333435058594,
      "completions/min_length": 108.4,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.0093841642228739,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.1640625,
      "kl": 0.009318914514733479,
      "learning_rate": 7.916796747198757e-06,
      "loss": -0.012,
      "num_tokens": 443458.0,
      "reward": 7.394418716430664,
      "reward_std": 9.918637371063232,
      "rewards/wrapper/mean": 3.697209335118532,
      "rewards/wrapper/std": 11.686485758423805,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.8,
      "completions/mean_length": 253.04375,
      "completions/mean_terminated_length": 7.8,
      "completions/min_length": 161.4,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.010557184750733138,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 7.25,
      "kl": 0.009933865355560557,
      "learning_rate": 7.916783532705924e-06,
      "loss": -0.0065,
      "num_tokens": 500731.0,
      "reward": 6.865958595275879,
      "reward_std": 9.22301788330078,
      "rewards/wrapper/mean": 3.4329791098833082,
      "rewards/wrapper/std": 11.259305146336555,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 59.2,
      "completions/mean_length": 253.05,
      "completions/mean_terminated_length": 59.2,
      "completions/min_length": 161.6,
      "completions/min_terminated_length": 59.2,
      "epoch": 0.011730205278592375,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.625,
      "kl": 0.005990624788682908,
      "learning_rate": 7.916760153266633e-06,
      "loss": -0.0025,
      "num_tokens": 551543.0,
      "reward": 6.809884262084961,
      "reward_std": 8.566570162773132,
      "rewards/wrapper/mean": 3.4049422472715376,
      "rewards/wrapper/std": 8.249704784154892,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 93.2,
      "completions/mean_length": 252.5125,
      "completions/mean_terminated_length": 93.2,
      "completions/min_length": 144.4,
      "completions/min_terminated_length": 93.2,
      "epoch": 0.012903225806451613,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 2.265625,
      "kl": 0.007241980719845742,
      "learning_rate": 7.916726608960937e-06,
      "loss": 0.0009,
      "num_tokens": 606755.0,
      "reward": 14.835551500320435,
      "reward_std": 17.018505144119263,
      "rewards/wrapper/mean": 7.417775437235832,
      "rewards/wrapper/std": 18.011443032324316,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.6,
      "completions/mean_length": 254.575,
      "completions/mean_terminated_length": 5.6,
      "completions/min_length": 210.4,
      "completions/min_terminated_length": 5.6,
      "epoch": 0.01407624633431085,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.328125,
      "kl": 0.02387903115595691,
      "learning_rate": 7.916682899903684e-06,
      "loss": -0.004,
      "num_tokens": 658779.0,
      "reward": 11.199076652526855,
      "reward_std": 12.978521537780761,
      "rewards/wrapper/mean": 5.5995381936430935,
      "rewards/wrapper/std": 18.202550745010377,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.015249266862170088,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.90625,
      "kl": 0.006103656144114211,
      "learning_rate": 7.916629026244537e-06,
      "loss": -0.0012,
      "num_tokens": 715315.0,
      "reward": 6.255694437026977,
      "reward_std": 8.289126348495483,
      "rewards/wrapper/mean": 3.1278470791876316,
      "rewards/wrapper/std": 10.367880092561245,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.016422287390029325,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.9375,
      "kl": 0.006190836068708449,
      "learning_rate": 7.916564988167955e-06,
      "loss": -0.0098,
      "num_tokens": 769328.0,
      "reward": 9.829441356658936,
      "reward_std": 13.315379619598389,
      "rewards/wrapper/mean": 4.914720744639635,
      "rewards/wrapper/std": 14.687225022912026,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 20.8,
      "completions/mean_length": 255.05,
      "completions/mean_terminated_length": 20.8,
      "completions/min_length": 225.6,
      "completions/min_terminated_length": 20.8,
      "epoch": 0.017595307917888565,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.6796875,
      "kl": 0.010496543836779892,
      "learning_rate": 7.916490785893198e-06,
      "loss": -0.003,
      "num_tokens": 826122.0,
      "reward": 10.434588527679443,
      "reward_std": 12.015387630462646,
      "rewards/wrapper/mean": 5.217294257879257,
      "rewards/wrapper/std": 14.79298051893711,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.4,
      "completions/mean_length": 252.96875,
      "completions/mean_terminated_length": 5.4,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.0187683284457478,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.8984375,
      "kl": 0.16615661919931882,
      "learning_rate": 7.916406419674335e-06,
      "loss": 0.0112,
      "num_tokens": 881035.0,
      "reward": 9.748047590255737,
      "reward_std": 13.241822493076324,
      "rewards/wrapper/mean": 4.874023604393005,
      "rewards/wrapper/std": 16.71337246745825,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.01994134897360704,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.484375,
      "kl": 0.005029451113659889,
      "learning_rate": 7.916311889800224e-06,
      "loss": 0.0002,
      "num_tokens": 935581.0,
      "reward": 10.057518577575683,
      "reward_std": 12.031916904449464,
      "rewards/wrapper/mean": 5.0287592664361,
      "rewards/wrapper/std": 14.356791715323926,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.2,
      "completions/mean_length": 252.96875,
      "completions/mean_terminated_length": 2.7,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.021114369501466276,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.171875,
      "kl": 0.0057046718808123845,
      "learning_rate": 7.916207196594537e-06,
      "loss": -0.0058,
      "num_tokens": 990930.0,
      "reward": 9.220219135284424,
      "reward_std": 11.743634796142578,
      "rewards/wrapper/mean": 4.610109446942806,
      "rewards/wrapper/std": 13.207088494300843,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.4,
      "completions/mean_length": 252.49375,
      "completions/mean_terminated_length": 41.4,
      "completions/min_length": 143.8,
      "completions/min_terminated_length": 41.4,
      "epoch": 0.022287390029325515,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.921875,
      "kl": 0.007934682531049475,
      "learning_rate": 7.916092340415737e-06,
      "loss": -0.008,
      "num_tokens": 1044417.0,
      "reward": 12.452170944213867,
      "reward_std": 11.926630926132201,
      "rewards/wrapper/mean": 6.226085089147091,
      "rewards/wrapper/std": 18.230350717902184,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.6,
      "completions/mean_length": 250.93125,
      "completions/mean_terminated_length": 21.5,
      "completions/min_length": 104.8,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.02346041055718475,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.71875,
      "kl": 0.13625025742221625,
      "learning_rate": 7.915967321657082e-06,
      "loss": -0.0075,
      "num_tokens": 1099482.0,
      "reward": 10.583320891857147,
      "reward_std": 13.258717286586762,
      "rewards/wrapper/mean": 5.291660659015179,
      "rewards/wrapper/std": 15.585099121928215,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.2,
      "completions/mean_length": 249.2375,
      "completions/mean_terminated_length": 19.9,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.02463343108504399,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.359375,
      "kl": 0.12125548404292204,
      "learning_rate": 7.915832140746629e-06,
      "loss": -0.0172,
      "num_tokens": 1152528.0,
      "reward": 10.50744104385376,
      "reward_std": 12.436288833618164,
      "rewards/wrapper/mean": 5.253720180690289,
      "rewards/wrapper/std": 16.398430436849594,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.4,
      "completions/mean_length": 248.775,
      "completions/mean_terminated_length": 12.5,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.025806451612903226,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.5,
      "kl": 0.00848652045824565,
      "learning_rate": 7.915686798147231e-06,
      "loss": -0.012,
      "num_tokens": 1203452.0,
      "reward": 8.89383053779602,
      "reward_std": 10.21728515625,
      "rewards/wrapper/mean": 4.446915102005005,
      "rewards/wrapper/std": 11.703204187750817,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.6,
      "completions/mean_length": 255.95,
      "completions/mean_terminated_length": 49.6,
      "completions/min_length": 254.4,
      "completions/min_terminated_length": 49.6,
      "epoch": 0.026979472140762465,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.875,
      "kl": 0.009458682424155995,
      "learning_rate": 7.915531294356533e-06,
      "loss": 0.0005,
      "num_tokens": 1257228.0,
      "reward": 12.8822021484375,
      "reward_std": 13.596014976501465,
      "rewards/wrapper/mean": 6.44110068231821,
      "rewards/wrapper/std": 20.19829418361187,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 252.89375,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 156.6,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.0281524926686217,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.5625,
      "kl": 0.007531364588066936,
      "learning_rate": 7.915365629906973e-06,
      "loss": -0.0119,
      "num_tokens": 1311561.0,
      "reward": 8.984685134887695,
      "reward_std": 11.791397857666016,
      "rewards/wrapper/mean": 4.492342208325863,
      "rewards/wrapper/std": 14.827356532216072,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 94.2,
      "completions/mean_length": 251.0625,
      "completions/mean_terminated_length": 74.6,
      "completions/min_length": 157.4,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.02932551319648094,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 7.125,
      "kl": 0.006311549601377919,
      "learning_rate": 7.915189805365772e-06,
      "loss": -0.0016,
      "num_tokens": 1365499.0,
      "reward": 9.319353520870209,
      "reward_std": 12.09698166847229,
      "rewards/wrapper/mean": 4.6596766747534275,
      "rewards/wrapper/std": 11.877293466031551,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.030498533724340176,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.84375,
      "kl": 0.005467874178430065,
      "learning_rate": 7.915003821334948e-06,
      "loss": -0.0086,
      "num_tokens": 1417623.0,
      "reward": 9.443776416778565,
      "reward_std": 11.862878894805908,
      "rewards/wrapper/mean": 4.72188790589571,
      "rewards/wrapper/std": 15.555906724929809,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.4,
      "completions/mean_length": 250.7125,
      "completions/mean_terminated_length": 31.1,
      "completions/min_length": 129.2,
      "completions/min_terminated_length": 26.8,
      "epoch": 0.03167155425219941,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.078125,
      "kl": 0.007808281725738197,
      "learning_rate": 7.914807678451295e-06,
      "loss": -0.0047,
      "num_tokens": 1472927.0,
      "reward": 11.669925975799561,
      "reward_std": 15.550298118591309,
      "rewards/wrapper/mean": 5.834962645173073,
      "rewards/wrapper/std": 17.61610255539417,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 76.6,
      "completions/mean_length": 252.0,
      "completions/mean_terminated_length": 71.5,
      "completions/min_length": 168.8,
      "completions/min_terminated_length": 66.4,
      "epoch": 0.03284457478005865,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.625,
      "kl": 0.0043662395240971815,
      "learning_rate": 7.9146013773864e-06,
      "loss": 0.0015,
      "num_tokens": 1526883.0,
      "reward": 7.352222728729248,
      "reward_std": 9.59028902053833,
      "rewards/wrapper/mean": 3.6761114027351143,
      "rewards/wrapper/std": 13.92982299849391,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.4,
      "completions/mean_length": 251.1125,
      "completions/mean_terminated_length": 48.4,
      "completions/min_length": 99.6,
      "completions/min_terminated_length": 48.4,
      "epoch": 0.03401759530791789,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.6640625,
      "kl": 0.018718967185122892,
      "learning_rate": 7.914384918846623e-06,
      "loss": -0.0152,
      "num_tokens": 1579857.0,
      "reward": 10.049270629882812,
      "reward_std": 11.676184892654419,
      "rewards/wrapper/mean": 5.024635132402182,
      "rewards/wrapper/std": 16.037650553882123,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.03519061583577713,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.0,
      "kl": 0.004746401822194457,
      "learning_rate": 7.914158303573106e-06,
      "loss": -0.0036,
      "num_tokens": 1636140.0,
      "reward": 9.5964071393013,
      "reward_std": 12.794659090042114,
      "rewards/wrapper/mean": 4.79820346981287,
      "rewards/wrapper/std": 13.207125359773636,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.03636363636363636,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.15625,
      "kl": 0.005534662073478103,
      "learning_rate": 7.91392153234177e-06,
      "loss": -0.0149,
      "num_tokens": 1687599.0,
      "reward": 9.0966365814209,
      "reward_std": 11.15857810974121,
      "rewards/wrapper/mean": 4.5483182817697525,
      "rewards/wrapper/std": 14.566433542966843,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.0375366568914956,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 0.72265625,
      "kl": 0.0071763014071621,
      "learning_rate": 7.913674605963302e-06,
      "loss": 0.0028,
      "num_tokens": 1740307.0,
      "reward": 18.722765350341795,
      "reward_std": 25.852748107910156,
      "rewards/wrapper/mean": 9.361382472515107,
      "rewards/wrapper/std": 24.016877111792564,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 61.0,
      "completions/mean_length": 254.70625,
      "completions/mean_terminated_length": 61.0,
      "completions/min_length": 214.6,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.03870967741935484,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2265625,
      "kl": 0.007732412667246535,
      "learning_rate": 7.913417525283167e-06,
      "loss": -0.0006,
      "num_tokens": 1797808.0,
      "reward": 12.486311435699463,
      "reward_std": 15.336132717132568,
      "rewards/wrapper/mean": 6.24315589889884,
      "rewards/wrapper/std": 18.720538268983365,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 103.2,
      "completions/mean_length": 248.0875,
      "completions/mean_terminated_length": 81.03333435058593,
      "completions/min_length": 44.2,
      "completions/min_terminated_length": 44.2,
      "epoch": 0.03988269794721408,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 5.5625,
      "kl": 0.00886870157555677,
      "learning_rate": 7.913150291181591e-06,
      "loss": -0.0148,
      "num_tokens": 1850610.0,
      "reward": 11.940378427505493,
      "reward_std": 16.308357906341552,
      "rewards/wrapper/mean": 5.970189142972231,
      "rewards/wrapper/std": 17.508427077531813,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 82.8,
      "completions/mean_length": 250.6,
      "completions/mean_terminated_length": 68.66666717529297,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 61.6,
      "epoch": 0.04105571847507331,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.546875,
      "kl": 0.00591239562490955,
      "learning_rate": 7.912872904573574e-06,
      "loss": -0.0091,
      "num_tokens": 1906648.0,
      "reward": 10.338704681396484,
      "reward_std": 14.060916805267334,
      "rewards/wrapper/mean": 5.169352217018604,
      "rewards/wrapper/std": 15.273518617451192,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.6,
      "completions/mean_length": 254.5125,
      "completions/mean_terminated_length": 3.6,
      "completions/min_length": 208.4,
      "completions/min_terminated_length": 3.6,
      "epoch": 0.04222873900293255,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 0.85546875,
      "kl": 0.008821232226910069,
      "learning_rate": 7.912585366408867e-06,
      "loss": -0.002,
      "num_tokens": 1958564.0,
      "reward": 10.528805828094482,
      "reward_std": 11.93729567527771,
      "rewards/wrapper/mean": 5.264402637630701,
      "rewards/wrapper/std": 16.117270739376544,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.2,
      "completions/mean_length": 253.15,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 164.8,
      "completions/min_terminated_length": 11.2,
      "epoch": 0.04340175953079179,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.1796875,
      "kl": 0.09926816275110469,
      "learning_rate": 7.912287677671986e-06,
      "loss": -0.0039,
      "num_tokens": 2012928.0,
      "reward": 11.147413158416748,
      "reward_std": 12.735891819000244,
      "rewards/wrapper/mean": 5.573706501722336,
      "rewards/wrapper/std": 15.083421854674816,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.6,
      "completions/mean_length": 251.5625,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 11.6,
      "epoch": 0.04457478005865103,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.4765625,
      "kl": 0.04385784288460855,
      "learning_rate": 7.911979839382199e-06,
      "loss": 0.0034,
      "num_tokens": 2067150.0,
      "reward": 7.201774215698242,
      "reward_std": 8.909545421600342,
      "rewards/wrapper/mean": 3.60088697001338,
      "rewards/wrapper/std": 11.909615388512611,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.2,
      "completions/mean_length": 254.30625,
      "completions/mean_terminated_length": 48.2,
      "completions/min_length": 201.8,
      "completions/min_terminated_length": 48.2,
      "epoch": 0.04574780058651026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.03125,
      "kl": 0.016907342718332073,
      "learning_rate": 7.911661852593531e-06,
      "loss": -0.0034,
      "num_tokens": 2120981.0,
      "reward": 10.34285626411438,
      "reward_std": 11.99207363128662,
      "rewards/wrapper/mean": 5.171428105235099,
      "rewards/wrapper/std": 15.60823110193014,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.2,
      "completions/mean_length": 248.83125,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.0469208211143695,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.140625,
      "kl": 0.029390834498917683,
      "learning_rate": 7.911333718394748e-06,
      "loss": -0.0187,
      "num_tokens": 2178772.0,
      "reward": 10.648024845123292,
      "reward_std": 14.51836280822754,
      "rewards/wrapper/mean": 5.324012476205826,
      "rewards/wrapper/std": 16.51776341497898,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.2,
      "completions/mean_length": 249.89375,
      "completions/mean_terminated_length": 8.5,
      "completions/min_length": 110.2,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.04809384164222874,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.0625,
      "kl": 0.006215279077878222,
      "learning_rate": 7.910995437909363e-06,
      "loss": 0.0135,
      "num_tokens": 2234131.0,
      "reward": 7.719930791854859,
      "reward_std": 9.464711093902588,
      "rewards/wrapper/mean": 3.8599654987454413,
      "rewards/wrapper/std": 12.777549323439597,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 18.0,
      "completions/mean_length": 253.3625,
      "completions/mean_terminated_length": 18.0,
      "completions/min_length": 171.6,
      "completions/min_terminated_length": 18.0,
      "epoch": 0.04926686217008798,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.546875,
      "kl": 0.24979069257969969,
      "learning_rate": 7.910647012295629e-06,
      "loss": 0.0059,
      "num_tokens": 2286511.0,
      "reward": 9.189921045303345,
      "reward_std": 10.320459508895874,
      "rewards/wrapper/mean": 4.594960490614175,
      "rewards/wrapper/std": 15.06650394052267,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.0,
      "completions/mean_length": 253.1125,
      "completions/mean_terminated_length": 10.0,
      "completions/min_length": 163.6,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.05043988269794721,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 1.3203125,
      "kl": 0.8644496379303745,
      "learning_rate": 7.910288442746534e-06,
      "loss": 0.029,
      "num_tokens": 2344661.0,
      "reward": 10.684868431091308,
      "reward_std": 14.451215171813965,
      "rewards/wrapper/mean": 5.342434239387512,
      "rewards/wrapper/std": 17.4603207424283,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 51.2,
      "completions/mean_length": 252.8,
      "completions/mean_terminated_length": 51.2,
      "completions/min_length": 153.6,
      "completions/min_terminated_length": 51.2,
      "epoch": 0.05161290322580645,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 0.921875,
      "kl": 0.004707379778847098,
      "learning_rate": 7.909919730489803e-06,
      "loss": -0.0092,
      "num_tokens": 2397565.0,
      "reward": 10.836441993713379,
      "reward_std": 14.642048645019532,
      "rewards/wrapper/mean": 5.418220953643322,
      "rewards/wrapper/std": 15.194202147424221,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 112.2,
      "completions/mean_length": 248.325,
      "completions/mean_terminated_length": 56.53333435058594,
      "completions/min_length": 67.8,
      "completions/min_terminated_length": 16.6,
      "epoch": 0.05278592375366569,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.3828125,
      "kl": 0.025090236740652472,
      "learning_rate": 7.909540876787885e-06,
      "loss": 0.0044,
      "num_tokens": 2453233.0,
      "reward": 11.105024528503417,
      "reward_std": 14.836073446273804,
      "rewards/wrapper/mean": 5.552512162923813,
      "rewards/wrapper/std": 18.147161222994328,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 243.8,
      "completions/mean_terminated_length": 8.266666793823243,
      "completions/min_length": 54.4,
      "completions/min_terminated_length": 3.2,
      "epoch": 0.05395894428152493,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.046875,
      "kl": 0.008077077101916075,
      "learning_rate": 7.909151882937952e-06,
      "loss": -0.0209,
      "num_tokens": 2505285.0,
      "reward": 8.832015323638917,
      "reward_std": 9.847680950164795,
      "rewards/wrapper/mean": 4.416007310897112,
      "rewards/wrapper/std": 12.637867455184459,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 61.6,
      "completions/mean_length": 251.5375,
      "completions/mean_terminated_length": 43.6,
      "completions/min_length": 188.2,
      "completions/min_terminated_length": 34.6,
      "epoch": 0.05513196480938416,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.09375,
      "kl": 0.005555787222692743,
      "learning_rate": 7.9087527502719e-06,
      "loss": -0.007,
      "num_tokens": 2561367.0,
      "reward": 5.861324524879455,
      "reward_std": 7.810394310951233,
      "rewards/wrapper/mean": 2.930662341415882,
      "rewards/wrapper/std": 9.822330982983113,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.0563049853372434,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.96875,
      "kl": 0.011543343169614672,
      "learning_rate": 7.908343480156331e-06,
      "loss": 0.0005,
      "num_tokens": 2617321.0,
      "reward": 8.890991735458375,
      "reward_std": 11.866082763671875,
      "rewards/wrapper/mean": 4.445495916903019,
      "rewards/wrapper/std": 16.05286959260702,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.8,
      "completions/mean_length": 252.00625,
      "completions/mean_terminated_length": 25.8,
      "completions/min_length": 128.2,
      "completions/min_terminated_length": 25.8,
      "epoch": 0.05747800586510264,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.6875,
      "kl": 0.024089473049389198,
      "learning_rate": 7.907924073992568e-06,
      "loss": -0.0029,
      "num_tokens": 2671834.0,
      "reward": 10.378494834899902,
      "reward_std": 12.050719356536865,
      "rewards/wrapper/mean": 5.189247503876686,
      "rewards/wrapper/std": 17.33666645437479,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.05865102639296188,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.625,
      "kl": 0.022702468262286855,
      "learning_rate": 7.907494533216633e-06,
      "loss": -0.0048,
      "num_tokens": 2726430.0,
      "reward": 7.797803008556366,
      "reward_std": 10.616388177871704,
      "rewards/wrapper/mean": 3.8989015720784663,
      "rewards/wrapper/std": 13.724219344556332,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.6,
      "completions/mean_length": 252.6875,
      "completions/mean_terminated_length": 47.6,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 47.6,
      "epoch": 0.05982404692082111,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.234375,
      "kl": 0.007992285175714643,
      "learning_rate": 7.907054859299246e-06,
      "loss": -0.0079,
      "num_tokens": 2780486.0,
      "reward": 11.148936986923218,
      "reward_std": 15.111816167831421,
      "rewards/wrapper/mean": 5.574468304216862,
      "rewards/wrapper/std": 17.840139020979404,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.0,
      "completions/mean_length": 252.6875,
      "completions/mean_terminated_length": 26.1,
      "completions/min_length": 177.8,
      "completions/min_terminated_length": 24.2,
      "epoch": 0.06099706744868035,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.53125,
      "kl": 0.009301980794407427,
      "learning_rate": 7.90660505374583e-06,
      "loss": -0.0008,
      "num_tokens": 2834880.0,
      "reward": 7.71119909286499,
      "reward_std": 10.172386932373048,
      "rewards/wrapper/mean": 3.8555997565388678,
      "rewards/wrapper/std": 12.606358571350574,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 249.625,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.06217008797653959,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.78125,
      "kl": 0.005297352850902826,
      "learning_rate": 7.906145118096491e-06,
      "loss": -0.0149,
      "num_tokens": 2888866.0,
      "reward": 9.901272583007813,
      "reward_std": 11.926094055175781,
      "rewards/wrapper/mean": 4.9506362281739715,
      "rewards/wrapper/std": 13.668652732670306,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.4,
      "completions/mean_length": 253.04375,
      "completions/mean_terminated_length": 29.7,
      "completions/min_length": 174.6,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.06334310850439882,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.6328125,
      "kl": 0.003792907050228678,
      "learning_rate": 7.905675053926023e-06,
      "loss": -0.0024,
      "num_tokens": 2941299.0,
      "reward": 6.089825701713562,
      "reward_std": 8.013871765136718,
      "rewards/wrapper/mean": 3.0449130102992057,
      "rewards/wrapper/std": 10.364787791669368,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.4,
      "completions/mean_length": 249.7375,
      "completions/mean_terminated_length": 4.4,
      "completions/min_length": 55.6,
      "completions/min_terminated_length": 4.4,
      "epoch": 0.06451612903225806,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.59375,
      "kl": 0.12846245223772712,
      "learning_rate": 7.905194862843898e-06,
      "loss": 0.0065,
      "num_tokens": 2993889.0,
      "reward": 11.451298046112061,
      "reward_std": 14.382197093963622,
      "rewards/wrapper/mean": 5.7256487928330895,
      "rewards/wrapper/std": 14.491848316788673,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.6,
      "completions/mean_length": 252.15625,
      "completions/mean_terminated_length": 30.6,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 30.6,
      "epoch": 0.0656891495601173,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.8125,
      "kl": 0.005170016887132078,
      "learning_rate": 7.904704546494267e-06,
      "loss": -0.0124,
      "num_tokens": 3047526.0,
      "reward": 13.99454574584961,
      "reward_std": 19.274290084838867,
      "rewards/wrapper/mean": 6.997273133695126,
      "rewards/wrapper/std": 22.01692125797272,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 89.6,
      "completions/mean_length": 254.0,
      "completions/mean_terminated_length": 89.6,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 89.6,
      "epoch": 0.06686217008797654,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.4375,
      "kl": 0.009661910351132974,
      "learning_rate": 7.90420410655594e-06,
      "loss": -0.0058,
      "num_tokens": 3104106.0,
      "reward": 10.678810977935791,
      "reward_std": 11.440056610107423,
      "rewards/wrapper/mean": 5.3394052013754845,
      "rewards/wrapper/std": 17.356336744129656,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.8,
      "completions/mean_length": 252.44375,
      "completions/mean_terminated_length": 39.8,
      "completions/min_length": 142.2,
      "completions/min_terminated_length": 39.8,
      "epoch": 0.06803519061583578,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.671875,
      "kl": 0.0087807220290415,
      "learning_rate": 7.9036935447424e-06,
      "loss": -0.0017,
      "num_tokens": 3158783.0,
      "reward": 6.231310081481934,
      "reward_std": 8.16063642501831,
      "rewards/wrapper/mean": 3.115654981136322,
      "rewards/wrapper/std": 9.851075352728367,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 96.8,
      "completions/mean_length": 252.625,
      "completions/mean_terminated_length": 96.8,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 96.8,
      "epoch": 0.06920821114369502,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 8.25,
      "kl": 0.004456112009938807,
      "learning_rate": 7.903172862801778e-06,
      "loss": -0.0015,
      "num_tokens": 3212223.0,
      "reward": 11.368969821929932,
      "reward_std": 15.1733154296875,
      "rewards/wrapper/mean": 5.68448467105627,
      "rewards/wrapper/std": 17.372170712053776,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.2,
      "completions/mean_length": 251.6125,
      "completions/mean_terminated_length": 13.2,
      "completions/min_length": 115.6,
      "completions/min_terminated_length": 13.2,
      "epoch": 0.07038123167155426,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.0,
      "kl": 0.0058444807189516725,
      "learning_rate": 7.902642062516862e-06,
      "loss": -0.0076,
      "num_tokens": 3266219.0,
      "reward": 9.240291213989257,
      "reward_std": 12.255160140991212,
      "rewards/wrapper/mean": 4.620145709067583,
      "rewards/wrapper/std": 15.276380948722363,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.0,
      "completions/mean_length": 252.66875,
      "completions/mean_terminated_length": 47.0,
      "completions/min_length": 149.4,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.07155425219941348,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.15625,
      "kl": 0.011432503134710715,
      "learning_rate": 7.902101145705079e-06,
      "loss": -0.0005,
      "num_tokens": 3322326.0,
      "reward": 10.280498886108399,
      "reward_std": 13.853771209716797,
      "rewards/wrapper/mean": 5.140249271690846,
      "rewards/wrapper/std": 16.632223202288152,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.4,
      "completions/mean_length": 251.53125,
      "completions/mean_terminated_length": 5.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.07272727272727272,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.15625,
      "kl": 0.0049453877902124075,
      "learning_rate": 7.9015501142185e-06,
      "loss": -0.0059,
      "num_tokens": 3376043.0,
      "reward": 8.330785369873047,
      "reward_std": 10.894810342788697,
      "rewards/wrapper/mean": 4.165392602980137,
      "rewards/wrapper/std": 10.359532837569713,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.6,
      "completions/mean_length": 255.825,
      "completions/mean_terminated_length": 45.6,
      "completions/min_length": 250.4,
      "completions/min_terminated_length": 45.6,
      "epoch": 0.07390029325513196,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.59375,
      "kl": 0.005565386958187446,
      "learning_rate": 7.900988969943825e-06,
      "loss": 0.0001,
      "num_tokens": 3432859.0,
      "reward": 13.264854145050048,
      "reward_std": 18.112439727783205,
      "rewards/wrapper/mean": 6.6324268616735935,
      "rewards/wrapper/std": 20.8455373570323,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.0750733137829912,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.1796875,
      "kl": 0.0033622915856540204,
      "learning_rate": 7.900417714802381e-06,
      "loss": -0.0068,
      "num_tokens": 3485236.0,
      "reward": 10.746763801574707,
      "reward_std": 14.438395261764526,
      "rewards/wrapper/mean": 5.37338171377778,
      "rewards/wrapper/std": 18.321337571740152,
      "step": 320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 77.8,
      "completions/mean_length": 252.49375,
      "completions/mean_terminated_length": 66.4,
      "completions/min_length": 157.4,
      "completions/min_terminated_length": 55.0,
      "epoch": 0.07624633431085044,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.90625,
      "kl": 0.004275998589582742,
      "learning_rate": 7.899836350750111e-06,
      "loss": -0.0088,
      "num_tokens": 3539361.0,
      "reward": 6.864453983306885,
      "reward_std": 9.229621601104736,
      "rewards/wrapper/mean": 3.4322269305586817,
      "rewards/wrapper/std": 12.251788380742074,
      "step": 325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.2,
      "completions/mean_length": 250.6125,
      "completions/mean_terminated_length": 32.2,
      "completions/min_length": 134.6,
      "completions/min_terminated_length": 32.2,
      "epoch": 0.07741935483870968,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.59375,
      "kl": 0.011575464520137756,
      "learning_rate": 7.899244879777575e-06,
      "loss": -0.0178,
      "num_tokens": 3595203.0,
      "reward": 10.059148216247559,
      "reward_std": 11.052921676635743,
      "rewards/wrapper/mean": 5.029574017226696,
      "rewards/wrapper/std": 15.514526377618314,
      "step": 330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.6,
      "completions/mean_length": 255.3875,
      "completions/mean_terminated_length": 31.6,
      "completions/min_length": 236.4,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.07859237536656892,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.90625,
      "kl": 0.00798338907188736,
      "learning_rate": 7.898643303909933e-06,
      "loss": 0.0021,
      "num_tokens": 3649915.0,
      "reward": 12.55016393661499,
      "reward_std": 17.015945434570312,
      "rewards/wrapper/mean": 6.27508184760809,
      "rewards/wrapper/std": 17.497456189990043,
      "step": 335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 250.03125,
      "completions/mean_terminated_length": 10.3,
      "completions/min_length": 109.4,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.07976539589442816,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.6875,
      "kl": 0.011324020172469317,
      "learning_rate": 7.89803162520695e-06,
      "loss": -0.0073,
      "num_tokens": 3705082.0,
      "reward": 13.485107231140137,
      "reward_std": 18.413896560668945,
      "rewards/wrapper/mean": 6.7425536692142485,
      "rewards/wrapper/std": 19.012604074180125,
      "step": 340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.6,
      "completions/mean_length": 248.125,
      "completions/mean_terminated_length": 2.1,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.08093841642228738,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.421875,
      "kl": 0.005373219301691279,
      "learning_rate": 7.897409845762977e-06,
      "loss": -0.0265,
      "num_tokens": 3761870.0,
      "reward": 13.213876724243164,
      "reward_std": 17.01407127380371,
      "rewards/wrapper/mean": 6.606937965750694,
      "rewards/wrapper/std": 17.88588900715113,
      "step": 345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.2,
      "completions/mean_length": 249.0125,
      "completions/mean_terminated_length": 17.5,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 2.8,
      "epoch": 0.08211143695014662,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.09375,
      "kl": 0.006631963752442971,
      "learning_rate": 7.896777967706954e-06,
      "loss": -0.0242,
      "num_tokens": 3814768.0,
      "reward": 12.118576288223267,
      "reward_std": 16.33619499206543,
      "rewards/wrapper/mean": 6.059287798404694,
      "rewards/wrapper/std": 19.245304891467093,
      "step": 350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.2,
      "completions/mean_length": 252.80625,
      "completions/mean_terminated_length": 25.8,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.08328445747800586,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.984375,
      "kl": 0.0040073363459669055,
      "learning_rate": 7.896135993202392e-06,
      "loss": -0.0111,
      "num_tokens": 3868007.0,
      "reward": 5.220393860340119,
      "reward_std": 6.543023681640625,
      "rewards/wrapper/mean": 2.6101968064904213,
      "rewards/wrapper/std": 9.627190099656582,
      "step": 355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 252.8625,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 155.6,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.0844574780058651,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.3125,
      "kl": 0.020079531124792993,
      "learning_rate": 7.895483924447377e-06,
      "loss": -0.0065,
      "num_tokens": 3923273.0,
      "reward": 12.692752647399903,
      "reward_std": 15.342731380462647,
      "rewards/wrapper/mean": 6.346376179158687,
      "rewards/wrapper/std": 20.673074428737163,
      "step": 360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.08563049853372434,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 17.75,
      "kl": 0.043798852752661335,
      "learning_rate": 7.894821763674556e-06,
      "loss": -0.0052,
      "num_tokens": 3978401.0,
      "reward": 9.202295875549316,
      "reward_std": 9.355103588104248,
      "rewards/wrapper/mean": 4.60114776045084,
      "rewards/wrapper/std": 13.482706923782825,
      "step": 365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.08680351906158358,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.03125,
      "kl": 0.0192249774816446,
      "learning_rate": 7.89414951315113e-06,
      "loss": -0.0068,
      "num_tokens": 4035268.0,
      "reward": 7.144095826148987,
      "reward_std": 9.492268562316895,
      "rewards/wrapper/mean": 3.572047848254442,
      "rewards/wrapper/std": 10.409622764587402,
      "step": 370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 250.1375,
      "completions/mean_terminated_length": 10.1,
      "completions/min_length": 105.6,
      "completions/min_terminated_length": 3.2,
      "epoch": 0.08797653958944282,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 6.25,
      "kl": 0.0067635633982717994,
      "learning_rate": 7.893467175178848e-06,
      "loss": -0.0136,
      "num_tokens": 4089458.0,
      "reward": 5.858023273944855,
      "reward_std": 7.792664754390716,
      "rewards/wrapper/mean": 2.929011580348015,
      "rewards/wrapper/std": 8.176401071250439,
      "step": 375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.4,
      "completions/mean_length": 252.96875,
      "completions/mean_terminated_length": 5.4,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.08914956011730206,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.515625,
      "kl": 0.008263138856273144,
      "learning_rate": 7.892774752093993e-06,
      "loss": -0.0069,
      "num_tokens": 4144745.0,
      "reward": 5.752477717399597,
      "reward_std": 7.585747843980789,
      "rewards/wrapper/mean": 2.8762387074530125,
      "rewards/wrapper/std": 10.010685224831104,
      "step": 380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.8,
      "completions/mean_length": 252.9125,
      "completions/mean_terminated_length": 54.8,
      "completions/min_length": 157.2,
      "completions/min_terminated_length": 54.8,
      "epoch": 0.09032258064516129,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.3125,
      "kl": 0.008487808145582676,
      "learning_rate": 7.892072246267383e-06,
      "loss": -0.0105,
      "num_tokens": 4198865.0,
      "reward": 12.836762094497681,
      "reward_std": 15.75669355392456,
      "rewards/wrapper/mean": 6.418380816280842,
      "rewards/wrapper/std": 18.001925800740718,
      "step": 385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 249.625,
      "completions/mean_terminated_length": 0.8,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.09149560117302052,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.625,
      "kl": 0.02361058863461949,
      "learning_rate": 7.891359660104361e-06,
      "loss": -0.0022,
      "num_tokens": 4257033.0,
      "reward": 10.801888704299927,
      "reward_std": 13.946699237823486,
      "rewards/wrapper/mean": 5.400944182276726,
      "rewards/wrapper/std": 14.914789086580276,
      "step": 390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.2,
      "completions/mean_length": 251.26875,
      "completions/mean_terminated_length": 2.2,
      "completions/min_length": 104.6,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.09266862170087976,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.078125,
      "kl": 0.07726627183728851,
      "learning_rate": 7.89063699604478e-06,
      "loss": 0.0079,
      "num_tokens": 4307928.0,
      "reward": 9.3045334815979,
      "reward_std": 10.442511081695557,
      "rewards/wrapper/mean": 4.652266338467598,
      "rewards/wrapper/std": 14.739392180740833,
      "step": 395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.0,
      "completions/mean_length": 252.83125,
      "completions/mean_terminated_length": 1.0,
      "completions/min_length": 154.6,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.093841642228739,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.0625,
      "kl": 0.005269644845975563,
      "learning_rate": 7.889904256563e-06,
      "loss": -0.0066,
      "num_tokens": 4364827.0,
      "reward": 10.288627099990844,
      "reward_std": 13.034698617458343,
      "rewards/wrapper/mean": 5.144313305616379,
      "rewards/wrapper/std": 14.584010930359364,
      "step": 400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.2,
      "completions/mean_length": 250.80625,
      "completions/mean_terminated_length": 19.3,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.09501466275659824,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 88.0,
      "kl": 0.5907435442029965,
      "learning_rate": 7.88916144416788e-06,
      "loss": 0.0165,
      "num_tokens": 4419578.0,
      "reward": 7.900735855102539,
      "reward_std": 10.42778558731079,
      "rewards/wrapper/mean": 3.9503678739070893,
      "rewards/wrapper/std": 13.24696546792984,
      "step": 405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.09618768328445748,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.59375,
      "kl": 0.004958215163787827,
      "learning_rate": 7.888408561402767e-06,
      "loss": -0.0023,
      "num_tokens": 4475905.0,
      "reward": 7.845335531234741,
      "reward_std": 10.648851537704468,
      "rewards/wrapper/mean": 3.92266783118248,
      "rewards/wrapper/std": 13.168632271885873,
      "step": 410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 253.3375,
      "completions/mean_terminated_length": 17.2,
      "completions/min_length": 170.8,
      "completions/min_terminated_length": 17.2,
      "epoch": 0.09736070381231672,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.3125,
      "kl": 0.009949381230399013,
      "learning_rate": 7.887645610845491e-06,
      "loss": -0.0014,
      "num_tokens": 4528455.0,
      "reward": 6.3044956684112545,
      "reward_std": 8.397425222396851,
      "rewards/wrapper/mean": 3.1522478252649306,
      "rewards/wrapper/std": 13.238372421264648,
      "step": 415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 60.0,
      "completions/mean_length": 251.475,
      "completions/mean_terminated_length": 60.0,
      "completions/min_length": 111.2,
      "completions/min_terminated_length": 60.0,
      "epoch": 0.09853372434017596,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 9.5625,
      "kl": 0.009382255608215928,
      "learning_rate": 7.88687259510835e-06,
      "loss": -0.0136,
      "num_tokens": 4586407.0,
      "reward": 10.528643894195557,
      "reward_std": 14.410901546478271,
      "rewards/wrapper/mean": 5.264321990311146,
      "rewards/wrapper/std": 17.865098947286604,
      "step": 420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 254.41875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 205.4,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.09970674486803519,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.1875,
      "kl": 0.006355077913030982,
      "learning_rate": 7.886089516838104e-06,
      "loss": -0.0035,
      "num_tokens": 4640554.0,
      "reward": 8.880403900146485,
      "reward_std": 11.53792371749878,
      "rewards/wrapper/mean": 4.440201735496521,
      "rewards/wrapper/std": 14.214903639256955,
      "step": 425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 55.2,
      "completions/mean_length": 252.925,
      "completions/mean_terminated_length": 55.2,
      "completions/min_length": 157.6,
      "completions/min_terminated_length": 55.2,
      "epoch": 0.10087976539589442,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.9375,
      "kl": 0.011302582046482712,
      "learning_rate": 7.885296378715972e-06,
      "loss": -0.0074,
      "num_tokens": 4695284.0,
      "reward": 9.465247249603271,
      "reward_std": 12.294286060333253,
      "rewards/wrapper/mean": 4.732623440772295,
      "rewards/wrapper/std": 16.889464408159256,
      "step": 430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 64.8,
      "completions/mean_length": 249.25625,
      "completions/mean_terminated_length": 60.2,
      "completions/min_length": 106.8,
      "completions/min_terminated_length": 55.6,
      "epoch": 0.10205278592375366,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.0390625,
      "kl": 0.008649818610865622,
      "learning_rate": 7.884493183457612e-06,
      "loss": -0.0084,
      "num_tokens": 4747981.0,
      "reward": 10.74110836982727,
      "reward_std": 14.534211778640747,
      "rewards/wrapper/mean": 5.37055408731103,
      "rewards/wrapper/std": 17.95357619225979,
      "step": 435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.2,
      "completions/mean_length": 255.15625,
      "completions/mean_terminated_length": 24.2,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 24.2,
      "epoch": 0.1032258064516129,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.7421875,
      "kl": 0.01159596272627823,
      "learning_rate": 7.883679933813119e-06,
      "loss": -0.0014,
      "num_tokens": 4802888.0,
      "reward": 15.938543891906738,
      "reward_std": 20.0162145614624,
      "rewards/wrapper/mean": 7.9692716032266615,
      "rewards/wrapper/std": 23.760393367707728,
      "step": 440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 29.6,
      "completions/mean_length": 252.13125,
      "completions/mean_terminated_length": 15.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.10439882697947214,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.421875,
      "kl": 0.008417817368172108,
      "learning_rate": 7.882856632567015e-06,
      "loss": -0.0053,
      "num_tokens": 4859187.0,
      "reward": 17.78517837524414,
      "reward_std": 22.430026054382324,
      "rewards/wrapper/mean": 8.892589239776134,
      "rewards/wrapper/std": 25.629758620262145,
      "step": 445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.6,
      "completions/mean_length": 247.63125,
      "completions/mean_terminated_length": 35.9,
      "completions/min_length": 135.8,
      "completions/min_terminated_length": 33.4,
      "epoch": 0.10557184750733138,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 0.9765625,
      "kl": 0.005837788642384112,
      "learning_rate": 7.882023282538236e-06,
      "loss": -0.0218,
      "num_tokens": 4911118.0,
      "reward": 13.440273857116699,
      "reward_std": 12.466052770614624,
      "rewards/wrapper/mean": 6.720136827230453,
      "rewards/wrapper/std": 17.47331467717886,
      "step": 450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.8,
      "completions/mean_length": 244.96875,
      "completions/mean_terminated_length": 2.8,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.10674486803519062,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.5,
      "kl": 0.007168042741250246,
      "learning_rate": 7.881179886580125e-06,
      "loss": -0.0221,
      "num_tokens": 4963319.0,
      "reward": 12.285084342956543,
      "reward_std": 14.525907707214355,
      "rewards/wrapper/mean": 6.142542491853237,
      "rewards/wrapper/std": 17.427485939860343,
      "step": 455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.10791788856304986,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.828125,
      "kl": 0.009624737745616585,
      "learning_rate": 7.880326447580421e-06,
      "loss": -0.0021,
      "num_tokens": 5017498.0,
      "reward": 10.349391460418701,
      "reward_std": 13.977695178985595,
      "rewards/wrapper/mean": 5.174696030467748,
      "rewards/wrapper/std": 17.32550062686205,
      "step": 460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.6,
      "completions/mean_length": 251.40625,
      "completions/mean_terminated_length": 6.6,
      "completions/min_length": 109.0,
      "completions/min_terminated_length": 6.6,
      "epoch": 0.10909090909090909,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.0,
      "kl": 0.005783123133005575,
      "learning_rate": 7.879462968461254e-06,
      "loss": 0.0054,
      "num_tokens": 5075057.0,
      "reward": 12.299484062194825,
      "reward_std": 13.539479064941407,
      "rewards/wrapper/mean": 6.149741820991039,
      "rewards/wrapper/std": 17.295961599051953,
      "step": 465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.8,
      "completions/mean_length": 252.84375,
      "completions/mean_terminated_length": 26.4,
      "completions/min_length": 155.6,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.11026392961876832,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.375,
      "kl": 0.017190750857116653,
      "learning_rate": 7.878589452179124e-06,
      "loss": -0.0092,
      "num_tokens": 5129870.0,
      "reward": 16.291987419128418,
      "reward_std": 16.42507972717285,
      "rewards/wrapper/mean": 8.145993730425834,
      "rewards/wrapper/std": 23.72040745615959,
      "step": 470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.6,
      "completions/mean_length": 252.5,
      "completions/mean_terminated_length": 41.6,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 41.6,
      "epoch": 0.11143695014662756,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.609375,
      "kl": 0.026824597117956726,
      "learning_rate": 7.877705901724904e-06,
      "loss": -0.004,
      "num_tokens": 5185876.0,
      "reward": 9.22128176689148,
      "reward_std": 12.521512603759765,
      "rewards/wrapper/mean": 4.610640931129455,
      "rewards/wrapper/std": 15.985003382712602,
      "step": 475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 20.6,
      "completions/mean_length": 253.44375,
      "completions/mean_terminated_length": 20.6,
      "completions/min_length": 174.2,
      "completions/min_terminated_length": 20.6,
      "epoch": 0.1126099706744868,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.5859375,
      "kl": 0.008066412215703167,
      "learning_rate": 7.876812320123819e-06,
      "loss": -0.0074,
      "num_tokens": 5238537.0,
      "reward": 7.3396319389343265,
      "reward_std": 9.42587718963623,
      "rewards/wrapper/mean": 3.6698158286511897,
      "rewards/wrapper/std": 11.667912058532238,
      "step": 480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.6,
      "completions/mean_length": 254.35,
      "completions/mean_terminated_length": 49.6,
      "completions/min_length": 203.2,
      "completions/min_terminated_length": 49.6,
      "epoch": 0.11378299120234604,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.0,
      "kl": 0.005426285532303154,
      "learning_rate": 7.875908710435441e-06,
      "loss": -0.0052,
      "num_tokens": 5292019.0,
      "reward": 13.085040092468262,
      "reward_std": 13.977268314361572,
      "rewards/wrapper/mean": 6.542520047724247,
      "rewards/wrapper/std": 18.962931068241595,
      "step": 485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 79.4,
      "completions/mean_length": 247.86875,
      "completions/mean_terminated_length": 56.03333435058594,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 35.8,
      "epoch": 0.11495601173020528,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.21875,
      "kl": 0.0073792809271253645,
      "learning_rate": 7.874995075753678e-06,
      "loss": -0.0044,
      "num_tokens": 5344538.0,
      "reward": 6.792131614685059,
      "reward_std": 9.024149131774902,
      "rewards/wrapper/mean": 3.3960657209157943,
      "rewards/wrapper/std": 11.643630801141262,
      "step": 490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 122.4,
      "completions/mean_length": 252.16875,
      "completions/mean_terminated_length": 102.9,
      "completions/min_length": 134.6,
      "completions/min_terminated_length": 83.4,
      "epoch": 0.11612903225806452,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.375,
      "kl": 0.011405504535650835,
      "learning_rate": 7.874071419206767e-06,
      "loss": -0.0067,
      "num_tokens": 5399329.0,
      "reward": 10.500157880783082,
      "reward_std": 12.362210059165955,
      "rewards/wrapper/mean": 5.250078846514225,
      "rewards/wrapper/std": 16.170384666323663,
      "step": 495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.8,
      "completions/mean_length": 251.05625,
      "completions/mean_terminated_length": 30.0,
      "completions/min_length": 119.6,
      "completions/min_terminated_length": 17.2,
      "epoch": 0.11730205278592376,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 15.75,
      "kl": 0.023087952454807235,
      "learning_rate": 7.873137743957253e-06,
      "loss": -0.0053,
      "num_tokens": 5453188.0,
      "reward": 10.207341730594635,
      "reward_std": 13.235777711868286,
      "rewards/wrapper/mean": 5.1036708191037174,
      "rewards/wrapper/std": 15.854820623993874,
      "step": 500
    },
    {
      "epoch": 0.11730205278592376,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.805,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 64.59,
      "eval_completions/mean_length": 225.58,
      "eval_completions/mean_terminated_length": 54.89000015258789,
      "eval_completions/min_length": 160.77,
      "eval_completions/min_terminated_length": 45.57,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.008613987206481398,
      "eval_loss": -0.05724004656076431,
      "eval_num_tokens": 5453188.0,
      "eval_reward": 0.3156442906707525,
      "eval_reward_std": 0.15477610152214766,
      "eval_rewards/wrapper/mean": 0.1578221420943737,
      "eval_rewards/wrapper/std": 0.14301297422032802,
      "eval_runtime": 207.7513,
      "eval_samples_per_second": 0.963,
      "eval_steps_per_second": 0.241,
      "step": 500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.6,
      "completions/mean_length": 249.65,
      "completions/mean_terminated_length": 1.6,
      "completions/min_length": 52.8,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.11847507331378299,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.265625,
      "kl": 0.007007372000953182,
      "learning_rate": 7.872194053201988e-06,
      "loss": -0.0135,
      "num_tokens": 5505338.0,
      "reward": 5.762967586517334,
      "reward_std": 7.6124520778656,
      "rewards/wrapper/mean": 2.8814836353063584,
      "rewards/wrapper/std": 10.962222987413407,
      "step": 505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.11964809384164223,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.765625,
      "kl": 0.00901545921806246,
      "learning_rate": 7.871240350172112e-06,
      "loss": -0.0059,
      "num_tokens": 5561149.0,
      "reward": 10.71638011932373,
      "reward_std": 11.327287292480468,
      "rewards/wrapper/mean": 5.358189883828163,
      "rewards/wrapper/std": 15.345132572948932,
      "step": 510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.8,
      "completions/mean_length": 252.1,
      "completions/mean_terminated_length": 28.8,
      "completions/min_length": 131.2,
      "completions/min_terminated_length": 28.8,
      "epoch": 0.12082111436950146,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.0,
      "kl": 1.7968808292440372,
      "learning_rate": 7.870276638133056e-06,
      "loss": 0.0643,
      "num_tokens": 5615687.0,
      "reward": 10.27314796447754,
      "reward_std": 12.886303329467774,
      "rewards/wrapper/mean": 5.13657393977046,
      "rewards/wrapper/std": 17.148977878689767,
      "step": 515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.4,
      "completions/mean_length": 253.59375,
      "completions/mean_terminated_length": 25.4,
      "completions/min_length": 179.0,
      "completions/min_terminated_length": 25.4,
      "epoch": 0.1219941348973607,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.625,
      "kl": 0.008333416358800605,
      "learning_rate": 7.869302920384511e-06,
      "loss": -0.0036,
      "num_tokens": 5669610.0,
      "reward": 8.138755130767823,
      "reward_std": 9.697688674926757,
      "rewards/wrapper/mean": 4.069377472251654,
      "rewards/wrapper/std": 13.40217920690775,
      "step": 520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.4,
      "completions/mean_length": 248.93125,
      "completions/mean_terminated_length": 16.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 4.6,
      "epoch": 0.12316715542521994,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.25,
      "kl": 0.00565977388760075,
      "learning_rate": 7.868319200260435e-06,
      "loss": 0.0013,
      "num_tokens": 5723607.0,
      "reward": 7.519344139099121,
      "reward_std": 9.667561912536621,
      "rewards/wrapper/mean": 3.7596719533205034,
      "rewards/wrapper/std": 12.146075774729251,
      "step": 525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 74.2,
      "completions/mean_length": 249.6,
      "completions/mean_terminated_length": 61.06666717529297,
      "completions/min_length": 91.8,
      "completions/min_terminated_length": 40.6,
      "epoch": 0.12434017595307918,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.359375,
      "kl": 0.0132868135930039,
      "learning_rate": 7.867325481129026e-06,
      "loss": -0.0132,
      "num_tokens": 5779257.0,
      "reward": 13.06201467514038,
      "reward_std": 14.132675647735596,
      "rewards/wrapper/mean": 6.531007275730372,
      "rewards/wrapper/std": 19.79606983065605,
      "step": 530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 253.01875,
      "completions/mean_terminated_length": 7.0,
      "completions/min_length": 160.6,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.12551319648093842,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.953125,
      "kl": 0.011028506548609585,
      "learning_rate": 7.866321766392723e-06,
      "loss": -0.0078,
      "num_tokens": 5835452.0,
      "reward": 6.974220561981201,
      "reward_std": 9.202022171020507,
      "rewards/wrapper/mean": 3.4871101051568987,
      "rewards/wrapper/std": 11.520281651616097,
      "step": 535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.0,
      "completions/mean_length": 254.3625,
      "completions/mean_terminated_length": 50.0,
      "completions/min_length": 203.6,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.12668621700879765,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5625,
      "kl": 0.006271988293156028,
      "learning_rate": 7.86530805948819e-06,
      "loss": -0.0035,
      "num_tokens": 5888666.0,
      "reward": 11.376586723327637,
      "reward_std": 13.494106674194336,
      "rewards/wrapper/mean": 5.688293327391148,
      "rewards/wrapper/std": 18.444926972687245,
      "step": 540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.6,
      "completions/mean_length": 252.5375,
      "completions/mean_terminated_length": 21.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.1278592375366569,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.359375,
      "kl": 0.017584053613245488,
      "learning_rate": 7.864284363886301e-06,
      "loss": -0.0096,
      "num_tokens": 5946884.0,
      "reward": 6.535472047328949,
      "reward_std": 8.555399453639984,
      "rewards/wrapper/mean": 3.267736179381609,
      "rewards/wrapper/std": 9.66622311770916,
      "step": 545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 71.4,
      "completions/mean_length": 249.46875,
      "completions/mean_terminated_length": 49.3,
      "completions/min_length": 78.4,
      "completions/min_terminated_length": 27.2,
      "epoch": 0.12903225806451613,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.140625,
      "kl": 0.028963472304167227,
      "learning_rate": 7.863250683092132e-06,
      "loss": -0.0195,
      "num_tokens": 5998817.0,
      "reward": 12.318414115905762,
      "reward_std": 15.487536811828614,
      "rewards/wrapper/mean": 6.159206974506378,
      "rewards/wrapper/std": 18.252126486599444,
      "step": 550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13020527859237538,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.046875,
      "kl": 0.006062440911773592,
      "learning_rate": 7.862207020644947e-06,
      "loss": 0.0002,
      "num_tokens": 6053299.0,
      "reward": 7.462558031082153,
      "reward_std": 9.897032356262207,
      "rewards/wrapper/mean": 3.731279059499502,
      "rewards/wrapper/std": 11.677504101395607,
      "step": 555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.1313782991202346,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.140625,
      "kl": 0.005756572855170816,
      "learning_rate": 7.861153380118187e-06,
      "loss": 0.0002,
      "num_tokens": 6107719.0,
      "reward": 9.559343433380127,
      "reward_std": 12.309616780281067,
      "rewards/wrapper/mean": 4.77967184856534,
      "rewards/wrapper/std": 15.938154307007789,
      "step": 560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 61.8,
      "completions/mean_length": 249.9375,
      "completions/mean_terminated_length": 40.0,
      "completions/min_length": 69.4,
      "completions/min_terminated_length": 18.2,
      "epoch": 0.13255131964809383,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.09375,
      "kl": 0.0070392878842540085,
      "learning_rate": 7.860089765119458e-06,
      "loss": -0.0095,
      "num_tokens": 6162783.0,
      "reward": 15.076542854309082,
      "reward_std": 18.630818176269532,
      "rewards/wrapper/mean": 7.538271514326334,
      "rewards/wrapper/std": 20.744914372265338,
      "step": 565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13372434017595308,
      "frac_reward_zero_std": 0.075,
      "grad_norm": 5.1875,
      "kl": 0.015195795160252601,
      "learning_rate": 7.859016179290516e-06,
      "loss": 0.0006,
      "num_tokens": 6222655.0,
      "reward": 9.283301067352294,
      "reward_std": 10.608103895187378,
      "rewards/wrapper/mean": 4.641650436818599,
      "rewards/wrapper/std": 13.378304573893548,
      "step": 570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.6,
      "completions/mean_length": 254.325,
      "completions/mean_terminated_length": 24.4,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.1348973607038123,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.1875,
      "kl": 0.03376310399617068,
      "learning_rate": 7.857932626307261e-06,
      "loss": 0.0018,
      "num_tokens": 6276273.0,
      "reward": 12.699429130554199,
      "reward_std": 14.780900478363037,
      "rewards/wrapper/mean": 6.349714441597461,
      "rewards/wrapper/std": 23.842760853469372,
      "step": 575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 56.2,
      "completions/mean_length": 254.55625,
      "completions/mean_terminated_length": 56.2,
      "completions/min_length": 209.8,
      "completions/min_terminated_length": 56.2,
      "epoch": 0.13607038123167156,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.171875,
      "kl": 0.007011306460481137,
      "learning_rate": 7.856839109879712e-06,
      "loss": 0.0015,
      "num_tokens": 6333892.0,
      "reward": 7.93853178024292,
      "reward_std": 10.552415084838866,
      "rewards/wrapper/mean": 3.9692660093307497,
      "rewards/wrapper/std": 13.967398370802403,
      "step": 580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.0,
      "completions/mean_length": 254.20625,
      "completions/mean_terminated_length": 45.0,
      "completions/min_length": 198.6,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.1372434017595308,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.515625,
      "kl": 0.01811500685289502,
      "learning_rate": 7.855735633752014e-06,
      "loss": 0.0048,
      "num_tokens": 6391489.0,
      "reward": 7.980970191955566,
      "reward_std": 9.922795867919922,
      "rewards/wrapper/mean": 3.9904851004481317,
      "rewards/wrapper/std": 13.737288218736648,
      "step": 585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.4,
      "completions/mean_length": 249.45625,
      "completions/mean_terminated_length": 23.4,
      "completions/min_length": 108.8,
      "completions/min_terminated_length": 6.4,
      "epoch": 0.13841642228739004,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.140625,
      "kl": 0.008395724120782688,
      "learning_rate": 7.854622201702398e-06,
      "loss": -0.021,
      "num_tokens": 6445788.0,
      "reward": 17.261702919006346,
      "reward_std": 22.740974617004394,
      "rewards/wrapper/mean": 8.630851200222969,
      "rewards/wrapper/std": 25.095195826888084,
      "step": 590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.13958944281524927,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.234375,
      "kl": 0.006187805708032101,
      "learning_rate": 7.8534988175432e-06,
      "loss": -0.0054,
      "num_tokens": 6500061.0,
      "reward": 4.855136448144913,
      "reward_std": 6.186183905601501,
      "rewards/wrapper/mean": 2.4275682747364042,
      "rewards/wrapper/std": 8.418499158322811,
      "step": 595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.14076246334310852,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.84375,
      "kl": 0.005689225665992126,
      "learning_rate": 7.852365485120821e-06,
      "loss": -0.0123,
      "num_tokens": 6553787.0,
      "reward": 11.236301851272582,
      "reward_std": 12.74597978591919,
      "rewards/wrapper/mean": 5.618150828778743,
      "rewards/wrapper/std": 13.135591895878315,
      "step": 600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 59.0,
      "completions/mean_length": 254.64375,
      "completions/mean_terminated_length": 59.0,
      "completions/min_length": 212.6,
      "completions/min_terminated_length": 59.0,
      "epoch": 0.14193548387096774,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.34375,
      "kl": 0.006667055486468598,
      "learning_rate": 7.851222208315726e-06,
      "loss": -0.0047,
      "num_tokens": 6607744.0,
      "reward": 7.296946382522583,
      "reward_std": 9.803734683990479,
      "rewards/wrapper/mean": 3.648472948372364,
      "rewards/wrapper/std": 13.976402992010117,
      "step": 605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 248.2625,
      "completions/mean_terminated_length": 4.3,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.14310850439882697,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.65625,
      "kl": 0.023930460814153776,
      "learning_rate": 7.850068991042432e-06,
      "loss": -0.0114,
      "num_tokens": 6661002.0,
      "reward": 7.392018556594849,
      "reward_std": 8.457538390159607,
      "rewards/wrapper/mean": 3.696009010076523,
      "rewards/wrapper/std": 10.95611379891634,
      "step": 610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 75.0,
      "completions/mean_length": 251.94375,
      "completions/mean_terminated_length": 75.0,
      "completions/min_length": 126.2,
      "completions/min_terminated_length": 75.0,
      "epoch": 0.14428152492668622,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.671875,
      "kl": 17.328260349377523,
      "learning_rate": 7.848905837249485e-06,
      "loss": 0.6789,
      "num_tokens": 6717383.0,
      "reward": 6.301204872131348,
      "reward_std": 8.14721348285675,
      "rewards/wrapper/mean": 3.150602462887764,
      "rewards/wrapper/std": 10.698176135122775,
      "step": 615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.225,
      "completions/mean_terminated_length": 0.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.14545454545454545,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.0625,
      "kl": 0.007349775591865182,
      "learning_rate": 7.847732750919463e-06,
      "loss": -0.0085,
      "num_tokens": 6771751.0,
      "reward": 15.633229780197144,
      "reward_std": 19.575571060180664,
      "rewards/wrapper/mean": 7.816615104675293,
      "rewards/wrapper/std": 21.27004445493221,
      "step": 620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.6,
      "completions/mean_length": 252.68125,
      "completions/mean_terminated_length": 31.0,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 26.4,
      "epoch": 0.1466275659824047,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.53125,
      "kl": 1.286929903679993,
      "learning_rate": 7.846549736068945e-06,
      "loss": 0.0417,
      "num_tokens": 6829496.0,
      "reward": 10.100393962860107,
      "reward_std": 10.381993126869201,
      "rewards/wrapper/mean": 5.050196871161461,
      "rewards/wrapper/std": 15.772063083946705,
      "step": 625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.2,
      "completions/mean_length": 254.4375,
      "completions/mean_terminated_length": 1.2,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 1.2,
      "epoch": 0.14780058651026393,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.78125,
      "kl": 0.030532787676202135,
      "learning_rate": 7.845356796748507e-06,
      "loss": -0.0012,
      "num_tokens": 6881764.0,
      "reward": 8.524028420448303,
      "reward_std": 9.672714823484421,
      "rewards/wrapper/mean": 4.262014053016901,
      "rewards/wrapper/std": 12.955846460163594,
      "step": 630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.2,
      "completions/mean_length": 249.95,
      "completions/mean_terminated_length": 61.6,
      "completions/min_length": 112.2,
      "completions/min_terminated_length": 61.0,
      "epoch": 0.14897360703812318,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.8203125,
      "kl": 0.008349815453402697,
      "learning_rate": 7.844153937042703e-06,
      "loss": 0.0055,
      "num_tokens": 6937332.0,
      "reward": 7.351616859436035,
      "reward_std": 9.709289264678954,
      "rewards/wrapper/mean": 3.67580828666687,
      "rewards/wrapper/std": 11.747823464870454,
      "step": 635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.8,
      "completions/mean_length": 250.15,
      "completions/mean_terminated_length": 35.7,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.1501466275659824,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.0859375,
      "kl": 0.005067132035037503,
      "learning_rate": 7.84294116107006e-06,
      "loss": -0.0055,
      "num_tokens": 6991852.0,
      "reward": 15.24313793182373,
      "reward_std": 19.83392467498779,
      "rewards/wrapper/mean": 7.621568508446217,
      "rewards/wrapper/std": 21.900732143223287,
      "step": 640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.4,
      "completions/mean_length": 254.56875,
      "completions/mean_terminated_length": 5.4,
      "completions/min_length": 210.2,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.15131964809384163,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.5625,
      "kl": 0.22633971532341093,
      "learning_rate": 7.841718472983054e-06,
      "loss": 0.0046,
      "num_tokens": 7047317.0,
      "reward": 11.10000295639038,
      "reward_std": 13.981466102600098,
      "rewards/wrapper/mean": 5.550001335144043,
      "rewards/wrapper/std": 15.351305815577508,
      "step": 645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.4,
      "completions/mean_length": 254.50625,
      "completions/mean_terminated_length": 3.4,
      "completions/min_length": 208.2,
      "completions/min_terminated_length": 3.4,
      "epoch": 0.15249266862170088,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 35.25,
      "kl": 0.05593276094878093,
      "learning_rate": 7.840485876968097e-06,
      "loss": -0.0024,
      "num_tokens": 7099374.0,
      "reward": 12.740144157409668,
      "reward_std": 14.847570991516113,
      "rewards/wrapper/mean": 6.3700722143054005,
      "rewards/wrapper/std": 17.99122847020626,
      "step": 650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.1536656891495601,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.46875,
      "kl": 0.010588540998287498,
      "learning_rate": 7.839243377245529e-06,
      "loss": -0.004,
      "num_tokens": 7152777.0,
      "reward": 8.608040571212769,
      "reward_std": 9.49074192047119,
      "rewards/wrapper/mean": 4.304020477086306,
      "rewards/wrapper/std": 12.330823975801469,
      "step": 655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.6,
      "completions/mean_length": 253.06875,
      "completions/mean_terminated_length": 8.6,
      "completions/min_length": 162.2,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.15483870967741936,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.828125,
      "kl": 0.23861043564975262,
      "learning_rate": 7.8379909780696e-06,
      "loss": 0.0039,
      "num_tokens": 7208812.0,
      "reward": 9.388974571228028,
      "reward_std": 11.66281862258911,
      "rewards/wrapper/mean": 4.694487226009369,
      "rewards/wrapper/std": 13.111043818295002,
      "step": 660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.0,
      "completions/mean_length": 252.16875,
      "completions/mean_terminated_length": 31.0,
      "completions/min_length": 133.4,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.1560117302052786,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.0,
      "kl": 0.005851204495411366,
      "learning_rate": 7.836728683728452e-06,
      "loss": 0.0058,
      "num_tokens": 7263149.0,
      "reward": 11.78786849975586,
      "reward_std": 15.879752826690673,
      "rewards/wrapper/mean": 5.893934021890163,
      "rewards/wrapper/std": 17.91351638287306,
      "step": 665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 253.35625,
      "completions/mean_terminated_length": 17.8,
      "completions/min_length": 171.4,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.15718475073313784,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.96875,
      "kl": 0.00533767455490306,
      "learning_rate": 7.83545649854411e-06,
      "loss": -0.0037,
      "num_tokens": 7317060.0,
      "reward": 9.627443599700928,
      "reward_std": 10.127561569213867,
      "rewards/wrapper/mean": 4.813721719384193,
      "rewards/wrapper/std": 14.26107615828514,
      "step": 670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 23.6,
      "completions/mean_length": 250.34375,
      "completions/mean_terminated_length": 22.3,
      "completions/min_length": 123.4,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.15835777126099707,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.375,
      "kl": 0.005868964816909283,
      "learning_rate": 7.834174426872463e-06,
      "loss": -0.0032,
      "num_tokens": 7369351.0,
      "reward": 9.649084949493409,
      "reward_std": 12.152606201171874,
      "rewards/wrapper/mean": 4.824542417377233,
      "rewards/wrapper/std": 15.261625829339028,
      "step": 675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 53.8,
      "completions/mean_length": 251.2875,
      "completions/mean_terminated_length": 50.6,
      "completions/min_length": 149.8,
      "completions/min_terminated_length": 47.4,
      "epoch": 0.15953079178885632,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.09375,
      "kl": 0.00833343998529017,
      "learning_rate": 7.832882473103254e-06,
      "loss": -0.0141,
      "num_tokens": 7426519.0,
      "reward": 14.143692588806152,
      "reward_std": 15.034424209594727,
      "rewards/wrapper/mean": 7.0718462012708185,
      "rewards/wrapper/std": 21.05203797221184,
      "step": 680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.4,
      "completions/mean_length": 252.74375,
      "completions/mean_terminated_length": 49.4,
      "completions/min_length": 151.8,
      "completions/min_terminated_length": 49.4,
      "epoch": 0.16070381231671554,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.5,
      "kl": 0.010165076283738018,
      "learning_rate": 7.831580641660056e-06,
      "loss": -0.0069,
      "num_tokens": 7479772.0,
      "reward": 8.289137840270996,
      "reward_std": 11.20027780532837,
      "rewards/wrapper/mean": 4.1445689931511875,
      "rewards/wrapper/std": 12.911294972896576,
      "step": 685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 76.2,
      "completions/mean_length": 253.5875,
      "completions/mean_terminated_length": 50.7,
      "completions/min_length": 178.8,
      "completions/min_terminated_length": 25.2,
      "epoch": 0.16187683284457477,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.03125,
      "kl": 0.00548412193893455,
      "learning_rate": 7.83026893700027e-06,
      "loss": -0.0066,
      "num_tokens": 7534518.0,
      "reward": 13.755285120010376,
      "reward_std": 13.63194980621338,
      "rewards/wrapper/mean": 6.877642697840929,
      "rewards/wrapper/std": 19.16850001066923,
      "step": 690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 76.0,
      "completions/mean_length": 251.98125,
      "completions/mean_terminated_length": 63.2,
      "completions/min_length": 152.8,
      "completions/min_terminated_length": 50.4,
      "epoch": 0.16304985337243402,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.09375,
      "kl": 0.011029014608357101,
      "learning_rate": 7.828947363615096e-06,
      "loss": -0.0103,
      "num_tokens": 7590827.0,
      "reward": 12.973630714416505,
      "reward_std": 17.048142385482787,
      "rewards/wrapper/mean": 6.4868153288960455,
      "rewards/wrapper/std": 19.200295877456664,
      "step": 695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 248.0375,
      "completions/mean_terminated_length": 0.7,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.16422287390029325,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.546875,
      "kl": 0.006441613682545722,
      "learning_rate": 7.827615926029526e-06,
      "loss": -0.0086,
      "num_tokens": 7645821.0,
      "reward": 9.089998376369476,
      "reward_std": 10.087905651330948,
      "rewards/wrapper/mean": 4.544999056309462,
      "rewards/wrapper/std": 13.267100870609283,
      "step": 700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.2,
      "completions/mean_length": 250.3625,
      "completions/mean_terminated_length": 23.7,
      "completions/min_length": 125.6,
      "completions/min_terminated_length": 23.2,
      "epoch": 0.1653958944281525,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.2734375,
      "kl": 0.0045943191333208235,
      "learning_rate": 7.826274628802327e-06,
      "loss": -0.0068,
      "num_tokens": 7700815.0,
      "reward": 8.232522821426391,
      "reward_std": 11.1143310546875,
      "rewards/wrapper/mean": 4.116261105984449,
      "rewards/wrapper/std": 12.995249216258525,
      "step": 705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.8,
      "completions/mean_length": 253.01875,
      "completions/mean_terminated_length": 3.5,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.16656891495601173,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.40625,
      "kl": 0.007572338276077062,
      "learning_rate": 7.824923476526026e-06,
      "loss": -0.0044,
      "num_tokens": 7755876.0,
      "reward": 13.800330543518067,
      "reward_std": 17.5712028503418,
      "rewards/wrapper/mean": 6.900165131688118,
      "rewards/wrapper/std": 18.640123146772385,
      "step": 710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.2,
      "completions/mean_length": 253.65,
      "completions/mean_terminated_length": 27.2,
      "completions/min_length": 180.8,
      "completions/min_terminated_length": 27.2,
      "epoch": 0.16774193548387098,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.84375,
      "kl": 0.006318861967884004,
      "learning_rate": 7.823562473826892e-06,
      "loss": -0.0044,
      "num_tokens": 7813810.0,
      "reward": 7.7993542671203615,
      "reward_std": 10.558793354034425,
      "rewards/wrapper/mean": 3.8996770560741423,
      "rewards/wrapper/std": 12.23674759566784,
      "step": 715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 252.14375,
      "completions/mean_terminated_length": 15.2,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.1689149560117302,
      "frac_reward_zero_std": 0.075,
      "grad_norm": 2.5625,
      "kl": 0.008202216494828463,
      "learning_rate": 7.822191625364916e-06,
      "loss": 0.0079,
      "num_tokens": 7869451.0,
      "reward": 18.051965522766114,
      "reward_std": 21.227079582214355,
      "rewards/wrapper/mean": 9.02598342001438,
      "rewards/wrapper/std": 20.78606193512678,
      "step": 720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.81875,
      "completions/mean_terminated_length": 0.3,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.17008797653958943,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 7.8125,
      "kl": 0.008373394550289959,
      "learning_rate": 7.820810935833813e-06,
      "loss": -0.0059,
      "num_tokens": 7922738.0,
      "reward": 13.169144535064698,
      "reward_std": 18.08014087677002,
      "rewards/wrapper/mean": 6.5845720142126085,
      "rewards/wrapper/std": 18.33825700432062,
      "step": 725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.4,
      "completions/mean_length": 251.99375,
      "completions/mean_terminated_length": 25.4,
      "completions/min_length": 127.8,
      "completions/min_terminated_length": 25.4,
      "epoch": 0.17126099706744868,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.375,
      "kl": 0.006507234089076519,
      "learning_rate": 7.819420409960982e-06,
      "loss": -0.0115,
      "num_tokens": 7979947.0,
      "reward": 5.9373430728912355,
      "reward_std": 7.835532140731812,
      "rewards/wrapper/mean": 2.9686716251075267,
      "rewards/wrapper/std": 8.117770229279994,
      "step": 730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.1724340175953079,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.8984375,
      "kl": 0.0059782372671179475,
      "learning_rate": 7.818020052507503e-06,
      "loss": -0.0035,
      "num_tokens": 8033822.0,
      "reward": 8.168022966384887,
      "reward_std": 10.164891624450684,
      "rewards/wrapper/mean": 4.084011057019234,
      "rewards/wrapper/std": 12.692610569298267,
      "step": 735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 52.2,
      "completions/mean_length": 252.83125,
      "completions/mean_terminated_length": 52.2,
      "completions/min_length": 154.6,
      "completions/min_terminated_length": 52.2,
      "epoch": 0.17360703812316716,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.6640625,
      "kl": 0.006373942282516509,
      "learning_rate": 7.816609868268123e-06,
      "loss": 0.0013,
      "num_tokens": 8085969.0,
      "reward": 13.514916610717773,
      "reward_std": 16.967267417907713,
      "rewards/wrapper/mean": 6.757457870990038,
      "rewards/wrapper/std": 20.913749350607397,
      "step": 740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.6,
      "completions/mean_length": 250.0625,
      "completions/mean_terminated_length": 14.6,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 14.6,
      "epoch": 0.1747800586510264,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 7.71875,
      "kl": 0.015622739831451326,
      "learning_rate": 7.81518986207123e-06,
      "loss": -0.0026,
      "num_tokens": 8141105.0,
      "reward": 9.454648804664611,
      "reward_std": 12.46040769815445,
      "rewards/wrapper/mean": 4.727324414253235,
      "rewards/wrapper/std": 14.982401445508003,
      "step": 745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.8,
      "completions/mean_length": 250.56875,
      "completions/mean_terminated_length": 15.7,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.17595307917888564,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.890625,
      "kl": 0.023395044403150676,
      "learning_rate": 7.81376003877885e-06,
      "loss": -0.0032,
      "num_tokens": 8195538.0,
      "reward": 5.900467705726624,
      "reward_std": 7.061243009567261,
      "rewards/wrapper/mean": 2.950233814120293,
      "rewards/wrapper/std": 9.529572576284409,
      "step": 750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.4,
      "completions/mean_length": 249.94375,
      "completions/mean_terminated_length": 6.9,
      "completions/min_length": 106.8,
      "completions/min_terminated_length": 4.4,
      "epoch": 0.17712609970674487,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.734375,
      "kl": 0.027390728006139398,
      "learning_rate": 7.812320403286612e-06,
      "loss": -0.0116,
      "num_tokens": 8248127.0,
      "reward": 7.320503807067871,
      "reward_std": 7.543186902999878,
      "rewards/wrapper/mean": 3.660252057760954,
      "rewards/wrapper/std": 10.80094509869814,
      "step": 755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.8,
      "completions/mean_length": 253.85625,
      "completions/mean_terminated_length": 33.8,
      "completions/min_length": 187.4,
      "completions/min_terminated_length": 33.8,
      "epoch": 0.17829912023460412,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.875,
      "kl": 0.022175381588749588,
      "learning_rate": 7.810870960523749e-06,
      "loss": -0.0016,
      "num_tokens": 8301810.0,
      "reward": 8.605068969726563,
      "reward_std": 11.631190872192382,
      "rewards/wrapper/mean": 4.302534601837396,
      "rewards/wrapper/std": 15.27453635185957,
      "step": 760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.8,
      "completions/mean_length": 253.0125,
      "completions/mean_terminated_length": 6.8,
      "completions/min_length": 160.4,
      "completions/min_terminated_length": 6.8,
      "epoch": 0.17947214076246334,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.21875,
      "kl": 0.007904288393910974,
      "learning_rate": 7.809411715453069e-06,
      "loss": -0.0078,
      "num_tokens": 8357124.0,
      "reward": 9.56756021976471,
      "reward_std": 11.147935009002685,
      "rewards/wrapper/mean": 4.7837800078094,
      "rewards/wrapper/std": 16.28676289319992,
      "step": 765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 107.0,
      "completions/mean_length": 252.95625,
      "completions/mean_terminated_length": 103.8,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 100.6,
      "epoch": 0.18064516129032257,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.328125,
      "kl": 0.02297303997911513,
      "learning_rate": 7.807942673070945e-06,
      "loss": -0.0041,
      "num_tokens": 8413391.0,
      "reward": 10.628111362457275,
      "reward_std": 12.795229434967041,
      "rewards/wrapper/mean": 5.314055364578962,
      "rewards/wrapper/std": 16.77784028351307,
      "step": 770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 51.4,
      "completions/mean_length": 251.3625,
      "completions/mean_terminated_length": 31.8,
      "completions/min_length": 114.6,
      "completions/min_terminated_length": 12.2,
      "epoch": 0.18181818181818182,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.3046875,
      "kl": 0.006928546976996586,
      "learning_rate": 7.806463838407295e-06,
      "loss": -0.008,
      "num_tokens": 8466755.0,
      "reward": 13.176506042480469,
      "reward_std": 16.891181087493898,
      "rewards/wrapper/mean": 6.588253006339073,
      "rewards/wrapper/std": 16.821544279158115,
      "step": 775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.2,
      "completions/mean_length": 252.18125,
      "completions/mean_terminated_length": 31.2,
      "completions/min_length": 184.8,
      "completions/min_terminated_length": 31.2,
      "epoch": 0.18299120234604105,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.640625,
      "kl": 0.07739391865034122,
      "learning_rate": 7.804975216525566e-06,
      "loss": -0.0047,
      "num_tokens": 8521826.0,
      "reward": 6.241498494148255,
      "reward_std": 8.379155158996582,
      "rewards/wrapper/mean": 3.1207491770386695,
      "rewards/wrapper/std": 10.579415337741375,
      "step": 780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 252.9875,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 159.6,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.1841642228739003,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 2.046875,
      "kl": 0.00618872475461103,
      "learning_rate": 7.803476812522711e-06,
      "loss": -0.0075,
      "num_tokens": 8577354.0,
      "reward": 12.9086124420166,
      "reward_std": 16.800788116455077,
      "rewards/wrapper/mean": 6.454306278377771,
      "rewards/wrapper/std": 19.88245558142662,
      "step": 785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 254.49375,
      "completions/mean_terminated_length": 3.0,
      "completions/min_length": 207.8,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.18533724340175953,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7421875,
      "kl": 0.01572358557023108,
      "learning_rate": 7.801968631529187e-06,
      "loss": -0.0041,
      "num_tokens": 8631829.0,
      "reward": 7.105897712707519,
      "reward_std": 8.864458084106445,
      "rewards/wrapper/mean": 3.552948968857527,
      "rewards/wrapper/std": 12.096154929697514,
      "step": 790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.2,
      "completions/mean_length": 255.375,
      "completions/mean_terminated_length": 31.2,
      "completions/min_length": 236.0,
      "completions/min_terminated_length": 31.2,
      "epoch": 0.18651026392961878,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 5.25,
      "kl": 0.04917837719549425,
      "learning_rate": 7.800450678708914e-06,
      "loss": 0.0033,
      "num_tokens": 8687353.0,
      "reward": 15.010721778869629,
      "reward_std": 16.94651641845703,
      "rewards/wrapper/mean": 7.5053609274327755,
      "rewards/wrapper/std": 22.161951984465123,
      "step": 795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.0,
      "completions/mean_length": 250.5875,
      "completions/mean_terminated_length": 30.7,
      "completions/min_length": 132.8,
      "completions/min_terminated_length": 30.4,
      "epoch": 0.187683284457478,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.265625,
      "kl": 0.005762098281411454,
      "learning_rate": 7.79892295925928e-06,
      "loss": -0.0046,
      "num_tokens": 8739759.0,
      "reward": 11.818170356750489,
      "reward_std": 12.876243591308594,
      "rewards/wrapper/mean": 5.909084708243609,
      "rewards/wrapper/std": 18.03155415803194,
      "step": 800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 37.2,
      "completions/mean_length": 252.4875,
      "completions/mean_terminated_length": 32.5,
      "completions/min_length": 181.4,
      "completions/min_terminated_length": 27.8,
      "epoch": 0.18885630498533723,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.984375,
      "kl": 0.028124336060136555,
      "learning_rate": 7.797385478411107e-06,
      "loss": -0.002,
      "num_tokens": 8794821.0,
      "reward": 7.452442216873169,
      "reward_std": 8.096717083454132,
      "rewards/wrapper/mean": 3.7262209847569467,
      "rewards/wrapper/std": 10.810575023293495,
      "step": 805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.6,
      "completions/mean_length": 247.0375,
      "completions/mean_terminated_length": 9.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.19002932551319648,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 524.0,
      "kl": 0.06574524050229229,
      "learning_rate": 7.795838241428644e-06,
      "loss": 0.0005,
      "num_tokens": 8849903.0,
      "reward": 7.610436058044433,
      "reward_std": 9.26673491001129,
      "rewards/wrapper/mean": 3.805217783153057,
      "rewards/wrapper/std": 13.05225038230419,
      "step": 810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.1912023460410557,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6015625,
      "kl": 0.032787256489973514,
      "learning_rate": 7.794281253609542e-06,
      "loss": 0.0038,
      "num_tokens": 8903826.0,
      "reward": 9.780489444732666,
      "reward_std": 13.06036205291748,
      "rewards/wrapper/mean": 4.890244487673044,
      "rewards/wrapper/std": 16.01880385428667,
      "step": 815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.8,
      "completions/mean_length": 252.10625,
      "completions/mean_terminated_length": 28.8,
      "completions/min_length": 182.4,
      "completions/min_terminated_length": 28.8,
      "epoch": 0.19237536656891496,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.890625,
      "kl": 0.007354407967068255,
      "learning_rate": 7.79271452028484e-06,
      "loss": -0.0057,
      "num_tokens": 8960313.0,
      "reward": 12.27817931175232,
      "reward_std": 16.556332683563234,
      "rewards/wrapper/mean": 6.139089624583721,
      "rewards/wrapper/std": 15.078392013907433,
      "step": 820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.4,
      "completions/mean_length": 252.98125,
      "completions/mean_terminated_length": 37.0,
      "completions/min_length": 178.2,
      "completions/min_terminated_length": 24.6,
      "epoch": 0.1935483870967742,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.328125,
      "kl": 0.007571105333045125,
      "learning_rate": 7.791138046818944e-06,
      "loss": -0.0037,
      "num_tokens": 9014292.0,
      "reward": 14.692852687835693,
      "reward_std": 19.98781144618988,
      "rewards/wrapper/mean": 7.346426869183778,
      "rewards/wrapper/std": 20.598740892112254,
      "step": 825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.6,
      "completions/mean_length": 251.25,
      "completions/mean_terminated_length": 1.6,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.19472140762463344,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.78125,
      "kl": 0.009129003703128547,
      "learning_rate": 7.78955183860961e-06,
      "loss": -0.0121,
      "num_tokens": 9070082.0,
      "reward": 9.678040361404419,
      "reward_std": 12.95136342048645,
      "rewards/wrapper/mean": 4.839020009338856,
      "rewards/wrapper/std": 14.712241315841675,
      "step": 830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.0,
      "completions/mean_length": 249.91875,
      "completions/mean_terminated_length": 6.5,
      "completions/min_length": 105.4,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.19589442815249267,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.71875,
      "kl": 0.00882287950371392,
      "learning_rate": 7.787955901087924e-06,
      "loss": -0.0139,
      "num_tokens": 9126347.0,
      "reward": 8.241892063617707,
      "reward_std": 11.121530401706696,
      "rewards/wrapper/mean": 4.120945824682712,
      "rewards/wrapper/std": 13.859178911149503,
      "step": 835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.19706744868035192,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.265625,
      "kl": 0.0057106147520244125,
      "learning_rate": 7.786350239718285e-06,
      "loss": -0.0104,
      "num_tokens": 9184344.0,
      "reward": 7.929042911529541,
      "reward_std": 10.194645261764526,
      "rewards/wrapper/mean": 3.9645213529467584,
      "rewards/wrapper/std": 12.511058503389359,
      "step": 840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.2,
      "completions/mean_length": 255.75,
      "completions/mean_terminated_length": 43.2,
      "completions/min_length": 248.0,
      "completions/min_terminated_length": 43.2,
      "epoch": 0.19824046920821115,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.546875,
      "kl": 0.010326540493406356,
      "learning_rate": 7.784734859998386e-06,
      "loss": 0.0013,
      "num_tokens": 9241426.0,
      "reward": 12.294482326507568,
      "reward_std": 14.175844478607178,
      "rewards/wrapper/mean": 6.14724093079567,
      "rewards/wrapper/std": 16.098360952734946,
      "step": 845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.19941348973607037,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.0625,
      "kl": 0.006363995838910341,
      "learning_rate": 7.783109767459199e-06,
      "loss": -0.0111,
      "num_tokens": 9294562.0,
      "reward": 12.141466617584229,
      "reward_std": 16.330566787719725,
      "rewards/wrapper/mean": 6.070733168721199,
      "rewards/wrapper/std": 16.94215931892395,
      "step": 850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.8,
      "completions/mean_length": 245.30625,
      "completions/mean_terminated_length": 29.5,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.20058651026392962,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.15625,
      "kl": 0.007616071903612465,
      "learning_rate": 7.781474967664944e-06,
      "loss": -0.018,
      "num_tokens": 9348237.0,
      "reward": 7.399131870269775,
      "reward_std": 9.726300144195557,
      "rewards/wrapper/mean": 3.699565923213959,
      "rewards/wrapper/std": 13.051005025207996,
      "step": 855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.94375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 53.4,
      "completions/mean_length": 243.3,
      "completions/mean_terminated_length": 18.33333339691162,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.20175953079178885,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.21875,
      "kl": 0.01259395177476108,
      "learning_rate": 7.779830466213087e-06,
      "loss": -0.0218,
      "num_tokens": 9401783.0,
      "reward": 9.727622842788696,
      "reward_std": 11.499066877365113,
      "rewards/wrapper/mean": 4.863811122626066,
      "rewards/wrapper/std": 11.483076599240302,
      "step": 860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 252.925,
      "completions/mean_terminated_length": 4.0,
      "completions/min_length": 157.6,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.2029325513196481,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 5.5,
      "kl": 0.018720851698890328,
      "learning_rate": 7.778176268734307e-06,
      "loss": -0.0095,
      "num_tokens": 9464335.0,
      "reward": 11.788486242294312,
      "reward_std": 14.090129899978638,
      "rewards/wrapper/mean": 5.894242788851261,
      "rewards/wrapper/std": 16.64428468346596,
      "step": 865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 249.625,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.20410557184750733,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.015625,
      "kl": 12.23134752092883,
      "learning_rate": 7.776512380892478e-06,
      "loss": 0.4802,
      "num_tokens": 9518197.0,
      "reward": 12.99444284439087,
      "reward_std": 14.776373958587646,
      "rewards/wrapper/mean": 6.497221313416958,
      "rewards/wrapper/std": 17.821214818954466,
      "step": 870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 247.7625,
      "completions/mean_terminated_length": 36.1,
      "completions/min_length": 131.6,
      "completions/min_terminated_length": 29.2,
      "epoch": 0.20527859237536658,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.5546875,
      "kl": 0.22191111339488998,
      "learning_rate": 7.774838808384665e-06,
      "loss": -0.0061,
      "num_tokens": 9574929.0,
      "reward": 8.010742235183717,
      "reward_std": 8.707789611816406,
      "rewards/wrapper/mean": 4.005371156334877,
      "rewards/wrapper/std": 11.873735588788985,
      "step": 875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.0,
      "completions/mean_length": 249.74375,
      "completions/mean_terminated_length": 28.0,
      "completions/min_length": 130.4,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.2064516129032258,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.234375,
      "kl": 0.0091151007974986,
      "learning_rate": 7.773155556941077e-06,
      "loss": -0.0074,
      "num_tokens": 9628886.0,
      "reward": 6.747600078582764,
      "reward_std": 8.869162845611573,
      "rewards/wrapper/mean": 3.3737999342381952,
      "rewards/wrapper/std": 12.170971043407917,
      "step": 880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.4,
      "completions/mean_length": 249.64375,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 52.6,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.20762463343108503,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.53125,
      "kl": 0.008626295503927395,
      "learning_rate": 7.771462632325079e-06,
      "loss": -0.0022,
      "num_tokens": 9684261.0,
      "reward": 6.184603309631347,
      "reward_std": 8.200012350082398,
      "rewards/wrapper/mean": 3.0923014655709267,
      "rewards/wrapper/std": 11.65102232992649,
      "step": 885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 249.85625,
      "completions/mean_terminated_length": 4.3,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.20879765395894428,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.75,
      "kl": 0.006855689559597522,
      "learning_rate": 7.769760040333146e-06,
      "loss": -0.0166,
      "num_tokens": 9743200.0,
      "reward": 10.165173721313476,
      "reward_std": 13.681201267242432,
      "rewards/wrapper/mean": 5.082586967200041,
      "rewards/wrapper/std": 15.366864316165447,
      "step": 890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 72.4,
      "completions/mean_length": 250.275,
      "completions/mean_terminated_length": 72.4,
      "completions/min_length": 174.8,
      "completions/min_terminated_length": 72.4,
      "epoch": 0.2099706744868035,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.765625,
      "kl": 0.006981910020112991,
      "learning_rate": 7.768047786794854e-06,
      "loss": -0.0108,
      "num_tokens": 9798840.0,
      "reward": 12.545556449890137,
      "reward_std": 16.114229202270508,
      "rewards/wrapper/mean": 6.272777940332889,
      "rewards/wrapper/std": 16.644485236704348,
      "step": 895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 254.825,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 218.4,
      "completions/min_terminated_length": 13.6,
      "epoch": 0.21114369501466276,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.890625,
      "kl": 0.008089781797025353,
      "learning_rate": 7.766325877572865e-06,
      "loss": -0.0024,
      "num_tokens": 9855976.0,
      "reward": 8.491017055511474,
      "reward_std": 9.61400227546692,
      "rewards/wrapper/mean": 4.245508745312691,
      "rewards/wrapper/std": 13.363166551291943,
      "step": 900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 251.6,
      "completions/mean_terminated_length": 6.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.212316715542522,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.375,
      "kl": 85.43212811666308,
      "learning_rate": 7.764594318562897e-06,
      "loss": 3.401,
      "num_tokens": 9910584.0,
      "reward": 12.041954612731933,
      "reward_std": 12.423308753967286,
      "rewards/wrapper/mean": 6.020977398753166,
      "rewards/wrapper/std": 16.532723309099673,
      "step": 905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.4,
      "completions/mean_length": 252.65,
      "completions/mean_terminated_length": 46.4,
      "completions/min_length": 148.8,
      "completions/min_terminated_length": 46.4,
      "epoch": 0.21348973607038124,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.703125,
      "kl": 0.02620959288906306,
      "learning_rate": 7.76285311569371e-06,
      "loss": -0.0118,
      "num_tokens": 9969166.0,
      "reward": 10.540363311767578,
      "reward_std": 12.41097354888916,
      "rewards/wrapper/mean": 5.270181411504746,
      "rewards/wrapper/std": 16.059864945709705,
      "step": 910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.21466275659824047,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.078125,
      "kl": 0.006863275967771187,
      "learning_rate": 7.761102274927087e-06,
      "loss": -0.006,
      "num_tokens": 10026009.0,
      "reward": 10.822275352478027,
      "reward_std": 12.488915252685548,
      "rewards/wrapper/mean": 5.411137568205595,
      "rewards/wrapper/std": 17.650423718988897,
      "step": 915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 36.6,
      "completions/mean_length": 252.6125,
      "completions/mean_terminated_length": 22.7,
      "completions/min_length": 162.4,
      "completions/min_terminated_length": 8.8,
      "epoch": 0.21583577712609972,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.65625,
      "kl": 0.012789941893424838,
      "learning_rate": 7.759341802257804e-06,
      "loss": -0.0037,
      "num_tokens": 10081561.0,
      "reward": 8.169697475433349,
      "reward_std": 10.355749702453613,
      "rewards/wrapper/mean": 4.084848717600107,
      "rewards/wrapper/std": 14.507414634525777,
      "step": 920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 58.8,
      "completions/mean_length": 248.25625,
      "completions/mean_terminated_length": 47.0,
      "completions/min_length": 143.2,
      "completions/min_terminated_length": 40.8,
      "epoch": 0.21700879765395895,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.421875,
      "kl": 0.0058027503313496705,
      "learning_rate": 7.75757170371362e-06,
      "loss": -0.0189,
      "num_tokens": 10138170.0,
      "reward": 9.064529609680175,
      "reward_std": 10.007425928115845,
      "rewards/wrapper/mean": 4.532264867424965,
      "rewards/wrapper/std": 15.986943626403809,
      "step": 925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 53.2,
      "completions/mean_length": 251.2625,
      "completions/mean_terminated_length": 53.2,
      "completions/min_length": 104.4,
      "completions/min_terminated_length": 53.2,
      "epoch": 0.21818181818181817,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.8046875,
      "kl": 0.01003794745192863,
      "learning_rate": 7.755791985355252e-06,
      "loss": -0.0052,
      "num_tokens": 10188544.0,
      "reward": 12.584936428070069,
      "reward_std": 16.22059907913208,
      "rewards/wrapper/mean": 6.292468182742596,
      "rewards/wrapper/std": 17.238589255511762,
      "step": 930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.6,
      "completions/mean_length": 252.8875,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.21935483870967742,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.328125,
      "kl": 0.10706454229075461,
      "learning_rate": 7.754002653276356e-06,
      "loss": -0.0062,
      "num_tokens": 10240576.0,
      "reward": 8.845070493221282,
      "reward_std": 11.477675139904022,
      "rewards/wrapper/mean": 4.422535435855389,
      "rewards/wrapper/std": 14.686181424558162,
      "step": 935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 57.4,
      "completions/mean_length": 253.86875,
      "completions/mean_terminated_length": 46.0,
      "completions/min_length": 188.2,
      "completions/min_terminated_length": 34.6,
      "epoch": 0.22052785923753665,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.421875,
      "kl": 0.012510137457866222,
      "learning_rate": 7.752203713603501e-06,
      "loss": -0.0001,
      "num_tokens": 10297877.0,
      "reward": 15.567786598205567,
      "reward_std": 18.3408540725708,
      "rewards/wrapper/mean": 7.783893074095249,
      "rewards/wrapper/std": 18.60699598044157,
      "step": 940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.8,
      "completions/mean_length": 252.73125,
      "completions/mean_terminated_length": 24.6,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.2217008797653959,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 206.0,
      "kl": 0.020914323756005614,
      "learning_rate": 7.750395172496158e-06,
      "loss": -0.0082,
      "num_tokens": 10353394.0,
      "reward": 7.288831424713135,
      "reward_std": 9.706880664825439,
      "rewards/wrapper/mean": 3.6444156602025033,
      "rewards/wrapper/std": 11.424931126832963,
      "step": 945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.2,
      "completions/mean_length": 248.175,
      "completions/mean_terminated_length": 5.2,
      "completions/min_length": 107.6,
      "completions/min_terminated_length": 5.2,
      "epoch": 0.22287390029325513,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.875,
      "kl": 0.015235843160189689,
      "learning_rate": 7.748577036146666e-06,
      "loss": -0.0159,
      "num_tokens": 10408856.0,
      "reward": 7.371206140518188,
      "reward_std": 8.97930736541748,
      "rewards/wrapper/mean": 3.6856027841567993,
      "rewards/wrapper/std": 11.833311099559069,
      "step": 950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.22404692082111438,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 11.125,
      "kl": 0.014451591449324041,
      "learning_rate": 7.746749310780223e-06,
      "loss": -0.0044,
      "num_tokens": 10463535.0,
      "reward": 11.889365959167481,
      "reward_std": 8.928557300567627,
      "rewards/wrapper/mean": 5.944682708382606,
      "rewards/wrapper/std": 17.176417842507362,
      "step": 955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 99.6,
      "completions/mean_length": 251.64375,
      "completions/mean_terminated_length": 50.03333435058594,
      "completions/min_length": 170.4,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.2252199413489736,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.71875,
      "kl": 0.005338036478497088,
      "learning_rate": 7.744912002654856e-06,
      "loss": -0.0013,
      "num_tokens": 10520552.0,
      "reward": 7.731118607521057,
      "reward_std": 10.374876952171325,
      "rewards/wrapper/mean": 3.8655590668320654,
      "rewards/wrapper/std": 13.530312813818455,
      "step": 960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 247.75625,
      "completions/mean_terminated_length": 41.8,
      "completions/min_length": 91.8,
      "completions/min_terminated_length": 40.6,
      "epoch": 0.22639296187683283,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.28125,
      "kl": 0.011531842313706875,
      "learning_rate": 7.743065118061405e-06,
      "loss": -0.0041,
      "num_tokens": 10573747.0,
      "reward": 11.806914710998536,
      "reward_std": 15.965612602233886,
      "rewards/wrapper/mean": 5.90345728546381,
      "rewards/wrapper/std": 17.234433594346047,
      "step": 965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.2,
      "completions/mean_length": 254.5625,
      "completions/mean_terminated_length": 5.2,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 5.2,
      "epoch": 0.22756598240469209,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 9.25,
      "kl": 0.011473823472624645,
      "learning_rate": 7.741208663323497e-06,
      "loss": -0.0029,
      "num_tokens": 10626809.0,
      "reward": 11.198212456703185,
      "reward_std": 12.455942213535309,
      "rewards/wrapper/mean": 5.599106089770794,
      "rewards/wrapper/std": 15.173226012289524,
      "step": 970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.8,
      "completions/mean_length": 252.85625,
      "completions/mean_terminated_length": 1.8,
      "completions/min_length": 155.4,
      "completions/min_terminated_length": 1.8,
      "epoch": 0.2287390029325513,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.953125,
      "kl": 0.01461629854165949,
      "learning_rate": 7.739342644797526e-06,
      "loss": 0.0007,
      "num_tokens": 10680226.0,
      "reward": 7.582841587066651,
      "reward_std": 10.145219755172729,
      "rewards/wrapper/mean": 3.7914206713438032,
      "rewards/wrapper/std": 12.200360830128194,
      "step": 975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.0,
      "completions/mean_length": 251.5125,
      "completions/mean_terminated_length": 10.0,
      "completions/min_length": 112.4,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.22991202346041056,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.3203125,
      "kl": 1.5801498372165952,
      "learning_rate": 7.737467068872637e-06,
      "loss": 0.0504,
      "num_tokens": 10733206.0,
      "reward": 11.884287261962891,
      "reward_std": 14.595683097839355,
      "rewards/wrapper/mean": 5.942143467068672,
      "rewards/wrapper/std": 19.32650369256735,
      "step": 980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.2310850439882698,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.75,
      "kl": 0.009276495571248234,
      "learning_rate": 7.735581941970693e-06,
      "loss": -0.0059,
      "num_tokens": 10785268.0,
      "reward": 6.786200904846192,
      "reward_std": 8.93930425643921,
      "rewards/wrapper/mean": 3.393100444227457,
      "rewards/wrapper/std": 10.605686566233635,
      "step": 985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.23225806451612904,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.40625,
      "kl": 0.0071992134675383564,
      "learning_rate": 7.733687270546263e-06,
      "loss": -0.006,
      "num_tokens": 10839475.0,
      "reward": 12.208200645446777,
      "reward_std": 14.330049514770508,
      "rewards/wrapper/mean": 6.104100047051906,
      "rewards/wrapper/std": 16.828649199008943,
      "step": 990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.2,
      "completions/mean_length": 246.2875,
      "completions/mean_terminated_length": 47.2,
      "completions/min_length": 47.2,
      "completions/min_terminated_length": 47.2,
      "epoch": 0.23343108504398827,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.6875,
      "kl": 0.20198758316691964,
      "learning_rate": 7.731783061086594e-06,
      "loss": -0.0104,
      "num_tokens": 10894911.0,
      "reward": 9.800655174255372,
      "reward_std": 12.638169860839843,
      "rewards/wrapper/mean": 4.900327530503273,
      "rewards/wrapper/std": 17.312468548119067,
      "step": 995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.4,
      "completions/mean_length": 248.39375,
      "completions/mean_terminated_length": 11.9,
      "completions/min_length": 62.6,
      "completions/min_terminated_length": 11.4,
      "epoch": 0.23460410557184752,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.703125,
      "kl": 0.029017451941035687,
      "learning_rate": 7.729869320111593e-06,
      "loss": -0.0203,
      "num_tokens": 10952706.0,
      "reward": 8.67610182762146,
      "reward_std": 9.548018550872802,
      "rewards/wrapper/mean": 4.338050843402743,
      "rewards/wrapper/std": 12.254481440782547,
      "step": 1000
    },
    {
      "epoch": 0.23460410557184752,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.82,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 69.66,
      "eval_completions/mean_length": 230.335,
      "eval_completions/mean_terminated_length": 61.85166683197021,
      "eval_completions/min_length": 173.91,
      "eval_completions/min_terminated_length": 53.59,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.009674767768010496,
      "eval_loss": -0.04156604781746864,
      "eval_num_tokens": 10952706.0,
      "eval_reward": 0.31490315936505797,
      "eval_reward_std": 0.11863522203173488,
      "eval_rewards/wrapper/mean": 0.1574515798687935,
      "eval_rewards/wrapper/std": 0.11627076880075038,
      "eval_runtime": 211.1908,
      "eval_samples_per_second": 0.947,
      "eval_steps_per_second": 0.237,
      "step": 1000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.23577712609970675,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.5625,
      "kl": 0.011619134614011272,
      "learning_rate": 7.727946054173796e-06,
      "loss": -0.0046,
      "num_tokens": 11006125.0,
      "reward": 12.64652976989746,
      "reward_std": 15.572536277770997,
      "rewards/wrapper/mean": 6.323264981806278,
      "rewards/wrapper/std": 20.6168105751276,
      "step": 1005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.23695014662756597,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.453125,
      "kl": 0.006164612190332264,
      "learning_rate": 7.726013269858362e-06,
      "loss": -0.001,
      "num_tokens": 11061484.0,
      "reward": 11.502595329284668,
      "reward_std": 15.562606811523438,
      "rewards/wrapper/mean": 5.751297509670257,
      "rewards/wrapper/std": 18.69067438542843,
      "step": 1010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.23812316715542522,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.65625,
      "kl": 0.0124075862695463,
      "learning_rate": 7.724070973783033e-06,
      "loss": -0.0083,
      "num_tokens": 11116552.0,
      "reward": 9.195004653930663,
      "reward_std": 12.420144939422608,
      "rewards/wrapper/mean": 4.59750243127346,
      "rewards/wrapper/std": 16.103847907483576,
      "step": 1015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.8,
      "completions/mean_length": 248.475,
      "completions/mean_terminated_length": 7.7,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.23929618768328445,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.28125,
      "kl": 0.009327601094264537,
      "learning_rate": 7.722119172598117e-06,
      "loss": -0.0073,
      "num_tokens": 11170880.0,
      "reward": 8.787823486328126,
      "reward_std": 11.928912353515624,
      "rewards/wrapper/mean": 4.393911641836167,
      "rewards/wrapper/std": 14.354398925602435,
      "step": 1020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.2,
      "completions/mean_length": 252.3,
      "completions/mean_terminated_length": 35.2,
      "completions/min_length": 137.6,
      "completions/min_terminated_length": 35.2,
      "epoch": 0.2404692082111437,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 7.125,
      "kl": 0.007904788851737976,
      "learning_rate": 7.720157872986474e-06,
      "loss": -0.0067,
      "num_tokens": 11227748.0,
      "reward": 9.267306017875672,
      "reward_std": 12.493532657623291,
      "rewards/wrapper/mean": 4.633652974665165,
      "rewards/wrapper/std": 14.101441629230976,
      "step": 1025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 249.6625,
      "completions/mean_terminated_length": 2.0,
      "completions/min_length": 53.2,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.24164222873900293,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.03125,
      "kl": 0.013292332063429058,
      "learning_rate": 7.718187081663484e-06,
      "loss": -0.0012,
      "num_tokens": 11283032.0,
      "reward": 11.44096269607544,
      "reward_std": 15.433443355560303,
      "rewards/wrapper/mean": 5.72048115581274,
      "rewards/wrapper/std": 17.70729095637798,
      "step": 1030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24281524926686218,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.046875,
      "kl": 0.009489351080264895,
      "learning_rate": 7.716206805377021e-06,
      "loss": 0.0004,
      "num_tokens": 11339056.0,
      "reward": 11.168231201171874,
      "reward_std": 9.798115158081055,
      "rewards/wrapper/mean": 5.584115269035101,
      "rewards/wrapper/std": 14.987447142601013,
      "step": 1035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 111.2,
      "completions/mean_length": 249.9,
      "completions/mean_terminated_length": 107.6,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.2439882697947214,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.4765625,
      "kl": 0.009569767396897078,
      "learning_rate": 7.714217050907444e-06,
      "loss": -0.0182,
      "num_tokens": 11392794.0,
      "reward": 9.1754976272583,
      "reward_std": 11.94598445892334,
      "rewards/wrapper/mean": 4.587748650461435,
      "rewards/wrapper/std": 14.406213076412678,
      "step": 1040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.24516129032258063,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.609375,
      "kl": 0.006833912117872387,
      "learning_rate": 7.712217825067554e-06,
      "loss": -0.0123,
      "num_tokens": 11450883.0,
      "reward": 11.125387191772461,
      "reward_std": 14.580111122131347,
      "rewards/wrapper/mean": 5.562693519145251,
      "rewards/wrapper/std": 18.808959732949734,
      "step": 1045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.8,
      "completions/mean_length": 248.6875,
      "completions/mean_terminated_length": 21.8,
      "completions/min_length": 73.0,
      "completions/min_terminated_length": 21.8,
      "epoch": 0.24633431085043989,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 7.0625,
      "kl": 0.04593934025615454,
      "learning_rate": 7.710209134702588e-06,
      "loss": -0.0156,
      "num_tokens": 11506459.0,
      "reward": 9.748054599761963,
      "reward_std": 10.04455499649048,
      "rewards/wrapper/mean": 4.874027146399021,
      "rewards/wrapper/std": 14.737575414776803,
      "step": 1050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.0,
      "completions/mean_length": 249.3,
      "completions/mean_terminated_length": 17.333333587646486,
      "completions/min_length": 107.8,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.2475073313782991,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.03125,
      "kl": 0.011725964327342808,
      "learning_rate": 7.708190986690189e-06,
      "loss": -0.005,
      "num_tokens": 11560853.0,
      "reward": 6.436659145355224,
      "reward_std": 6.5048364162445065,
      "rewards/wrapper/mean": 3.218329684436321,
      "rewards/wrapper/std": 10.194329760968685,
      "step": 1055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.4,
      "completions/mean_length": 250.19375,
      "completions/mean_terminated_length": 39.9,
      "completions/min_length": 133.8,
      "completions/min_terminated_length": 31.4,
      "epoch": 0.24868035190615836,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 3.96875,
      "kl": 0.006699386925902218,
      "learning_rate": 7.706163387940381e-06,
      "loss": -0.0045,
      "num_tokens": 11617612.0,
      "reward": 10.259013462066651,
      "reward_std": 12.297688674926757,
      "rewards/wrapper/mean": 5.129506582021714,
      "rewards/wrapper/std": 16.151020860672,
      "step": 1060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.2,
      "completions/mean_length": 252.55,
      "completions/mean_terminated_length": 43.2,
      "completions/min_length": 145.6,
      "completions/min_terminated_length": 43.2,
      "epoch": 0.2498533724340176,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.0,
      "kl": 0.02774180765263736,
      "learning_rate": 7.704126345395549e-06,
      "loss": -0.0038,
      "num_tokens": 11672532.0,
      "reward": 8.056447982788086,
      "reward_std": 8.833860492706298,
      "rewards/wrapper/mean": 4.02822390422225,
      "rewards/wrapper/std": 14.670276536047458,
      "step": 1065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.2,
      "completions/mean_length": 248.75625,
      "completions/mean_terminated_length": 12.2,
      "completions/min_length": 105.6,
      "completions/min_terminated_length": 3.2,
      "epoch": 0.25102639296187684,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.6953125,
      "kl": 0.009839186503086239,
      "learning_rate": 7.702079866030408e-06,
      "loss": 0.0011,
      "num_tokens": 11726633.0,
      "reward": 11.247284412384033,
      "reward_std": 15.467829513549805,
      "rewards/wrapper/mean": 5.623642058670521,
      "rewards/wrapper/std": 18.223739244043827,
      "step": 1070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.4,
      "completions/mean_length": 252.84375,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.25219941348973607,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.453125,
      "kl": 0.026338630728423594,
      "learning_rate": 7.700023956851989e-06,
      "loss": -0.0101,
      "num_tokens": 11781894.0,
      "reward": 9.359408187866212,
      "reward_std": 10.791181874275207,
      "rewards/wrapper/mean": 4.679704067856074,
      "rewards/wrapper/std": 12.456576159596443,
      "step": 1075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.2,
      "completions/mean_length": 247.69375,
      "completions/mean_terminated_length": 37.9,
      "completions/min_length": 34.6,
      "completions/min_terminated_length": 34.6,
      "epoch": 0.2533724340175953,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.6875,
      "kl": 0.008290641068015248,
      "learning_rate": 7.697958624899609e-06,
      "loss": -0.0154,
      "num_tokens": 11836557.0,
      "reward": 4.384031456708908,
      "reward_std": 5.678225213289261,
      "rewards/wrapper/mean": 2.1920157223939896,
      "rewards/wrapper/std": 7.228950951993466,
      "step": 1080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.6,
      "completions/mean_length": 252.6875,
      "completions/mean_terminated_length": 47.6,
      "completions/min_length": 150.0,
      "completions/min_terminated_length": 47.6,
      "epoch": 0.2545454545454545,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 7.65625,
      "kl": 0.009902437112759798,
      "learning_rate": 7.695883877244846e-06,
      "loss": -0.0113,
      "num_tokens": 11891565.0,
      "reward": 9.418929600715638,
      "reward_std": 11.308019065856934,
      "rewards/wrapper/mean": 4.709464704990387,
      "rewards/wrapper/std": 14.645676551759243,
      "step": 1085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 70.0,
      "completions/mean_length": 253.3875,
      "completions/mean_terminated_length": 70.0,
      "completions/min_length": 172.4,
      "completions/min_terminated_length": 70.0,
      "epoch": 0.2557184750733138,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.203125,
      "kl": 0.008452445617876947,
      "learning_rate": 7.69379972099152e-06,
      "loss": -0.0044,
      "num_tokens": 11946271.0,
      "reward": 10.250184059143066,
      "reward_std": 13.61496181488037,
      "rewards/wrapper/mean": 5.125091888010502,
      "rewards/wrapper/std": 16.24077228009701,
      "step": 1090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 61.0,
      "completions/mean_length": 250.25625,
      "completions/mean_terminated_length": 30.733334350585938,
      "completions/min_length": 112.6,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.256891495601173,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.625,
      "kl": 0.22729696487658657,
      "learning_rate": 7.691706163275663e-06,
      "loss": -0.0026,
      "num_tokens": 12000630.0,
      "reward": 6.446021175384521,
      "reward_std": 6.5143946528434755,
      "rewards/wrapper/mean": 3.223010669648647,
      "rewards/wrapper/std": 10.89571967869997,
      "step": 1095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.25806451612903225,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.0078125,
      "kl": 0.0057296501705423,
      "learning_rate": 7.689603211265496e-06,
      "loss": -0.0061,
      "num_tokens": 12053895.0,
      "reward": 11.164654111862182,
      "reward_std": 14.292295026779176,
      "rewards/wrapper/mean": 5.58232696801424,
      "rewards/wrapper/std": 16.829090513288975,
      "step": 1100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.2592375366568915,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.375,
      "kl": 0.006397956318687647,
      "learning_rate": 7.68749087216141e-06,
      "loss": -0.006,
      "num_tokens": 12107008.0,
      "reward": 9.694752669334411,
      "reward_std": 11.153807973861694,
      "rewards/wrapper/mean": 4.84737599119544,
      "rewards/wrapper/std": 13.409111241996289,
      "step": 1105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.2,
      "completions/mean_length": 254.59375,
      "completions/mean_terminated_length": 6.2,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 6.2,
      "epoch": 0.26041055718475076,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.0625,
      "kl": 4.17301641855156,
      "learning_rate": 7.685369153195933e-06,
      "loss": 0.1615,
      "num_tokens": 12160781.0,
      "reward": 11.649996852874756,
      "reward_std": 15.833073997497559,
      "rewards/wrapper/mean": 5.824998654425144,
      "rewards/wrapper/std": 17.238114669919014,
      "step": 1110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.0,
      "completions/mean_length": 251.4875,
      "completions/mean_terminated_length": 20.26666717529297,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.26158357771261,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 8.25,
      "kl": 0.009280881262384355,
      "learning_rate": 7.683238061633712e-06,
      "loss": -0.0052,
      "num_tokens": 12215855.0,
      "reward": 12.724572658538818,
      "reward_std": 13.146955060958863,
      "rewards/wrapper/mean": 6.362286276370287,
      "rewards/wrapper/std": 15.493781666457654,
      "step": 1115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 89.6,
      "completions/mean_length": 250.86875,
      "completions/mean_terminated_length": 68.1,
      "completions/min_length": 97.8,
      "completions/min_terminated_length": 46.6,
      "epoch": 0.2627565982404692,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.140625,
      "kl": 0.004430928025976754,
      "learning_rate": 7.68109760477148e-06,
      "loss": -0.0145,
      "num_tokens": 12269000.0,
      "reward": 10.06506805419922,
      "reward_std": 13.367911243438721,
      "rewards/wrapper/mean": 5.032533720880747,
      "rewards/wrapper/std": 14.828080916404724,
      "step": 1120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.26392961876832843,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.640625,
      "kl": 0.012790444202255457,
      "learning_rate": 7.678947789938045e-06,
      "loss": -0.0039,
      "num_tokens": 12323763.0,
      "reward": 6.5787577629089355,
      "reward_std": 8.753919792175292,
      "rewards/wrapper/mean": 3.289378835260868,
      "rewards/wrapper/std": 10.194012176990508,
      "step": 1125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 59.4,
      "completions/mean_length": 251.4625,
      "completions/mean_terminated_length": 46.4,
      "completions/min_length": 135.8,
      "completions/min_terminated_length": 33.4,
      "epoch": 0.26510263929618766,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.7890625,
      "kl": 0.010952617693692445,
      "learning_rate": 7.676788624494249e-06,
      "loss": -0.0055,
      "num_tokens": 12376033.0,
      "reward": 9.405997359752655,
      "reward_std": 10.985446679592133,
      "rewards/wrapper/mean": 4.702998787909746,
      "rewards/wrapper/std": 12.160600701719522,
      "step": 1130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 57.4,
      "completions/mean_length": 252.99375,
      "completions/mean_terminated_length": 57.4,
      "completions/min_length": 159.8,
      "completions/min_terminated_length": 57.4,
      "epoch": 0.26627565982404694,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.4375,
      "kl": 0.2351893066195771,
      "learning_rate": 7.674620115832949e-06,
      "loss": 0.0148,
      "num_tokens": 12432798.0,
      "reward": 8.022018957138062,
      "reward_std": 10.71765694618225,
      "rewards/wrapper/mean": 4.011009331047535,
      "rewards/wrapper/std": 12.21592505723238,
      "step": 1135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.6,
      "completions/mean_length": 253.7875,
      "completions/mean_terminated_length": 31.6,
      "completions/min_length": 185.2,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.26744868035190617,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.2578125,
      "kl": 0.016819122969172894,
      "learning_rate": 7.672442271379e-06,
      "loss": -0.0055,
      "num_tokens": 12490174.0,
      "reward": 8.455910956859588,
      "reward_std": 11.398876094818116,
      "rewards/wrapper/mean": 4.2279553160071375,
      "rewards/wrapper/std": 15.768989896774292,
      "step": 1140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.6,
      "completions/mean_length": 250.2875,
      "completions/mean_terminated_length": 11.0,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.2686217008797654,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 8.0,
      "kl": 0.02696207492845133,
      "learning_rate": 7.670255098589216e-06,
      "loss": -0.014,
      "num_tokens": 12543940.0,
      "reward": 6.251084423065185,
      "reward_std": 8.070647144317627,
      "rewards/wrapper/mean": 3.125542238354683,
      "rewards/wrapper/std": 10.576351188123226,
      "step": 1145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.2697947214076246,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 10.5625,
      "kl": 0.014508222823496907,
      "learning_rate": 7.668058604952354e-06,
      "loss": -0.0051,
      "num_tokens": 12600511.0,
      "reward": 12.813151550292968,
      "reward_std": 15.152927589416503,
      "rewards/wrapper/mean": 6.406575272977352,
      "rewards/wrapper/std": 19.122226648032665,
      "step": 1150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.2709677419354839,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.21875,
      "kl": 0.005419670697301626,
      "learning_rate": 7.66585279798908e-06,
      "loss": -0.0048,
      "num_tokens": 12656646.0,
      "reward": 10.259989547729493,
      "reward_std": 10.767040920257568,
      "rewards/wrapper/mean": 5.129994577169418,
      "rewards/wrapper/std": 14.785299123823643,
      "step": 1155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.2721407624633431,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.28125,
      "kl": 0.027052150969393552,
      "learning_rate": 7.663637685251955e-06,
      "loss": -0.0109,
      "num_tokens": 12709572.0,
      "reward": 12.856212615966797,
      "reward_std": 17.106783866882324,
      "rewards/wrapper/mean": 6.42810637652874,
      "rewards/wrapper/std": 19.706878601014616,
      "step": 1160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.6,
      "completions/mean_length": 249.93125,
      "completions/mean_terminated_length": 3.666666793823242,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.27331378299120235,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 3.875,
      "kl": 0.01902551531093195,
      "learning_rate": 7.6614132743254e-06,
      "loss": -0.0208,
      "num_tokens": 12767009.0,
      "reward": 11.094143390655518,
      "reward_std": 15.04799976348877,
      "rewards/wrapper/mean": 5.547071680426598,
      "rewards/wrapper/std": 15.767345032095909,
      "step": 1165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 56.4,
      "completions/mean_length": 252.9625,
      "completions/mean_terminated_length": 56.4,
      "completions/min_length": 158.8,
      "completions/min_terminated_length": 56.4,
      "epoch": 0.2744868035190616,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.59375,
      "kl": 3.983160498877987,
      "learning_rate": 7.659179572825669e-06,
      "loss": 0.1554,
      "num_tokens": 12822107.0,
      "reward": 12.889067268371582,
      "reward_std": 17.192436599731444,
      "rewards/wrapper/mean": 6.444533663988113,
      "rewards/wrapper/std": 18.103868405520917,
      "step": 1170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 76.8,
      "completions/mean_length": 249.7,
      "completions/mean_terminated_length": 67.5,
      "completions/min_length": 109.4,
      "completions/min_terminated_length": 58.2,
      "epoch": 0.2756598240469208,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.6015625,
      "kl": 0.0451830686070025,
      "learning_rate": 7.65693658840083e-06,
      "loss": -0.0085,
      "num_tokens": 12878973.0,
      "reward": 9.611810493469239,
      "reward_std": 13.04067497253418,
      "rewards/wrapper/mean": 4.805905170738697,
      "rewards/wrapper/std": 17.624341449141504,
      "step": 1175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.8,
      "completions/mean_length": 255.76875,
      "completions/mean_terminated_length": 43.8,
      "completions/min_length": 248.6,
      "completions/min_terminated_length": 43.8,
      "epoch": 0.2768328445747801,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.765625,
      "kl": 0.00828095116885379,
      "learning_rate": 7.654684328730737e-06,
      "loss": -0.0003,
      "num_tokens": 12935770.0,
      "reward": 6.8795403957366945,
      "reward_std": 9.140692472457886,
      "rewards/wrapper/mean": 3.439770007133484,
      "rewards/wrapper/std": 13.50705413967371,
      "step": 1180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.2,
      "completions/mean_length": 251.24375,
      "completions/mean_terminated_length": 0.8,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.2780058651026393,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.328125,
      "kl": 0.008541014278307557,
      "learning_rate": 7.652422801526998e-06,
      "loss": -0.0184,
      "num_tokens": 12990883.0,
      "reward": 10.144163513183594,
      "reward_std": 13.843765115737915,
      "rewards/wrapper/mean": 5.072081534564495,
      "rewards/wrapper/std": 15.462312346696853,
      "step": 1185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.2,
      "completions/mean_length": 248.175,
      "completions/mean_terminated_length": 29.6,
      "completions/min_length": 112.6,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.27917888563049853,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 0.8359375,
      "kl": 0.016829893505200744,
      "learning_rate": 7.650152014532953e-06,
      "loss": -0.0205,
      "num_tokens": 13044743.0,
      "reward": 9.684790706634521,
      "reward_std": 13.179641246795654,
      "rewards/wrapper/mean": 4.8423951178789135,
      "rewards/wrapper/std": 16.63096822053194,
      "step": 1190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.8,
      "completions/mean_length": 251.31875,
      "completions/mean_terminated_length": 3.8,
      "completions/min_length": 106.2,
      "completions/min_terminated_length": 3.8,
      "epoch": 0.28035190615835776,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.71875,
      "kl": 0.006020776781952009,
      "learning_rate": 7.647871975523648e-06,
      "loss": -0.0139,
      "num_tokens": 13097946.0,
      "reward": 6.571639347076416,
      "reward_std": 8.726711702346801,
      "rewards/wrapper/mean": 3.285819558799267,
      "rewards/wrapper/std": 11.842679353058339,
      "step": 1195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.28152492668621704,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 10.375,
      "kl": 0.03795347143895924,
      "learning_rate": 7.645582692305809e-06,
      "loss": -0.006,
      "num_tokens": 13154027.0,
      "reward": 11.074889278411865,
      "reward_std": 9.10773811340332,
      "rewards/wrapper/mean": 5.537444531917572,
      "rewards/wrapper/std": 15.783117219805717,
      "step": 1200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.4,
      "completions/mean_length": 249.35,
      "completions/mean_terminated_length": 14.466667175292969,
      "completions/min_length": 154.8,
      "completions/min_terminated_length": 1.2,
      "epoch": 0.28269794721407626,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.265625,
      "kl": 0.006860299198888242,
      "learning_rate": 7.643284172717809e-06,
      "loss": -0.0026,
      "num_tokens": 13207235.0,
      "reward": 12.508953714370728,
      "reward_std": 16.71675834655762,
      "rewards/wrapper/mean": 6.254476898163557,
      "rewards/wrapper/std": 16.900203044712544,
      "step": 1205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.0,
      "completions/mean_length": 250.81875,
      "completions/mean_terminated_length": 39.0,
      "completions/min_length": 90.2,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.2838709677419355,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.5625,
      "kl": 0.010175050923135132,
      "learning_rate": 7.64097642462965e-06,
      "loss": -0.0141,
      "num_tokens": 13261454.0,
      "reward": 10.52227783203125,
      "reward_std": 11.177353668212891,
      "rewards/wrapper/mean": 5.261138796061277,
      "rewards/wrapper/std": 14.039105215668679,
      "step": 1210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 82.2,
      "completions/mean_length": 247.41875,
      "completions/mean_terminated_length": 62.4,
      "completions/min_length": 42.6,
      "completions/min_terminated_length": 42.6,
      "epoch": 0.2850439882697947,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.0,
      "kl": 0.012304930877871812,
      "learning_rate": 7.638659455942934e-06,
      "loss": -0.0229,
      "num_tokens": 13315821.0,
      "reward": 4.5331168413162235,
      "reward_std": 5.74956374168396,
      "rewards/wrapper/mean": 2.266558450460434,
      "rewards/wrapper/std": 7.662119425088167,
      "step": 1215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 111.8,
      "completions/mean_length": 250.13125,
      "completions/mean_terminated_length": 79.4,
      "completions/min_length": 98.2,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.28621700879765394,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.390625,
      "kl": 0.007477234344696626,
      "learning_rate": 7.636333274590826e-06,
      "loss": -0.009,
      "num_tokens": 13368678.0,
      "reward": 7.7060727834701535,
      "reward_std": 10.061940121650697,
      "rewards/wrapper/mean": 3.853036458790302,
      "rewards/wrapper/std": 10.947172378003597,
      "step": 1220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.2,
      "completions/mean_length": 253.25,
      "completions/mean_terminated_length": 43.6,
      "completions/min_length": 191.6,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.2873900293255132,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.734375,
      "kl": 0.015387153357733042,
      "learning_rate": 7.63399788853804e-06,
      "loss": -0.0022,
      "num_tokens": 13424186.0,
      "reward": 10.410561656951904,
      "reward_std": 13.186028957366943,
      "rewards/wrapper/mean": 5.205280630290508,
      "rewards/wrapper/std": 13.38996929973364,
      "step": 1225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.8,
      "completions/mean_length": 247.6625,
      "completions/mean_terminated_length": 20.2,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.28856304985337244,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 44.75,
      "kl": 0.08414683194132522,
      "learning_rate": 7.631653305780806e-06,
      "loss": -0.0171,
      "num_tokens": 13478848.0,
      "reward": 6.07905797958374,
      "reward_std": 7.165961527824402,
      "rewards/wrapper/mean": 3.039528689533472,
      "rewards/wrapper/std": 8.566541536152362,
      "step": 1230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.28973607038123167,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.09375,
      "kl": 0.012354453129228204,
      "learning_rate": 7.629299534346842e-06,
      "loss": 0.0103,
      "num_tokens": 13535398.0,
      "reward": 9.07586328983307,
      "reward_std": 10.113778376579285,
      "rewards/wrapper/mean": 4.53793145492673,
      "rewards/wrapper/std": 13.287790149450302,
      "step": 1235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 254.84375,
      "completions/mean_terminated_length": 14.2,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.2909090909090909,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.3828125,
      "kl": 0.007052855577785522,
      "learning_rate": 7.626936582295328e-06,
      "loss": -0.0023,
      "num_tokens": 13588821.0,
      "reward": 11.41899070739746,
      "reward_std": 14.385398948192597,
      "rewards/wrapper/mean": 5.709495208412409,
      "rewards/wrapper/std": 15.28487433195114,
      "step": 1240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.2920821114369501,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.390625,
      "kl": 0.06638398257782682,
      "learning_rate": 7.624564457716878e-06,
      "loss": -0.0149,
      "num_tokens": 13642985.0,
      "reward": 17.59623432159424,
      "reward_std": 19.738516807556152,
      "rewards/wrapper/mean": 8.798117038607597,
      "rewards/wrapper/std": 21.251633982360364,
      "step": 1245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 117.2,
      "completions/mean_length": 253.2625,
      "completions/mean_terminated_length": 117.2,
      "completions/min_length": 168.4,
      "completions/min_terminated_length": 117.2,
      "epoch": 0.2932551319648094,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.328125,
      "kl": 0.023045004159212113,
      "learning_rate": 7.622183168733512e-06,
      "loss": -0.0004,
      "num_tokens": 13696105.0,
      "reward": 5.633255100250244,
      "reward_std": 7.433923816680908,
      "rewards/wrapper/mean": 2.8166275203227995,
      "rewards/wrapper/std": 11.261727234721183,
      "step": 1250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.6,
      "completions/mean_length": 253.81875,
      "completions/mean_terminated_length": 32.6,
      "completions/min_length": 186.2,
      "completions/min_terminated_length": 32.6,
      "epoch": 0.2944281524926686,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.15625,
      "kl": 0.007092752261087298,
      "learning_rate": 7.619792723498629e-06,
      "loss": 0.0075,
      "num_tokens": 13750444.0,
      "reward": 11.408452892303467,
      "reward_std": 14.730864334106446,
      "rewards/wrapper/mean": 5.7042262017726895,
      "rewards/wrapper/std": 17.131004671752454,
      "step": 1255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.0,
      "completions/mean_length": 250.0125,
      "completions/mean_terminated_length": 13.0,
      "completions/min_length": 115.4,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.29560117302052785,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.25,
      "kl": 1.5145615183922927,
      "learning_rate": 7.617393130196977e-06,
      "loss": 0.0421,
      "num_tokens": 13803322.0,
      "reward": 11.319013595581055,
      "reward_std": 13.331906461715699,
      "rewards/wrapper/mean": 5.659506534039974,
      "rewards/wrapper/std": 16.614761224389078,
      "step": 1260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.2,
      "completions/mean_length": 250.26875,
      "completions/mean_terminated_length": 20.9,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 20.6,
      "epoch": 0.2967741935483871,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 3.59375,
      "kl": 0.008018044813070446,
      "learning_rate": 7.614984397044628e-06,
      "loss": -0.012,
      "num_tokens": 13856751.0,
      "reward": 7.36223726272583,
      "reward_std": 9.809179973602294,
      "rewards/wrapper/mean": 3.6811186604201795,
      "rewards/wrapper/std": 11.822094440460205,
      "step": 1265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 252.54375,
      "completions/mean_terminated_length": 43.0,
      "completions/min_length": 145.4,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.29794721407624636,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.828125,
      "kl": 0.004893030686071142,
      "learning_rate": 7.6125665322889466e-06,
      "loss": -0.0021,
      "num_tokens": 13907542.0,
      "reward": 6.66135311126709,
      "reward_std": 8.785630035400391,
      "rewards/wrapper/mean": 3.330676446855068,
      "rewards/wrapper/std": 11.304787519574166,
      "step": 1270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.8,
      "completions/mean_length": 252.63125,
      "completions/mean_terminated_length": 45.8,
      "completions/min_length": 148.2,
      "completions/min_terminated_length": 45.8,
      "epoch": 0.2991202346041056,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.609375,
      "kl": 5.395943081658333,
      "learning_rate": 7.610139544208566e-06,
      "loss": 0.2097,
      "num_tokens": 13965193.0,
      "reward": 10.49851016998291,
      "reward_std": 11.999550914764404,
      "rewards/wrapper/mean": 5.249254953861237,
      "rewards/wrapper/std": 15.486617393791676,
      "step": 1275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.8,
      "completions/mean_length": 252.25,
      "completions/mean_terminated_length": 17.1,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 5.4,
      "epoch": 0.3002932551319648,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.5625,
      "kl": 0.012830907647730783,
      "learning_rate": 7.607703441113355e-06,
      "loss": -0.0125,
      "num_tokens": 14017931.0,
      "reward": 9.211493253707886,
      "reward_std": 10.461997652053833,
      "rewards/wrapper/mean": 4.605746623873711,
      "rewards/wrapper/std": 13.777990686893464,
      "step": 1280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 89.6,
      "completions/mean_length": 253.3875,
      "completions/mean_terminated_length": 86.4,
      "completions/min_length": 185.6,
      "completions/min_terminated_length": 83.2,
      "epoch": 0.30146627565982403,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.265625,
      "kl": 0.012887239316478371,
      "learning_rate": 7.605258231344392e-06,
      "loss": 0.0047,
      "num_tokens": 14074503.0,
      "reward": 7.647303819656372,
      "reward_std": 10.293930107355118,
      "rewards/wrapper/mean": 3.8236515186727047,
      "rewards/wrapper/std": 11.564179126918315,
      "step": 1285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.8,
      "completions/mean_length": 249.975,
      "completions/mean_terminated_length": 44.9,
      "completions/min_length": 90.2,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.30263929618768326,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.96875,
      "kl": 0.014583267294801772,
      "learning_rate": 7.602803923273938e-06,
      "loss": 0.0005,
      "num_tokens": 14127933.0,
      "reward": 13.534779834747315,
      "reward_std": 17.292752075195313,
      "rewards/wrapper/mean": 6.767389929294586,
      "rewards/wrapper/std": 19.772680358588694,
      "step": 1290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.4,
      "completions/mean_length": 251.30625,
      "completions/mean_terminated_length": 3.4,
      "completions/min_length": 105.8,
      "completions/min_terminated_length": 3.4,
      "epoch": 0.30381231671554254,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.0703125,
      "kl": 0.006688924302579835,
      "learning_rate": 7.600340525305404e-06,
      "loss": -0.0058,
      "num_tokens": 14182722.0,
      "reward": 8.628307437896728,
      "reward_std": 11.121209239959716,
      "rewards/wrapper/mean": 4.314153614640236,
      "rewards/wrapper/std": 16.781408032774927,
      "step": 1295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.6,
      "completions/mean_length": 251.46875,
      "completions/mean_terminated_length": 8.6,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.30498533724340177,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.546875,
      "kl": 0.008998627658002079,
      "learning_rate": 7.5978680458733254e-06,
      "loss": -0.0098,
      "num_tokens": 14237021.0,
      "reward": 14.251729774475098,
      "reward_std": 13.466583633422852,
      "rewards/wrapper/mean": 7.125864551961422,
      "rewards/wrapper/std": 18.563405425846575,
      "step": 1300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 44.2,
      "completions/mean_length": 252.58125,
      "completions/mean_terminated_length": 44.2,
      "completions/min_length": 146.6,
      "completions/min_terminated_length": 44.2,
      "epoch": 0.306158357771261,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.078125,
      "kl": 0.005243840476032347,
      "learning_rate": 7.5953864934433305e-06,
      "loss": -0.0049,
      "num_tokens": 14289700.0,
      "reward": 7.625714588165283,
      "reward_std": 10.161770915985107,
      "rewards/wrapper/mean": 3.8128572389483453,
      "rewards/wrapper/std": 12.751371662318707,
      "step": 1305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.8,
      "completions/mean_length": 251.50625,
      "completions/mean_terminated_length": 9.8,
      "completions/min_length": 112.2,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.3073313782991202,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 2.203125,
      "kl": 0.008733793662395328,
      "learning_rate": 7.592895876512114e-06,
      "loss": -0.011,
      "num_tokens": 14347077.0,
      "reward": 7.226803135871887,
      "reward_std": 9.705256414413451,
      "rewards/wrapper/mean": 3.6134014263749124,
      "rewards/wrapper/std": 12.593166868388654,
      "step": 1310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 251.96875,
      "completions/mean_terminated_length": 12.4,
      "completions/min_length": 161.4,
      "completions/min_terminated_length": 7.8,
      "epoch": 0.3085043988269795,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.9921875,
      "kl": 0.008413564675720409,
      "learning_rate": 7.590396203607408e-06,
      "loss": -0.0103,
      "num_tokens": 14398824.0,
      "reward": 8.467525911331176,
      "reward_std": 9.760836601257324,
      "rewards/wrapper/mean": 4.233762781322002,
      "rewards/wrapper/std": 13.273331837356091,
      "step": 1315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.0,
      "completions/mean_length": 252.025,
      "completions/mean_terminated_length": 8.8,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.3096774193548387,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.265625,
      "kl": 0.011639650189317763,
      "learning_rate": 7.58788748328795e-06,
      "loss": -0.0051,
      "num_tokens": 14454298.0,
      "reward": 12.13127155303955,
      "reward_std": 15.99749984741211,
      "rewards/wrapper/mean": 6.065636083483696,
      "rewards/wrapper/std": 19.705180183053017,
      "step": 1320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.31085043988269795,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.609375,
      "kl": 0.01836782739846967,
      "learning_rate": 7.585369724143458e-06,
      "loss": -0.0068,
      "num_tokens": 14509264.0,
      "reward": 11.664790630340576,
      "reward_std": 13.763594150543213,
      "rewards/wrapper/mean": 5.832394993305206,
      "rewards/wrapper/std": 16.633710739016532,
      "step": 1325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 52.6,
      "completions/mean_length": 251.24375,
      "completions/mean_terminated_length": 52.6,
      "completions/min_length": 103.8,
      "completions/min_terminated_length": 52.6,
      "epoch": 0.3120234604105572,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.7109375,
      "kl": 0.00812934830901213,
      "learning_rate": 7.582842934794593e-06,
      "loss": -0.0138,
      "num_tokens": 14562739.0,
      "reward": 12.168029403686523,
      "reward_std": 13.295973205566407,
      "rewards/wrapper/mean": 6.084014493227005,
      "rewards/wrapper/std": 18.827900260686874,
      "step": 1330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 23.8,
      "completions/mean_length": 251.94375,
      "completions/mean_terminated_length": 23.8,
      "completions/min_length": 126.2,
      "completions/min_terminated_length": 23.8,
      "epoch": 0.3131964809384164,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.0859375,
      "kl": 0.006739077181555331,
      "learning_rate": 7.580307123892941e-06,
      "loss": -0.0124,
      "num_tokens": 14616972.0,
      "reward": 11.35460147857666,
      "reward_std": 15.193605709075928,
      "rewards/wrapper/mean": 5.677300703525543,
      "rewards/wrapper/std": 17.427980916202067,
      "step": 1335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 64.0,
      "completions/mean_length": 251.60625,
      "completions/mean_terminated_length": 53.3,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 42.6,
      "epoch": 0.3143695014662757,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 8.1875,
      "kl": 0.011302015569526702,
      "learning_rate": 7.577762300120974e-06,
      "loss": -0.008,
      "num_tokens": 14675993.0,
      "reward": 11.836063861846924,
      "reward_std": 15.175680541992188,
      "rewards/wrapper/mean": 5.918032126128674,
      "rewards/wrapper/std": 17.08586499094963,
      "step": 1340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 60.0,
      "completions/mean_length": 246.71875,
      "completions/mean_terminated_length": 35.53333358764648,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 14.8,
      "epoch": 0.3155425219941349,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.078125,
      "kl": 0.014866840979084372,
      "learning_rate": 7.575208472192025e-06,
      "loss": -0.0225,
      "num_tokens": 14728314.0,
      "reward": 8.228108072280884,
      "reward_std": 10.156515312194824,
      "rewards/wrapper/mean": 4.114054039120674,
      "rewards/wrapper/std": 11.845364609360695,
      "step": 1345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.8,
      "completions/mean_length": 252.00625,
      "completions/mean_terminated_length": 25.8,
      "completions/min_length": 128.2,
      "completions/min_terminated_length": 25.8,
      "epoch": 0.31671554252199413,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.578125,
      "kl": 0.11884763344423846,
      "learning_rate": 7.572645648850256e-06,
      "loss": -0.0057,
      "num_tokens": 14780373.0,
      "reward": 11.06895570755005,
      "reward_std": 14.749995613098145,
      "rewards/wrapper/mean": 5.5344778671860695,
      "rewards/wrapper/std": 17.256612426042558,
      "step": 1350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.4,
      "completions/mean_length": 252.09375,
      "completions/mean_terminated_length": 28.4,
      "completions/min_length": 182.0,
      "completions/min_terminated_length": 28.4,
      "epoch": 0.31788856304985336,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.984375,
      "kl": 0.00944446304347366,
      "learning_rate": 7.570073838870627e-06,
      "loss": -0.0071,
      "num_tokens": 14838402.0,
      "reward": 7.713986945152283,
      "reward_std": 8.773927760124206,
      "rewards/wrapper/mean": 3.8569933280348776,
      "rewards/wrapper/std": 12.51425680667162,
      "step": 1355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.2,
      "completions/mean_length": 252.7125,
      "completions/mean_terminated_length": 45.5,
      "completions/min_length": 196.4,
      "completions/min_terminated_length": 42.8,
      "epoch": 0.31906158357771264,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.140625,
      "kl": 0.01574853319907561,
      "learning_rate": 7.567493051058871e-06,
      "loss": 0.0067,
      "num_tokens": 14892928.0,
      "reward": 15.730212306976318,
      "reward_std": 16.052964973449708,
      "rewards/wrapper/mean": 7.865105799585581,
      "rewards/wrapper/std": 22.611429415643215,
      "step": 1360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.6,
      "completions/mean_length": 251.89375,
      "completions/mean_terminated_length": 11.2,
      "completions/min_length": 154.4,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.32023460410557186,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.7265625,
      "kl": 0.011211365048075095,
      "learning_rate": 7.56490329425146e-06,
      "loss": -0.01,
      "num_tokens": 14948657.0,
      "reward": 10.49436092376709,
      "reward_std": 14.247867679595947,
      "rewards/wrapper/mean": 5.247180543094873,
      "rewards/wrapper/std": 16.597414763271807,
      "step": 1365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.3214076246334311,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.6875,
      "kl": 0.008569952449761331,
      "learning_rate": 7.562304577315573e-06,
      "loss": 0.0003,
      "num_tokens": 15007042.0,
      "reward": 13.28712511062622,
      "reward_std": 13.76039524078369,
      "rewards/wrapper/mean": 6.643562447279692,
      "rewards/wrapper/std": 18.46181525737047,
      "step": 1370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.0,
      "completions/mean_length": 254.26875,
      "completions/mean_terminated_length": 47.0,
      "completions/min_length": 200.6,
      "completions/min_terminated_length": 47.0,
      "epoch": 0.3225806451612903,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.0625,
      "kl": 0.009434047807008027,
      "learning_rate": 7.559696909149068e-06,
      "loss": -0.0026,
      "num_tokens": 15068173.0,
      "reward": 4.01227194070816,
      "reward_std": 4.248266899585724,
      "rewards/wrapper/mean": 2.006135963648558,
      "rewards/wrapper/std": 6.169549755752087,
      "step": 1375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.6,
      "completions/mean_length": 254.2875,
      "completions/mean_terminated_length": 47.6,
      "completions/min_length": 201.2,
      "completions/min_terminated_length": 47.6,
      "epoch": 0.32375366568914954,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.5,
      "kl": 0.008193347556516527,
      "learning_rate": 7.557080298680456e-06,
      "loss": -0.003,
      "num_tokens": 15127993.0,
      "reward": 10.501580429077148,
      "reward_std": 14.141244888305664,
      "rewards/wrapper/mean": 5.250790251791477,
      "rewards/wrapper/std": 16.1909792765975,
      "step": 1380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.8,
      "completions/mean_length": 254.70625,
      "completions/mean_terminated_length": 9.8,
      "completions/min_length": 214.6,
      "completions/min_terminated_length": 9.8,
      "epoch": 0.3249266862170088,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.875,
      "kl": 0.0087321916827932,
      "learning_rate": 7.554454754868861e-06,
      "loss": -0.0026,
      "num_tokens": 15183854.0,
      "reward": 9.186988854408265,
      "reward_std": 12.355615091323852,
      "rewards/wrapper/mean": 4.593494184315205,
      "rewards/wrapper/std": 14.244051401317119,
      "step": 1385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.8,
      "completions/mean_length": 247.25,
      "completions/mean_terminated_length": 26.8,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 26.8,
      "epoch": 0.32609970674486805,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.75,
      "kl": 0.009691889875102788,
      "learning_rate": 7.551820286703997e-06,
      "loss": -0.0212,
      "num_tokens": 15237238.0,
      "reward": 12.978730201721191,
      "reward_std": 17.73060188293457,
      "rewards/wrapper/mean": 6.489365118741989,
      "rewards/wrapper/std": 19.83084503412247,
      "step": 1390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 97.0,
      "completions/mean_length": 254.23125,
      "completions/mean_terminated_length": 97.0,
      "completions/min_length": 199.4,
      "completions/min_terminated_length": 97.0,
      "epoch": 0.32727272727272727,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.71875,
      "kl": 0.007169304159469902,
      "learning_rate": 7.549176903206133e-06,
      "loss": 0.004,
      "num_tokens": 15295067.0,
      "reward": 14.387743473052979,
      "reward_std": 19.485210800170897,
      "rewards/wrapper/mean": 7.193871764093638,
      "rewards/wrapper/std": 21.868296499550343,
      "step": 1395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.6,
      "completions/mean_length": 255.075,
      "completions/mean_terminated_length": 21.6,
      "completions/min_length": 226.4,
      "completions/min_terminated_length": 21.6,
      "epoch": 0.3284457478005865,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.15625,
      "kl": 0.007469373976346105,
      "learning_rate": 7.546524613426066e-06,
      "loss": -0.0011,
      "num_tokens": 15353999.0,
      "reward": 10.911634540557861,
      "reward_std": 14.8988431930542,
      "rewards/wrapper/mean": 5.455817250907421,
      "rewards/wrapper/std": 19.719540111720562,
      "step": 1400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 80.6,
      "completions/mean_length": 250.58125,
      "completions/mean_terminated_length": 66.7,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 52.8,
      "epoch": 0.3296187683284457,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.8125,
      "kl": 0.008937957452144474,
      "learning_rate": 7.543863426445082e-06,
      "loss": -0.0028,
      "num_tokens": 15407088.0,
      "reward": 9.857917308807373,
      "reward_std": 11.83640947341919,
      "rewards/wrapper/mean": 4.928958788514137,
      "rewards/wrapper/std": 15.389665246009827,
      "step": 1405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.6,
      "completions/mean_length": 254.48125,
      "completions/mean_terminated_length": 2.6,
      "completions/min_length": 207.4,
      "completions/min_terminated_length": 2.6,
      "epoch": 0.330791788856305,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.921875,
      "kl": 0.012071310554165392,
      "learning_rate": 7.5411933513749375e-06,
      "loss": -0.0037,
      "num_tokens": 15461157.0,
      "reward": 11.119661998748779,
      "reward_std": 11.121147727966308,
      "rewards/wrapper/mean": 5.5598307564854625,
      "rewards/wrapper/std": 16.329788361489772,
      "step": 1410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 59.2,
      "completions/mean_length": 253.05,
      "completions/mean_terminated_length": 59.2,
      "completions/min_length": 161.6,
      "completions/min_terminated_length": 59.2,
      "epoch": 0.33196480938416423,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.125,
      "kl": 0.011578649946022779,
      "learning_rate": 7.538514397357817e-06,
      "loss": -0.0086,
      "num_tokens": 15517341.0,
      "reward": 9.766520977020264,
      "reward_std": 12.796609210968018,
      "rewards/wrapper/mean": 4.883260330557823,
      "rewards/wrapper/std": 15.593793278932571,
      "step": 1415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.6,
      "completions/mean_length": 252.25625,
      "completions/mean_terminated_length": 17.9,
      "completions/min_length": 155.8,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.33313782991202345,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.53125,
      "kl": 0.004327619081595913,
      "learning_rate": 7.535826573566306e-06,
      "loss": -0.011,
      "num_tokens": 15569564.0,
      "reward": 10.319400787353516,
      "reward_std": 13.982878303527832,
      "rewards/wrapper/mean": 5.1597000800073145,
      "rewards/wrapper/std": 16.632020924985408,
      "step": 1420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 254.9125,
      "completions/mean_terminated_length": 16.4,
      "completions/min_length": 221.2,
      "completions/min_terminated_length": 16.4,
      "epoch": 0.3343108504398827,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5859375,
      "kl": 0.009104005433619023,
      "learning_rate": 7.533129889203364e-06,
      "loss": -0.0021,
      "num_tokens": 15622916.0,
      "reward": 11.106964683532714,
      "reward_std": 11.095128536224365,
      "rewards/wrapper/mean": 5.553482050448656,
      "rewards/wrapper/std": 14.927113994956017,
      "step": 1425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 44.6,
      "completions/mean_length": 252.84375,
      "completions/mean_terminated_length": 40.7,
      "completions/min_length": 190.4,
      "completions/min_terminated_length": 36.8,
      "epoch": 0.33548387096774196,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.9375,
      "kl": 0.014619318360928446,
      "learning_rate": 7.530424353502283e-06,
      "loss": -0.0096,
      "num_tokens": 15678809.0,
      "reward": 13.947904205322265,
      "reward_std": 15.49264030456543,
      "rewards/wrapper/mean": 6.973952141404152,
      "rewards/wrapper/std": 19.970910519361496,
      "step": 1430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.4,
      "completions/mean_length": 252.75,
      "completions/mean_terminated_length": 33.0,
      "completions/min_length": 170.2,
      "completions/min_terminated_length": 16.6,
      "epoch": 0.3366568914956012,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.046875,
      "kl": 0.00951800765178632,
      "learning_rate": 7.527709975726663e-06,
      "loss": -0.0063,
      "num_tokens": 15737421.0,
      "reward": 10.923522877693177,
      "reward_std": 11.951661324501037,
      "rewards/wrapper/mean": 5.461761482059956,
      "rewards/wrapper/std": 15.331962569057941,
      "step": 1435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.2,
      "completions/mean_length": 254.59375,
      "completions/mean_terminated_length": 6.2,
      "completions/min_length": 211.0,
      "completions/min_terminated_length": 6.2,
      "epoch": 0.3378299120234604,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.390625,
      "kl": 0.008740646217484027,
      "learning_rate": 7.5249867651703825e-06,
      "loss": -0.0029,
      "num_tokens": 15790706.0,
      "reward": 11.986185383796691,
      "reward_std": 13.874305212497712,
      "rewards/wrapper/mean": 5.993092510849237,
      "rewards/wrapper/std": 15.854976122826338,
      "step": 1440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.33900293255131964,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.3984375,
      "kl": 0.005966075940523297,
      "learning_rate": 7.522254731157557e-06,
      "loss": -0.011,
      "num_tokens": 15845562.0,
      "reward": 9.148483896255494,
      "reward_std": 11.106124210357667,
      "rewards/wrapper/mean": 4.574241859093308,
      "rewards/wrapper/std": 15.179227907955646,
      "step": 1445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.34017595307917886,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.84375,
      "kl": 0.02206381254363805,
      "learning_rate": 7.519513883042518e-06,
      "loss": -0.0016,
      "num_tokens": 15900385.0,
      "reward": 13.788352870941162,
      "reward_std": 17.711102962493896,
      "rewards/wrapper/mean": 6.89417629390955,
      "rewards/wrapper/std": 20.102886700630187,
      "step": 1450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.4,
      "completions/mean_length": 255.50625,
      "completions/mean_terminated_length": 35.4,
      "completions/min_length": 240.2,
      "completions/min_terminated_length": 35.4,
      "epoch": 0.34134897360703814,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.796875,
      "kl": 0.008329333225265145,
      "learning_rate": 7.516764230209772e-06,
      "loss": -0.0004,
      "num_tokens": 15957806.0,
      "reward": 10.32720980644226,
      "reward_std": 12.660326385498047,
      "rewards/wrapper/mean": 5.163604502379894,
      "rewards/wrapper/std": 14.365816079080105,
      "step": 1455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.0,
      "completions/mean_length": 253.675,
      "completions/mean_terminated_length": 28.0,
      "completions/min_length": 181.6,
      "completions/min_terminated_length": 28.0,
      "epoch": 0.34252199413489737,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.3671875,
      "kl": 0.005196410208009183,
      "learning_rate": 7.514005782073976e-06,
      "loss": 0.0034,
      "num_tokens": 16012790.0,
      "reward": 11.027314805984497,
      "reward_std": 14.765119647979736,
      "rewards/wrapper/mean": 5.513657581061125,
      "rewards/wrapper/std": 18.939475986361504,
      "step": 1460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.4,
      "completions/mean_length": 247.1625,
      "completions/mean_terminated_length": 11.233333587646484,
      "completions/min_length": 104.6,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.3436950146627566,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.296875,
      "kl": 0.024585752293933182,
      "learning_rate": 7.5112385480799005e-06,
      "loss": -0.0166,
      "num_tokens": 16065540.0,
      "reward": 10.954252338409423,
      "reward_std": 12.776335978507996,
      "rewards/wrapper/mean": 5.477126209437847,
      "rewards/wrapper/std": 15.678541065752507,
      "step": 1465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.4,
      "completions/mean_length": 254.125,
      "completions/mean_terminated_length": 42.4,
      "completions/min_length": 196.0,
      "completions/min_terminated_length": 42.4,
      "epoch": 0.3448680351906158,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.125,
      "kl": 0.009685445297509431,
      "learning_rate": 7.5084625377023954e-06,
      "loss": -0.0056,
      "num_tokens": 16121824.0,
      "reward": 7.70174765586853,
      "reward_std": 8.262192821502685,
      "rewards/wrapper/mean": 3.8508735738694666,
      "rewards/wrapper/std": 14.08769258260727,
      "step": 1470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.3460410557184751,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 0.9296875,
      "kl": 0.1006514249893371,
      "learning_rate": 7.505677760446367e-06,
      "loss": -0.001,
      "num_tokens": 16178669.0,
      "reward": 15.888223457336426,
      "reward_std": 19.840636253356934,
      "rewards/wrapper/mean": 7.944112040102482,
      "rewards/wrapper/std": 21.836421263217925,
      "step": 1475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.3472140762463343,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.859375,
      "kl": 0.029327043099328876,
      "learning_rate": 7.502884225846729e-06,
      "loss": 0.0017,
      "num_tokens": 16230501.0,
      "reward": 7.629909253120422,
      "reward_std": 8.137647867202759,
      "rewards/wrapper/mean": 3.8149545326828957,
      "rewards/wrapper/std": 11.658829681575298,
      "step": 1480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.34838709677419355,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 5.625,
      "kl": 0.01167833567596972,
      "learning_rate": 7.50008194346839e-06,
      "loss": -0.0121,
      "num_tokens": 16287069.0,
      "reward": 10.761643028259277,
      "reward_std": 12.282279825210571,
      "rewards/wrapper/mean": 5.380821162462235,
      "rewards/wrapper/std": 13.490466183423996,
      "step": 1485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.2,
      "completions/mean_length": 250.95625,
      "completions/mean_terminated_length": 42.9,
      "completions/min_length": 145.0,
      "completions/min_terminated_length": 42.6,
      "epoch": 0.3495601173020528,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.640625,
      "kl": 0.015173644432798028,
      "learning_rate": 7.497270922906204e-06,
      "loss": -0.0121,
      "num_tokens": 16342400.0,
      "reward": 7.586162424087524,
      "reward_std": 9.830938339233398,
      "rewards/wrapper/mean": 3.793080995231867,
      "rewards/wrapper/std": 12.029572662711143,
      "step": 1490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.8,
      "completions/mean_length": 254.4875,
      "completions/mean_terminated_length": 2.8,
      "completions/min_length": 207.6,
      "completions/min_terminated_length": 2.8,
      "epoch": 0.350733137829912,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.25,
      "kl": 0.008850508206523954,
      "learning_rate": 7.494451173784947e-06,
      "loss": -0.0056,
      "num_tokens": 16398928.0,
      "reward": 10.645478534698487,
      "reward_std": 12.192581272125244,
      "rewards/wrapper/mean": 5.322738918662071,
      "rewards/wrapper/std": 16.82269820868969,
      "step": 1495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 63.2,
      "completions/mean_length": 253.175,
      "completions/mean_terminated_length": 63.2,
      "completions/min_length": 165.6,
      "completions/min_terminated_length": 63.2,
      "epoch": 0.3519061583577713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7734375,
      "kl": 0.47186655312543735,
      "learning_rate": 7.491622705759279e-06,
      "loss": 0.0119,
      "num_tokens": 16452334.0,
      "reward": 13.618064212799073,
      "reward_std": 16.841291904449463,
      "rewards/wrapper/mean": 6.809031952917576,
      "rewards/wrapper/std": 19.131683690845968,
      "step": 1500
    },
    {
      "epoch": 0.3519061583577713,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.845,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 57.44,
      "eval_completions/mean_length": 233.8025,
      "eval_completions/mean_terminated_length": 49.33000011444092,
      "eval_completions/min_length": 185.48,
      "eval_completions/min_terminated_length": 42.12,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.011634215260855854,
      "eval_loss": -0.030341310426592827,
      "eval_num_tokens": 16452334.0,
      "eval_reward": 0.40629449486732483,
      "eval_reward_std": 0.2625969736929983,
      "eval_rewards/wrapper/mean": 0.2031472486257553,
      "eval_rewards/wrapper/std": 0.21003973964601755,
      "eval_runtime": 207.7988,
      "eval_samples_per_second": 0.962,
      "eval_steps_per_second": 0.241,
      "step": 1500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 249.63125,
      "completions/mean_terminated_length": 0.7,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.3530791788856305,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.8125,
      "kl": 0.008165108982939272,
      "learning_rate": 7.488785528513715e-06,
      "loss": -0.0143,
      "num_tokens": 16509875.0,
      "reward": 9.125647592544556,
      "reward_std": 12.147355389595031,
      "rewards/wrapper/mean": 4.562823601812124,
      "rewards/wrapper/std": 14.803350380063057,
      "step": 1505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.35425219941348973,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.8671875,
      "kl": 0.02630561958067119,
      "learning_rate": 7.485939651762588e-06,
      "loss": -0.0074,
      "num_tokens": 16568078.0,
      "reward": 8.876133251190186,
      "reward_std": 10.792211532592773,
      "rewards/wrapper/mean": 4.438066463172436,
      "rewards/wrapper/std": 12.372373120486737,
      "step": 1510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 90.6,
      "completions/mean_length": 250.8375,
      "completions/mean_terminated_length": 69.5,
      "completions/min_length": 99.6,
      "completions/min_terminated_length": 48.4,
      "epoch": 0.35542521994134896,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1171875,
      "kl": 0.007182114117313176,
      "learning_rate": 7.483085085250019e-06,
      "loss": -0.0057,
      "num_tokens": 16622066.0,
      "reward": 10.847808790206908,
      "reward_std": 13.713920974731446,
      "rewards/wrapper/mean": 5.423904552310705,
      "rewards/wrapper/std": 17.696707151830196,
      "step": 1515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.35659824046920824,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.296875,
      "kl": 1.0365399254136718,
      "learning_rate": 7.480221838749882e-06,
      "loss": 0.0364,
      "num_tokens": 16682705.0,
      "reward": 7.309674024581909,
      "reward_std": 9.755594110488891,
      "rewards/wrapper/mean": 3.6548370026051997,
      "rewards/wrapper/std": 10.389201259613037,
      "step": 1520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.6,
      "completions/mean_length": 252.66875,
      "completions/mean_terminated_length": 23.6,
      "completions/min_length": 154.2,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.35777126099706746,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.234375,
      "kl": 0.007430961437057704,
      "learning_rate": 7.477349922065771e-06,
      "loss": -0.0063,
      "num_tokens": 16735422.0,
      "reward": 14.080436992645264,
      "reward_std": 16.378932380676268,
      "rewards/wrapper/mean": 7.040218336880207,
      "rewards/wrapper/std": 18.554736307263376,
      "step": 1525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.0,
      "completions/mean_length": 250.8875,
      "completions/mean_terminated_length": 41.0,
      "completions/min_length": 143.4,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.3589442815249267,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.953125,
      "kl": 0.007186479715164751,
      "learning_rate": 7.474469345030966e-06,
      "loss": -0.0064,
      "num_tokens": 16787668.0,
      "reward": 6.7414408206939695,
      "reward_std": 8.522413969039917,
      "rewards/wrapper/mean": 3.370720238983631,
      "rewards/wrapper/std": 10.809870810806752,
      "step": 1530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 254.8375,
      "completions/mean_terminated_length": 14.0,
      "completions/min_length": 218.8,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3601173020527859,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.6875,
      "kl": 0.015209858620073647,
      "learning_rate": 7.471580117508398e-06,
      "loss": 0.0033,
      "num_tokens": 16843118.0,
      "reward": 9.573083400726318,
      "reward_std": 12.79993715286255,
      "rewards/wrapper/mean": 4.78654208779335,
      "rewards/wrapper/std": 15.178947728872298,
      "step": 1535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.36129032258064514,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.03125,
      "kl": 0.06851877669105307,
      "learning_rate": 7.468682249390621e-06,
      "loss": 0.0002,
      "num_tokens": 16896579.0,
      "reward": 7.265839624404907,
      "reward_std": 9.448273944854737,
      "rewards/wrapper/mean": 3.632919803261757,
      "rewards/wrapper/std": 11.570318593084812,
      "step": 1540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 151.8,
      "completions/mean_length": 252.75,
      "completions/mean_terminated_length": 137.6,
      "completions/min_length": 174.6,
      "completions/min_terminated_length": 123.4,
      "epoch": 0.3624633431085044,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.28125,
      "kl": 0.011325775182922371,
      "learning_rate": 7.465775750599767e-06,
      "loss": -0.0059,
      "num_tokens": 16950423.0,
      "reward": 11.469275760650635,
      "reward_std": 12.777053165435792,
      "rewards/wrapper/mean": 5.734637747704983,
      "rewards/wrapper/std": 16.157272858917715,
      "step": 1545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.8,
      "completions/mean_length": 252.25625,
      "completions/mean_terminated_length": 33.8,
      "completions/min_length": 136.2,
      "completions/min_terminated_length": 33.8,
      "epoch": 0.36363636363636365,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.796875,
      "kl": 0.007997657801024616,
      "learning_rate": 7.462860631087526e-06,
      "loss": -0.0103,
      "num_tokens": 17006628.0,
      "reward": 11.788213729858398,
      "reward_std": 15.076388835906982,
      "rewards/wrapper/mean": 5.894107177108526,
      "rewards/wrapper/std": 17.494142431020737,
      "step": 1550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.0,
      "completions/mean_length": 253.83125,
      "completions/mean_terminated_length": 33.0,
      "completions/min_length": 186.6,
      "completions/min_terminated_length": 33.0,
      "epoch": 0.36480938416422287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9296875,
      "kl": 0.006896321999374777,
      "learning_rate": 7.459936900835101e-06,
      "loss": -0.0029,
      "num_tokens": 17061199.0,
      "reward": 15.098381996154785,
      "reward_std": 20.83281021118164,
      "rewards/wrapper/mean": 7.549191132187843,
      "rewards/wrapper/std": 22.724813936650754,
      "step": 1555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.0,
      "completions/mean_length": 255.2125,
      "completions/mean_terminated_length": 26.0,
      "completions/min_length": 230.8,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.3659824046920821,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.75,
      "kl": 0.012470835470594466,
      "learning_rate": 7.45700456985318e-06,
      "loss": -0.0012,
      "num_tokens": 17115033.0,
      "reward": 12.148631858825684,
      "reward_std": 15.902271842956543,
      "rewards/wrapper/mean": 6.0743159070611,
      "rewards/wrapper/std": 15.158780360221863,
      "step": 1560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 51.8,
      "completions/mean_length": 252.81875,
      "completions/mean_terminated_length": 51.8,
      "completions/min_length": 154.2,
      "completions/min_terminated_length": 51.8,
      "epoch": 0.3671554252199413,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.8203125,
      "kl": 0.006677528831642121,
      "learning_rate": 7.454063648181896e-06,
      "loss": -0.0029,
      "num_tokens": 17168222.0,
      "reward": 13.805829715728759,
      "reward_std": 15.897808837890626,
      "rewards/wrapper/mean": 6.902914525568486,
      "rewards/wrapper/std": 22.55402392446995,
      "step": 1565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 29.2,
      "completions/mean_length": 248.925,
      "completions/mean_terminated_length": 14.9,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.3683284457478006,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.71875,
      "kl": 0.021633249043952675,
      "learning_rate": 7.451114145890799e-06,
      "loss": -0.0192,
      "num_tokens": 17222840.0,
      "reward": 9.334974765777588,
      "reward_std": 12.756710720062255,
      "rewards/wrapper/mean": 4.667487615346909,
      "rewards/wrapper/std": 15.754800505936146,
      "step": 1570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 253.25,
      "completions/mean_terminated_length": 14.4,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 14.4,
      "epoch": 0.36950146627565983,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.875,
      "kl": 0.0058940518880262974,
      "learning_rate": 7.448156073078817e-06,
      "loss": -0.0087,
      "num_tokens": 17275464.0,
      "reward": 9.2894437789917,
      "reward_std": 11.119479942321778,
      "rewards/wrapper/mean": 4.644721812009811,
      "rewards/wrapper/std": 12.392537288367748,
      "step": 1575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 253.2,
      "completions/mean_terminated_length": 6.4,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.37067448680351905,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.046875,
      "kl": 0.008174310147296637,
      "learning_rate": 7.445189439874223e-06,
      "loss": -0.0035,
      "num_tokens": 17331486.0,
      "reward": 9.974001216888428,
      "reward_std": 12.6971941947937,
      "rewards/wrapper/mean": 4.987000489979982,
      "rewards/wrapper/std": 15.624969989061356,
      "step": 1580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3718475073313783,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 8.0625,
      "kl": 0.010091668064706027,
      "learning_rate": 7.442214256434603e-06,
      "loss": 0.0004,
      "num_tokens": 17387346.0,
      "reward": 6.865322303771973,
      "reward_std": 8.876708436012269,
      "rewards/wrapper/mean": 3.432661159336567,
      "rewards/wrapper/std": 11.23374333679676,
      "step": 1585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 44.4,
      "completions/mean_length": 254.1875,
      "completions/mean_terminated_length": 44.4,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 44.4,
      "epoch": 0.37302052785923756,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.28125,
      "kl": 0.007138772174948826,
      "learning_rate": 7.439230532946815e-06,
      "loss": -0.0055,
      "num_tokens": 17439276.0,
      "reward": 9.094755506515503,
      "reward_std": 11.259793186187744,
      "rewards/wrapper/mean": 4.5473778083920475,
      "rewards/wrapper/std": 13.361431784927845,
      "step": 1590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 252.81875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 154.2,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.3741935483870968,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.90625,
      "kl": 0.006641448987647891,
      "learning_rate": 7.436238279626959e-06,
      "loss": -0.0091,
      "num_tokens": 17492901.0,
      "reward": 9.107149982452393,
      "reward_std": 12.01515827178955,
      "rewards/wrapper/mean": 4.553574965894223,
      "rewards/wrapper/std": 13.188546454906463,
      "step": 1595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.2,
      "completions/mean_length": 251.36875,
      "completions/mean_terminated_length": 2.8,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.375366568914956,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.21875,
      "kl": 0.0073716026323381815,
      "learning_rate": 7.433237506720342e-06,
      "loss": -0.0184,
      "num_tokens": 17546222.0,
      "reward": 11.634984397888184,
      "reward_std": 12.310375213623047,
      "rewards/wrapper/mean": 5.81749247610569,
      "rewards/wrapper/std": 17.150740154087543,
      "step": 1600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.37653958944281524,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.0,
      "kl": 0.007083708542631939,
      "learning_rate": 7.430228224501438e-06,
      "loss": -0.0058,
      "num_tokens": 17601677.0,
      "reward": 7.806136894226074,
      "reward_std": 8.510241031646729,
      "rewards/wrapper/mean": 3.903068270534277,
      "rewards/wrapper/std": 14.243344616889953,
      "step": 1605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.2,
      "completions/mean_length": 254.71875,
      "completions/mean_terminated_length": 10.2,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.37771260997067446,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.8125,
      "kl": 0.011267528822645544,
      "learning_rate": 7.427210443273859e-06,
      "loss": -0.003,
      "num_tokens": 17655154.0,
      "reward": 11.17388744354248,
      "reward_std": 12.301093673706054,
      "rewards/wrapper/mean": 5.586943505704403,
      "rewards/wrapper/std": 16.70587693154812,
      "step": 1610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.0,
      "completions/mean_length": 251.1625,
      "completions/mean_terminated_length": 50.0,
      "completions/min_length": 101.2,
      "completions/min_terminated_length": 50.0,
      "epoch": 0.37888563049853374,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.671875,
      "kl": 0.007835835078731179,
      "learning_rate": 7.424184173370319e-06,
      "loss": 0.0061,
      "num_tokens": 17708750.0,
      "reward": 13.72214469909668,
      "reward_std": 15.858495712280273,
      "rewards/wrapper/mean": 6.861072225868702,
      "rewards/wrapper/std": 21.06146321594715,
      "step": 1615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.0,
      "completions/mean_length": 254.33125,
      "completions/mean_terminated_length": 49.0,
      "completions/min_length": 202.6,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.38005865102639297,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.90625,
      "kl": 0.0077929800259880725,
      "learning_rate": 7.421149425152591e-06,
      "loss": -0.0013,
      "num_tokens": 17763259.0,
      "reward": 9.450750017166138,
      "reward_std": 9.001422214508057,
      "rewards/wrapper/mean": 4.725374779850244,
      "rewards/wrapper/std": 13.301532693952321,
      "step": 1620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 57.8,
      "completions/mean_length": 253.29375,
      "completions/mean_terminated_length": 48.1,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 38.4,
      "epoch": 0.3812316715542522,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.1640625,
      "kl": 0.010976305836811662,
      "learning_rate": 7.418106209011485e-06,
      "loss": 0.0035,
      "num_tokens": 17816420.0,
      "reward": 8.98981170654297,
      "reward_std": 12.001905918121338,
      "rewards/wrapper/mean": 4.494905859231949,
      "rewards/wrapper/std": 14.715555727481842,
      "step": 1625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.8,
      "completions/mean_length": 249.6625,
      "completions/mean_terminated_length": 1.3,
      "completions/min_length": 103.2,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.3824046920821114,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.75,
      "kl": 0.007970065460540354,
      "learning_rate": 7.415054535366797e-06,
      "loss": -0.0185,
      "num_tokens": 17871970.0,
      "reward": 15.410069465637207,
      "reward_std": 18.368908309936522,
      "rewards/wrapper/mean": 7.705034771561623,
      "rewards/wrapper/std": 21.16511830240488,
      "step": 1630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 93.0,
      "completions/mean_length": 251.19375,
      "completions/mean_terminated_length": 80.2,
      "completions/min_length": 118.6,
      "completions/min_terminated_length": 67.4,
      "epoch": 0.3835777126099707,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.625,
      "kl": 0.03646875837584958,
      "learning_rate": 7.411994414667286e-06,
      "loss": -0.0052,
      "num_tokens": 17926495.0,
      "reward": 11.289712238311768,
      "reward_std": 12.078022670745849,
      "rewards/wrapper/mean": 5.644856164604425,
      "rewards/wrapper/std": 15.732601109147073,
      "step": 1635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 247.1,
      "completions/mean_terminated_length": 15.0,
      "completions/min_length": 65.2,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.3847507331378299,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.359375,
      "kl": 0.010182082024402916,
      "learning_rate": 7.4089258573906325e-06,
      "loss": -0.0291,
      "num_tokens": 17982111.0,
      "reward": 10.671856796741485,
      "reward_std": 14.430114448070526,
      "rewards/wrapper/mean": 5.335928474366665,
      "rewards/wrapper/std": 17.302095092833042,
      "step": 1640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.0,
      "completions/mean_length": 253.49375,
      "completions/mean_terminated_length": 11.1,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.38592375366568915,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.28125,
      "kl": 0.027646390511654316,
      "learning_rate": 7.4058488740434015e-06,
      "loss": -0.0053,
      "num_tokens": 18035278.0,
      "reward": 11.211683654785157,
      "reward_std": 12.046236991882324,
      "rewards/wrapper/mean": 5.605841771513224,
      "rewards/wrapper/std": 16.439643205702303,
      "step": 1645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.8,
      "completions/mean_length": 253.6375,
      "completions/mean_terminated_length": 26.8,
      "completions/min_length": 180.4,
      "completions/min_terminated_length": 26.8,
      "epoch": 0.3870967741935484,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 19.125,
      "kl": 0.023396419017808513,
      "learning_rate": 7.402763475161009e-06,
      "loss": 0.005,
      "num_tokens": 18090086.0,
      "reward": 10.315212440490722,
      "reward_std": 10.757926654815673,
      "rewards/wrapper/mean": 5.157606067508459,
      "rewards/wrapper/std": 16.255353631079196,
      "step": 1650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.2,
      "completions/mean_length": 252.64375,
      "completions/mean_terminated_length": 46.2,
      "completions/min_length": 148.6,
      "completions/min_terminated_length": 46.2,
      "epoch": 0.3882697947214076,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.7578125,
      "kl": 0.012765820871572941,
      "learning_rate": 7.3996696713076875e-06,
      "loss": 0.0038,
      "num_tokens": 18144855.0,
      "reward": 9.633971977233887,
      "reward_std": 12.852963256835938,
      "rewards/wrapper/mean": 4.81698562502861,
      "rewards/wrapper/std": 14.805128015577793,
      "step": 1655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.8,
      "completions/mean_length": 247.71875,
      "completions/mean_terminated_length": 21.3,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.3894428152492669,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 1.5703125,
      "kl": 0.008341539406683297,
      "learning_rate": 7.3965674730764436e-06,
      "loss": -0.0171,
      "num_tokens": 18199476.0,
      "reward": 12.23201961517334,
      "reward_std": 14.92741813659668,
      "rewards/wrapper/mean": 6.116009667515755,
      "rewards/wrapper/std": 18.56237207353115,
      "step": 1660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.3906158357771261,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.09375,
      "kl": 0.008628571312874556,
      "learning_rate": 7.393456891089031e-06,
      "loss": -0.0047,
      "num_tokens": 18254951.0,
      "reward": 13.272594833374024,
      "reward_std": 16.658662605285645,
      "rewards/wrapper/mean": 6.636297233402729,
      "rewards/wrapper/std": 19.94486008733511,
      "step": 1665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 251.71875,
      "completions/mean_terminated_length": 8.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.39178885630498533,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.46875,
      "kl": 0.015108229930046947,
      "learning_rate": 7.3903379359959035e-06,
      "loss": -0.0113,
      "num_tokens": 18309606.0,
      "reward": 11.978076171875,
      "reward_std": 15.562600898742676,
      "rewards/wrapper/mean": 5.989037749916315,
      "rewards/wrapper/std": 17.648951482772826,
      "step": 1670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 88.0,
      "completions/mean_length": 250.7625,
      "completions/mean_terminated_length": 58.66666717529297,
      "completions/min_length": 146.4,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.39296187683284456,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.859375,
      "kl": 0.009045041864737868,
      "learning_rate": 7.387210618476187e-06,
      "loss": -0.0051,
      "num_tokens": 18364170.0,
      "reward": 7.711848163604737,
      "reward_std": 10.005695056915282,
      "rewards/wrapper/mean": 3.8559240214526653,
      "rewards/wrapper/std": 13.164579983055592,
      "step": 1675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 56.2,
      "completions/mean_length": 253.86875,
      "completions/mean_terminated_length": 55.5,
      "completions/min_length": 208.4,
      "completions/min_terminated_length": 54.8,
      "epoch": 0.39413489736070384,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.25,
      "kl": 0.010809017776045949,
      "learning_rate": 7.38407494923764e-06,
      "loss": -0.0047,
      "num_tokens": 18419115.0,
      "reward": 11.691289234161378,
      "reward_std": 14.441447448730468,
      "rewards/wrapper/mean": 5.845644051581621,
      "rewards/wrapper/std": 17.613725888729096,
      "step": 1680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.4,
      "completions/mean_length": 247.5375,
      "completions/mean_terminated_length": 18.2,
      "completions/min_length": 110.4,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.39530791788856307,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.4375,
      "kl": 0.006498944421764463,
      "learning_rate": 7.380930939016617e-06,
      "loss": -0.0026,
      "num_tokens": 18470913.0,
      "reward": 11.800182819366455,
      "reward_std": 13.530944919586181,
      "rewards/wrapper/mean": 5.900091470777989,
      "rewards/wrapper/std": 14.850279198586941,
      "step": 1685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 65.0,
      "completions/mean_length": 251.83125,
      "completions/mean_terminated_length": 43.5,
      "completions/min_length": 124.4,
      "completions/min_terminated_length": 22.0,
      "epoch": 0.3964809384164223,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.53125,
      "kl": 0.0053620882332324985,
      "learning_rate": 7.377778598578028e-06,
      "loss": -0.015,
      "num_tokens": 18523112.0,
      "reward": 11.4480149269104,
      "reward_std": 15.00893726348877,
      "rewards/wrapper/mean": 5.724007427692413,
      "rewards/wrapper/std": 17.372301462292672,
      "step": 1690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.3976539589442815,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.2734375,
      "kl": 0.017944533191621304,
      "learning_rate": 7.37461793871531e-06,
      "loss": 0.0032,
      "num_tokens": 18578531.0,
      "reward": 6.788930177688599,
      "reward_std": 8.851951217651367,
      "rewards/wrapper/mean": 3.3944652788341045,
      "rewards/wrapper/std": 11.30336948186159,
      "step": 1695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.4,
      "completions/mean_length": 255.38125,
      "completions/mean_terminated_length": 31.4,
      "completions/min_length": 236.2,
      "completions/min_terminated_length": 31.4,
      "epoch": 0.39882697947214074,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.015625,
      "kl": 0.009708100673742592,
      "learning_rate": 7.371448970250383e-06,
      "loss": 0.0022,
      "num_tokens": 18634028.0,
      "reward": 11.817505073547363,
      "reward_std": 14.400478649139405,
      "rewards/wrapper/mean": 5.908752170950175,
      "rewards/wrapper/std": 17.49454737752676,
      "step": 1700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 79.6,
      "completions/mean_length": 248.90625,
      "completions/mean_terminated_length": 33.93333435058594,
      "completions/min_length": 104.6,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.4,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.4375,
      "kl": 0.013681471673771739,
      "learning_rate": 7.368271704033615e-06,
      "loss": -0.0172,
      "num_tokens": 18687007.0,
      "reward": 8.214220666885376,
      "reward_std": 11.032522630691528,
      "rewards/wrapper/mean": 4.107110323756933,
      "rewards/wrapper/std": 13.84426678419113,
      "step": 1705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 59.4,
      "completions/mean_length": 251.46875,
      "completions/mean_terminated_length": 29.9,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.40117302052785925,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.0,
      "kl": 0.0418191681150347,
      "learning_rate": 7.365086150943786e-06,
      "loss": -0.0087,
      "num_tokens": 18746950.0,
      "reward": 13.3557297706604,
      "reward_std": 14.804699611663818,
      "rewards/wrapper/mean": 6.677864947915078,
      "rewards/wrapper/std": 18.732959206402302,
      "step": 1710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.4,
      "completions/mean_length": 252.875,
      "completions/mean_terminated_length": 2.4,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.4023460410557185,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.21875,
      "kl": 0.012403641873970628,
      "learning_rate": 7.3618923218880465e-06,
      "loss": -0.0037,
      "num_tokens": 18802882.0,
      "reward": 6.81656813621521,
      "reward_std": 8.471649742126464,
      "rewards/wrapper/mean": 3.408283967524767,
      "rewards/wrapper/std": 10.526500597596169,
      "step": 1715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.0,
      "completions/mean_length": 253.89375,
      "completions/mean_terminated_length": 35.0,
      "completions/min_length": 188.6,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.4035190615835777,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.53125,
      "kl": 0.013926792331039906,
      "learning_rate": 7.35869022780189e-06,
      "loss": -0.0055,
      "num_tokens": 18857201.0,
      "reward": 10.40798740386963,
      "reward_std": 11.334413433074952,
      "rewards/wrapper/mean": 5.203993559628725,
      "rewards/wrapper/std": 17.202790300548077,
      "step": 1720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.4046920821114369,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.609375,
      "kl": 0.0062289016088470815,
      "learning_rate": 7.355479879649102e-06,
      "loss": -0.0048,
      "num_tokens": 18911494.0,
      "reward": 9.848146200180054,
      "reward_std": 10.52330822944641,
      "rewards/wrapper/mean": 4.924072936177254,
      "rewards/wrapper/std": 12.834916192293168,
      "step": 1725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 253.3375,
      "completions/mean_terminated_length": 17.2,
      "completions/min_length": 170.8,
      "completions/min_terminated_length": 17.2,
      "epoch": 0.4058651026392962,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.421875,
      "kl": 0.009612555714556947,
      "learning_rate": 7.352261288421734e-06,
      "loss": 0.0099,
      "num_tokens": 18965412.0,
      "reward": 14.73887882232666,
      "reward_std": 18.674267578125,
      "rewards/wrapper/mean": 7.369439592212439,
      "rewards/wrapper/std": 19.213176207244395,
      "step": 1730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.40703812316715543,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.109375,
      "kl": 0.013163172616623342,
      "learning_rate": 7.349034465140059e-06,
      "loss": -0.0045,
      "num_tokens": 19024361.0,
      "reward": 12.89403257369995,
      "reward_std": 14.669578742980956,
      "rewards/wrapper/mean": 6.447016255557537,
      "rewards/wrapper/std": 18.42922862917185,
      "step": 1735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 251.59375,
      "completions/mean_terminated_length": 12.6,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 12.6,
      "epoch": 0.40821114369501466,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.40625,
      "kl": 0.01098422622308135,
      "learning_rate": 7.345799420852538e-06,
      "loss": -0.0102,
      "num_tokens": 19081464.0,
      "reward": 6.755143082141876,
      "reward_std": 9.144136524200439,
      "rewards/wrapper/mean": 3.3775712579488752,
      "rewards/wrapper/std": 10.79396327584982,
      "step": 1740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 254.95625,
      "completions/mean_terminated_length": 17.8,
      "completions/min_length": 222.6,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.4093841642228739,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 1.0234375,
      "kl": 0.016762871819082648,
      "learning_rate": 7.342556166635778e-06,
      "loss": 0.003,
      "num_tokens": 19139861.0,
      "reward": 11.431461834907532,
      "reward_std": 14.701215863227844,
      "rewards/wrapper/mean": 5.715730750560761,
      "rewards/wrapper/std": 18.74812933206558,
      "step": 1745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 253.26875,
      "completions/mean_terminated_length": 15.0,
      "completions/min_length": 168.6,
      "completions/min_terminated_length": 15.0,
      "epoch": 0.41055718475073316,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.0,
      "kl": 0.02420689011923969,
      "learning_rate": 7.3393047135944975e-06,
      "loss": -0.0026,
      "num_tokens": 19193342.0,
      "reward": 10.389392566680907,
      "reward_std": 11.804220390319824,
      "rewards/wrapper/mean": 5.194696100801229,
      "rewards/wrapper/std": 15.313809236884117,
      "step": 1750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 251.65625,
      "completions/mean_terminated_length": 7.5,
      "completions/min_length": 154.2,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.4117302052785924,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.6484375,
      "kl": 0.008320258150342852,
      "learning_rate": 7.336045072861489e-06,
      "loss": -0.0107,
      "num_tokens": 19248133.0,
      "reward": 12.969413948059081,
      "reward_std": 17.84697332382202,
      "rewards/wrapper/mean": 6.484706741571427,
      "rewards/wrapper/std": 17.889204749464987,
      "step": 1755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.2,
      "completions/mean_length": 252.05625,
      "completions/mean_terminated_length": 22.5,
      "completions/min_length": 171.4,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.4129032258064516,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.3828125,
      "kl": 0.006936083594337106,
      "learning_rate": 7.332777255597575e-06,
      "loss": -0.0072,
      "num_tokens": 19300942.0,
      "reward": 9.992176389694214,
      "reward_std": 10.995404851436614,
      "rewards/wrapper/mean": 4.996087930724025,
      "rewards/wrapper/std": 13.14226104170084,
      "step": 1760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.8,
      "completions/mean_length": 251.03125,
      "completions/mean_terminated_length": 45.8,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 45.8,
      "epoch": 0.41407624633431084,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.578125,
      "kl": 0.008393231546506286,
      "learning_rate": 7.3295012729915785e-06,
      "loss": -0.0081,
      "num_tokens": 19355175.0,
      "reward": 14.870281219482422,
      "reward_std": 18.440231704711913,
      "rewards/wrapper/mean": 7.435140260308981,
      "rewards/wrapper/std": 21.349429170787335,
      "step": 1765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 250.5375,
      "completions/mean_terminated_length": 30.0,
      "completions/min_length": 81.2,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.41524926686217006,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.453125,
      "kl": 0.007583037635777145,
      "learning_rate": 7.326217136260277e-06,
      "loss": -0.0148,
      "num_tokens": 19407093.0,
      "reward": 12.376422429084778,
      "reward_std": 14.544963467121125,
      "rewards/wrapper/mean": 6.18821112215519,
      "rewards/wrapper/std": 15.661508214473724,
      "step": 1770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 253.7375,
      "completions/mean_terminated_length": 30.0,
      "completions/min_length": 183.6,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.41642228739002934,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.8515625,
      "kl": 0.010185779200401156,
      "learning_rate": 7.322924856648371e-06,
      "loss": -0.0048,
      "num_tokens": 19461935.0,
      "reward": 11.91078872680664,
      "reward_std": 16.30748119354248,
      "rewards/wrapper/mean": 5.955394900590181,
      "rewards/wrapper/std": 19.23618437051773,
      "step": 1775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 251.625,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 13.6,
      "epoch": 0.41759530791788857,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.09375,
      "kl": 0.17002868838608265,
      "learning_rate": 7.319624445428436e-06,
      "loss": 0.005,
      "num_tokens": 19520653.0,
      "reward": 12.678552627563477,
      "reward_std": 16.436040306091307,
      "rewards/wrapper/mean": 6.339276467263699,
      "rewards/wrapper/std": 21.000767435133458,
      "step": 1780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 72.2,
      "completions/mean_length": 252.54375,
      "completions/mean_terminated_length": 58.6,
      "completions/min_length": 147.4,
      "completions/min_terminated_length": 45.0,
      "epoch": 0.4187683284457478,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.6328125,
      "kl": 0.00664262983482331,
      "learning_rate": 7.316315913900893e-06,
      "loss": -0.0056,
      "num_tokens": 19573212.0,
      "reward": 7.194902086257935,
      "reward_std": 9.6545166015625,
      "rewards/wrapper/mean": 3.5974511459469793,
      "rewards/wrapper/std": 12.101305271685124,
      "step": 1785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.8,
      "completions/mean_length": 252.5625,
      "completions/mean_terminated_length": 21.9,
      "completions/min_length": 154.6,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.419941348973607,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.5,
      "kl": 0.0394431886379607,
      "learning_rate": 7.312999273393968e-06,
      "loss": -0.0076,
      "num_tokens": 19627970.0,
      "reward": 7.889170932769775,
      "reward_std": 10.530238914489747,
      "rewards/wrapper/mean": 3.9445855379104615,
      "rewards/wrapper/std": 13.939225174486637,
      "step": 1790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.6,
      "completions/mean_length": 255.16875,
      "completions/mean_terminated_length": 24.6,
      "completions/min_length": 229.4,
      "completions/min_terminated_length": 24.6,
      "epoch": 0.4211143695014663,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.5,
      "kl": 0.009630634234054015,
      "learning_rate": 7.309674535263649e-06,
      "loss": -0.0015,
      "num_tokens": 19685435.0,
      "reward": 8.898407530784606,
      "reward_std": 11.805877017974854,
      "rewards/wrapper/mean": 4.4492038011550905,
      "rewards/wrapper/std": 14.308778963983059,
      "step": 1795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.4,
      "completions/mean_length": 254.44375,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 206.2,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.4222873900293255,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 7.90625,
      "kl": 0.02276097269495949,
      "learning_rate": 7.3063417108936525e-06,
      "loss": 0.0015,
      "num_tokens": 19742088.0,
      "reward": 6.068336296081543,
      "reward_std": 7.929187393188476,
      "rewards/wrapper/mean": 3.034168167412281,
      "rewards/wrapper/std": 11.404951086640358,
      "step": 1800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.0,
      "completions/mean_length": 250.9,
      "completions/mean_terminated_length": 14.4,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.42346041055718475,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.96875,
      "kl": 0.019500624504871666,
      "learning_rate": 7.3030008116953775e-06,
      "loss": -0.0126,
      "num_tokens": 19797162.0,
      "reward": 6.141694736480713,
      "reward_std": 8.104972839355469,
      "rewards/wrapper/mean": 3.07084731683135,
      "rewards/wrapper/std": 11.21379586905241,
      "step": 1805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 143.2,
      "completions/mean_length": 249.61875,
      "completions/mean_terminated_length": 112.73333435058593,
      "completions/min_length": 92.2,
      "completions/min_terminated_length": 92.2,
      "epoch": 0.424633431085044,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.21875,
      "kl": 0.23462779039982706,
      "learning_rate": 7.299651849107875e-06,
      "loss": -0.0037,
      "num_tokens": 19854501.0,
      "reward": 8.947885489463806,
      "reward_std": 11.50197262763977,
      "rewards/wrapper/mean": 4.473942489922047,
      "rewards/wrapper/std": 14.429775257408618,
      "step": 1810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.0,
      "completions/mean_length": 251.075,
      "completions/mean_terminated_length": 29.3,
      "completions/min_length": 114.0,
      "completions/min_terminated_length": 11.6,
      "epoch": 0.4258064516129032,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 7.21875,
      "kl": 0.012706667324528098,
      "learning_rate": 7.296294834597802e-06,
      "loss": 0.0008,
      "num_tokens": 19910181.0,
      "reward": 7.768388175964356,
      "reward_std": 9.072725200653077,
      "rewards/wrapper/mean": 3.884194038808346,
      "rewards/wrapper/std": 12.262552881240845,
      "step": 1815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.4,
      "completions/mean_length": 253.09375,
      "completions/mean_terminated_length": 9.4,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.4269794721407625,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.4375,
      "kl": 0.02032384374178946,
      "learning_rate": 7.292929779659388e-06,
      "loss": -0.0085,
      "num_tokens": 19966670.0,
      "reward": 7.567059135437011,
      "reward_std": 10.188446760177612,
      "rewards/wrapper/mean": 3.783529528230429,
      "rewards/wrapper/std": 12.873230685293674,
      "step": 1820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.0,
      "completions/mean_length": 251.73125,
      "completions/mean_terminated_length": 17.0,
      "completions/min_length": 119.4,
      "completions/min_terminated_length": 17.0,
      "epoch": 0.4281524926686217,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.6875,
      "kl": 0.02145702773705125,
      "learning_rate": 7.289556695814387e-06,
      "loss": -0.012,
      "num_tokens": 20023081.0,
      "reward": 9.922704219818115,
      "reward_std": 13.515721893310547,
      "rewards/wrapper/mean": 4.961352105438709,
      "rewards/wrapper/std": 16.33309898674488,
      "step": 1825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.42932551319648093,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.0625,
      "kl": 0.007976790505927056,
      "learning_rate": 7.28617559461205e-06,
      "loss": -0.0047,
      "num_tokens": 20079482.0,
      "reward": 6.184163045883179,
      "reward_std": 6.993026924133301,
      "rewards/wrapper/mean": 3.092081458866596,
      "rewards/wrapper/std": 8.93438842445612,
      "step": 1830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 77.4,
      "completions/mean_length": 249.44375,
      "completions/mean_terminated_length": 54.2,
      "completions/min_length": 82.2,
      "completions/min_terminated_length": 31.0,
      "epoch": 0.43049853372434016,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.09375,
      "kl": 0.010702864232007414,
      "learning_rate": 7.2827864876290725e-06,
      "loss": -0.0112,
      "num_tokens": 20133555.0,
      "reward": 12.384488344192505,
      "reward_std": 16.911638498306274,
      "rewards/wrapper/mean": 6.192244322597981,
      "rewards/wrapper/std": 19.98247754573822,
      "step": 1835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 250.1625,
      "completions/mean_terminated_length": 9.2,
      "completions/min_length": 104.8,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.43167155425219944,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.203125,
      "kl": 8.578012371703517,
      "learning_rate": 7.2793893864695675e-06,
      "loss": 0.3315,
      "num_tokens": 20187869.0,
      "reward": 12.463616633415223,
      "reward_std": 16.961518478393554,
      "rewards/wrapper/mean": 6.231808027625084,
      "rewards/wrapper/std": 20.683779430389404,
      "step": 1840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.6,
      "completions/mean_length": 250.46875,
      "completions/mean_terminated_length": 20.0,
      "completions/min_length": 114.8,
      "completions/min_terminated_length": 12.4,
      "epoch": 0.43284457478005867,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.171875,
      "kl": 0.03384362782817334,
      "learning_rate": 7.275984302765016e-06,
      "loss": -0.0105,
      "num_tokens": 20244016.0,
      "reward": 11.264978408813477,
      "reward_std": 12.805488967895508,
      "rewards/wrapper/mean": 5.63248887732625,
      "rewards/wrapper/std": 17.072499746084212,
      "step": 1845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.4,
      "completions/mean_length": 255.44375,
      "completions/mean_terminated_length": 33.4,
      "completions/min_length": 238.2,
      "completions/min_terminated_length": 33.4,
      "epoch": 0.4340175953079179,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.375,
      "kl": 0.008770254044793546,
      "learning_rate": 7.272571248174231e-06,
      "loss": 0.0012,
      "num_tokens": 20297411.0,
      "reward": 11.228783702850341,
      "reward_std": 14.728189849853516,
      "rewards/wrapper/mean": 5.614392015337944,
      "rewards/wrapper/std": 16.68226896971464,
      "step": 1850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.4351906158357771,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 7.5,
      "kl": 0.010837229958269745,
      "learning_rate": 7.269150234383318e-06,
      "loss": -0.0102,
      "num_tokens": 20355551.0,
      "reward": 11.516048622131347,
      "reward_std": 13.84256067276001,
      "rewards/wrapper/mean": 5.758024173974991,
      "rewards/wrapper/std": 18.53322400599718,
      "step": 1855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.2,
      "completions/mean_length": 252.4625,
      "completions/mean_terminated_length": 20.3,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.43636363636363634,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.390625,
      "kl": 0.007430908730020747,
      "learning_rate": 7.2657212731056345e-06,
      "loss": -0.0031,
      "num_tokens": 20409427.0,
      "reward": 9.576006889343262,
      "reward_std": 12.781783771514892,
      "rewards/wrapper/mean": 4.788003156334161,
      "rewards/wrapper/std": 14.093942853808404,
      "step": 1860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 44.0,
      "completions/mean_length": 247.83125,
      "completions/mean_terminated_length": 33.5,
      "completions/min_length": 119.6,
      "completions/min_terminated_length": 17.2,
      "epoch": 0.4375366568914956,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.578125,
      "kl": 0.04859507377259433,
      "learning_rate": 7.262284376081749e-06,
      "loss": -0.0194,
      "num_tokens": 20466058.0,
      "reward": 8.321262574195861,
      "reward_std": 5.054714918136597,
      "rewards/wrapper/mean": 4.160631164908409,
      "rewards/wrapper/std": 9.609023374319076,
      "step": 1865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 250.04375,
      "completions/mean_terminated_length": 14.2,
      "completions/min_length": 65.4,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.43870967741935485,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.015625,
      "kl": 0.017362323612906037,
      "learning_rate": 7.258839555079402e-06,
      "loss": -0.0054,
      "num_tokens": 20521151.0,
      "reward": 5.06641993522644,
      "reward_std": 3.91677873134613,
      "rewards/wrapper/mean": 2.5332097202539443,
      "rewards/wrapper/std": 8.172790160775184,
      "step": 1870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 73.0,
      "completions/mean_length": 251.2,
      "completions/mean_terminated_length": 51.6,
      "completions/min_length": 132.6,
      "completions/min_terminated_length": 30.2,
      "epoch": 0.4398826979472141,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.546875,
      "kl": 0.008750611578579991,
      "learning_rate": 7.255386821893465e-06,
      "loss": -0.0106,
      "num_tokens": 20574001.0,
      "reward": 8.642753672599792,
      "reward_std": 9.593472319841386,
      "rewards/wrapper/mean": 4.321376763284206,
      "rewards/wrapper/std": 13.84940035790205,
      "step": 1875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4410557184750733,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.515625,
      "kl": 0.008011186274234205,
      "learning_rate": 7.251926188345901e-06,
      "loss": 0.0003,
      "num_tokens": 20631611.0,
      "reward": 11.901252555847169,
      "reward_std": 16.12425422668457,
      "rewards/wrapper/mean": 5.950626049935818,
      "rewards/wrapper/std": 19.04661168754101,
      "step": 1880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 18.8,
      "completions/mean_length": 250.19375,
      "completions/mean_terminated_length": 10.3,
      "completions/min_length": 104.2,
      "completions/min_terminated_length": 1.8,
      "epoch": 0.4422287390029325,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.1875,
      "kl": 0.006020620831986889,
      "learning_rate": 7.248457666285724e-06,
      "loss": -0.0141,
      "num_tokens": 20683374.0,
      "reward": 11.824284934997559,
      "reward_std": 14.59802188873291,
      "rewards/wrapper/mean": 5.912142033874988,
      "rewards/wrapper/std": 18.940556921064854,
      "step": 1885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.4,
      "completions/mean_length": 248.66875,
      "completions/mean_terminated_length": 36.333334350585936,
      "completions/min_length": 69.6,
      "completions/min_terminated_length": 18.4,
      "epoch": 0.4434017595307918,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 7.21875,
      "kl": 0.008943629602435976,
      "learning_rate": 7.244981267588955e-06,
      "loss": -0.0154,
      "num_tokens": 20738729.0,
      "reward": 10.507330799102784,
      "reward_std": 10.409684324264527,
      "rewards/wrapper/mean": 5.253665325790644,
      "rewards/wrapper/std": 14.284284387528896,
      "step": 1890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44457478005865103,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9921875,
      "kl": 0.01356151889776811,
      "learning_rate": 7.241497004158588e-06,
      "loss": 0.0005,
      "num_tokens": 20790895.0,
      "reward": 12.190250253677368,
      "reward_std": 14.93087453842163,
      "rewards/wrapper/mean": 6.095125179737806,
      "rewards/wrapper/std": 16.378230841457842,
      "step": 1895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.6,
      "completions/mean_length": 251.25,
      "completions/mean_terminated_length": 1.6,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.44574780058651026,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 7.25,
      "kl": 0.010270661639515311,
      "learning_rate": 7.238004887924543e-06,
      "loss": -0.012,
      "num_tokens": 20847969.0,
      "reward": 12.254180431365967,
      "reward_std": 14.935155391693115,
      "rewards/wrapper/mean": 6.12709027454257,
      "rewards/wrapper/std": 16.741011860966683,
      "step": 1900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.0,
      "completions/mean_length": 246.50625,
      "completions/mean_terminated_length": 1.9,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.4469208211143695,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.75,
      "kl": 0.031814318336546424,
      "learning_rate": 7.234504930843625e-06,
      "loss": -0.0165,
      "num_tokens": 20903034.0,
      "reward": 8.4836496591568,
      "reward_std": 11.438926529884338,
      "rewards/wrapper/mean": 4.241824831068516,
      "rewards/wrapper/std": 11.069314436614514,
      "step": 1905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.44809384164222876,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.5,
      "kl": 0.028099820134229958,
      "learning_rate": 7.230997144899492e-06,
      "loss": -0.0089,
      "num_tokens": 20962770.0,
      "reward": 15.655290794372558,
      "reward_std": 21.4853853225708,
      "rewards/wrapper/mean": 7.827645578980446,
      "rewards/wrapper/std": 22.036782597005367,
      "step": 1910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 250.03125,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 13.6,
      "epoch": 0.449266862170088,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.046875,
      "kl": 0.008698656456544995,
      "learning_rate": 7.227481542102603e-06,
      "loss": -0.0205,
      "num_tokens": 21015545.0,
      "reward": 9.024892663955688,
      "reward_std": 11.932024049758912,
      "rewards/wrapper/mean": 4.512446265667677,
      "rewards/wrapper/std": 15.796873818337918,
      "step": 1915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.0,
      "completions/mean_length": 249.29375,
      "completions/mean_terminated_length": 41.0,
      "completions/min_length": 143.4,
      "completions/min_terminated_length": 41.0,
      "epoch": 0.4504398826979472,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.421875,
      "kl": 0.008335027925204486,
      "learning_rate": 7.223958134490182e-06,
      "loss": -0.0107,
      "num_tokens": 21068274.0,
      "reward": 14.141839408874512,
      "reward_std": 16.80069694519043,
      "rewards/wrapper/mean": 7.0709196768701075,
      "rewards/wrapper/std": 18.76930390149355,
      "step": 1920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.0,
      "completions/mean_length": 252.45625,
      "completions/mean_terminated_length": 25.3,
      "completions/min_length": 164.2,
      "completions/min_terminated_length": 10.6,
      "epoch": 0.45161290322580644,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.765625,
      "kl": 0.004524467344162985,
      "learning_rate": 7.2204269341261774e-06,
      "loss": -0.0035,
      "num_tokens": 21120481.0,
      "reward": 10.868282413482666,
      "reward_std": 11.861543273925781,
      "rewards/wrapper/mean": 5.434141282737255,
      "rewards/wrapper/std": 16.583185213804246,
      "step": 1925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 37.2,
      "completions/mean_length": 253.9625,
      "completions/mean_terminated_length": 37.2,
      "completions/min_length": 190.8,
      "completions/min_terminated_length": 37.2,
      "epoch": 0.45278592375366566,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.6328125,
      "kl": 0.008211625734111294,
      "learning_rate": 7.21688795310122e-06,
      "loss": -0.0075,
      "num_tokens": 21174425.0,
      "reward": 8.30243649482727,
      "reward_std": 11.125893306732177,
      "rewards/wrapper/mean": 4.1512182362377645,
      "rewards/wrapper/std": 15.107886047661305,
      "step": 1930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 23.6,
      "completions/mean_length": 255.1375,
      "completions/mean_terminated_length": 23.6,
      "completions/min_length": 228.4,
      "completions/min_terminated_length": 23.6,
      "epoch": 0.45395894428152495,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.5625,
      "kl": 0.010393868479877711,
      "learning_rate": 7.213341203532579e-06,
      "loss": -0.0012,
      "num_tokens": 21227715.0,
      "reward": 8.592939472198486,
      "reward_std": 11.629208850860596,
      "rewards/wrapper/mean": 4.296469537913799,
      "rewards/wrapper/std": 15.236213786900043,
      "step": 1935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 89.0,
      "completions/mean_length": 249.8,
      "completions/mean_terminated_length": 68.53333435058593,
      "completions/min_length": 99.8,
      "completions/min_terminated_length": 48.6,
      "epoch": 0.45513196480938417,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.9375,
      "kl": 0.023260063456837086,
      "learning_rate": 7.209786697564124e-06,
      "loss": -0.0113,
      "num_tokens": 21286759.0,
      "reward": 14.108087921142578,
      "reward_std": 14.388849067687989,
      "rewards/wrapper/mean": 7.054043973982334,
      "rewards/wrapper/std": 18.971005833148958,
      "step": 1940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.6,
      "completions/mean_length": 250.65625,
      "completions/mean_terminated_length": 17.1,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.4563049853372434,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.640625,
      "kl": 0.013732324563898146,
      "learning_rate": 7.206224447366281e-06,
      "loss": -0.0036,
      "num_tokens": 21342288.0,
      "reward": 15.73800368309021,
      "reward_std": 18.553266048431396,
      "rewards/wrapper/mean": 7.869001491367817,
      "rewards/wrapper/std": 21.538618184626102,
      "step": 1945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.0,
      "completions/mean_length": 253.45625,
      "completions/mean_terminated_length": 21.0,
      "completions/min_length": 174.6,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.4574780058651026,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.203125,
      "kl": 0.007772558298893273,
      "learning_rate": 7.202654465135994e-06,
      "loss": -0.0075,
      "num_tokens": 21395321.0,
      "reward": 10.073754501342773,
      "reward_std": 9.745565795898438,
      "rewards/wrapper/mean": 5.036876889318227,
      "rewards/wrapper/std": 14.711842876672744,
      "step": 1950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4586510263929619,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.2578125,
      "kl": 0.016830851417034866,
      "learning_rate": 7.1990767630966786e-06,
      "loss": 0.0007,
      "num_tokens": 21449241.0,
      "reward": 11.806681632995605,
      "reward_std": 16.12475757598877,
      "rewards/wrapper/mean": 5.903340773284436,
      "rewards/wrapper/std": 18.54739146232605,
      "step": 1955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.4,
      "completions/mean_length": 252.94375,
      "completions/mean_terminated_length": 2.3,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.45982404692082113,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.84375,
      "kl": 0.02298996753524989,
      "learning_rate": 7.195491353498185e-06,
      "loss": -0.0097,
      "num_tokens": 21503506.0,
      "reward": 9.964939308166503,
      "reward_std": 11.606767654418945,
      "rewards/wrapper/mean": 4.982469742745161,
      "rewards/wrapper/std": 15.793828999996185,
      "step": 1960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46099706744868035,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.84375,
      "kl": 0.012611826619831845,
      "learning_rate": 7.191898248616752e-06,
      "loss": 0.0005,
      "num_tokens": 21556986.0,
      "reward": 9.814716625213624,
      "reward_std": 10.976065969467163,
      "rewards/wrapper/mean": 4.907358513772488,
      "rewards/wrapper/std": 14.037784579396249,
      "step": 1965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.6,
      "completions/mean_length": 255.60625,
      "completions/mean_terminated_length": 38.6,
      "completions/min_length": 243.4,
      "completions/min_terminated_length": 38.6,
      "epoch": 0.4621700879765396,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.328125,
      "kl": 0.0076864651869982484,
      "learning_rate": 7.188297460754966e-06,
      "loss": 0.0011,
      "num_tokens": 21613243.0,
      "reward": 12.246005964279174,
      "reward_std": 14.45817790031433,
      "rewards/wrapper/mean": 6.123002929985523,
      "rewards/wrapper/std": 14.876671414077283,
      "step": 1970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.0,
      "completions/mean_length": 250.35,
      "completions/mean_terminated_length": 24.0,
      "completions/min_length": 75.2,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.4633431085043988,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.515625,
      "kl": 0.007964454928878695,
      "learning_rate": 7.18468900224172e-06,
      "loss": -0.016,
      "num_tokens": 21667595.0,
      "reward": 6.403034138679504,
      "reward_std": 8.485719972848893,
      "rewards/wrapper/mean": 3.2015167769044637,
      "rewards/wrapper/std": 11.11855943724513,
      "step": 1975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.94375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 65.2,
      "completions/mean_length": 243.6625,
      "completions/mean_terminated_length": 61.5,
      "completions/min_length": 59.6,
      "completions/min_terminated_length": 59.6,
      "epoch": 0.4645161290322581,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.765625,
      "kl": 0.007507404690841213,
      "learning_rate": 7.1810728854321735e-06,
      "loss": -0.0286,
      "num_tokens": 21721657.0,
      "reward": 16.982488250732423,
      "reward_std": 20.268542289733887,
      "rewards/wrapper/mean": 8.491244368255138,
      "rewards/wrapper/std": 18.553547403216363,
      "step": 1980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.4,
      "completions/mean_length": 251.9625,
      "completions/mean_terminated_length": 24.4,
      "completions/min_length": 126.8,
      "completions/min_terminated_length": 24.4,
      "epoch": 0.4656891495601173,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 8.875,
      "kl": 0.06541569416876883,
      "learning_rate": 7.177449122707703e-06,
      "loss": -0.0055,
      "num_tokens": 21775327.0,
      "reward": 9.9188814163208,
      "reward_std": 12.333803415298462,
      "rewards/wrapper/mean": 4.95944052785635,
      "rewards/wrapper/std": 12.621320475637912,
      "step": 1985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 68.8,
      "completions/mean_length": 251.75625,
      "completions/mean_terminated_length": 55.2,
      "completions/min_length": 144.0,
      "completions/min_terminated_length": 41.6,
      "epoch": 0.46686217008797654,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.3046875,
      "kl": 0.010084632772486658,
      "learning_rate": 7.17381772647587e-06,
      "loss": -0.012,
      "num_tokens": 21830376.0,
      "reward": 8.562131118774413,
      "reward_std": 10.988706159591676,
      "rewards/wrapper/mean": 4.281065583229065,
      "rewards/wrapper/std": 12.385915765166283,
      "step": 1990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.46803519061583576,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.9375,
      "kl": 0.010933949542231858,
      "learning_rate": 7.170178709170365e-06,
      "loss": -0.0121,
      "num_tokens": 21885670.0,
      "reward": 12.820589447021485,
      "reward_std": 17.374348068237303,
      "rewards/wrapper/mean": 6.410294429957867,
      "rewards/wrapper/std": 16.29390445202589,
      "step": 1995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 92.2,
      "completions/mean_length": 252.48125,
      "completions/mean_terminated_length": 92.2,
      "completions/min_length": 143.4,
      "completions/min_terminated_length": 92.2,
      "epoch": 0.46920821114369504,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.125,
      "kl": 0.0378818953060545,
      "learning_rate": 7.1665320832509805e-06,
      "loss": -0.0031,
      "num_tokens": 21938619.0,
      "reward": 10.933886623382568,
      "reward_std": 14.852296829223633,
      "rewards/wrapper/mean": 5.466943139582872,
      "rewards/wrapper/std": 16.55307368338108,
      "step": 2000
    },
    {
      "epoch": 0.46920821114369504,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.81,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 77.34,
      "eval_completions/mean_length": 231.14,
      "eval_completions/mean_terminated_length": 69.25166679382325,
      "eval_completions/min_length": 176.15,
      "eval_completions/min_terminated_length": 60.95,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.012026752880774438,
      "eval_loss": -0.026698114350438118,
      "eval_num_tokens": 21938619.0,
      "eval_reward": 0.391553550735116,
      "eval_reward_std": 0.2365354063967243,
      "eval_rewards/wrapper/mean": 0.19577677831053733,
      "eval_rewards/wrapper/std": 0.1914515098161064,
      "eval_runtime": 208.8731,
      "eval_samples_per_second": 0.958,
      "eval_steps_per_second": 0.239,
      "step": 2000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 29.4,
      "completions/mean_length": 255.31875,
      "completions/mean_terminated_length": 29.4,
      "completions/min_length": 234.2,
      "completions/min_terminated_length": 29.4,
      "epoch": 0.47038123167155427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.15625,
      "kl": 0.009880531148519367,
      "learning_rate": 7.162877861203553e-06,
      "loss": 0.0024,
      "num_tokens": 21994548.0,
      "reward": 9.461054754257201,
      "reward_std": 10.576078653335571,
      "rewards/wrapper/mean": 4.730527497828007,
      "rewards/wrapper/std": 14.311911128461361,
      "step": 2005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.4715542521994135,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.140625,
      "kl": 0.010484227410051972,
      "learning_rate": 7.159216055539936e-06,
      "loss": -0.0096,
      "num_tokens": 22053638.0,
      "reward": 9.75552453994751,
      "reward_std": 13.13539743423462,
      "rewards/wrapper/mean": 4.877762029320001,
      "rewards/wrapper/std": 16.15574167072773,
      "step": 2010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 18.4,
      "completions/mean_length": 253.375,
      "completions/mean_terminated_length": 18.4,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 18.4,
      "epoch": 0.4727272727272727,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.03125,
      "kl": 0.0069014692155178635,
      "learning_rate": 7.155546678797941e-06,
      "loss": -0.0071,
      "num_tokens": 22109996.0,
      "reward": 5.2139427185058596,
      "reward_std": 6.810544824600219,
      "rewards/wrapper/mean": 2.6069714561104775,
      "rewards/wrapper/std": 8.123496209084987,
      "step": 2015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.8,
      "completions/mean_length": 246.24375,
      "completions/mean_terminated_length": 31.2,
      "completions/min_length": 16.6,
      "completions/min_terminated_length": 16.6,
      "epoch": 0.47390029325513194,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.296875,
      "kl": 0.009175126685295253,
      "learning_rate": 7.1518697435413075e-06,
      "loss": -0.0085,
      "num_tokens": 22161447.0,
      "reward": 16.252300643920897,
      "reward_std": 18.46790657043457,
      "rewards/wrapper/mean": 8.126150195300578,
      "rewards/wrapper/std": 22.689970228075982,
      "step": 2020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 253.05,
      "completions/mean_terminated_length": 8.0,
      "completions/min_length": 161.6,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.4750733137829912,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.21875,
      "kl": 0.013811478717252612,
      "learning_rate": 7.148185262359653e-06,
      "loss": -0.0064,
      "num_tokens": 22215827.0,
      "reward": 8.836444938182831,
      "reward_std": 11.80629455447197,
      "rewards/wrapper/mean": 4.418222548812627,
      "rewards/wrapper/std": 12.121624572575092,
      "step": 2025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47624633431085045,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.40625,
      "kl": 0.011954761703964322,
      "learning_rate": 7.144493247868432e-06,
      "loss": 0.0005,
      "num_tokens": 22270149.0,
      "reward": 8.726956057548524,
      "reward_std": 10.781459975242615,
      "rewards/wrapper/mean": 4.363477950543166,
      "rewards/wrapper/std": 11.56369944959879,
      "step": 2030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.0,
      "completions/mean_length": 247.7125,
      "completions/mean_terminated_length": 19.56666679382324,
      "completions/min_length": 116.4,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.4774193548387097,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.390625,
      "kl": 0.017661917663645,
      "learning_rate": 7.140793712708894e-06,
      "loss": -0.02,
      "num_tokens": 22324755.0,
      "reward": 11.873594665527344,
      "reward_std": 16.035013008117676,
      "rewards/wrapper/mean": 5.936797216534615,
      "rewards/wrapper/std": 18.508428135514258,
      "step": 2035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.4785923753665689,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.53125,
      "kl": 0.006388617679476738,
      "learning_rate": 7.137086669548035e-06,
      "loss": -0.0167,
      "num_tokens": 22379342.0,
      "reward": 9.060737133026123,
      "reward_std": 10.055862641334533,
      "rewards/wrapper/mean": 4.530368596315384,
      "rewards/wrapper/std": 15.233650147914886,
      "step": 2040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.6,
      "completions/mean_length": 252.1,
      "completions/mean_terminated_length": 14.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.4797653958944281,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.375,
      "kl": 8129.877822359197,
      "learning_rate": 7.1333721310785614e-06,
      "loss": 325.1902,
      "num_tokens": 22435590.0,
      "reward": 9.408462858200073,
      "reward_std": 12.48482882976532,
      "rewards/wrapper/mean": 4.704231335222721,
      "rewards/wrapper/std": 13.56791399270296,
      "step": 2045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.8,
      "completions/mean_length": 251.25625,
      "completions/mean_terminated_length": 1.8,
      "completions/min_length": 104.2,
      "completions/min_terminated_length": 1.8,
      "epoch": 0.4809384164222874,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.765625,
      "kl": 0.010197655879892409,
      "learning_rate": 7.129650110018844e-06,
      "loss": -0.0065,
      "num_tokens": 22488429.0,
      "reward": 10.777331662178039,
      "reward_std": 13.09556884765625,
      "rewards/wrapper/mean": 5.388665563613176,
      "rewards/wrapper/std": 15.516864584386349,
      "step": 2050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 254.4125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 205.2,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.48211143695014663,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 16.125,
      "kl": 0.029901218856684862,
      "learning_rate": 7.12592061911287e-06,
      "loss": 0.0062,
      "num_tokens": 22540891.0,
      "reward": 5.5157770156860355,
      "reward_std": 6.8765318393707275,
      "rewards/wrapper/mean": 2.757888501882553,
      "rewards/wrapper/std": 9.093967694044114,
      "step": 2055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.48328445747800586,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.5703125,
      "kl": 0.00937183212954551,
      "learning_rate": 7.122183671130207e-06,
      "loss": -0.016,
      "num_tokens": 22593954.0,
      "reward": 10.615709495544433,
      "reward_std": 13.608506298065185,
      "rewards/wrapper/mean": 5.307854762673378,
      "rewards/wrapper/std": 14.481095506250858,
      "step": 2060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.6,
      "completions/mean_length": 252.88125,
      "completions/mean_terminated_length": 2.6,
      "completions/min_length": 156.2,
      "completions/min_terminated_length": 2.6,
      "epoch": 0.4844574780058651,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.3203125,
      "kl": 0.009229195350781084,
      "learning_rate": 7.118439278865949e-06,
      "loss": -0.0063,
      "num_tokens": 22646413.0,
      "reward": 10.647134971618652,
      "reward_std": 13.339782333374023,
      "rewards/wrapper/mean": 5.323567350953818,
      "rewards/wrapper/std": 16.633514940738678,
      "step": 2065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.48563049853372436,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.3125,
      "kl": 0.017320739361457525,
      "learning_rate": 7.114687455140686e-06,
      "loss": -0.0119,
      "num_tokens": 22703215.0,
      "reward": 10.626597595214843,
      "reward_std": 11.47060546875,
      "rewards/wrapper/mean": 5.313298827409744,
      "rewards/wrapper/std": 14.610971334576607,
      "step": 2070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.4868035190615836,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.625,
      "kl": 0.026328472554450855,
      "learning_rate": 7.110928212800449e-06,
      "loss": -0.0015,
      "num_tokens": 22760596.0,
      "reward": 8.179675102233887,
      "reward_std": 9.763381147384644,
      "rewards/wrapper/mean": 4.089837612211705,
      "rewards/wrapper/std": 12.722039490938187,
      "step": 2075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.4879765395894428,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.53125,
      "kl": 0.018763077515177428,
      "learning_rate": 7.107161564716671e-06,
      "loss": -0.0055,
      "num_tokens": 22814613.0,
      "reward": 7.410968685150147,
      "reward_std": 9.938138008117676,
      "rewards/wrapper/mean": 3.705484404414892,
      "rewards/wrapper/std": 12.76738702505827,
      "step": 2080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.0,
      "completions/mean_length": 253.9875,
      "completions/mean_terminated_length": 38.0,
      "completions/min_length": 191.6,
      "completions/min_terminated_length": 38.0,
      "epoch": 0.48914956011730204,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 7.21875,
      "kl": 0.015139080863445998,
      "learning_rate": 7.10338752378614e-06,
      "loss": -0.0032,
      "num_tokens": 22871085.0,
      "reward": 12.965310859680176,
      "reward_std": 14.322172927856446,
      "rewards/wrapper/mean": 6.482655397057533,
      "rewards/wrapper/std": 19.77724291831255,
      "step": 2085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.49032258064516127,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.03125,
      "kl": 0.008022710657678544,
      "learning_rate": 7.099606102930959e-06,
      "loss": -0.0072,
      "num_tokens": 22924535.0,
      "reward": 9.030936241149902,
      "reward_std": 11.908495712280274,
      "rewards/wrapper/mean": 4.515467864274979,
      "rewards/wrapper/std": 14.924723632633686,
      "step": 2090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.6,
      "completions/mean_length": 253.7875,
      "completions/mean_terminated_length": 31.6,
      "completions/min_length": 185.2,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.49149560117302055,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.71875,
      "kl": 0.006183647876605391,
      "learning_rate": 7.095817315098498e-06,
      "loss": -0.0064,
      "num_tokens": 22977949.0,
      "reward": 12.94020755290985,
      "reward_std": 17.86543025970459,
      "rewards/wrapper/mean": 6.470103675872087,
      "rewards/wrapper/std": 16.391293506324292,
      "step": 2095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.49266862170087977,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.5,
      "kl": 0.008084136416437104,
      "learning_rate": 7.092021173261353e-06,
      "loss": -0.0109,
      "num_tokens": 23031935.0,
      "reward": 11.518471813201904,
      "reward_std": 11.456220531463623,
      "rewards/wrapper/mean": 5.759235548973083,
      "rewards/wrapper/std": 16.237758734822272,
      "step": 2100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 251.6375,
      "completions/mean_terminated_length": 7.1,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.493841642228739,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 0.9375,
      "kl": 0.008848378155380487,
      "learning_rate": 7.088217690417298e-06,
      "loss": -0.0107,
      "num_tokens": 23086489.0,
      "reward": 7.520121216773987,
      "reward_std": 10.031231105327606,
      "rewards/wrapper/mean": 3.7600605204701423,
      "rewards/wrapper/std": 12.67145141363144,
      "step": 2105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.0,
      "completions/mean_length": 255.4,
      "completions/mean_terminated_length": 32.0,
      "completions/min_length": 236.8,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.4950146627565982,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.65625,
      "kl": 0.008108193590305745,
      "learning_rate": 7.084406879589242e-06,
      "loss": -0.0012,
      "num_tokens": 23144123.0,
      "reward": 11.643420743942261,
      "reward_std": 13.64364709854126,
      "rewards/wrapper/mean": 5.82171031832695,
      "rewards/wrapper/std": 14.45436689555645,
      "step": 2110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.6,
      "completions/mean_length": 253.00625,
      "completions/mean_terminated_length": 6.6,
      "completions/min_length": 160.2,
      "completions/min_terminated_length": 6.6,
      "epoch": 0.4961876832844575,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.046875,
      "kl": 0.007070534571539611,
      "learning_rate": 7.080588753825184e-06,
      "loss": -0.0102,
      "num_tokens": 23198110.0,
      "reward": 10.695003128051757,
      "reward_std": 14.557994079589843,
      "rewards/wrapper/mean": 5.347501567006111,
      "rewards/wrapper/std": 17.22459286004305,
      "step": 2115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 75.2,
      "completions/mean_length": 250.3625,
      "completions/mean_terminated_length": 53.9,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 32.6,
      "epoch": 0.49736070381231673,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.765625,
      "kl": 0.029336131096351893,
      "learning_rate": 7.076763326198173e-06,
      "loss": -0.0016,
      "num_tokens": 23253760.0,
      "reward": 8.677525091171265,
      "reward_std": 9.848081493377686,
      "rewards/wrapper/mean": 4.338762363791465,
      "rewards/wrapper/std": 13.28867315351963,
      "step": 2120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 247.06875,
      "completions/mean_terminated_length": 5.65,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.49853372434017595,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.578125,
      "kl": 0.017404579918365925,
      "learning_rate": 7.072930609806254e-06,
      "loss": -0.0171,
      "num_tokens": 23306149.0,
      "reward": 15.353384113311767,
      "reward_std": 19.154958724975586,
      "rewards/wrapper/mean": 7.676692083477974,
      "rewards/wrapper/std": 22.459415701031684,
      "step": 2125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 85.6,
      "completions/mean_length": 252.28125,
      "completions/mean_terminated_length": 68.3,
      "completions/min_length": 153.4,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.4997067448680352,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.4453125,
      "kl": 0.006470509018981829,
      "learning_rate": 7.0690906177724305e-06,
      "loss": 0.0063,
      "num_tokens": 23363424.0,
      "reward": 9.307399272918701,
      "reward_std": 12.280207061767578,
      "rewards/wrapper/mean": 4.653699503093958,
      "rewards/wrapper/std": 15.558639793097973,
      "step": 2130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 65.2,
      "completions/mean_length": 251.64375,
      "completions/mean_terminated_length": 62.9,
      "completions/min_length": 163.0,
      "completions/min_terminated_length": 60.6,
      "epoch": 0.5008797653958944,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.875,
      "kl": 0.017257986776530742,
      "learning_rate": 7.065243363244619e-06,
      "loss": -0.0119,
      "num_tokens": 23418599.0,
      "reward": 12.853836822509766,
      "reward_std": 17.099227905273438,
      "rewards/wrapper/mean": 6.4269180707633495,
      "rewards/wrapper/std": 18.025887221097946,
      "step": 2135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 251.41875,
      "completions/mean_terminated_length": 7.0,
      "completions/min_length": 109.4,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.5020527859237537,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.921875,
      "kl": 0.008581590990070253,
      "learning_rate": 7.0613888593956e-06,
      "loss": -0.0054,
      "num_tokens": 23472810.0,
      "reward": 8.985012340545655,
      "reward_std": 11.706706619262695,
      "rewards/wrapper/mean": 4.492506121098995,
      "rewards/wrapper/std": 12.430897434055804,
      "step": 2140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5032258064516129,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.890625,
      "kl": 0.008975341753102838,
      "learning_rate": 7.057527119422977e-06,
      "loss": -0.0053,
      "num_tokens": 23526291.0,
      "reward": 10.154117774963378,
      "reward_std": 13.665915584564209,
      "rewards/wrapper/mean": 5.077058912813664,
      "rewards/wrapper/std": 15.374008457362653,
      "step": 2145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.8,
      "completions/mean_length": 252.4125,
      "completions/mean_terminated_length": 38.8,
      "completions/min_length": 141.2,
      "completions/min_terminated_length": 38.8,
      "epoch": 0.5043988269794721,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5703125,
      "kl": 0.006695700716227293,
      "learning_rate": 7.0536581565491265e-06,
      "loss": -0.0117,
      "num_tokens": 23577593.0,
      "reward": 7.732948541641235,
      "reward_std": 8.718576312065125,
      "rewards/wrapper/mean": 3.866474460810423,
      "rewards/wrapper/std": 11.597916722297668,
      "step": 2150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 248.03125,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.5055718475073314,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.21875,
      "kl": 0.009025942167500033,
      "learning_rate": 7.049781984021159e-06,
      "loss": -0.0187,
      "num_tokens": 23630638.0,
      "reward": 10.552233409881591,
      "reward_std": 13.876630878448486,
      "rewards/wrapper/mean": 5.276116743683815,
      "rewards/wrapper/std": 15.018184214830399,
      "step": 2155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.2,
      "completions/mean_length": 253.8125,
      "completions/mean_terminated_length": 16.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5067448680351906,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.140625,
      "kl": 0.008229665062390267,
      "learning_rate": 7.04589861511087e-06,
      "loss": -0.0064,
      "num_tokens": 23683908.0,
      "reward": 7.022137629985809,
      "reward_std": 9.132097482681274,
      "rewards/wrapper/mean": 3.511068840324879,
      "rewards/wrapper/std": 10.933140191435815,
      "step": 2160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 36.4,
      "completions/mean_length": 253.9375,
      "completions/mean_terminated_length": 36.4,
      "completions/min_length": 190.0,
      "completions/min_terminated_length": 36.4,
      "epoch": 0.5079178885630499,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.4375,
      "kl": 0.014441122498828918,
      "learning_rate": 7.042008063114695e-06,
      "loss": -0.0074,
      "num_tokens": 23739054.0,
      "reward": 7.9544504404067995,
      "reward_std": 10.569594264030457,
      "rewards/wrapper/mean": 3.977225196361542,
      "rewards/wrapper/std": 12.171872541308403,
      "step": 2165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.2,
      "completions/mean_length": 251.9,
      "completions/mean_terminated_length": 22.2,
      "completions/min_length": 175.8,
      "completions/min_terminated_length": 22.2,
      "epoch": 0.509090909090909,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.1875,
      "kl": 0.013165635778568685,
      "learning_rate": 7.038110341353661e-06,
      "loss": 0.0052,
      "num_tokens": 23796234.0,
      "reward": 11.378597354888916,
      "reward_std": 15.319499397277832,
      "rewards/wrapper/mean": 5.689298801869154,
      "rewards/wrapper/std": 20.10531617105007,
      "step": 2170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 248.03125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5102639296187683,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.8984375,
      "kl": 0.007043931004591286,
      "learning_rate": 7.034205463173349e-06,
      "loss": -0.0194,
      "num_tokens": 23849139.0,
      "reward": 11.2071537733078,
      "reward_std": 12.747740030288696,
      "rewards/wrapper/mean": 5.6035766273736956,
      "rewards/wrapper/std": 14.315529163181782,
      "step": 2175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.2,
      "completions/mean_length": 252.49375,
      "completions/mean_terminated_length": 40.1,
      "completions/min_length": 193.6,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.5114369501466276,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.984375,
      "kl": 0.013299105316400528,
      "learning_rate": 7.030293441943839e-06,
      "loss": -0.0089,
      "num_tokens": 23904442.0,
      "reward": 14.169266891479491,
      "reward_std": 15.299544715881348,
      "rewards/wrapper/mean": 7.084633606672287,
      "rewards/wrapper/std": 20.22032133191824,
      "step": 2180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 74.4,
      "completions/mean_length": 247.14375,
      "completions/mean_terminated_length": 45.2,
      "completions/min_length": 81.8,
      "completions/min_terminated_length": 30.6,
      "epoch": 0.5126099706744868,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.78125,
      "kl": 0.02666954748565331,
      "learning_rate": 7.02637429105967e-06,
      "loss": -0.0197,
      "num_tokens": 23961361.0,
      "reward": 13.424666404724121,
      "reward_std": 15.76526699066162,
      "rewards/wrapper/mean": 6.712333005666733,
      "rewards/wrapper/std": 18.18206671178341,
      "step": 2185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.4,
      "completions/mean_length": 254.50625,
      "completions/mean_terminated_length": 3.4,
      "completions/min_length": 208.2,
      "completions/min_terminated_length": 3.4,
      "epoch": 0.513782991202346,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.375,
      "kl": 0.00966759123839438,
      "learning_rate": 7.022448023939792e-06,
      "loss": -0.0031,
      "num_tokens": 24016330.0,
      "reward": 10.31696891784668,
      "reward_std": 13.620968246459961,
      "rewards/wrapper/mean": 5.158484085649252,
      "rewards/wrapper/std": 15.674263837933541,
      "step": 2190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.6,
      "completions/mean_length": 252.09375,
      "completions/mean_terminated_length": 28.6,
      "completions/min_length": 131.0,
      "completions/min_terminated_length": 28.6,
      "epoch": 0.5149560117302053,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.234375,
      "kl": 0.008295352436834946,
      "learning_rate": 7.018514654027522e-06,
      "loss": -0.0095,
      "num_tokens": 24071905.0,
      "reward": 11.781395196914673,
      "reward_std": 15.11465334892273,
      "rewards/wrapper/mean": 5.890697306394577,
      "rewards/wrapper/std": 16.348299649357795,
      "step": 2195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5161290322580645,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.6875,
      "kl": 0.01148320131469518,
      "learning_rate": 7.014574194790494e-06,
      "loss": -0.0071,
      "num_tokens": 24131471.0,
      "reward": 10.274917244911194,
      "reward_std": 13.521960470080376,
      "rewards/wrapper/mean": 5.137458457052707,
      "rewards/wrapper/std": 14.553812845051288,
      "step": 2200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.0,
      "completions/mean_length": 248.66875,
      "completions/mean_terminated_length": 19.7,
      "completions/min_length": 120.8,
      "completions/min_terminated_length": 18.4,
      "epoch": 0.5173020527859238,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.46875,
      "kl": 0.008623561984859408,
      "learning_rate": 7.010626659720619e-06,
      "loss": -0.0215,
      "num_tokens": 24182886.0,
      "reward": 11.50489158630371,
      "reward_std": 12.356374740600586,
      "rewards/wrapper/mean": 5.75244573764503,
      "rewards/wrapper/std": 15.028051799535751,
      "step": 2205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.518475073313783,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.125,
      "kl": 0.014326384535524993,
      "learning_rate": 7.006672062334031e-06,
      "loss": -0.0095,
      "num_tokens": 24235856.0,
      "reward": 10.057607388496399,
      "reward_std": 10.844542121887207,
      "rewards/wrapper/mean": 5.028803963959217,
      "rewards/wrapper/std": 14.027101680636406,
      "step": 2210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.2,
      "completions/mean_length": 251.70625,
      "completions/mean_terminated_length": 16.2,
      "completions/min_length": 118.6,
      "completions/min_terminated_length": 16.2,
      "epoch": 0.5196480938416422,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.3671875,
      "kl": 0.0070623870589770375,
      "learning_rate": 7.0027104161710485e-06,
      "loss": 0.0003,
      "num_tokens": 24289943.0,
      "reward": 7.508664560317993,
      "reward_std": 9.97425332069397,
      "rewards/wrapper/mean": 3.7543324276804926,
      "rewards/wrapper/std": 12.59233037829399,
      "step": 2215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5208211143695015,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.09375,
      "kl": 0.011236746236681939,
      "learning_rate": 6.9987417347961224e-06,
      "loss": 0.0004,
      "num_tokens": 24350043.0,
      "reward": 10.7836181640625,
      "reward_std": 14.692389869689942,
      "rewards/wrapper/mean": 5.391809102892876,
      "rewards/wrapper/std": 17.6582034394145,
      "step": 2220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.6,
      "completions/mean_length": 250.875,
      "completions/mean_terminated_length": 36.1,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.5219941348973607,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.375,
      "kl": 0.014934242086019367,
      "learning_rate": 6.994766031797795e-06,
      "loss": -0.011,
      "num_tokens": 24404613.0,
      "reward": 12.124627828598022,
      "reward_std": 12.525247192382812,
      "rewards/wrapper/mean": 6.062313592433929,
      "rewards/wrapper/std": 15.312794582545758,
      "step": 2225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.4,
      "completions/mean_length": 249.8375,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 5.6,
      "epoch": 0.52316715542522,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 15.3125,
      "kl": 0.05197887100512162,
      "learning_rate": 6.990783320788646e-06,
      "loss": -0.0027,
      "num_tokens": 24458525.0,
      "reward": 10.043572521209716,
      "reward_std": 13.56480016708374,
      "rewards/wrapper/mean": 5.021785932034254,
      "rewards/wrapper/std": 14.084061123430729,
      "step": 2230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.4,
      "completions/mean_length": 251.025,
      "completions/mean_terminated_length": 45.4,
      "completions/min_length": 147.8,
      "completions/min_terminated_length": 45.4,
      "epoch": 0.5243401759530791,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.953125,
      "kl": 0.008444994036108256,
      "learning_rate": 6.98679361540525e-06,
      "loss": -0.008,
      "num_tokens": 24515231.0,
      "reward": 9.886309909820557,
      "reward_std": 9.906683957576751,
      "rewards/wrapper/mean": 4.9431547477841375,
      "rewards/wrapper/std": 13.339074079692363,
      "step": 2235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 29.2,
      "completions/mean_length": 252.2875,
      "completions/mean_terminated_length": 17.5,
      "completions/min_length": 159.4,
      "completions/min_terminated_length": 5.8,
      "epoch": 0.5255131964809384,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.4375,
      "kl": 0.4571570298052393,
      "learning_rate": 6.9827969293081375e-06,
      "loss": 0.0155,
      "num_tokens": 24570455.0,
      "reward": 10.244660663604737,
      "reward_std": 11.858344125747681,
      "rewards/wrapper/mean": 5.122330310195684,
      "rewards/wrapper/std": 16.16204769462347,
      "step": 2240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.6,
      "completions/mean_length": 254.38125,
      "completions/mean_terminated_length": 50.6,
      "completions/min_length": 204.2,
      "completions/min_terminated_length": 50.6,
      "epoch": 0.5266862170087977,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.453125,
      "kl": 0.040043732104822996,
      "learning_rate": 6.97879327618173e-06,
      "loss": -0.0034,
      "num_tokens": 24626558.0,
      "reward": 10.697834300994874,
      "reward_std": 14.070580577850341,
      "rewards/wrapper/mean": 5.348917351663113,
      "rewards/wrapper/std": 14.375663158297538,
      "step": 2245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.4,
      "completions/mean_length": 252.5,
      "completions/mean_terminated_length": 27.1,
      "completions/min_length": 166.4,
      "completions/min_terminated_length": 12.8,
      "epoch": 0.5278592375366569,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.046875,
      "kl": 0.007097694149706513,
      "learning_rate": 6.97478266973431e-06,
      "loss": -0.0077,
      "num_tokens": 24680370.0,
      "reward": 7.563847708702087,
      "reward_std": 9.385326147079468,
      "rewards/wrapper/mean": 3.7819239191710947,
      "rewards/wrapper/std": 11.115535339713096,
      "step": 2250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.6,
      "completions/mean_length": 251.18125,
      "completions/mean_terminated_length": 50.6,
      "completions/min_length": 101.8,
      "completions/min_terminated_length": 50.6,
      "epoch": 0.5290322580645161,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.46875,
      "kl": 0.010719807958230377,
      "learning_rate": 6.970765123697969e-06,
      "loss": -0.0095,
      "num_tokens": 24738315.0,
      "reward": 18.66574192047119,
      "reward_std": 23.168578147888184,
      "rewards/wrapper/mean": 9.332871111482381,
      "rewards/wrapper/std": 24.021143828332423,
      "step": 2255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.8,
      "completions/mean_length": 251.7,
      "completions/mean_terminated_length": 8.1,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5302052785923753,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.546875,
      "kl": 0.08032823919784278,
      "learning_rate": 6.966740651828553e-06,
      "loss": -0.0097,
      "num_tokens": 24793259.0,
      "reward": 12.07797212600708,
      "reward_std": 13.894320297241212,
      "rewards/wrapper/mean": 6.038985838741064,
      "rewards/wrapper/std": 17.31892890483141,
      "step": 2260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.6,
      "completions/mean_length": 250.4,
      "completions/mean_terminated_length": 25.6,
      "completions/min_length": 76.8,
      "completions/min_terminated_length": 25.6,
      "epoch": 0.5313782991202346,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.5,
      "kl": 0.009525609156116843,
      "learning_rate": 6.962709267905628e-06,
      "loss": -0.0004,
      "num_tokens": 24848745.0,
      "reward": 17.720832443237306,
      "reward_std": 21.870363998413087,
      "rewards/wrapper/mean": 8.860416962206363,
      "rewards/wrapper/std": 28.17476643770933,
      "step": 2265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 75.8,
      "completions/mean_length": 245.925,
      "completions/mean_terminated_length": 38.266667175292966,
      "completions/min_length": 65.4,
      "completions/min_terminated_length": 14.2,
      "epoch": 0.5325513196480939,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.09375,
      "kl": 0.008869636815506964,
      "learning_rate": 6.9586709857324235e-06,
      "loss": -0.014,
      "num_tokens": 24903285.0,
      "reward": 7.448293590545655,
      "reward_std": 9.88570761680603,
      "rewards/wrapper/mean": 3.724146793037653,
      "rewards/wrapper/std": 12.0461391761899,
      "step": 2270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.4,
      "completions/mean_length": 251.3125,
      "completions/mean_terminated_length": 1.9,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.533724340175953,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.75,
      "kl": 0.006731168960686773,
      "learning_rate": 6.954625819135789e-06,
      "loss": -0.0132,
      "num_tokens": 24959077.0,
      "reward": 14.303616046905518,
      "reward_std": 19.654699897766115,
      "rewards/wrapper/mean": 7.151808172464371,
      "rewards/wrapper/std": 20.88992646113038,
      "step": 2275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.4,
      "completions/mean_length": 252.68125,
      "completions/mean_terminated_length": 47.4,
      "completions/min_length": 149.8,
      "completions/min_terminated_length": 47.4,
      "epoch": 0.5348973607038123,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.625,
      "kl": 0.008235664875246584,
      "learning_rate": 6.950573781966145e-06,
      "loss": -0.01,
      "num_tokens": 25016086.0,
      "reward": 10.721317195892334,
      "reward_std": 13.564059352874756,
      "rewards/wrapper/mean": 5.360658337175846,
      "rewards/wrapper/std": 15.573051902651788,
      "step": 2280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.2,
      "completions/mean_length": 251.65,
      "completions/mean_terminated_length": 7.3,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5360703812316715,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.0703125,
      "kl": 0.016059622971806676,
      "learning_rate": 6.946514888097435e-06,
      "loss": -0.0099,
      "num_tokens": 25071402.0,
      "reward": 11.941667938232422,
      "reward_std": 13.991264152526856,
      "rewards/wrapper/mean": 5.970833889394998,
      "rewards/wrapper/std": 17.43187249600887,
      "step": 2285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.4,
      "completions/mean_length": 253.125,
      "completions/mean_terminated_length": 10.4,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 10.4,
      "epoch": 0.5372434017595308,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.09375,
      "kl": 0.009205941716209054,
      "learning_rate": 6.942449151427085e-06,
      "loss": -0.0041,
      "num_tokens": 25124486.0,
      "reward": 9.145212745666504,
      "reward_std": 12.24248571395874,
      "rewards/wrapper/mean": 4.572606243938208,
      "rewards/wrapper/std": 13.682215167582035,
      "step": 2290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.2,
      "completions/mean_length": 253.2,
      "completions/mean_terminated_length": 6.4,
      "completions/min_length": 206.4,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.5384164222873901,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.984375,
      "kl": 0.18185594582464545,
      "learning_rate": 6.9383765858759435e-06,
      "loss": -0.0005,
      "num_tokens": 25176610.0,
      "reward": 13.980056762695312,
      "reward_std": 17.20791530609131,
      "rewards/wrapper/mean": 6.990028128027916,
      "rewards/wrapper/std": 20.499118688702584,
      "step": 2295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 36.8,
      "completions/mean_length": 252.5375,
      "completions/mean_terminated_length": 21.5,
      "completions/min_length": 159.8,
      "completions/min_terminated_length": 6.2,
      "epoch": 0.5395894428152492,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.71875,
      "kl": 0.013887450261972845,
      "learning_rate": 6.9342972053882475e-06,
      "loss": -0.0067,
      "num_tokens": 25235306.0,
      "reward": 7.798388671875,
      "reward_std": 9.488762283325196,
      "rewards/wrapper/mean": 3.8991942696273325,
      "rewards/wrapper/std": 14.203517746925353,
      "step": 2300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 90.4,
      "completions/mean_length": 254.025,
      "completions/mean_terminated_length": 90.4,
      "completions/min_length": 192.8,
      "completions/min_terminated_length": 90.4,
      "epoch": 0.5407624633431085,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 0.94921875,
      "kl": 0.1488804242340848,
      "learning_rate": 6.930211023931562e-06,
      "loss": 0.0008,
      "num_tokens": 25288734.0,
      "reward": 11.9146014213562,
      "reward_std": 12.40765314102173,
      "rewards/wrapper/mean": 5.957300490140915,
      "rewards/wrapper/std": 16.380452224612235,
      "step": 2305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 252.825,
      "completions/mean_terminated_length": 0.8,
      "completions/min_length": 154.4,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.5419354838709678,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.375,
      "kl": 0.009590476250741631,
      "learning_rate": 6.926118055496741e-06,
      "loss": -0.0109,
      "num_tokens": 25343804.0,
      "reward": 10.353494834899902,
      "reward_std": 11.441220569610596,
      "rewards/wrapper/mean": 5.176747385412455,
      "rewards/wrapper/std": 12.015816460549832,
      "step": 2310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.2,
      "completions/mean_length": 253.01875,
      "completions/mean_terminated_length": 37.0,
      "completions/min_length": 178.4,
      "completions/min_terminated_length": 24.8,
      "epoch": 0.543108504398827,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5546875,
      "kl": 0.019391293311491607,
      "learning_rate": 6.922018314097876e-06,
      "loss": -0.0031,
      "num_tokens": 25399647.0,
      "reward": 10.329414510726929,
      "reward_std": 13.49524603486061,
      "rewards/wrapper/mean": 5.164707355946303,
      "rewards/wrapper/std": 15.286242140829563,
      "step": 2315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 252.9875,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 159.6,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.5442815249266862,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.09375,
      "kl": 0.008706731640268117,
      "learning_rate": 6.917911813772251e-06,
      "loss": 0.0019,
      "num_tokens": 25454231.0,
      "reward": 7.387438726425171,
      "reward_std": 9.80572988986969,
      "rewards/wrapper/mean": 3.6937191992998124,
      "rewards/wrapper/std": 12.426317961513996,
      "step": 2320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.4,
      "completions/mean_length": 248.4375,
      "completions/mean_terminated_length": 4.733333587646484,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5454545454545454,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.21875,
      "kl": 0.008474712888710201,
      "learning_rate": 6.913798568580287e-06,
      "loss": -0.0237,
      "num_tokens": 25510329.0,
      "reward": 11.03807897567749,
      "reward_std": 12.529253387451172,
      "rewards/wrapper/mean": 5.519039383530616,
      "rewards/wrapper/std": 15.573815928399563,
      "step": 2325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.0,
      "completions/mean_length": 246.59375,
      "completions/mean_terminated_length": 32.6,
      "completions/min_length": 19.2,
      "completions/min_terminated_length": 19.2,
      "epoch": 0.5466275659824047,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 12.125,
      "kl": 0.021075761376414447,
      "learning_rate": 6.909678592605505e-06,
      "loss": -0.016,
      "num_tokens": 25563666.0,
      "reward": 12.097493743896484,
      "reward_std": 16.431555366516115,
      "rewards/wrapper/mean": 6.048747086524964,
      "rewards/wrapper/std": 19.726453380286692,
      "step": 2330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.0,
      "completions/mean_length": 252.04375,
      "completions/mean_terminated_length": 27.0,
      "completions/min_length": 129.4,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.547800586510264,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.53125,
      "kl": 0.010052182164508849,
      "learning_rate": 6.905551899954469e-06,
      "loss": -0.0028,
      "num_tokens": 25618809.0,
      "reward": 7.563618755340576,
      "reward_std": 8.234925365447998,
      "rewards/wrapper/mean": 3.781809412688017,
      "rewards/wrapper/std": 11.576200237870216,
      "step": 2335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.0,
      "completions/mean_length": 250.35625,
      "completions/mean_terminated_length": 24.0,
      "completions/min_length": 126.4,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.5489736070381231,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.609375,
      "kl": 0.007738465024158358,
      "learning_rate": 6.9014185047567374e-06,
      "loss": -0.0186,
      "num_tokens": 25669328.0,
      "reward": 7.976767730712891,
      "reward_std": 9.955007457733155,
      "rewards/wrapper/mean": 3.9883838906884193,
      "rewards/wrapper/std": 11.497270411252975,
      "step": 2340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5501466275659824,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.21875,
      "kl": 0.00954286667983979,
      "learning_rate": 6.897278421164825e-06,
      "loss": -0.0053,
      "num_tokens": 25723649.0,
      "reward": 10.446186733245849,
      "reward_std": 11.945374870300293,
      "rewards/wrapper/mean": 5.223093181848526,
      "rewards/wrapper/std": 15.632076103985309,
      "step": 2345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 53.6,
      "completions/mean_length": 252.88125,
      "completions/mean_terminated_length": 49.8,
      "completions/min_length": 199.6,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.5513196480938416,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.796875,
      "kl": 0.01686981668462977,
      "learning_rate": 6.893131663354141e-06,
      "loss": -0.0092,
      "num_tokens": 25778384.0,
      "reward": 12.496757245063781,
      "reward_std": 14.88165216445923,
      "rewards/wrapper/mean": 6.24837853461504,
      "rewards/wrapper/std": 19.739536590874195,
      "step": 2350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5524926686217009,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 7.84375,
      "kl": 0.49535912594292314,
      "learning_rate": 6.8889782455229516e-06,
      "loss": 0.0142,
      "num_tokens": 25834571.0,
      "reward": 17.93525218963623,
      "reward_std": 22.702994346618652,
      "rewards/wrapper/mean": 8.967626057565212,
      "rewards/wrapper/std": 25.6280776694417,
      "step": 2355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5536656891495602,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.125,
      "kl": 0.052939243521541356,
      "learning_rate": 6.884818181892319e-06,
      "loss": -0.0096,
      "num_tokens": 25891789.0,
      "reward": 8.508272314071656,
      "reward_std": 10.7423424243927,
      "rewards/wrapper/mean": 4.254136118665338,
      "rewards/wrapper/std": 14.045457898080349,
      "step": 2360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.4,
      "completions/mean_length": 254.34375,
      "completions/mean_terminated_length": 49.4,
      "completions/min_length": 203.0,
      "completions/min_terminated_length": 49.4,
      "epoch": 0.5548387096774193,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.203125,
      "kl": 0.014628489618189633,
      "learning_rate": 6.8806514867060685e-06,
      "loss": 0.0015,
      "num_tokens": 25947872.0,
      "reward": 14.109557342529296,
      "reward_std": 17.481741905212402,
      "rewards/wrapper/mean": 7.054778654873371,
      "rewards/wrapper/std": 21.674049939215184,
      "step": 2365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.2,
      "completions/mean_length": 252.8375,
      "completions/mean_terminated_length": 1.2,
      "completions/min_length": 154.8,
      "completions/min_terminated_length": 1.2,
      "epoch": 0.5560117302052786,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.953125,
      "kl": 0.028208025731146336,
      "learning_rate": 6.876478174230728e-06,
      "loss": 0.0025,
      "num_tokens": 26001552.0,
      "reward": 12.057064437866211,
      "reward_std": 16.252457237243654,
      "rewards/wrapper/mean": 6.028532239794731,
      "rewards/wrapper/std": 19.421438497304916,
      "step": 2370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.6,
      "completions/mean_length": 249.34375,
      "completions/mean_terminated_length": 26.3,
      "completions/min_length": 112.4,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.5571847507331378,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.359375,
      "kl": 0.008724843431264163,
      "learning_rate": 6.872298258755484e-06,
      "loss": -0.0051,
      "num_tokens": 26055861.0,
      "reward": 8.755217671394348,
      "reward_std": 11.814749300479889,
      "rewards/wrapper/mean": 4.377608657628298,
      "rewards/wrapper/std": 13.853392350673676,
      "step": 2375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 251.26875,
      "completions/mean_terminated_length": 1.2,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5583577712609971,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.03125,
      "kl": 0.008417540992377326,
      "learning_rate": 6.868111754592126e-06,
      "loss": -0.0133,
      "num_tokens": 26108722.0,
      "reward": 6.4707801818847654,
      "reward_std": 6.961957550048828,
      "rewards/wrapper/mean": 3.235390084981918,
      "rewards/wrapper/std": 9.7925319314003,
      "step": 2380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 34.8,
      "completions/mean_length": 250.69375,
      "completions/mean_terminated_length": 26.5,
      "completions/min_length": 120.6,
      "completions/min_terminated_length": 18.2,
      "epoch": 0.5595307917888563,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.71875,
      "kl": 0.008799165394157172,
      "learning_rate": 6.863918676075011e-06,
      "loss": -0.0172,
      "num_tokens": 26163919.0,
      "reward": 7.298964881896973,
      "reward_std": 9.784769356250763,
      "rewards/wrapper/mean": 3.6494823902845384,
      "rewards/wrapper/std": 9.869220197200775,
      "step": 2385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 251.225,
      "completions/mean_terminated_length": 0.8,
      "completions/min_length": 103.2,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.5607038123167155,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.734375,
      "kl": 0.010733005020301788,
      "learning_rate": 6.859719037561e-06,
      "loss": 0.0067,
      "num_tokens": 26217157.0,
      "reward": 9.287919998168945,
      "reward_std": 10.79558277130127,
      "rewards/wrapper/mean": 4.643959843367338,
      "rewards/wrapper/std": 12.492424213886261,
      "step": 2390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5618768328445748,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.609375,
      "kl": 0.00881088743917644,
      "learning_rate": 6.855512853429417e-06,
      "loss": -0.0047,
      "num_tokens": 26271704.0,
      "reward": 12.773279190063477,
      "reward_std": 16.686813735961913,
      "rewards/wrapper/mean": 6.386639550328255,
      "rewards/wrapper/std": 16.06548334211111,
      "step": 2395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.4,
      "completions/mean_length": 251.46875,
      "completions/mean_terminated_length": 8.4,
      "completions/min_length": 162.0,
      "completions/min_terminated_length": 8.4,
      "epoch": 0.5630498533724341,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.75,
      "kl": 0.008467439271043986,
      "learning_rate": 6.851300138081998e-06,
      "loss": 0.002,
      "num_tokens": 26324537.0,
      "reward": 9.383643126487732,
      "reward_std": 11.786495923995972,
      "rewards/wrapper/mean": 4.691821337491274,
      "rewards/wrapper/std": 10.728658132255077,
      "step": 2400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.0,
      "completions/mean_length": 253.14375,
      "completions/mean_terminated_length": 44.5,
      "completions/min_length": 180.6,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.5642228739002932,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.203125,
      "kl": 0.00841777388122864,
      "learning_rate": 6.847080905942841e-06,
      "loss": -0.0103,
      "num_tokens": 26379500.0,
      "reward": 15.23821144104004,
      "reward_std": 19.51053657531738,
      "rewards/wrapper/mean": 7.619105443358421,
      "rewards/wrapper/std": 20.655081064999102,
      "step": 2405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.5653958944281525,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.046875,
      "kl": 0.009163353778421878,
      "learning_rate": 6.84285517145836e-06,
      "loss": -0.0166,
      "num_tokens": 26431411.0,
      "reward": 13.655599784851074,
      "reward_std": 18.63932514190674,
      "rewards/wrapper/mean": 6.827799582481385,
      "rewards/wrapper/std": 20.419064237177373,
      "step": 2410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 248.51875,
      "completions/mean_terminated_length": 9.3,
      "completions/min_length": 53.4,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.5665689149560117,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.765625,
      "kl": 0.008064938080497085,
      "learning_rate": 6.838622949097228e-06,
      "loss": -0.0118,
      "num_tokens": 26486158.0,
      "reward": 7.38580174446106,
      "reward_std": 9.37550323009491,
      "rewards/wrapper/mean": 3.6929009795188903,
      "rewards/wrapper/std": 11.389765891432763,
      "step": 2415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.8,
      "completions/mean_length": 255.175,
      "completions/mean_terminated_length": 24.8,
      "completions/min_length": 229.6,
      "completions/min_terminated_length": 24.8,
      "epoch": 0.567741935483871,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.484375,
      "kl": 0.014842483540996909,
      "learning_rate": 6.834384253350335e-06,
      "loss": 0.0,
      "num_tokens": 26539338.0,
      "reward": 9.526783275604249,
      "reward_std": 12.541070747375489,
      "rewards/wrapper/mean": 4.763391713798046,
      "rewards/wrapper/std": 14.62897629737854,
      "step": 2420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5689149560117303,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.015625,
      "kl": 0.008617074706126004,
      "learning_rate": 6.8301390987307355e-06,
      "loss": -0.0059,
      "num_tokens": 26594891.0,
      "reward": 9.922335815429687,
      "reward_std": 12.18459119796753,
      "rewards/wrapper/mean": 4.961168044060469,
      "rewards/wrapper/std": 13.656743614375591,
      "step": 2425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 77.4,
      "completions/mean_length": 251.225,
      "completions/mean_terminated_length": 72.4,
      "completions/min_length": 169.8,
      "completions/min_terminated_length": 67.4,
      "epoch": 0.5700879765395894,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.9375,
      "kl": 0.007735758402850479,
      "learning_rate": 6.8258874997735975e-06,
      "loss": 0.0074,
      "num_tokens": 26651603.0,
      "reward": 7.229635429382324,
      "reward_std": 9.8571928024292,
      "rewards/wrapper/mean": 3.6148177579045297,
      "rewards/wrapper/std": 12.254036985337734,
      "step": 2430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.0,
      "completions/mean_length": 247.0375,
      "completions/mean_terminated_length": 38.3,
      "completions/min_length": 27.6,
      "completions/min_terminated_length": 27.6,
      "epoch": 0.5712609970674487,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.84375,
      "kl": 0.010122451512143017,
      "learning_rate": 6.821629471036154e-06,
      "loss": -0.0266,
      "num_tokens": 26705361.0,
      "reward": 9.642913770675658,
      "reward_std": 7.355159568786621,
      "rewards/wrapper/mean": 4.821456654369831,
      "rewards/wrapper/std": 13.488919001817703,
      "step": 2435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.5724340175953079,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3152.0,
      "kl": 1.9757349454564974,
      "learning_rate": 6.817365027097655e-06,
      "loss": 0.0677,
      "num_tokens": 26757754.0,
      "reward": 5.890293455123901,
      "reward_std": 6.86049211025238,
      "rewards/wrapper/mean": 2.9451463639736177,
      "rewards/wrapper/std": 10.447056712210179,
      "step": 2440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 251.78125,
      "completions/mean_terminated_length": 18.6,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 18.6,
      "epoch": 0.5736070381231672,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 7.875,
      "kl": 0.0200483936117962,
      "learning_rate": 6.813094182559314e-06,
      "loss": -0.0131,
      "num_tokens": 26812285.0,
      "reward": 17.4649169921875,
      "reward_std": 20.268056106567382,
      "rewards/wrapper/mean": 8.732458454370498,
      "rewards/wrapper/std": 23.30886591821909,
      "step": 2445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.0,
      "completions/mean_length": 252.70625,
      "completions/mean_terminated_length": 24.3,
      "completions/min_length": 154.2,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.5747800586510264,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.8125,
      "kl": 0.011486495216377079,
      "learning_rate": 6.8088169520442604e-06,
      "loss": -0.0067,
      "num_tokens": 26865718.0,
      "reward": 14.725182628631591,
      "reward_std": 19.531952953338624,
      "rewards/wrapper/mean": 7.362591397762299,
      "rewards/wrapper/std": 16.93528108596802,
      "step": 2450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.5759530791788856,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.4296875,
      "kl": 0.015283348388038576,
      "learning_rate": 6.804533350197491e-06,
      "loss": -0.005,
      "num_tokens": 26920061.0,
      "reward": 12.328321695327759,
      "reward_std": 13.61722400188446,
      "rewards/wrapper/mean": 6.1641609571874145,
      "rewards/wrapper/std": 15.86318702250719,
      "step": 2455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.2,
      "completions/mean_length": 250.65625,
      "completions/mean_terminated_length": 44.2,
      "completions/min_length": 90.4,
      "completions/min_terminated_length": 39.2,
      "epoch": 0.5771260997067449,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 1.640625,
      "kl": 0.011669400706887245,
      "learning_rate": 6.800243391685812e-06,
      "loss": -0.0067,
      "num_tokens": 26975658.0,
      "reward": 7.230981612205506,
      "reward_std": 8.025412940979004,
      "rewards/wrapper/mean": 3.615490733087063,
      "rewards/wrapper/std": 9.987757830321788,
      "step": 2460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.6,
      "completions/mean_length": 251.5,
      "completions/mean_terminated_length": 30.6,
      "completions/min_length": 115.0,
      "completions/min_terminated_length": 12.6,
      "epoch": 0.5782991202346041,
      "frac_reward_zero_std": 0.075,
      "grad_norm": 5.0,
      "kl": 0.00790996109135449,
      "learning_rate": 6.795947091197802e-06,
      "loss": -0.0036,
      "num_tokens": 27033648.0,
      "reward": 12.457160663604736,
      "reward_std": 13.068470573425293,
      "rewards/wrapper/mean": 6.228580264747142,
      "rewards/wrapper/std": 17.349486616253852,
      "step": 2465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.0,
      "completions/mean_length": 250.41875,
      "completions/mean_terminated_length": 38.6,
      "completions/min_length": 185.8,
      "completions/min_terminated_length": 32.2,
      "epoch": 0.5794721407624633,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.3828125,
      "kl": 0.009441101230913773,
      "learning_rate": 6.791644463443747e-06,
      "loss": -0.0107,
      "num_tokens": 27088861.0,
      "reward": 9.027909755706787,
      "reward_std": 11.057364964485169,
      "rewards/wrapper/mean": 4.513954804837704,
      "rewards/wrapper/std": 15.522229766845703,
      "step": 2470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.5806451612903226,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.046875,
      "kl": 0.0075355375884100795,
      "learning_rate": 6.787335523155603e-06,
      "loss": -0.0198,
      "num_tokens": 27142882.0,
      "reward": 10.263352251052856,
      "reward_std": 11.702549076080322,
      "rewards/wrapper/mean": 5.131676015257836,
      "rewards/wrapper/std": 14.765554384887219,
      "step": 2475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.8,
      "completions/mean_length": 252.475,
      "completions/mean_terminated_length": 40.8,
      "completions/min_length": 143.2,
      "completions/min_terminated_length": 40.8,
      "epoch": 0.5818181818181818,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.484375,
      "kl": 0.009854745410848409,
      "learning_rate": 6.783020285086934e-06,
      "loss": -0.0079,
      "num_tokens": 27199384.0,
      "reward": 12.724866485595703,
      "reward_std": 15.704174327850343,
      "rewards/wrapper/mean": 6.362433303892613,
      "rewards/wrapper/std": 19.918368512392043,
      "step": 2480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 55.2,
      "completions/mean_length": 252.24375,
      "completions/mean_terminated_length": 52.3,
      "completions/min_length": 151.8,
      "completions/min_terminated_length": 49.4,
      "epoch": 0.5829912023460411,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.96875,
      "kl": 2.5806705773225986,
      "learning_rate": 6.778698764012874e-06,
      "loss": 0.1059,
      "num_tokens": 27254611.0,
      "reward": 11.302087688446045,
      "reward_std": 12.61121587753296,
      "rewards/wrapper/mean": 5.651043940335512,
      "rewards/wrapper/std": 16.543170015513898,
      "step": 2485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.2,
      "completions/mean_length": 253.55625,
      "completions/mean_terminated_length": 24.2,
      "completions/min_length": 177.8,
      "completions/min_terminated_length": 24.2,
      "epoch": 0.5841642228739002,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.1015625,
      "kl": 0.005001664778683334,
      "learning_rate": 6.7743709747300635e-06,
      "loss": -0.0087,
      "num_tokens": 27306574.0,
      "reward": 17.763720893859862,
      "reward_std": 21.10177364349365,
      "rewards/wrapper/mean": 8.881860357522964,
      "rewards/wrapper/std": 25.570963774621486,
      "step": 2490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 52.4,
      "completions/mean_length": 252.8375,
      "completions/mean_terminated_length": 52.4,
      "completions/min_length": 154.8,
      "completions/min_terminated_length": 52.4,
      "epoch": 0.5853372434017595,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.78125,
      "kl": 0.08458915337687359,
      "learning_rate": 6.770036932056609e-06,
      "loss": -0.0047,
      "num_tokens": 27359112.0,
      "reward": 9.331131935119629,
      "reward_std": 12.55569248199463,
      "rewards/wrapper/mean": 4.665565884113311,
      "rewards/wrapper/std": 14.336715736985207,
      "step": 2495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.2,
      "completions/mean_length": 253.025,
      "completions/mean_terminated_length": 7.2,
      "completions/min_length": 160.8,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.5865102639296188,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.96875,
      "kl": 0.026178717741277068,
      "learning_rate": 6.765696650832026e-06,
      "loss": -0.0082,
      "num_tokens": 27410764.0,
      "reward": 10.922028636932373,
      "reward_std": 14.060818576812744,
      "rewards/wrapper/mean": 5.461014226078987,
      "rewards/wrapper/std": 21.613298135995866,
      "step": 2500
    },
    {
      "epoch": 0.5865102639296188,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.8,
      "eval_completions/max_length": 255.95,
      "eval_completions/max_terminated_length": 72.19,
      "eval_completions/mean_length": 227.0475,
      "eval_completions/mean_terminated_length": 64.31,
      "eval_completions/min_length": 158.85,
      "eval_completions/min_terminated_length": 56.45,
      "eval_frac_reward_zero_std": 0.015,
      "eval_kl": 0.015076845483854413,
      "eval_loss": -0.026887383311986923,
      "eval_num_tokens": 27410764.0,
      "eval_reward": 0.34842764906585216,
      "eval_reward_std": 0.17556372756604105,
      "eval_rewards/wrapper/mean": 0.17421382576227187,
      "eval_rewards/wrapper/std": 0.15659947301028296,
      "eval_runtime": 208.2404,
      "eval_samples_per_second": 0.96,
      "eval_steps_per_second": 0.24,
      "step": 2500
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 6.0,
      "completions/mean_length": 249.7875,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 57.2,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.587683284457478,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.4375,
      "kl": 0.008145063684787601,
      "learning_rate": 6.761350145917192e-06,
      "loss": -0.0178,
      "num_tokens": 27463364.0,
      "reward": 5.690246820449829,
      "reward_std": 6.9504313468933105,
      "rewards/wrapper/mean": 2.845123402774334,
      "rewards/wrapper/std": 9.402719949185848,
      "step": 2505
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 250.225,
      "completions/mean_terminated_length": 20.0,
      "completions/min_length": 71.2,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.5888563049853373,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.125,
      "kl": 0.011013640067540109,
      "learning_rate": 6.756997432194293e-06,
      "loss": -0.0043,
      "num_tokens": 27518224.0,
      "reward": 15.580005931854249,
      "reward_std": 20.939382457733153,
      "rewards/wrapper/mean": 7.790003002434969,
      "rewards/wrapper/std": 21.323382918536662,
      "step": 2510
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.0,
      "completions/mean_length": 252.8875,
      "completions/mean_terminated_length": 54.0,
      "completions/min_length": 156.4,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.5900293255131965,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 7.96875,
      "kl": 0.02926081788027659,
      "learning_rate": 6.752638524566773e-06,
      "loss": -0.0009,
      "num_tokens": 27572688.0,
      "reward": 8.543451595306397,
      "reward_std": 11.566871976852417,
      "rewards/wrapper/mean": 4.271725799143314,
      "rewards/wrapper/std": 16.63720283508301,
      "step": 2515
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.6,
      "completions/mean_length": 249.9625,
      "completions/mean_terminated_length": 11.6,
      "completions/min_length": 62.8,
      "completions/min_terminated_length": 11.6,
      "epoch": 0.5912023460410557,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.296875,
      "kl": 0.009021649765782059,
      "learning_rate": 6.748273437959286e-06,
      "loss": -0.0176,
      "num_tokens": 27629564.0,
      "reward": 14.583421611785889,
      "reward_std": 17.03965835571289,
      "rewards/wrapper/mean": 7.29171050414443,
      "rewards/wrapper/std": 17.830930642783642,
      "step": 2520
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.0,
      "completions/mean_length": 249.91875,
      "completions/mean_terminated_length": 10.0,
      "completions/min_length": 112.4,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.592375366568915,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.9140625,
      "kl": 0.007613116281572729,
      "learning_rate": 6.74390218731764e-06,
      "loss": -0.0146,
      "num_tokens": 27683597.0,
      "reward": 11.419633483886718,
      "reward_std": 15.065901374816894,
      "rewards/wrapper/mean": 5.709816740453244,
      "rewards/wrapper/std": 17.053964272141457,
      "step": 2525
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.5935483870967742,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 9.875,
      "kl": 0.010646293580066413,
      "learning_rate": 6.7395247876087505e-06,
      "loss": -0.0128,
      "num_tokens": 27738890.0,
      "reward": 11.766228675842285,
      "reward_std": 13.879878425598145,
      "rewards/wrapper/mean": 5.883114151656628,
      "rewards/wrapper/std": 15.828456656634808,
      "step": 2530
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.6,
      "completions/mean_length": 250.9,
      "completions/mean_terminated_length": 41.6,
      "completions/min_length": 92.8,
      "completions/min_terminated_length": 41.6,
      "epoch": 0.5947214076246334,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.84375,
      "kl": 0.009409766842145473,
      "learning_rate": 6.735141253820584e-06,
      "loss": -0.003,
      "num_tokens": 27793140.0,
      "reward": 10.182917737960816,
      "reward_std": 11.574919128417969,
      "rewards/wrapper/mean": 5.091458834707737,
      "rewards/wrapper/std": 16.947440457344054,
      "step": 2535
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.0,
      "completions/mean_length": 252.2,
      "completions/mean_terminated_length": 32.0,
      "completions/min_length": 134.4,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.5958944281524927,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.453125,
      "kl": 0.014364969654707238,
      "learning_rate": 6.730751600962113e-06,
      "loss": -0.0112,
      "num_tokens": 27845464.0,
      "reward": 10.166706693172454,
      "reward_std": 11.566449183225632,
      "rewards/wrapper/mean": 5.0833531498908995,
      "rewards/wrapper/std": 15.070533008873463,
      "step": 2540
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.8,
      "completions/mean_length": 253.60625,
      "completions/mean_terminated_length": 25.8,
      "completions/min_length": 179.4,
      "completions/min_terminated_length": 25.8,
      "epoch": 0.5970674486803519,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.125,
      "kl": 0.01348751963232644,
      "learning_rate": 6.7263558440632615e-06,
      "loss": 0.006,
      "num_tokens": 27897069.0,
      "reward": 6.304814243316651,
      "reward_std": 5.5823561668396,
      "rewards/wrapper/mean": 3.1524071991443634,
      "rewards/wrapper/std": 9.431627669930458,
      "step": 2545
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 61.4,
      "completions/mean_length": 254.71875,
      "completions/mean_terminated_length": 61.4,
      "completions/min_length": 215.0,
      "completions/min_terminated_length": 61.4,
      "epoch": 0.5982404692082112,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.59375,
      "kl": 0.007102605036925525,
      "learning_rate": 6.721953998174848e-06,
      "loss": -0.0031,
      "num_tokens": 27953170.0,
      "reward": 10.140501308441163,
      "reward_std": 13.48496742248535,
      "rewards/wrapper/mean": 5.0702507749199865,
      "rewards/wrapper/std": 14.650724425911903,
      "step": 2550
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 55.2,
      "completions/mean_length": 252.93125,
      "completions/mean_terminated_length": 52.5,
      "completions/min_length": 203.4,
      "completions/min_terminated_length": 49.8,
      "epoch": 0.5994134897360703,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.65625,
      "kl": 0.027316716557834297,
      "learning_rate": 6.717546078368546e-06,
      "loss": -0.0006,
      "num_tokens": 28008075.0,
      "reward": 5.827312445640564,
      "reward_std": 7.616882419586181,
      "rewards/wrapper/mean": 2.913656205683947,
      "rewards/wrapper/std": 9.296160396933555,
      "step": 2555
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.6,
      "completions/mean_length": 252.94375,
      "completions/mean_terminated_length": 4.6,
      "completions/min_length": 158.2,
      "completions/min_terminated_length": 4.6,
      "epoch": 0.6005865102639296,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 0.90625,
      "kl": 0.00896136105293408,
      "learning_rate": 6.713132099736822e-06,
      "loss": -0.0105,
      "num_tokens": 28062328.0,
      "reward": 13.819943046569824,
      "reward_std": 18.820354843139647,
      "rewards/wrapper/mean": 6.9099711433053015,
      "rewards/wrapper/std": 20.776973666250704,
      "step": 2560
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.0,
      "completions/mean_length": 250.0625,
      "completions/mean_terminated_length": 43.7,
      "completions/min_length": 88.6,
      "completions/min_terminated_length": 37.4,
      "epoch": 0.6017595307917889,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.59375,
      "kl": 0.012951006775256246,
      "learning_rate": 6.708712077392889e-06,
      "loss": -0.0146,
      "num_tokens": 28118812.0,
      "reward": 11.193282270431519,
      "reward_std": 13.171154010295869,
      "rewards/wrapper/mean": 5.596641125530004,
      "rewards/wrapper/std": 16.368498238921166,
      "step": 2565
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.2,
      "completions/mean_length": 255.59375,
      "completions/mean_terminated_length": 38.2,
      "completions/min_length": 243.0,
      "completions/min_terminated_length": 38.2,
      "epoch": 0.6029325513196481,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 7.96875,
      "kl": 0.011676612915471197,
      "learning_rate": 6.704286026470651e-06,
      "loss": -0.0004,
      "num_tokens": 28174469.0,
      "reward": 13.969676399230957,
      "reward_std": 14.664949417114258,
      "rewards/wrapper/mean": 6.984838116168976,
      "rewards/wrapper/std": 19.052175915241243,
      "step": 2570
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.8,
      "completions/mean_length": 253.325,
      "completions/mean_terminated_length": 16.8,
      "completions/min_length": 170.4,
      "completions/min_terminated_length": 16.8,
      "epoch": 0.6041055718475073,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.03125,
      "kl": 0.033306200080551206,
      "learning_rate": 6.699853962124658e-06,
      "loss": -0.0036,
      "num_tokens": 28230099.0,
      "reward": 10.326721906661987,
      "reward_std": 13.107465887069703,
      "rewards/wrapper/mean": 5.163360907882452,
      "rewards/wrapper/std": 15.956426008045673,
      "step": 2575
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.6,
      "completions/mean_length": 251.75625,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 157.6,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6052785923753665,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.4296875,
      "kl": 0.006026765506248921,
      "learning_rate": 6.695415899530045e-06,
      "loss": -0.0031,
      "num_tokens": 28282456.0,
      "reward": 8.02139835357666,
      "reward_std": 9.612114334106446,
      "rewards/wrapper/mean": 4.010698922723532,
      "rewards/wrapper/std": 13.752987106144428,
      "step": 2580
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.6,
      "completions/mean_length": 251.25,
      "completions/mean_terminated_length": 1.6,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 1.6,
      "epoch": 0.6064516129032258,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.7421875,
      "kl": 0.006650674215052277,
      "learning_rate": 6.690971853882488e-06,
      "loss": -0.0122,
      "num_tokens": 28338506.0,
      "reward": 11.529253673553466,
      "reward_std": 12.749757957458495,
      "rewards/wrapper/mean": 5.764626702666282,
      "rewards/wrapper/std": 16.920075453817844,
      "step": 2585
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.2,
      "completions/mean_length": 253.275,
      "completions/mean_terminated_length": 15.2,
      "completions/min_length": 168.8,
      "completions/min_terminated_length": 15.2,
      "epoch": 0.6076246334310851,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.6875,
      "kl": 0.007390762877184898,
      "learning_rate": 6.686521840398147e-06,
      "loss": -0.006,
      "num_tokens": 28393026.0,
      "reward": 8.514682960510253,
      "reward_std": 9.638944625854492,
      "rewards/wrapper/mean": 4.257341559231281,
      "rewards/wrapper/std": 13.977068457007409,
      "step": 2590
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.8,
      "completions/mean_length": 249.81875,
      "completions/mean_terminated_length": 3.7,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 2.6,
      "epoch": 0.6087976539589443,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.65625,
      "kl": 0.009915889461990447,
      "learning_rate": 6.682065874313614e-06,
      "loss": -0.0171,
      "num_tokens": 28448633.0,
      "reward": 9.647655391693116,
      "reward_std": 12.866382217407226,
      "rewards/wrapper/mean": 4.823827692866326,
      "rewards/wrapper/std": 14.450115689635277,
      "step": 2595
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.2,
      "completions/mean_length": 244.54375,
      "completions/mean_terminated_length": 37.7,
      "completions/min_length": 84.4,
      "completions/min_terminated_length": 33.2,
      "epoch": 0.6099706744868035,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.75,
      "kl": 0.03516354828607291,
      "learning_rate": 6.677603970885869e-06,
      "loss": -0.0222,
      "num_tokens": 28503032.0,
      "reward": 7.7402863025665285,
      "reward_std": 10.311076450347901,
      "rewards/wrapper/mean": 3.870143134891987,
      "rewards/wrapper/std": 14.02377125620842,
      "step": 2600
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.0,
      "completions/mean_length": 250.075,
      "completions/mean_terminated_length": 14.3,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 13.6,
      "epoch": 0.6111436950146627,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 5.03125,
      "kl": 0.008182111865608022,
      "learning_rate": 6.67313614539221e-06,
      "loss": -0.0034,
      "num_tokens": 28556298.0,
      "reward": 8.295944690704346,
      "reward_std": 10.63902931213379,
      "rewards/wrapper/mean": 4.147972152382136,
      "rewards/wrapper/std": 12.825438100099564,
      "step": 2605
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.4,
      "completions/mean_length": 249.84375,
      "completions/mean_terminated_length": 4.1,
      "completions/min_length": 106.2,
      "completions/min_terminated_length": 3.8,
      "epoch": 0.612316715542522,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.90625,
      "kl": 0.006537263444624841,
      "learning_rate": 6.668662413130221e-06,
      "loss": -0.012,
      "num_tokens": 28609081.0,
      "reward": 11.645491218566894,
      "reward_std": 15.850448608398438,
      "rewards/wrapper/mean": 5.8227458745241165,
      "rewards/wrapper/std": 17.563245555758478,
      "step": 2610
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.8,
      "completions/mean_length": 250.03125,
      "completions/mean_terminated_length": 13.8,
      "completions/min_length": 65.0,
      "completions/min_terminated_length": 13.8,
      "epoch": 0.6134897360703813,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 14.9375,
      "kl": 0.009183612850029022,
      "learning_rate": 6.66418278941771e-06,
      "loss": -0.0135,
      "num_tokens": 28660034.0,
      "reward": 8.45309820175171,
      "reward_std": 11.176138877868652,
      "rewards/wrapper/mean": 4.226549039781093,
      "rewards/wrapper/std": 13.238977485895157,
      "step": 2615
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.8,
      "completions/mean_length": 248.725,
      "completions/mean_terminated_length": 22.8,
      "completions/min_length": 125.2,
      "completions/min_terminated_length": 22.8,
      "epoch": 0.6146627565982404,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.6953125,
      "kl": 0.007062935788417235,
      "learning_rate": 6.659697289592652e-06,
      "loss": -0.0188,
      "num_tokens": 28711502.0,
      "reward": 7.796641778945923,
      "reward_std": 10.286471939086914,
      "rewards/wrapper/mean": 3.898320996761322,
      "rewards/wrapper/std": 12.970464818179607,
      "step": 2620
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 251.325,
      "completions/mean_terminated_length": 4.0,
      "completions/min_length": 106.4,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.6158357771260997,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.71875,
      "kl": 0.008498370146844536,
      "learning_rate": 6.655205929013143e-06,
      "loss": -0.0101,
      "num_tokens": 28765306.0,
      "reward": 10.583085918426514,
      "reward_std": 13.03041124343872,
      "rewards/wrapper/mean": 5.2915429577231405,
      "rewards/wrapper/std": 13.423381480574609,
      "step": 2625
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.0,
      "completions/mean_length": 247.00625,
      "completions/mean_terminated_length": 13.13333339691162,
      "completions/min_length": 61.4,
      "completions/min_terminated_length": 10.2,
      "epoch": 0.617008797653959,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.53125,
      "kl": 0.01540077495155856,
      "learning_rate": 6.650708723057348e-06,
      "loss": -0.0035,
      "num_tokens": 28822733.0,
      "reward": 8.799090671539307,
      "reward_std": 11.789759016036987,
      "rewards/wrapper/mean": 4.399545115232468,
      "rewards/wrapper/std": 12.522874061763286,
      "step": 2630
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.8,
      "completions/mean_length": 249.34375,
      "completions/mean_terminated_length": 22.6,
      "completions/min_length": 53.6,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.6181818181818182,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.8203125,
      "kl": 0.008354636456351728,
      "learning_rate": 6.6462056871234466e-06,
      "loss": -0.0171,
      "num_tokens": 28878822.0,
      "reward": 8.222666358947754,
      "reward_std": 10.734997510910034,
      "rewards/wrapper/mean": 4.111333182454109,
      "rewards/wrapper/std": 13.712842452526093,
      "step": 2635
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 154.6,
      "completions/mean_length": 251.74375,
      "completions/mean_terminated_length": 154.1,
      "completions/min_length": 153.6,
      "completions/min_terminated_length": 153.6,
      "epoch": 0.6193548387096774,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.765625,
      "kl": 0.009392125299200416,
      "learning_rate": 6.641696836629576e-06,
      "loss": -0.0076,
      "num_tokens": 28932785.0,
      "reward": 13.914123153686523,
      "reward_std": 18.75023546218872,
      "rewards/wrapper/mean": 6.957061505317688,
      "rewards/wrapper/std": 19.568567314743994,
      "step": 2640
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.8,
      "completions/mean_length": 250.23125,
      "completions/mean_terminated_length": 6.866667175292969,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6205278592375366,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.625,
      "kl": 0.00819388214731589,
      "learning_rate": 6.637182187013788e-06,
      "loss": -0.0055,
      "num_tokens": 28984320.0,
      "reward": 5.599798417091369,
      "reward_std": 7.3513828158378605,
      "rewards/wrapper/mean": 2.7998990304768085,
      "rewards/wrapper/std": 8.210212644934654,
      "step": 2645
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 25.0,
      "completions/mean_length": 250.3875,
      "completions/mean_terminated_length": 25.0,
      "completions/min_length": 127.4,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.6217008797653959,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.375,
      "kl": 0.0097636858234182,
      "learning_rate": 6.632661753733982e-06,
      "loss": -0.004,
      "num_tokens": 29040122.0,
      "reward": 10.327516424655915,
      "reward_std": 14.078668093681335,
      "rewards/wrapper/mean": 5.16375821903348,
      "rewards/wrapper/std": 15.739317643642426,
      "step": 2650
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 249.6375,
      "completions/mean_terminated_length": 0.5333333492279053,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6228739002932552,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.765625,
      "kl": 0.017314812494441868,
      "learning_rate": 6.628135552267869e-06,
      "loss": -0.0056,
      "num_tokens": 29099046.0,
      "reward": 8.878870403766632,
      "reward_std": 9.9872851729393,
      "rewards/wrapper/mean": 4.439434761554002,
      "rewards/wrapper/std": 10.966332286596298,
      "step": 2655
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.2,
      "completions/mean_length": 255.6875,
      "completions/mean_terminated_length": 41.2,
      "completions/min_length": 246.0,
      "completions/min_terminated_length": 41.2,
      "epoch": 0.6240469208211143,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.25,
      "kl": 0.016126143047586083,
      "learning_rate": 6.6236035981129045e-06,
      "loss": 0.0013,
      "num_tokens": 29156898.0,
      "reward": 9.977191162109374,
      "reward_std": 12.882474327087403,
      "rewards/wrapper/mean": 4.988595449924469,
      "rewards/wrapper/std": 15.649570155143739,
      "step": 2660
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.0,
      "completions/mean_length": 250.41875,
      "completions/mean_terminated_length": 26.0,
      "completions/min_length": 128.4,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.6252199413489736,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 5.71875,
      "kl": 0.013472359464503824,
      "learning_rate": 6.6190659067862444e-06,
      "loss": -0.0151,
      "num_tokens": 29213789.0,
      "reward": 12.702929973602295,
      "reward_std": 15.42300910949707,
      "rewards/wrapper/mean": 6.351464556157589,
      "rewards/wrapper/std": 16.331206111609937,
      "step": 2665
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 18.6,
      "completions/mean_length": 253.38125,
      "completions/mean_terminated_length": 18.6,
      "completions/min_length": 172.2,
      "completions/min_terminated_length": 18.6,
      "epoch": 0.6263929618768328,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.125,
      "kl": 0.00791556949261576,
      "learning_rate": 6.614522493824686e-06,
      "loss": -0.0066,
      "num_tokens": 29268354.0,
      "reward": 7.424413585662842,
      "reward_std": 9.934688711166382,
      "rewards/wrapper/mean": 3.7122067116200923,
      "rewards/wrapper/std": 11.696235999464989,
      "step": 2670
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.4,
      "completions/mean_length": 250.21875,
      "completions/mean_terminated_length": 9.9,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6275659824046921,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.40625,
      "kl": 0.005988685146439821,
      "learning_rate": 6.609973374784615e-06,
      "loss": -0.0126,
      "num_tokens": 29321065.0,
      "reward": 11.359949398040772,
      "reward_std": 15.09793529510498,
      "rewards/wrapper/mean": 5.679974632710218,
      "rewards/wrapper/std": 18.337996226549148,
      "step": 2675
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.2,
      "completions/mean_length": 250.425,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.6287390029325514,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 0.95703125,
      "kl": 0.008262872393243016,
      "learning_rate": 6.605418565241957e-06,
      "loss": -0.012,
      "num_tokens": 29376085.0,
      "reward": 20.049555778503418,
      "reward_std": 23.516221237182616,
      "rewards/wrapper/mean": 10.024778033792973,
      "rewards/wrapper/std": 26.077826090157032,
      "step": 2680
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.8,
      "completions/mean_length": 252.65625,
      "completions/mean_terminated_length": 36.3,
      "completions/min_length": 183.4,
      "completions/min_terminated_length": 29.8,
      "epoch": 0.6299120234604105,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.53125,
      "kl": 0.00597726086853072,
      "learning_rate": 6.600858080792127e-06,
      "loss": -0.0033,
      "num_tokens": 29430674.0,
      "reward": 11.03137435913086,
      "reward_std": 14.728109073638915,
      "rewards/wrapper/mean": 5.515687373280525,
      "rewards/wrapper/std": 18.126582558453084,
      "step": 2685
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 39.6,
      "completions/mean_length": 254.0375,
      "completions/mean_terminated_length": 39.6,
      "completions/min_length": 193.2,
      "completions/min_terminated_length": 39.6,
      "epoch": 0.6310850439882698,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.34375,
      "kl": 0.008559392578899861,
      "learning_rate": 6.596291937049959e-06,
      "loss": 0.0071,
      "num_tokens": 29483068.0,
      "reward": 17.887986612319946,
      "reward_std": 21.92697615623474,
      "rewards/wrapper/mean": 8.943993638455868,
      "rewards/wrapper/std": 18.718515367805956,
      "step": 2690
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 35.0,
      "completions/mean_length": 251.36875,
      "completions/mean_terminated_length": 33.5,
      "completions/min_length": 134.4,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.632258064516129,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.453125,
      "kl": 0.014107034134212881,
      "learning_rate": 6.5917201496496735e-06,
      "loss": -0.0028,
      "num_tokens": 29536873.0,
      "reward": 8.071206760406493,
      "reward_std": 9.532784819602966,
      "rewards/wrapper/mean": 4.035603339970112,
      "rewards/wrapper/std": 13.035564199090004,
      "step": 2695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 94.2,
      "completions/mean_length": 251.6125,
      "completions/mean_terminated_length": 63.4,
      "completions/min_length": 135.0,
      "completions/min_terminated_length": 32.6,
      "epoch": 0.6334310850439883,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.34375,
      "kl": 0.007457331806654111,
      "learning_rate": 6.5871427342448105e-06,
      "loss": -0.0076,
      "num_tokens": 29588683.0,
      "reward": 8.361358261108398,
      "reward_std": 10.859839820861817,
      "rewards/wrapper/mean": 4.180679216235876,
      "rewards/wrapper/std": 13.797844186425209,
      "step": 2700
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.6,
      "completions/mean_length": 249.525,
      "completions/mean_terminated_length": 47.2,
      "completions/min_length": 97.0,
      "completions/min_terminated_length": 45.8,
      "epoch": 0.6346041055718475,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 7.3125,
      "kl": 0.011775065213441849,
      "learning_rate": 6.58255970650818e-06,
      "loss": -0.0114,
      "num_tokens": 29642651.0,
      "reward": 12.164882373809814,
      "reward_std": 10.960504055023193,
      "rewards/wrapper/mean": 6.082441242039204,
      "rewards/wrapper/std": 16.81056024134159,
      "step": 2705
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.2,
      "completions/mean_length": 253.49375,
      "completions/mean_terminated_length": 22.2,
      "completions/min_length": 175.8,
      "completions/min_terminated_length": 22.2,
      "epoch": 0.6357771260997067,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.0625,
      "kl": 0.012871231907047332,
      "learning_rate": 6.5779710821318105e-06,
      "loss": -0.0079,
      "num_tokens": 29698884.0,
      "reward": 8.56117124557495,
      "reward_std": 11.631469535827637,
      "rewards/wrapper/mean": 4.28058585524559,
      "rewards/wrapper/std": 12.213414934277534,
      "step": 2710
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.8,
      "completions/mean_length": 253.35625,
      "completions/mean_terminated_length": 17.8,
      "completions/min_length": 171.4,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.636950146627566,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.640625,
      "kl": 0.00925441893050447,
      "learning_rate": 6.573376876826891e-06,
      "loss": -0.0078,
      "num_tokens": 29753909.0,
      "reward": 12.767994022369384,
      "reward_std": 17.376163291931153,
      "rewards/wrapper/mean": 6.38399690464139,
      "rewards/wrapper/std": 19.173024424910544,
      "step": 2715
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.4,
      "completions/mean_length": 254.475,
      "completions/mean_terminated_length": 2.4,
      "completions/min_length": 207.2,
      "completions/min_terminated_length": 2.4,
      "epoch": 0.6381231671554253,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.4375,
      "kl": 0.007609774707816541,
      "learning_rate": 6.568777106323721e-06,
      "loss": -0.0044,
      "num_tokens": 29806425.0,
      "reward": 15.84351224899292,
      "reward_std": 19.228125762939452,
      "rewards/wrapper/mean": 7.9217560440301895,
      "rewards/wrapper/std": 18.802449291944505,
      "step": 2720
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.6,
      "completions/mean_length": 252.0625,
      "completions/mean_terminated_length": 24.5,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 22.4,
      "epoch": 0.6392961876832844,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 0.9453125,
      "kl": 0.00825467858230695,
      "learning_rate": 6.5641717863716515e-06,
      "loss": -0.0112,
      "num_tokens": 29860335.0,
      "reward": 11.479648876190186,
      "reward_std": 13.858703422546387,
      "rewards/wrapper/mean": 5.739824234694242,
      "rewards/wrapper/std": 15.419359780848026,
      "step": 2725
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 17.2,
      "completions/mean_length": 250.425,
      "completions/mean_terminated_length": 13.2,
      "completions/min_length": 162.8,
      "completions/min_terminated_length": 9.2,
      "epoch": 0.6404692082111437,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.9375,
      "kl": 0.02650681862141937,
      "learning_rate": 6.559560932739037e-06,
      "loss": -0.0188,
      "num_tokens": 29916441.0,
      "reward": 14.781854248046875,
      "reward_std": 15.919013786315919,
      "rewards/wrapper/mean": 7.390926908701658,
      "rewards/wrapper/std": 21.141387024521826,
      "step": 2730
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.6,
      "completions/mean_length": 250.76875,
      "completions/mean_terminated_length": 20.5,
      "completions/min_length": 109.8,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.6416422287390029,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.796875,
      "kl": 0.13726555263856427,
      "learning_rate": 6.554944561213182e-06,
      "loss": -0.0118,
      "num_tokens": 29974952.0,
      "reward": 10.810014820098877,
      "reward_std": 14.398545169830323,
      "rewards/wrapper/mean": 5.405007231235504,
      "rewards/wrapper/std": 16.08275884240866,
      "step": 2735
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6428152492668622,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 3.625,
      "kl": 0.009235573734622448,
      "learning_rate": 6.550322687600278e-06,
      "loss": -0.0009,
      "num_tokens": 30030699.0,
      "reward": 13.479443836212159,
      "reward_std": 14.499934005737305,
      "rewards/wrapper/mean": 6.739722138643264,
      "rewards/wrapper/std": 16.795974485576153,
      "step": 2740
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.0,
      "completions/mean_length": 248.95,
      "completions/mean_terminated_length": 15.3,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.6439882697947215,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.703125,
      "kl": 0.00881982856662944,
      "learning_rate": 6.54569532772536e-06,
      "loss": -0.0089,
      "num_tokens": 30086385.0,
      "reward": 14.04744815826416,
      "reward_std": 17.4063777923584,
      "rewards/wrapper/mean": 7.023723734170199,
      "rewards/wrapper/std": 19.03030771613121,
      "step": 2745
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.4,
      "completions/mean_length": 252.65625,
      "completions/mean_terminated_length": 30.5,
      "completions/min_length": 168.2,
      "completions/min_terminated_length": 14.6,
      "epoch": 0.6451612903225806,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.359375,
      "kl": 0.07983345453976654,
      "learning_rate": 6.541062497432242e-06,
      "loss": 0.0084,
      "num_tokens": 30140448.0,
      "reward": 5.343929433822632,
      "reward_std": 6.9545831203460695,
      "rewards/wrapper/mean": 2.671964705735445,
      "rewards/wrapper/std": 10.572839736938477,
      "step": 2750
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.6,
      "completions/mean_length": 254.91875,
      "completions/mean_terminated_length": 16.6,
      "completions/min_length": 221.4,
      "completions/min_terminated_length": 16.6,
      "epoch": 0.6463343108504399,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.984375,
      "kl": 0.01167663314845413,
      "learning_rate": 6.536424212583478e-06,
      "loss": 0.0045,
      "num_tokens": 30199283.0,
      "reward": 12.16322021484375,
      "reward_std": 12.57860621213913,
      "rewards/wrapper/mean": 6.081610155105591,
      "rewards/wrapper/std": 19.400627340376378,
      "step": 2755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.8,
      "completions/mean_length": 246.4375,
      "completions/mean_terminated_length": 0.8,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.6475073313782991,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.28125,
      "kl": 0.011256003042217345,
      "learning_rate": 6.531780489060287e-06,
      "loss": -0.0291,
      "num_tokens": 30253005.0,
      "reward": 10.376868152618409,
      "reward_std": 13.655181932449342,
      "rewards/wrapper/mean": 5.18843387439847,
      "rewards/wrapper/std": 16.041875714063643,
      "step": 2760
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.2,
      "completions/mean_length": 252.49375,
      "completions/mean_terminated_length": 20.8,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6486803519061584,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.296875,
      "kl": 0.006306805438362062,
      "learning_rate": 6.527131342762519e-06,
      "loss": -0.0122,
      "num_tokens": 30305632.0,
      "reward": 14.130721759796142,
      "reward_std": 17.248423194885255,
      "rewards/wrapper/mean": 7.065360965207219,
      "rewards/wrapper/std": 18.418144088238478,
      "step": 2765
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 46.4,
      "completions/mean_length": 251.05625,
      "completions/mean_terminated_length": 24.3,
      "completions/min_length": 104.6,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.6498533724340176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.03125,
      "kl": 0.007100276905111969,
      "learning_rate": 6.522476789608584e-06,
      "loss": -0.004,
      "num_tokens": 30359495.0,
      "reward": 7.962774801254272,
      "reward_std": 10.023509168624878,
      "rewards/wrapper/mean": 3.98138741850853,
      "rewards/wrapper/std": 12.435541369020939,
      "step": 2770
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.6,
      "completions/mean_length": 251.53125,
      "completions/mean_terminated_length": 10.6,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 10.6,
      "epoch": 0.6510263929618768,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.6484375,
      "kl": 0.018682882434222847,
      "learning_rate": 6.517816845535409e-06,
      "loss": -0.0133,
      "num_tokens": 30415146.0,
      "reward": 11.046603441238403,
      "reward_std": 14.976951122283936,
      "rewards/wrapper/mean": 5.523301954567432,
      "rewards/wrapper/std": 16.57356094866991,
      "step": 2775
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.0,
      "completions/mean_length": 251.6375,
      "completions/mean_terminated_length": 14.0,
      "completions/min_length": 116.4,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.6521994134897361,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 0.9765625,
      "kl": 0.15719263151986523,
      "learning_rate": 6.513151526498379e-06,
      "loss": 0.0002,
      "num_tokens": 30469236.0,
      "reward": 11.077020359039306,
      "reward_std": 14.02384262084961,
      "rewards/wrapper/mean": 5.538510248064995,
      "rewards/wrapper/std": 14.277467794716358,
      "step": 2780
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 88.0,
      "completions/mean_length": 252.35,
      "completions/mean_terminated_length": 88.0,
      "completions/min_length": 139.2,
      "completions/min_terminated_length": 88.0,
      "epoch": 0.6533724340175953,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.21875,
      "kl": 2.1136296992423014,
      "learning_rate": 6.508480848471282e-06,
      "loss": 0.0867,
      "num_tokens": 30524428.0,
      "reward": 8.025190353393555,
      "reward_std": 10.717722511291504,
      "rewards/wrapper/mean": 4.012595250457525,
      "rewards/wrapper/std": 15.221745024621487,
      "step": 2785
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.2,
      "completions/mean_length": 250.36875,
      "completions/mean_terminated_length": 12.3,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6545454545454545,
      "frac_reward_zero_std": 0.075,
      "grad_norm": 3.140625,
      "kl": 0.030153904797043652,
      "learning_rate": 6.503804827446254e-06,
      "loss": -0.008,
      "num_tokens": 30579919.0,
      "reward": 10.410851192474365,
      "reward_std": 13.341778182983399,
      "rewards/wrapper/mean": 5.2054255366325375,
      "rewards/wrapper/std": 16.092943432927132,
      "step": 2790
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 2.0,
      "completions/mean_length": 248.08125,
      "completions/mean_terminated_length": 1.2,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6557184750733138,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.59375,
      "kl": 0.007615790021372959,
      "learning_rate": 6.499123479433728e-06,
      "loss": -0.0049,
      "num_tokens": 30633020.0,
      "reward": 6.076622056961059,
      "reward_std": 7.86311776638031,
      "rewards/wrapper/mean": 3.038311021029949,
      "rewards/wrapper/std": 8.92110146433115,
      "step": 2795
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.6,
      "completions/mean_length": 253.56875,
      "completions/mean_terminated_length": 24.6,
      "completions/min_length": 178.2,
      "completions/min_terminated_length": 24.6,
      "epoch": 0.656891495601173,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.59375,
      "kl": 0.005625285796122625,
      "learning_rate": 6.494436820462371e-06,
      "loss": -0.0074,
      "num_tokens": 30685167.0,
      "reward": 11.33204174041748,
      "reward_std": 14.154102516174316,
      "rewards/wrapper/mean": 5.666020432859659,
      "rewards/wrapper/std": 16.62561158388853,
      "step": 2800
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 56.6,
      "completions/mean_length": 248.575,
      "completions/mean_terminated_length": 37.9,
      "completions/min_length": 70.4,
      "completions/min_terminated_length": 19.2,
      "epoch": 0.6580645161290323,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.59375,
      "kl": 0.0125009736395441,
      "learning_rate": 6.489744866579038e-06,
      "loss": -0.0202,
      "num_tokens": 30740001.0,
      "reward": 7.082417011260986,
      "reward_std": 8.872758960723877,
      "rewards/wrapper/mean": 3.5412084154784678,
      "rewards/wrapper/std": 11.814745858311653,
      "step": 2805
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6592375366568914,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 34.75,
      "kl": 0.07425311693223194,
      "learning_rate": 6.4850476338487135e-06,
      "loss": 0.003,
      "num_tokens": 30795647.0,
      "reward": 12.623469734191895,
      "reward_std": 16.824228286743164,
      "rewards/wrapper/mean": 6.311734789609909,
      "rewards/wrapper/std": 18.106909097731112,
      "step": 2810
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.6604105571847507,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.046875,
      "kl": 0.013532165088690817,
      "learning_rate": 6.480345138354457e-06,
      "loss": -0.0032,
      "num_tokens": 30850124.0,
      "reward": 12.670428943634032,
      "reward_std": 16.832879066467285,
      "rewards/wrapper/mean": 6.335214430838823,
      "rewards/wrapper/std": 17.93000815808773,
      "step": 2815
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.4,
      "completions/mean_length": 249.30625,
      "completions/mean_terminated_length": 14.2,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.66158357771261,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.4765625,
      "kl": 0.011149391869548709,
      "learning_rate": 6.475637396197346e-06,
      "loss": -0.01,
      "num_tokens": 30902123.0,
      "reward": 10.397428131103515,
      "reward_std": 12.043423748016357,
      "rewards/wrapper/mean": 5.198714216798544,
      "rewards/wrapper/std": 14.887701985239982,
      "step": 2820
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 96.2,
      "completions/mean_length": 247.48125,
      "completions/mean_terminated_length": 68.5,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 40.8,
      "epoch": 0.6627565982404692,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6.46875,
      "kl": 0.012524871563073248,
      "learning_rate": 6.470924423496421e-06,
      "loss": -0.0267,
      "num_tokens": 30955618.0,
      "reward": 14.662075805664063,
      "reward_std": 17.22254581451416,
      "rewards/wrapper/mean": 7.331037894636393,
      "rewards/wrapper/std": 17.588551034033298,
      "step": 2825
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.4,
      "completions/mean_length": 251.875,
      "completions/mean_terminated_length": 21.4,
      "completions/min_length": 175.0,
      "completions/min_terminated_length": 21.4,
      "epoch": 0.6639296187683285,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.890625,
      "kl": 0.007945716701215133,
      "learning_rate": 6.466206236388636e-06,
      "loss": -0.0116,
      "num_tokens": 31009472.0,
      "reward": 13.417535018920898,
      "reward_std": 17.303636741638183,
      "rewards/wrapper/mean": 6.708767288178206,
      "rewards/wrapper/std": 20.54477540552616,
      "step": 2830
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.6,
      "completions/mean_length": 250.59375,
      "completions/mean_terminated_length": 24.7,
      "completions/min_length": 120.2,
      "completions/min_terminated_length": 17.8,
      "epoch": 0.6651026392961877,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.40625,
      "kl": 0.007562464033253491,
      "learning_rate": 6.461482851028794e-06,
      "loss": -0.0107,
      "num_tokens": 31066489.0,
      "reward": 10.607298517227173,
      "reward_std": 11.593081283569337,
      "rewards/wrapper/mean": 5.30364919602871,
      "rewards/wrapper/std": 17.242631320655345,
      "step": 2835
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 23.2,
      "completions/mean_length": 250.3375,
      "completions/mean_terminated_length": 23.2,
      "completions/min_length": 176.8,
      "completions/min_terminated_length": 23.2,
      "epoch": 0.6662756598240469,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.84375,
      "kl": 0.008082286594435573,
      "learning_rate": 6.4567542835894985e-06,
      "loss": -0.0186,
      "num_tokens": 31119453.0,
      "reward": 11.698405933380126,
      "reward_std": 15.217379951477051,
      "rewards/wrapper/mean": 5.849203032255173,
      "rewards/wrapper/std": 18.357003271579742,
      "step": 2840
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6674486803519062,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 28.75,
      "kl": 0.03912512751994655,
      "learning_rate": 6.452020550261098e-06,
      "loss": 0.0016,
      "num_tokens": 31171969.0,
      "reward": 12.70831356048584,
      "reward_std": 14.184439086914063,
      "rewards/wrapper/mean": 6.3541566789150234,
      "rewards/wrapper/std": 18.155715675652026,
      "step": 2845
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.8,
      "completions/mean_length": 251.1625,
      "completions/mean_terminated_length": 48.8,
      "completions/min_length": 150.2,
      "completions/min_terminated_length": 47.8,
      "epoch": 0.6686217008797654,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.5859375,
      "kl": 0.021190780331380665,
      "learning_rate": 6.447281667251626e-06,
      "loss": -0.0041,
      "num_tokens": 31227067.0,
      "reward": 13.802499532699585,
      "reward_std": 16.760237789154054,
      "rewards/wrapper/mean": 6.901249774545431,
      "rewards/wrapper/std": 17.009926618635653,
      "step": 2850
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6697947214076246,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.53125,
      "kl": 0.020001567632425575,
      "learning_rate": 6.4425376507867485e-06,
      "loss": 0.0008,
      "num_tokens": 31280919.0,
      "reward": 5.958142876625061,
      "reward_std": 7.7278544187545775,
      "rewards/wrapper/mean": 2.9790713407099245,
      "rewards/wrapper/std": 10.234820060431957,
      "step": 2855
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6709677419354839,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.71875,
      "kl": 0.05707511976361275,
      "learning_rate": 6.4377885171097104e-06,
      "loss": -0.0053,
      "num_tokens": 31337926.0,
      "reward": 11.579971981048583,
      "reward_std": 15.459444427490235,
      "rewards/wrapper/mean": 5.789985730499029,
      "rewards/wrapper/std": 16.77395656108856,
      "step": 2860
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.2,
      "completions/mean_length": 249.95,
      "completions/mean_terminated_length": 57.2,
      "completions/min_length": 103.4,
      "completions/min_terminated_length": 52.2,
      "epoch": 0.6721407624633431,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.859375,
      "kl": 0.007303468359168619,
      "learning_rate": 6.4330342824812745e-06,
      "loss": -0.0168,
      "num_tokens": 31394784.0,
      "reward": 6.7388733386993405,
      "reward_std": 8.405215740203857,
      "rewards/wrapper/mean": 3.3694365844130516,
      "rewards/wrapper/std": 12.588712561130524,
      "step": 2865
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.8,
      "completions/mean_length": 249.95,
      "completions/mean_terminated_length": 7.5,
      "completions/min_length": 109.6,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.6733137829912024,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 9.5625,
      "kl": 0.20785174979828297,
      "learning_rate": 6.4282749631796725e-06,
      "loss": -0.0062,
      "num_tokens": 31450594.0,
      "reward": 12.838958740234375,
      "reward_std": 16.73229236602783,
      "rewards/wrapper/mean": 6.41947939991951,
      "rewards/wrapper/std": 16.986108617484568,
      "step": 2870
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 100.0,
      "completions/mean_length": 253.1,
      "completions/mean_terminated_length": 81.6,
      "completions/min_length": 165.6,
      "completions/min_terminated_length": 63.2,
      "epoch": 0.6744868035190615,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.046875,
      "kl": 0.014510875556152315,
      "learning_rate": 6.42351057550054e-06,
      "loss": 0.0028,
      "num_tokens": 31504968.0,
      "reward": 5.3186607837677,
      "reward_std": 7.012968826293945,
      "rewards/wrapper/mean": 2.6593304432928564,
      "rewards/wrapper/std": 10.752986335754395,
      "step": 2875
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6756598240469208,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 6.1875,
      "kl": 0.017193890095222743,
      "learning_rate": 6.418741135756875e-06,
      "loss": -0.0069,
      "num_tokens": 31557632.0,
      "reward": 13.5635555267334,
      "reward_std": 16.37178087234497,
      "rewards/wrapper/mean": 6.781777499616146,
      "rewards/wrapper/std": 18.871901808679105,
      "step": 2880
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.8,
      "completions/mean_length": 254.7375,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 215.6,
      "completions/min_terminated_length": 10.8,
      "epoch": 0.6768328445747801,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.34375,
      "kl": 0.00942248246865347,
      "learning_rate": 6.413966660278967e-06,
      "loss": -0.0015,
      "num_tokens": 31614106.0,
      "reward": 10.58498935699463,
      "reward_std": 11.666668128967284,
      "rewards/wrapper/mean": 5.292494739592075,
      "rewards/wrapper/std": 15.16362506300211,
      "step": 2885
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 51.0,
      "completions/mean_length": 252.79375,
      "completions/mean_terminated_length": 51.0,
      "completions/min_length": 153.4,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.6780058651026393,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.84375,
      "kl": 0.012383062462322413,
      "learning_rate": 6.409187165414346e-06,
      "loss": -0.0014,
      "num_tokens": 31669557.0,
      "reward": 10.968196487426757,
      "reward_std": 14.180962562561035,
      "rewards/wrapper/mean": 5.484098115563393,
      "rewards/wrapper/std": 18.82552878111601,
      "step": 2890
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.4,
      "completions/mean_length": 248.49375,
      "completions/mean_terminated_length": 8.0,
      "completions/min_length": 107.0,
      "completions/min_terminated_length": 4.6,
      "epoch": 0.6791788856304986,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.3125,
      "kl": 0.009327814332209527,
      "learning_rate": 6.404402667527736e-06,
      "loss": -0.0163,
      "num_tokens": 31726770.0,
      "reward": 10.455389595031738,
      "reward_std": 14.166728591918945,
      "rewards/wrapper/mean": 5.2276949137449265,
      "rewards/wrapper/std": 15.657369413971901,
      "step": 2895
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 37.0,
      "completions/mean_length": 253.95625,
      "completions/mean_terminated_length": 37.0,
      "completions/min_length": 190.6,
      "completions/min_terminated_length": 37.0,
      "epoch": 0.6803519061583577,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.453125,
      "kl": 0.00922186360694468,
      "learning_rate": 6.399613183000983e-06,
      "loss": -0.0063,
      "num_tokens": 31780625.0,
      "reward": 5.889966726303101,
      "reward_std": 5.738338303565979,
      "rewards/wrapper/mean": 2.9449833787977697,
      "rewards/wrapper/std": 10.102134810388089,
      "step": 2900
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.6,
      "completions/mean_length": 251.46875,
      "completions/mean_terminated_length": 8.6,
      "completions/min_length": 111.0,
      "completions/min_terminated_length": 8.6,
      "epoch": 0.681524926686217,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 2.1875,
      "kl": 0.011204652744345367,
      "learning_rate": 6.394818728233014e-06,
      "loss": -0.0103,
      "num_tokens": 31838018.0,
      "reward": 11.55151720046997,
      "reward_std": 15.863847255706787,
      "rewards/wrapper/mean": 5.775758402049542,
      "rewards/wrapper/std": 17.470771422982217,
      "step": 2905
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.8,
      "completions/mean_length": 252.91875,
      "completions/mean_terminated_length": 3.8,
      "completions/min_length": 157.4,
      "completions/min_terminated_length": 3.8,
      "epoch": 0.6826979472140763,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.84375,
      "kl": 0.028127940092235803,
      "learning_rate": 6.3900193196397675e-06,
      "loss": -0.0086,
      "num_tokens": 31894569.0,
      "reward": 11.12969675064087,
      "reward_std": 15.122749328613281,
      "rewards/wrapper/mean": 5.564848321676254,
      "rewards/wrapper/std": 15.775805968046189,
      "step": 2910
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.0,
      "completions/mean_length": 248.1,
      "completions/mean_terminated_length": 42.5,
      "completions/min_length": 86.2,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.6838709677419355,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.96875,
      "kl": 4.824623215675819,
      "learning_rate": 6.385214973654147e-06,
      "loss": 0.1788,
      "num_tokens": 31949579.0,
      "reward": 9.268371200561523,
      "reward_std": 12.328631019592285,
      "rewards/wrapper/mean": 4.6341855227947235,
      "rewards/wrapper/std": 14.82547686547041,
      "step": 2915
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.6850439882697947,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.1875,
      "kl": 0.04323632463347167,
      "learning_rate": 6.380405706725961e-06,
      "loss": -0.0057,
      "num_tokens": 32002908.0,
      "reward": 10.425172328948975,
      "reward_std": 14.20661792755127,
      "rewards/wrapper/mean": 5.212585891783237,
      "rewards/wrapper/std": 17.869454950094223,
      "step": 2920
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.8,
      "completions/mean_length": 255.675,
      "completions/mean_terminated_length": 40.8,
      "completions/min_length": 245.6,
      "completions/min_terminated_length": 40.8,
      "epoch": 0.6862170087976539,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.59375,
      "kl": 0.010727729741483926,
      "learning_rate": 6.375591535321866e-06,
      "loss": 0.0011,
      "num_tokens": 32058924.0,
      "reward": 7.594053506851196,
      "reward_std": 9.99413022994995,
      "rewards/wrapper/mean": 3.7970266461372377,
      "rewards/wrapper/std": 12.754188150167465,
      "step": 2925
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.6873900293255132,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.03125,
      "kl": 0.0054875939211342485,
      "learning_rate": 6.370772475925312e-06,
      "loss": -0.0048,
      "num_tokens": 32113721.0,
      "reward": 12.846275234222412,
      "reward_std": 15.05757179260254,
      "rewards/wrapper/mean": 6.423137576878071,
      "rewards/wrapper/std": 18.93209269195795,
      "step": 2930
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6885630498533725,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.28125,
      "kl": 0.012189660430885851,
      "learning_rate": 6.365948545036486e-06,
      "loss": -0.0078,
      "num_tokens": 32171397.0,
      "reward": 13.03080415725708,
      "reward_std": 15.789403009414674,
      "rewards/wrapper/mean": 6.515402068197727,
      "rewards/wrapper/std": 20.668898472189902,
      "step": 2935
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.6,
      "completions/mean_length": 254.13125,
      "completions/mean_terminated_length": 42.6,
      "completions/min_length": 196.2,
      "completions/min_terminated_length": 42.6,
      "epoch": 0.6897360703812316,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.640625,
      "kl": 0.007035130372969434,
      "learning_rate": 6.361119759172254e-06,
      "loss": -0.0039,
      "num_tokens": 32225176.0,
      "reward": 11.792992210388183,
      "reward_std": 15.923147010803223,
      "rewards/wrapper/mean": 5.896495893597603,
      "rewards/wrapper/std": 16.489003255963326,
      "step": 2940
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 252.01875,
      "completions/mean_terminated_length": 13.2,
      "completions/min_length": 166.4,
      "completions/min_terminated_length": 12.8,
      "epoch": 0.6909090909090909,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 9.5,
      "kl": 0.011763886036351322,
      "learning_rate": 6.3562861348661025e-06,
      "loss": -0.005,
      "num_tokens": 32280057.0,
      "reward": 12.035215187072755,
      "reward_std": 14.319508934020996,
      "rewards/wrapper/mean": 6.017607763409615,
      "rewards/wrapper/std": 17.326738145947456,
      "step": 2945
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 100.0,
      "completions/mean_length": 252.725,
      "completions/mean_terminated_length": 100.0,
      "completions/min_length": 151.2,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.6920821114369502,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.609375,
      "kl": 0.008140194357838481,
      "learning_rate": 6.351447688668089e-06,
      "loss": -0.0017,
      "num_tokens": 32330873.0,
      "reward": 7.528527927398682,
      "reward_std": 8.575776290893554,
      "rewards/wrapper/mean": 3.764264015108347,
      "rewards/wrapper/std": 12.360572703182697,
      "step": 2950
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 31.0,
      "completions/mean_length": 251.10625,
      "completions/mean_terminated_length": 16.2,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.6932551319648094,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.53125,
      "kl": 0.011117254314012826,
      "learning_rate": 6.34660443714478e-06,
      "loss": -0.0064,
      "num_tokens": 32385796.0,
      "reward": 13.417228507995606,
      "reward_std": 14.974496126174927,
      "rewards/wrapper/mean": 6.708614060282708,
      "rewards/wrapper/std": 16.140495674312113,
      "step": 2955
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.8,
      "completions/mean_length": 248.34375,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 10.8,
      "epoch": 0.6944281524926686,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 0.875,
      "kl": 0.011824999994132668,
      "learning_rate": 6.341756396879192e-06,
      "loss": -0.0234,
      "num_tokens": 32439957.0,
      "reward": 12.452883625030518,
      "reward_std": 15.025022745132446,
      "rewards/wrapper/mean": 6.226441939175129,
      "rewards/wrapper/std": 16.66017941981554,
      "step": 2960
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.2,
      "completions/mean_length": 252.89375,
      "completions/mean_terminated_length": 54.2,
      "completions/min_length": 156.6,
      "completions/min_terminated_length": 54.2,
      "epoch": 0.6956011730205278,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.828125,
      "kl": 0.009212367539294063,
      "learning_rate": 6.33690358447074e-06,
      "loss": -0.0097,
      "num_tokens": 32494362.0,
      "reward": 13.592586326599122,
      "reward_std": 18.491394233703613,
      "rewards/wrapper/mean": 6.796293315291405,
      "rewards/wrapper/std": 20.610531900823116,
      "step": 2965
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 93.6,
      "completions/mean_length": 252.525,
      "completions/mean_terminated_length": 93.6,
      "completions/min_length": 144.8,
      "completions/min_terminated_length": 93.6,
      "epoch": 0.6967741935483871,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.21875,
      "kl": 0.009233022644184529,
      "learning_rate": 6.33204601653518e-06,
      "loss": -0.0024,
      "num_tokens": 32549066.0,
      "reward": 8.715370655059814,
      "reward_std": 9.768978691101074,
      "rewards/wrapper/mean": 4.357685124874115,
      "rewards/wrapper/std": 16.73828110843897,
      "step": 2970
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 104.0,
      "completions/mean_length": 250.35,
      "completions/mean_terminated_length": 100.8,
      "completions/min_length": 148.8,
      "completions/min_terminated_length": 97.6,
      "epoch": 0.6979472140762464,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 0.9296875,
      "kl": 0.007445700804237276,
      "learning_rate": 6.327183709704547e-06,
      "loss": -0.0089,
      "num_tokens": 32601098.0,
      "reward": 8.694724130630494,
      "reward_std": 11.72841203212738,
      "rewards/wrapper/mean": 4.347361895442009,
      "rewards/wrapper/std": 12.822503638267516,
      "step": 2975
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.4,
      "completions/mean_length": 251.34375,
      "completions/mean_terminated_length": 2.4,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.6991202346041056,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 4.1875,
      "kl": 0.029423248756211252,
      "learning_rate": 6.322316680627107e-06,
      "loss": -0.0098,
      "num_tokens": 32657423.0,
      "reward": 15.263641691207885,
      "reward_std": 20.547141981124877,
      "rewards/wrapper/mean": 7.6318209052085875,
      "rewards/wrapper/std": 18.74337030798197,
      "step": 2980
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.0,
      "completions/mean_length": 248.48125,
      "completions/mean_terminated_length": 9.4,
      "completions/min_length": 58.0,
      "completions/min_terminated_length": 6.8,
      "epoch": 0.7002932551319648,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.421875,
      "kl": 0.009029289579484612,
      "learning_rate": 6.317444945967288e-06,
      "loss": -0.0006,
      "num_tokens": 32714476.0,
      "reward": 13.580448102951049,
      "reward_std": 15.056785011291504,
      "rewards/wrapper/mean": 6.790223602950573,
      "rewards/wrapper/std": 16.98922117650509,
      "step": 2985
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.0,
      "completions/mean_length": 249.85625,
      "completions/mean_terminated_length": 6.2,
      "completions/min_length": 106.8,
      "completions/min_terminated_length": 4.4,
      "epoch": 0.701466275659824,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.4609375,
      "kl": 0.010367827408481389,
      "learning_rate": 6.312568522405635e-06,
      "loss": -0.012,
      "num_tokens": 32768535.0,
      "reward": 8.727212715148926,
      "reward_std": 9.299742698669434,
      "rewards/wrapper/mean": 4.363606084138155,
      "rewards/wrapper/std": 11.761549571156502,
      "step": 2990
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 8.2,
      "completions/mean_length": 253.0625,
      "completions/mean_terminated_length": 4.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7026392961876833,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.34375,
      "kl": 0.010755813180003316,
      "learning_rate": 6.307687426638746e-06,
      "loss": -0.0087,
      "num_tokens": 32822397.0,
      "reward": 10.127997446060181,
      "reward_std": 12.69483847618103,
      "rewards/wrapper/mean": 5.0639987081289295,
      "rewards/wrapper/std": 14.818000476062299,
      "step": 2995
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 11.4,
      "completions/mean_length": 251.55625,
      "completions/mean_terminated_length": 11.4,
      "completions/min_length": 113.8,
      "completions/min_terminated_length": 11.4,
      "epoch": 0.7038123167155426,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.125,
      "kl": 0.007638441288145259,
      "learning_rate": 6.302801675379216e-06,
      "loss": -0.0098,
      "num_tokens": 32875248.0,
      "reward": 14.06427435874939,
      "reward_std": 18.11653337478638,
      "rewards/wrapper/mean": 7.032136972993612,
      "rewards/wrapper/std": 19.8682512819767,
      "step": 3000
    },
    {
      "epoch": 0.7038123167155426,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.79,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 71.57,
      "eval_completions/mean_length": 222.775,
      "eval_completions/mean_terminated_length": 60.508333358764645,
      "eval_completions/min_length": 146.37,
      "eval_completions/min_terminated_length": 49.09,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.013644674248062073,
      "eval_loss": -0.05559740215539932,
      "eval_num_tokens": 32875248.0,
      "eval_reward": 0.34767198249697684,
      "eval_reward_std": 0.19411193696781992,
      "eval_rewards/wrapper/mean": 0.1738359948247671,
      "eval_rewards/wrapper/std": 0.1692263395804912,
      "eval_runtime": 208.5757,
      "eval_samples_per_second": 0.959,
      "eval_steps_per_second": 0.24,
      "step": 3000
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 20.0,
      "completions/mean_length": 250.23125,
      "completions/mean_terminated_length": 20.0,
      "completions/min_length": 122.4,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.7049853372434017,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.75,
      "kl": 0.012254834035411477,
      "learning_rate": 6.297911285355579e-06,
      "loss": 0.0123,
      "num_tokens": 32930043.0,
      "reward": 10.28910961151123,
      "reward_std": 12.967611694335938,
      "rewards/wrapper/mean": 5.144554616510868,
      "rewards/wrapper/std": 18.04752763658762,
      "step": 3005
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.4,
      "completions/mean_length": 252.84375,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.706158357771261,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.015625,
      "kl": 0.012843809789046645,
      "learning_rate": 6.293016273312254e-06,
      "loss": -0.0044,
      "num_tokens": 32985718.0,
      "reward": 15.941922378540038,
      "reward_std": 19.729792022705077,
      "rewards/wrapper/mean": 7.970961252599954,
      "rewards/wrapper/std": 22.57847531288862,
      "step": 3010
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 254.41875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 205.4,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.7073313782991202,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.1171875,
      "kl": 0.011954522877931594,
      "learning_rate": 6.288116656009485e-06,
      "loss": 0.0036,
      "num_tokens": 33040551.0,
      "reward": 14.066415405273437,
      "reward_std": 16.348633670806883,
      "rewards/wrapper/mean": 7.033207412064075,
      "rewards/wrapper/std": 19.8112138196826,
      "step": 3015
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 5.0,
      "completions/mean_length": 251.35625,
      "completions/mean_terminated_length": 5.0,
      "completions/min_length": 107.4,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.7085043988269795,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.75,
      "kl": 0.013446322188246995,
      "learning_rate": 6.283212450223284e-06,
      "loss": -0.0117,
      "num_tokens": 33094550.0,
      "reward": 8.66706485748291,
      "reward_std": 11.540789794921874,
      "rewards/wrapper/mean": 4.333532364666462,
      "rewards/wrapper/std": 15.913649466633796,
      "step": 3020
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 50.2,
      "completions/mean_length": 253.8375,
      "completions/mean_terminated_length": 42.3,
      "completions/min_length": 188.0,
      "completions/min_terminated_length": 34.4,
      "epoch": 0.7096774193548387,
      "frac_reward_zero_std": 0.0625,
      "grad_norm": 1.7109375,
      "kl": 0.007783821260090917,
      "learning_rate": 6.278303672745373e-06,
      "loss": -0.002,
      "num_tokens": 33149128.0,
      "reward": 8.587579703330993,
      "reward_std": 10.724381399154662,
      "rewards/wrapper/mean": 4.293789640069008,
      "rewards/wrapper/std": 12.531948786973953,
      "step": 3025
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.8,
      "completions/mean_length": 254.23125,
      "completions/mean_terminated_length": 45.8,
      "completions/min_length": 199.4,
      "completions/min_terminated_length": 45.8,
      "epoch": 0.7108504398826979,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.0,
      "kl": 0.010787489754147828,
      "learning_rate": 6.2733903403831275e-06,
      "loss": -0.0075,
      "num_tokens": 33206507.0,
      "reward": 8.99125509262085,
      "reward_std": 11.77661190032959,
      "rewards/wrapper/mean": 4.495627209544182,
      "rewards/wrapper/std": 14.157334826886654,
      "step": 3030
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 87.6,
      "completions/mean_length": 242.9375,
      "completions/mean_terminated_length": 48.46666717529297,
      "completions/min_length": 26.6,
      "completions/min_terminated_length": 26.6,
      "epoch": 0.7120234604105572,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 4.4375,
      "kl": 0.00714592015137896,
      "learning_rate": 6.268472469959519e-06,
      "loss": -0.0185,
      "num_tokens": 33261703.0,
      "reward": 13.6625545501709,
      "reward_std": 13.896155834197998,
      "rewards/wrapper/mean": 6.831276829540729,
      "rewards/wrapper/std": 19.921477034687996,
      "step": 3035
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.2,
      "completions/mean_length": 250.20625,
      "completions/mean_terminated_length": 10.7,
      "completions/min_length": 104.6,
      "completions/min_terminated_length": 2.2,
      "epoch": 0.7131964809384165,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.203125,
      "kl": 0.009536017797654495,
      "learning_rate": 6.263550078313057e-06,
      "loss": -0.0123,
      "num_tokens": 33317868.0,
      "reward": 11.820788192749024,
      "reward_std": 13.19154167175293,
      "rewards/wrapper/mean": 5.910393899679184,
      "rewards/wrapper/std": 16.527128563821314,
      "step": 3040
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.7143695014662756,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.6171875,
      "kl": 0.006540574179962277,
      "learning_rate": 6.2586231822977305e-06,
      "loss": -0.0106,
      "num_tokens": 33371771.0,
      "reward": 7.129058766365051,
      "reward_std": 9.376532649993896,
      "rewards/wrapper/mean": 3.5645292200148107,
      "rewards/wrapper/std": 11.052740156650543,
      "step": 3045
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.4,
      "completions/mean_length": 253.3125,
      "completions/mean_terminated_length": 16.4,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 16.4,
      "epoch": 0.7155425219941349,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.1484375,
      "kl": 0.016403516015270726,
      "learning_rate": 6.253691798782954e-06,
      "loss": 0.001,
      "num_tokens": 33423625.0,
      "reward": 16.984063339233398,
      "reward_std": 21.145125770568846,
      "rewards/wrapper/mean": 8.492031678557396,
      "rewards/wrapper/std": 25.235761691629886,
      "step": 3050
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.4,
      "completions/mean_length": 251.675,
      "completions/mean_terminated_length": 54.3,
      "completions/min_length": 156.6,
      "completions/min_terminated_length": 54.2,
      "epoch": 0.7167155425219941,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.671875,
      "kl": 0.010424506437266246,
      "learning_rate": 6.248755944653503e-06,
      "loss": -0.0033,
      "num_tokens": 33481279.0,
      "reward": 9.209318351745605,
      "reward_std": 11.9292555809021,
      "rewards/wrapper/mean": 4.604659250378608,
      "rewards/wrapper/std": 14.66360622793436,
      "step": 3055
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 40.2,
      "completions/mean_length": 252.4625,
      "completions/mean_terminated_length": 40.2,
      "completions/min_length": 193.8,
      "completions/min_terminated_length": 40.2,
      "epoch": 0.7178885630498534,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.90625,
      "kl": 0.01565061039291322,
      "learning_rate": 6.243815636809464e-06,
      "loss": -0.0126,
      "num_tokens": 33539275.0,
      "reward": 12.543126440048217,
      "reward_std": 15.04892885684967,
      "rewards/wrapper/mean": 6.271563523262739,
      "rewards/wrapper/std": 18.085418404638766,
      "step": 3060
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.0,
      "completions/mean_length": 249.95625,
      "completions/mean_terminated_length": 27.8,
      "completions/min_length": 170.0,
      "completions/min_terminated_length": 16.4,
      "epoch": 0.7190615835777127,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.671875,
      "kl": 0.00790031259530224,
      "learning_rate": 6.238870892166168e-06,
      "loss": -0.0126,
      "num_tokens": 33590184.0,
      "reward": 9.49119300842285,
      "reward_std": 11.98473072052002,
      "rewards/wrapper/mean": 4.745596365630627,
      "rewards/wrapper/std": 14.232426093518734,
      "step": 3065
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.7202346041055718,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.9296875,
      "kl": 0.011212410021107644,
      "learning_rate": 6.233921727654144e-06,
      "loss": -0.0058,
      "num_tokens": 33646111.0,
      "reward": 7.471917772293091,
      "reward_std": 9.847779417037964,
      "rewards/wrapper/mean": 3.735959121584892,
      "rewards/wrapper/std": 13.612964145839214,
      "step": 3070
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.6,
      "completions/mean_length": 254.225,
      "completions/mean_terminated_length": 45.6,
      "completions/min_length": 199.2,
      "completions/min_terminated_length": 45.6,
      "epoch": 0.7214076246334311,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.46875,
      "kl": 0.019665966625325382,
      "learning_rate": 6.2289681602190475e-06,
      "loss": -0.0001,
      "num_tokens": 33701441.0,
      "reward": 9.508408069610596,
      "reward_std": 12.26361608505249,
      "rewards/wrapper/mean": 4.754203618317843,
      "rewards/wrapper/std": 16.59772346019745,
      "step": 3075
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 34.2,
      "completions/mean_length": 252.45625,
      "completions/mean_terminated_length": 13.4,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7225806451612903,
      "frac_reward_zero_std": 0.0875,
      "grad_norm": 2.546875,
      "kl": 0.014641248155385256,
      "learning_rate": 6.224010206821615e-06,
      "loss": -0.0111,
      "num_tokens": 33757582.0,
      "reward": 7.073202276229859,
      "reward_std": 9.425168991088867,
      "rewards/wrapper/mean": 3.536601182818413,
      "rewards/wrapper/std": 10.393733787536622,
      "step": 3080
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.4,
      "completions/mean_length": 251.11875,
      "completions/mean_terminated_length": 24.5,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.7237536656891496,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.375,
      "kl": 0.008687148883473128,
      "learning_rate": 6.219047884437596e-06,
      "loss": -0.0055,
      "num_tokens": 33812913.0,
      "reward": 10.549963283538819,
      "reward_std": 14.273748397827148,
      "rewards/wrapper/mean": 5.27498158365488,
      "rewards/wrapper/std": 18.366497644782065,
      "step": 3085
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.2,
      "completions/mean_length": 251.43125,
      "completions/mean_terminated_length": 7.2,
      "completions/min_length": 160.8,
      "completions/min_terminated_length": 7.2,
      "epoch": 0.7249266862170088,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.359375,
      "kl": 0.009205997944809497,
      "learning_rate": 6.214081210057702e-06,
      "loss": -0.0055,
      "num_tokens": 33870604.0,
      "reward": 15.315609312057495,
      "reward_std": 20.723115539550783,
      "rewards/wrapper/mean": 7.657804708182812,
      "rewards/wrapper/std": 18.553164108097555,
      "step": 3090
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.726099706744868,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.40625,
      "kl": 0.012098168022930622,
      "learning_rate": 6.209110200687543e-06,
      "loss": -0.0096,
      "num_tokens": 33923694.0,
      "reward": 8.973456811904907,
      "reward_std": 10.326069641113282,
      "rewards/wrapper/mean": 4.486728381365538,
      "rewards/wrapper/std": 13.67483127862215,
      "step": 3095
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.0,
      "completions/mean_length": 250.9875,
      "completions/mean_terminated_length": 34.0,
      "completions/min_length": 128.4,
      "completions/min_terminated_length": 26.0,
      "epoch": 0.7272727272727273,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.7578125,
      "kl": 0.020142507256241515,
      "learning_rate": 6.2041348733475726e-06,
      "loss": -0.013,
      "num_tokens": 33976212.0,
      "reward": 12.24562635421753,
      "reward_std": 11.961190509796143,
      "rewards/wrapper/mean": 6.122812962532043,
      "rewards/wrapper/std": 17.585172924399377,
      "step": 3100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.7284457478005865,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 3.703125,
      "kl": 6.63561298425775,
      "learning_rate": 6.199155245073034e-06,
      "loss": 0.2629,
      "num_tokens": 34031094.0,
      "reward": 7.917291593551636,
      "reward_std": 9.857232570648193,
      "rewards/wrapper/mean": 3.9586454682052135,
      "rewards/wrapper/std": 12.87554216235876,
      "step": 3105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 32.0,
      "completions/mean_length": 255.4,
      "completions/mean_terminated_length": 32.0,
      "completions/min_length": 236.8,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.7296187683284457,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.296875,
      "kl": 0.007416438066866249,
      "learning_rate": 6.194171332913887e-06,
      "loss": -0.0023,
      "num_tokens": 34084466.0,
      "reward": 9.664254093170166,
      "reward_std": 12.404686951637268,
      "rewards/wrapper/mean": 4.832127270102501,
      "rewards/wrapper/std": 12.02934721559286,
      "step": 3110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.2,
      "completions/mean_length": 251.29375,
      "completions/mean_terminated_length": 54.2,
      "completions/min_length": 105.4,
      "completions/min_terminated_length": 54.2,
      "epoch": 0.730791788856305,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.21875,
      "kl": 0.04753340142779052,
      "learning_rate": 6.189183153934767e-06,
      "loss": -0.0032,
      "num_tokens": 34139469.0,
      "reward": 10.3969295501709,
      "reward_std": 10.79515585899353,
      "rewards/wrapper/mean": 5.19846440255642,
      "rewards/wrapper/std": 16.67407398223877,
      "step": 3115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.4,
      "completions/mean_length": 251.49375,
      "completions/mean_terminated_length": 9.4,
      "completions/min_length": 111.8,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.7319648093841642,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 8.375,
      "kl": 0.03143579112365842,
      "learning_rate": 6.1841907252149144e-06,
      "loss": -0.0128,
      "num_tokens": 34194524.0,
      "reward": 11.688684749603272,
      "reward_std": 15.52239112854004,
      "rewards/wrapper/mean": 5.844342230260372,
      "rewards/wrapper/std": 17.19078350365162,
      "step": 3120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 19.6,
      "completions/mean_length": 251.81875,
      "completions/mean_terminated_length": 19.6,
      "completions/min_length": 173.2,
      "completions/min_terminated_length": 19.6,
      "epoch": 0.7331378299120235,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.296875,
      "kl": 0.006614711100701243,
      "learning_rate": 6.1791940638481225e-06,
      "loss": -0.0056,
      "num_tokens": 34247523.0,
      "reward": 10.99114260673523,
      "reward_std": 12.279700326919556,
      "rewards/wrapper/mean": 5.495571257919073,
      "rewards/wrapper/std": 14.978180499374867,
      "step": 3125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.8,
      "completions/mean_length": 251.5375,
      "completions/mean_terminated_length": 10.8,
      "completions/min_length": 113.2,
      "completions/min_terminated_length": 10.8,
      "epoch": 0.7343108504398826,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.5,
      "kl": 0.011815444775857031,
      "learning_rate": 6.174193186942678e-06,
      "loss": -0.0111,
      "num_tokens": 34305905.0,
      "reward": 10.515061807632446,
      "reward_std": 14.016519451141358,
      "rewards/wrapper/mean": 5.257530699670315,
      "rewards/wrapper/std": 16.910930271446706,
      "step": 3130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 56.8,
      "completions/mean_length": 249.91875,
      "completions/mean_terminated_length": 54.5,
      "completions/min_length": 103.4,
      "completions/min_terminated_length": 52.2,
      "epoch": 0.7354838709677419,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.53125,
      "kl": 0.020315185002982617,
      "learning_rate": 6.169188111621298e-06,
      "loss": -0.0097,
      "num_tokens": 34360016.0,
      "reward": 16.148542070388793,
      "reward_std": 16.03440327644348,
      "rewards/wrapper/mean": 8.074271266907454,
      "rewards/wrapper/std": 18.308212214708327,
      "step": 3135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 49.2,
      "completions/mean_length": 251.55625,
      "completions/mean_terminated_length": 48.2,
      "completions/min_length": 149.6,
      "completions/min_terminated_length": 47.2,
      "epoch": 0.7366568914956012,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.0625,
      "kl": 0.08753183353692293,
      "learning_rate": 6.164178855021075e-06,
      "loss": -0.0093,
      "num_tokens": 34412503.0,
      "reward": 8.106660556793212,
      "reward_std": 10.737511825561523,
      "rewards/wrapper/mean": 4.053329988569021,
      "rewards/wrapper/std": 13.285885770618915,
      "step": 3140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.0,
      "completions/mean_length": 252.925,
      "completions/mean_terminated_length": 4.0,
      "completions/min_length": 157.6,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.7378299120234604,
      "frac_reward_zero_std": 0.05,
      "grad_norm": 1.875,
      "kl": 0.008428220392670483,
      "learning_rate": 6.159165434293425e-06,
      "loss": -0.0124,
      "num_tokens": 34467477.0,
      "reward": 12.283936882019043,
      "reward_std": 15.909385299682617,
      "rewards/wrapper/mean": 6.141968539357185,
      "rewards/wrapper/std": 19.165711463987826,
      "step": 3145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7390029325513197,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.25,
      "kl": 0.008187936479225754,
      "learning_rate": 6.154147866604011e-06,
      "loss": -0.0047,
      "num_tokens": 34521474.0,
      "reward": 10.439709758758545,
      "reward_std": 13.946194648742676,
      "rewards/wrapper/mean": 5.219854548573494,
      "rewards/wrapper/std": 16.24859150648117,
      "step": 3150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 73.2,
      "completions/mean_length": 251.9,
      "completions/mean_terminated_length": 36.8,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.7401759530791789,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.484375,
      "kl": 0.0075267312116920945,
      "learning_rate": 6.149126169132701e-06,
      "loss": 0.0008,
      "num_tokens": 34574796.0,
      "reward": 9.501663303375244,
      "reward_std": 11.618663597106934,
      "rewards/wrapper/mean": 4.750831536203623,
      "rewards/wrapper/std": 14.094446489214898,
      "step": 3155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7413489736070381,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.453125,
      "kl": 0.009569109929725528,
      "learning_rate": 6.144100359073504e-06,
      "loss": 0.0004,
      "num_tokens": 34630316.0,
      "reward": 10.490760707855225,
      "reward_std": 12.09967861175537,
      "rewards/wrapper/mean": 5.245380634069443,
      "rewards/wrapper/std": 14.713596984744072,
      "step": 3160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 41.2,
      "completions/mean_length": 249.3,
      "completions/mean_terminated_length": 26.1,
      "completions/min_length": 113.4,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.7425219941348974,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.546875,
      "kl": 0.009689736948348581,
      "learning_rate": 6.139070453634509e-06,
      "loss": -0.0111,
      "num_tokens": 34685220.0,
      "reward": 13.131814670562743,
      "reward_std": 15.438633823394776,
      "rewards/wrapper/mean": 6.5659067839384075,
      "rewards/wrapper/std": 19.385744975507258,
      "step": 3165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 107.0,
      "completions/mean_length": 250.38125,
      "completions/mean_terminated_length": 70.73333358764648,
      "completions/min_length": 149.2,
      "completions/min_terminated_length": 46.8,
      "epoch": 0.7436950146627566,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.375,
      "kl": 0.33361702859401704,
      "learning_rate": 6.1340364700378255e-06,
      "loss": 0.0094,
      "num_tokens": 34739987.0,
      "reward": 14.167833518981933,
      "reward_std": 19.107415008544923,
      "rewards/wrapper/mean": 7.083916249871254,
      "rewards/wrapper/std": 18.601954208314417,
      "step": 3170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 88.2,
      "completions/mean_length": 253.9625,
      "completions/mean_terminated_length": 63.3,
      "completions/min_length": 192.0,
      "completions/min_terminated_length": 38.4,
      "epoch": 0.7448680351906158,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 6.78125,
      "kl": 0.008973815105855465,
      "learning_rate": 6.128998425519528e-06,
      "loss": -0.0039,
      "num_tokens": 34797083.0,
      "reward": 9.621114826202392,
      "reward_std": 10.426493692398072,
      "rewards/wrapper/mean": 4.8105573311448095,
      "rewards/wrapper/std": 14.826824332773686,
      "step": 3175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 14.4,
      "completions/mean_length": 251.65,
      "completions/mean_terminated_length": 14.4,
      "completions/min_length": 116.8,
      "completions/min_terminated_length": 14.4,
      "epoch": 0.7460410557184751,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.75,
      "kl": 0.009251125366427004,
      "learning_rate": 6.123956337329597e-06,
      "loss": -0.0111,
      "num_tokens": 34848269.0,
      "reward": 12.609267926216125,
      "reward_std": 16.96615676879883,
      "rewards/wrapper/mean": 6.304634357988834,
      "rewards/wrapper/std": 16.025322619080544,
      "step": 3180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 60.6,
      "completions/mean_length": 251.54375,
      "completions/mean_terminated_length": 53.3,
      "completions/min_length": 148.4,
      "completions/min_terminated_length": 46.0,
      "epoch": 0.7472140762463343,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.359375,
      "kl": 0.010146375722251832,
      "learning_rate": 6.118910222731853e-06,
      "loss": -0.0057,
      "num_tokens": 34903840.0,
      "reward": 18.351296615600585,
      "reward_std": 22.918200492858887,
      "rewards/wrapper/mean": 9.175648310780526,
      "rewards/wrapper/std": 21.03637299388647,
      "step": 3185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.2,
      "completions/mean_length": 251.49375,
      "completions/mean_terminated_length": 4.8,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.7483870967741936,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.8515625,
      "kl": 0.1860305307782255,
      "learning_rate": 6.113860099003909e-06,
      "loss": -0.0054,
      "num_tokens": 34960899.0,
      "reward": 8.806917667388916,
      "reward_std": 10.08819055557251,
      "rewards/wrapper/mean": 4.403458857536316,
      "rewards/wrapper/std": 14.91646645218134,
      "step": 3190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 4.2,
      "completions/mean_length": 251.3375,
      "completions/mean_terminated_length": 4.2,
      "completions/min_length": 157.8,
      "completions/min_terminated_length": 4.2,
      "epoch": 0.7495601173020527,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.1171875,
      "kl": 0.00893011859152466,
      "learning_rate": 6.108805983437102e-06,
      "loss": -0.0037,
      "num_tokens": 35014629.0,
      "reward": 13.187407493591309,
      "reward_std": 18.10890293121338,
      "rewards/wrapper/mean": 6.593703691661358,
      "rewards/wrapper/std": 21.82013604640961,
      "step": 3195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 34.8,
      "completions/mean_length": 253.8875,
      "completions/mean_terminated_length": 34.8,
      "completions/min_length": 188.4,
      "completions/min_terminated_length": 34.8,
      "epoch": 0.750733137829912,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.859375,
      "kl": 0.010938762326259167,
      "learning_rate": 6.103747893336437e-06,
      "loss": -0.0027,
      "num_tokens": 35069609.0,
      "reward": 9.006208515167236,
      "reward_std": 10.240087795257569,
      "rewards/wrapper/mean": 4.503104318678379,
      "rewards/wrapper/std": 12.582425367832183,
      "step": 3200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 52.2,
      "completions/mean_length": 254.43125,
      "completions/mean_terminated_length": 52.2,
      "completions/min_length": 205.8,
      "completions/min_terminated_length": 52.2,
      "epoch": 0.7519061583577713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.734375,
      "kl": 0.010828335920814424,
      "learning_rate": 6.098685846020526e-06,
      "loss": -0.0045,
      "num_tokens": 35125844.0,
      "reward": 12.905813598632813,
      "reward_std": 17.641766929626463,
      "rewards/wrapper/mean": 6.4529064983129505,
      "rewards/wrapper/std": 19.059850125014783,
      "step": 3205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.0,
      "completions/mean_length": 252.55,
      "completions/mean_terminated_length": 36.3,
      "completions/min_length": 183.2,
      "completions/min_terminated_length": 29.6,
      "epoch": 0.7530791788856305,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.96875,
      "kl": 0.009136547101661563,
      "learning_rate": 6.093619858821535e-06,
      "loss": -0.0074,
      "num_tokens": 35180934.0,
      "reward": 9.771417665481568,
      "reward_std": 13.003374195098877,
      "rewards/wrapper/mean": 4.885708878934383,
      "rewards/wrapper/std": 16.47006680816412,
      "step": 3210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 13.6,
      "completions/mean_length": 253.225,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 167.2,
      "completions/min_terminated_length": 13.6,
      "epoch": 0.7542521994134898,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 6.6875,
      "kl": 0.014397739351261407,
      "learning_rate": 6.088549949085114e-06,
      "loss": 0.0026,
      "num_tokens": 35239758.0,
      "reward": 7.511144304275513,
      "reward_std": 9.265942859649659,
      "rewards/wrapper/mean": 3.7555719554424285,
      "rewards/wrapper/std": 12.816774183511734,
      "step": 3215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7554252199413489,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.0625,
      "kl": 0.010587751294951885,
      "learning_rate": 6.083476134170349e-06,
      "loss": 0.0004,
      "num_tokens": 35293992.0,
      "reward": 11.967586421966553,
      "reward_std": 16.125496673583985,
      "rewards/wrapper/mean": 5.983793088048697,
      "rewards/wrapper/std": 17.478325541317464,
      "step": 3220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 36.4,
      "completions/mean_length": 248.24375,
      "completions/mean_terminated_length": 31.5,
      "completions/min_length": 26.6,
      "completions/min_terminated_length": 26.6,
      "epoch": 0.7565982404692082,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.875,
      "kl": 0.01313765674130991,
      "learning_rate": 6.078398431449692e-06,
      "loss": -0.0266,
      "num_tokens": 35348929.0,
      "reward": 10.895433044433593,
      "reward_std": 13.076961612701416,
      "rewards/wrapper/mean": 5.44771647900343,
      "rewards/wrapper/std": 17.260293766856194,
      "step": 3225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 54.6,
      "completions/mean_length": 252.90625,
      "completions/mean_terminated_length": 54.6,
      "completions/min_length": 157.0,
      "completions/min_terminated_length": 54.6,
      "epoch": 0.7577712609970675,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.015625,
      "kl": 0.008978944190312177,
      "learning_rate": 6.073316858308911e-06,
      "loss": -0.0062,
      "num_tokens": 35403446.0,
      "reward": 5.560769200325012,
      "reward_std": 7.3834045171737674,
      "rewards/wrapper/mean": 2.7803845427930356,
      "rewards/wrapper/std": 9.4280636459589,
      "step": 3230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 7.0,
      "completions/mean_length": 251.41875,
      "completions/mean_terminated_length": 7.0,
      "completions/min_length": 109.4,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.7589442815249267,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.078125,
      "kl": 0.01288509035948664,
      "learning_rate": 6.068231432147023e-06,
      "loss": -0.0164,
      "num_tokens": 35458751.0,
      "reward": 10.370946216583253,
      "reward_std": 13.006755352020264,
      "rewards/wrapper/mean": 5.185473144054413,
      "rewards/wrapper/std": 16.655578370392323,
      "step": 3235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.2,
      "completions/mean_length": 252.14375,
      "completions/mean_terminated_length": 30.2,
      "completions/min_length": 132.6,
      "completions/min_terminated_length": 30.2,
      "epoch": 0.7601173020527859,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.21875,
      "kl": 1.4038491782092024,
      "learning_rate": 6.063142170376238e-06,
      "loss": 0.0486,
      "num_tokens": 35516916.0,
      "reward": 8.923083782196045,
      "reward_std": 10.14984426498413,
      "rewards/wrapper/mean": 4.461541792750358,
      "rewards/wrapper/std": 11.562783433496952,
      "step": 3240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 12.6,
      "completions/mean_length": 253.19375,
      "completions/mean_terminated_length": 12.6,
      "completions/min_length": 166.2,
      "completions/min_terminated_length": 12.6,
      "epoch": 0.7612903225806451,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.59375,
      "kl": 0.007983066863380373,
      "learning_rate": 6.058049090421904e-06,
      "loss": -0.0074,
      "num_tokens": 35568979.0,
      "reward": 15.244392204284669,
      "reward_std": 18.322880840301515,
      "rewards/wrapper/mean": 7.622196093201637,
      "rewards/wrapper/std": 19.97906378209591,
      "step": 3245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.2,
      "completions/mean_length": 252.025,
      "completions/mean_terminated_length": 19.7,
      "completions/min_length": 166.8,
      "completions/min_terminated_length": 13.2,
      "epoch": 0.7624633431085044,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 9.625,
      "kl": 0.008099046605639159,
      "learning_rate": 6.052952209722434e-06,
      "loss": -0.012,
      "num_tokens": 35622339.0,
      "reward": 14.022188472747803,
      "reward_std": 12.63869276046753,
      "rewards/wrapper/mean": 7.011093850433826,
      "rewards/wrapper/std": 20.01946667730808,
      "step": 3250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 97.8,
      "completions/mean_length": 251.06875,
      "completions/mean_terminated_length": 62.13333435058594,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 19.6,
      "epoch": 0.7636363636363637,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.3359375,
      "kl": 0.3200004163081758,
      "learning_rate": 6.047851545729257e-06,
      "loss": -0.0037,
      "num_tokens": 35675784.0,
      "reward": 11.79908652305603,
      "reward_std": 15.175419282913207,
      "rewards/wrapper/mean": 5.899543111026287,
      "rewards/wrapper/std": 17.951800467073916,
      "step": 3255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.7648093841642228,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.859375,
      "kl": 0.015531188854947687,
      "learning_rate": 6.042747115906762e-06,
      "loss": -0.0132,
      "num_tokens": 35730971.0,
      "reward": 16.25852451324463,
      "reward_std": 20.682630634307863,
      "rewards/wrapper/mean": 8.12926201224327,
      "rewards/wrapper/std": 20.433372582495213,
      "step": 3260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 55.0,
      "completions/mean_length": 249.73125,
      "completions/mean_terminated_length": 39.4,
      "completions/min_length": 134.0,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.7659824046920821,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.0390625,
      "kl": 0.009887203492689877,
      "learning_rate": 6.037638937732224e-06,
      "loss": -0.0043,
      "num_tokens": 35783812.0,
      "reward": 10.08267617225647,
      "reward_std": 13.750806903839111,
      "rewards/wrapper/mean": 5.041338118910789,
      "rewards/wrapper/std": 13.372113381326198,
      "step": 3265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7671554252199414,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.796875,
      "kl": 0.01692812864203006,
      "learning_rate": 6.0325270286957576e-06,
      "loss": 0.0007,
      "num_tokens": 35836060.0,
      "reward": 6.83661425113678,
      "reward_std": 9.127959537506104,
      "rewards/wrapper/mean": 3.4183069966733455,
      "rewards/wrapper/std": 13.14266570582986,
      "step": 3270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.6,
      "completions/mean_length": 252.5625,
      "completions/mean_terminated_length": 43.6,
      "completions/min_length": 146.0,
      "completions/min_terminated_length": 43.6,
      "epoch": 0.7683284457478006,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.03125,
      "kl": 0.21960210915422068,
      "learning_rate": 6.027411406300248e-06,
      "loss": 0.0001,
      "num_tokens": 35891990.0,
      "reward": 12.126427268981933,
      "reward_std": 16.43520584106445,
      "rewards/wrapper/mean": 6.063213557004929,
      "rewards/wrapper/std": 18.251873682439328,
      "step": 3275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 10.8,
      "completions/mean_length": 251.54375,
      "completions/mean_terminated_length": 9.1,
      "completions/min_length": 161.0,
      "completions/min_terminated_length": 7.4,
      "epoch": 0.7695014662756599,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.640625,
      "kl": 0.013229875633260234,
      "learning_rate": 6.022292088061295e-06,
      "loss": -0.0091,
      "num_tokens": 35944685.0,
      "reward": 10.55277452468872,
      "reward_std": 12.219483661651612,
      "rewards/wrapper/mean": 5.276387079060077,
      "rewards/wrapper/std": 15.1108578145504,
      "step": 3280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.0,
      "completions/mean_length": 251.075,
      "completions/mean_terminated_length": 46.9,
      "completions/min_length": 149.2,
      "completions/min_terminated_length": 46.8,
      "epoch": 0.770674486803519,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.6328125,
      "kl": 0.012836833647452296,
      "learning_rate": 6.0171690915071554e-06,
      "loss": -0.0138,
      "num_tokens": 36000061.0,
      "reward": 13.690535640716552,
      "reward_std": 13.481866836547852,
      "rewards/wrapper/mean": 6.84526747316122,
      "rewards/wrapper/std": 17.245069521665574,
      "step": 3285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 43.6,
      "completions/mean_length": 254.16875,
      "completions/mean_terminated_length": 21.9,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7718475073313783,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 3.015625,
      "kl": 0.009838975500315428,
      "learning_rate": 6.01204243417868e-06,
      "loss": -0.0024,
      "num_tokens": 36055320.0,
      "reward": 11.147965097427369,
      "reward_std": 14.937340641021729,
      "rewards/wrapper/mean": 5.573982398211956,
      "rewards/wrapper/std": 14.91929092258215,
      "step": 3290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 26.8,
      "completions/mean_length": 252.04375,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.7730205278592376,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.640625,
      "kl": 0.008335669967345893,
      "learning_rate": 6.0069121336292505e-06,
      "loss": 0.0033,
      "num_tokens": 36109309.0,
      "reward": 14.027770709991454,
      "reward_std": 14.90116481781006,
      "rewards/wrapper/mean": 7.0138855308294294,
      "rewards/wrapper/std": 21.48350759744644,
      "step": 3295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 16.0,
      "completions/mean_length": 250.125,
      "completions/mean_terminated_length": 8.4,
      "completions/min_length": 154.4,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.7741935483870968,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.78125,
      "kl": 0.007277981494553387,
      "learning_rate": 6.001778207424726e-06,
      "loss": -0.0179,
      "num_tokens": 36161223.0,
      "reward": 7.435875916481018,
      "reward_std": 9.074062395095826,
      "rewards/wrapper/mean": 3.717937920242548,
      "rewards/wrapper/std": 13.181463140249253,
      "step": 3300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 71.2,
      "completions/mean_length": 248.64375,
      "completions/mean_terminated_length": 51.4,
      "completions/min_length": 82.8,
      "completions/min_terminated_length": 31.6,
      "epoch": 0.775366568914956,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.96875,
      "kl": 0.01733542055590078,
      "learning_rate": 5.996640673143379e-06,
      "loss": -0.0158,
      "num_tokens": 36214038.0,
      "reward": 11.63428020477295,
      "reward_std": 13.726638221740723,
      "rewards/wrapper/mean": 5.81714008525014,
      "rewards/wrapper/std": 17.69838539212942,
      "step": 3305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7765395894428152,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 12.9375,
      "kl": 0.12717605533543974,
      "learning_rate": 5.991499548375836e-06,
      "loss": -0.0012,
      "num_tokens": 36271697.0,
      "reward": 10.735355949401855,
      "reward_std": 14.319485092163086,
      "rewards/wrapper/mean": 5.367678099870682,
      "rewards/wrapper/std": 15.712982186675072,
      "step": 3310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 69.6,
      "completions/mean_length": 250.18125,
      "completions/mean_terminated_length": 53.0,
      "completions/min_length": 87.6,
      "completions/min_terminated_length": 36.4,
      "epoch": 0.7777126099706745,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 2.921875,
      "kl": 0.1200019514246378,
      "learning_rate": 5.986354850725015e-06,
      "loss": 0.0005,
      "num_tokens": 36326110.0,
      "reward": 15.179692125320434,
      "reward_std": 20.558689689636232,
      "rewards/wrapper/mean": 7.589845579862595,
      "rewards/wrapper/std": 19.174075277149676,
      "step": 3315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 75.6,
      "completions/mean_length": 251.975,
      "completions/mean_terminated_length": 45.6,
      "completions/min_length": 184.2,
      "completions/min_terminated_length": 30.6,
      "epoch": 0.7788856304985338,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.625,
      "kl": 0.012654828454833478,
      "learning_rate": 5.98120659780607e-06,
      "loss": -0.0074,
      "num_tokens": 36381048.0,
      "reward": 20.327030181884766,
      "reward_std": 27.906171798706055,
      "rewards/wrapper/mean": 10.16351497322321,
      "rewards/wrapper/std": 27.119664253294467,
      "step": 3320
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.4,
      "completions/mean_length": 249.9,
      "completions/mean_terminated_length": 6.0,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 2.6,
      "epoch": 0.7800586510263929,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.984375,
      "kl": 0.007662113837432117,
      "learning_rate": 5.976054807246328e-06,
      "loss": -0.0185,
      "num_tokens": 36433154.0,
      "reward": 9.926510620117188,
      "reward_std": 10.952439212799073,
      "rewards/wrapper/mean": 4.963255329430103,
      "rewards/wrapper/std": 13.317852970957755,
      "step": 3325
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 29.8,
      "completions/mean_length": 253.73125,
      "completions/mean_terminated_length": 29.8,
      "completions/min_length": 183.4,
      "completions/min_terminated_length": 29.8,
      "epoch": 0.7812316715542522,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.8125,
      "kl": 0.007791096775326878,
      "learning_rate": 5.970899496685225e-06,
      "loss": -0.0042,
      "num_tokens": 36487157.0,
      "reward": 18.77137498855591,
      "reward_std": 24.687060356140137,
      "rewards/wrapper/mean": 9.385687156021595,
      "rewards/wrapper/std": 25.24630133062601,
      "step": 3330
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7824046920821114,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.3125,
      "kl": 0.10288106517400593,
      "learning_rate": 5.965740683774254e-06,
      "loss": -0.0034,
      "num_tokens": 36544046.0,
      "reward": 7.03903284072876,
      "reward_std": 9.311002731323242,
      "rewards/wrapper/mean": 3.519516411423683,
      "rewards/wrapper/std": 11.18458695858717,
      "step": 3335
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7835777126099707,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.015625,
      "kl": 0.036584659735672174,
      "learning_rate": 5.960578386176898e-06,
      "loss": 0.0015,
      "num_tokens": 36599552.0,
      "reward": 14.876870918273926,
      "reward_std": 19.170709991455077,
      "rewards/wrapper/mean": 7.438435123860836,
      "rewards/wrapper/std": 18.200231629610062,
      "step": 3340
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 22.2,
      "completions/mean_length": 253.49375,
      "completions/mean_terminated_length": 22.2,
      "completions/min_length": 175.8,
      "completions/min_terminated_length": 22.2,
      "epoch": 0.78475073313783,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.859375,
      "kl": 0.025927690404932945,
      "learning_rate": 5.955412621568571e-06,
      "loss": -0.0057,
      "num_tokens": 36653269.0,
      "reward": 8.59150498509407,
      "reward_std": 11.231444156169891,
      "rewards/wrapper/mean": 4.2957524582743645,
      "rewards/wrapper/std": 13.615456952154636,
      "step": 3345
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7859237536656891,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 4.125,
      "kl": 0.007560189696960151,
      "learning_rate": 5.950243407636558e-06,
      "loss": 0.0003,
      "num_tokens": 36706603.0,
      "reward": 8.770905303955079,
      "reward_std": 11.600796127319336,
      "rewards/wrapper/mean": 4.385452452301979,
      "rewards/wrapper/std": 13.816292996704579,
      "step": 3350
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 44.0,
      "completions/mean_length": 251.275,
      "completions/mean_terminated_length": 20.960000610351564,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.7870967741935484,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 4.34375,
      "kl": 0.00841328235110268,
      "learning_rate": 5.945070762079953e-06,
      "loss": -0.0035,
      "num_tokens": 36760247.0,
      "reward": 10.318192052841187,
      "reward_std": 13.998966562747956,
      "rewards/wrapper/mean": 5.159096036851406,
      "rewards/wrapper/std": 14.412682954967021,
      "step": 3355
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.6,
      "completions/mean_length": 254.7,
      "completions/mean_terminated_length": 9.6,
      "completions/min_length": 214.4,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.7882697947214077,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.046875,
      "kl": 0.01900848246878013,
      "learning_rate": 5.939894702609604e-06,
      "loss": -0.0022,
      "num_tokens": 36817131.0,
      "reward": 8.248554515838624,
      "reward_std": 11.029405975341797,
      "rewards/wrapper/mean": 4.124277160316706,
      "rewards/wrapper/std": 11.294475804269315,
      "step": 3360
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 70.8,
      "completions/mean_length": 251.8125,
      "completions/mean_terminated_length": 70.8,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 70.8,
      "epoch": 0.7894428152492668,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.3671875,
      "kl": 0.00594492913223803,
      "learning_rate": 5.934715246948042e-06,
      "loss": -0.0115,
      "num_tokens": 36870919.0,
      "reward": 14.39435272216797,
      "reward_std": 17.458001327514648,
      "rewards/wrapper/mean": 7.197176413238049,
      "rewards/wrapper/std": 20.577864629030227,
      "step": 3365
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 20.8,
      "completions/mean_length": 250.25,
      "completions/mean_terminated_length": 20.8,
      "completions/min_length": 72.0,
      "completions/min_terminated_length": 20.8,
      "epoch": 0.7906158357771261,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 5.90625,
      "kl": 0.009505171538330614,
      "learning_rate": 5.929532412829432e-06,
      "loss": -0.0091,
      "num_tokens": 36928641.0,
      "reward": 8.956029534339905,
      "reward_std": 9.786428260803223,
      "rewards/wrapper/mean": 4.4780147187411785,
      "rewards/wrapper/std": 14.131121622025967,
      "step": 3370
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 23.6,
      "completions/mean_length": 251.9375,
      "completions/mean_terminated_length": 23.6,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 23.6,
      "epoch": 0.7917888563049853,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 3.609375,
      "kl": 0.007368017232511193,
      "learning_rate": 5.924346217999501e-06,
      "loss": -0.0148,
      "num_tokens": 36983527.0,
      "reward": 13.003273391723633,
      "reward_std": 17.144739818572997,
      "rewards/wrapper/mean": 6.50163644105196,
      "rewards/wrapper/std": 18.4751975864172,
      "step": 3375
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 15.6,
      "completions/mean_length": 246.90625,
      "completions/mean_terminated_length": 10.95,
      "completions/min_length": 111.8,
      "completions/min_terminated_length": 9.4,
      "epoch": 0.7929618768328446,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.46875,
      "kl": 0.0075474835990462456,
      "learning_rate": 5.919156680215489e-06,
      "loss": -0.0109,
      "num_tokens": 37036544.0,
      "reward": 8.331317377090453,
      "reward_std": 9.312832927703857,
      "rewards/wrapper/mean": 4.165658417344093,
      "rewards/wrapper/std": 11.342847776412963,
      "step": 3380
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.4,
      "completions/mean_length": 252.84375,
      "completions/mean_terminated_length": 1.4,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 1.4,
      "epoch": 0.7941348973607039,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.296875,
      "kl": 0.01361710149794817,
      "learning_rate": 5.913963817246078e-06,
      "loss": -0.0069,
      "num_tokens": 37094437.0,
      "reward": 7.812090110778809,
      "reward_std": 10.394651508331298,
      "rewards/wrapper/mean": 3.906045150756836,
      "rewards/wrapper/std": 12.286448706686496,
      "step": 3385
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 28.2,
      "completions/mean_length": 253.68125,
      "completions/mean_terminated_length": 28.2,
      "completions/min_length": 181.8,
      "completions/min_terminated_length": 28.2,
      "epoch": 0.795307917888563,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.109375,
      "kl": 0.08241370252799243,
      "learning_rate": 5.908767646871337e-06,
      "loss": -0.0068,
      "num_tokens": 37148160.0,
      "reward": 8.166846323013306,
      "reward_std": 10.215306043624878,
      "rewards/wrapper/mean": 4.083422873914242,
      "rewards/wrapper/std": 12.841757401823997,
      "step": 3390
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 24.0,
      "completions/mean_length": 250.35,
      "completions/mean_terminated_length": 24.0,
      "completions/min_length": 75.2,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.7964809384164223,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.4140625,
      "kl": 0.00970684234634973,
      "learning_rate": 5.903568186882657e-06,
      "loss": 0.0066,
      "num_tokens": 37203162.0,
      "reward": 5.611361646652222,
      "reward_std": 6.9950720310211185,
      "rewards/wrapper/mean": 2.805680803209543,
      "rewards/wrapper/std": 8.580831627547742,
      "step": 3395
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.7976539589442815,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.703125,
      "kl": 0.009019218035973608,
      "learning_rate": 5.898365455082694e-06,
      "loss": -0.0022,
      "num_tokens": 37259433.0,
      "reward": 15.188376426696777,
      "reward_std": 19.650272560119628,
      "rewards/wrapper/mean": 7.594188006222248,
      "rewards/wrapper/std": 20.71901829689741,
      "step": 3400
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 9.6,
      "completions/mean_length": 254.7,
      "completions/mean_terminated_length": 9.6,
      "completions/min_length": 214.4,
      "completions/min_terminated_length": 9.6,
      "epoch": 0.7988269794721408,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 5.0625,
      "kl": 0.4169622597051784,
      "learning_rate": 5.8931594692853095e-06,
      "loss": 0.0127,
      "num_tokens": 37313919.0,
      "reward": 8.489449882507325,
      "reward_std": 9.544210720062257,
      "rewards/wrapper/mean": 4.244724971055985,
      "rewards/wrapper/std": 11.45752448141575,
      "step": 3405
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 21.2,
      "completions/mean_length": 251.8625,
      "completions/mean_terminated_length": 21.2,
      "completions/min_length": 123.6,
      "completions/min_terminated_length": 21.2,
      "epoch": 0.8,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 3.390625,
      "kl": 0.009147522901184858,
      "learning_rate": 5.887950247315501e-06,
      "loss": -0.0074,
      "num_tokens": 37368831.0,
      "reward": 9.47968680858612,
      "reward_std": 11.7788712143898,
      "rewards/wrapper/mean": 4.739843459427357,
      "rewards/wrapper/std": 12.508141206204892,
      "step": 3410
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 82.6,
      "completions/mean_length": 253.78125,
      "completions/mean_terminated_length": 82.6,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 82.6,
      "epoch": 0.8011730205278592,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.734375,
      "kl": 0.008823846664745361,
      "learning_rate": 5.88273780700935e-06,
      "loss": -0.0047,
      "num_tokens": 37423584.0,
      "reward": 12.176618194580078,
      "reward_std": 15.400644683837891,
      "rewards/wrapper/mean": 6.088309331983328,
      "rewards/wrapper/std": 16.16872684061527,
      "step": 3415
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 256.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8023460410557185,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 8.1875,
      "kl": 0.012897691410034894,
      "learning_rate": 5.8775221662139565e-06,
      "loss": 0.0005,
      "num_tokens": 37481854.0,
      "reward": 13.41852263212204,
      "reward_std": 14.628087675571441,
      "rewards/wrapper/mean": 6.709261292219162,
      "rewards/wrapper/std": 21.448935608565808,
      "step": 3420
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.99375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.2,
      "completions/mean_length": 254.40625,
      "completions/mean_terminated_length": 0.2,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 0.2,
      "epoch": 0.8035190615835777,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 2.921875,
      "kl": 0.008465129404794425,
      "learning_rate": 5.87230334278738e-06,
      "loss": -0.0059,
      "num_tokens": 37537499.0,
      "reward": 12.369258069992066,
      "reward_std": 10.207852268218994,
      "rewards/wrapper/mean": 6.184629016369581,
      "rewards/wrapper/std": 15.72431526631117,
      "step": 3425
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 45.6,
      "completions/mean_length": 252.625,
      "completions/mean_terminated_length": 45.6,
      "completions/min_length": 148.0,
      "completions/min_terminated_length": 45.6,
      "epoch": 0.804692082111437,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.796875,
      "kl": 0.010761903249658644,
      "learning_rate": 5.867081354598574e-06,
      "loss": -0.0016,
      "num_tokens": 37593419.0,
      "reward": 8.310170364379882,
      "reward_std": 11.01772403717041,
      "rewards/wrapper/mean": 4.155085320025682,
      "rewards/wrapper/std": 13.078730754554272,
      "step": 3430
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 47.2,
      "completions/mean_length": 251.4875,
      "completions/mean_terminated_length": 47.0,
      "completions/min_length": 149.2,
      "completions/min_terminated_length": 46.8,
      "epoch": 0.8058651026392962,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.390625,
      "kl": 0.00788930271519348,
      "learning_rate": 5.861856219527331e-06,
      "loss": -0.0151,
      "num_tokens": 37646569.0,
      "reward": 7.208945274353027,
      "reward_std": 9.432900524139404,
      "rewards/wrapper/mean": 3.6044726580381394,
      "rewards/wrapper/std": 11.457335326075555,
      "step": 3435
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 33.4,
      "completions/mean_length": 250.9875,
      "completions/mean_terminated_length": 22.4,
      "completions/min_length": 113.8,
      "completions/min_terminated_length": 11.4,
      "epoch": 0.8070381231671554,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 6651904.0,
      "kl": 507.7494637601543,
      "learning_rate": 5.856627955464216e-06,
      "loss": 20.3053,
      "num_tokens": 37702253.0,
      "reward": 11.832038593292236,
      "reward_std": 15.661580467224121,
      "rewards/wrapper/mean": 5.916019389778375,
      "rewards/wrapper/std": 17.451789928972723,
      "step": 3440
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 42.2,
      "completions/mean_length": 253.0875,
      "completions/mean_terminated_length": 39.4,
      "completions/min_length": 190.2,
      "completions/min_terminated_length": 36.6,
      "epoch": 0.8082111436950147,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 4.1875,
      "kl": 0.014341074018739164,
      "learning_rate": 5.851396580310511e-06,
      "loss": -0.0004,
      "num_tokens": 37756311.0,
      "reward": 9.03423089981079,
      "reward_std": 12.168891334533692,
      "rewards/wrapper/mean": 4.517115272581577,
      "rewards/wrapper/std": 13.540437084436416,
      "step": 3445
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 62.4,
      "completions/mean_length": 253.15,
      "completions/mean_terminated_length": 62.4,
      "completions/min_length": 164.8,
      "completions/min_terminated_length": 62.4,
      "epoch": 0.8093841642228738,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.5,
      "kl": 0.010428384994156659,
      "learning_rate": 5.846162111978145e-06,
      "loss": -0.0056,
      "num_tokens": 37814377.0,
      "reward": 5.7462818145751955,
      "reward_std": 6.842691504955292,
      "rewards/wrapper/mean": 2.8731407657265664,
      "rewards/wrapper/std": 9.179096294939518,
      "step": 3450
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 30.6,
      "completions/mean_length": 252.1625,
      "completions/mean_terminated_length": 15.5,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.8105571847507331,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.9296875,
      "kl": 0.010554290283471346,
      "learning_rate": 5.8409245683896415e-06,
      "loss": -0.0107,
      "num_tokens": 37871469.0,
      "reward": 8.885381197929382,
      "reward_std": 11.384269547462463,
      "rewards/wrapper/mean": 4.4426903769373896,
      "rewards/wrapper/std": 12.253298838436603,
      "step": 3455
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 48.8,
      "completions/mean_length": 252.725,
      "completions/mean_terminated_length": 48.8,
      "completions/min_length": 151.2,
      "completions/min_terminated_length": 48.8,
      "epoch": 0.8117302052785924,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 1.234375,
      "kl": 0.009817326813936234,
      "learning_rate": 5.835683967478055e-06,
      "loss": -0.0082,
      "num_tokens": 37926255.0,
      "reward": 10.214480829238891,
      "reward_std": 11.895071125030517,
      "rewards/wrapper/mean": 5.107240244746208,
      "rewards/wrapper/std": 17.879739120602608,
      "step": 3460
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 67.0,
      "completions/mean_length": 253.29375,
      "completions/mean_terminated_length": 67.0,
      "completions/min_length": 169.4,
      "completions/min_terminated_length": 67.0,
      "epoch": 0.8129032258064516,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 2.734375,
      "kl": 0.411629791429732,
      "learning_rate": 5.830440327186903e-06,
      "loss": 0.0107,
      "num_tokens": 37983936.0,
      "reward": 8.380476808547973,
      "reward_std": 11.220604085922242,
      "rewards/wrapper/mean": 4.190238507837057,
      "rewards/wrapper/std": 13.17393459379673,
      "step": 3465
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 53.4,
      "completions/mean_length": 249.68125,
      "completions/mean_terminated_length": 51.93333339691162,
      "completions/min_length": 153.6,
      "completions/min_terminated_length": 51.2,
      "epoch": 0.8140762463343109,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 1.140625,
      "kl": 0.01218703945633024,
      "learning_rate": 5.825193665470114e-06,
      "loss": -0.0192,
      "num_tokens": 38035703.0,
      "reward": 9.604938888549805,
      "reward_std": 12.615773582458496,
      "rewards/wrapper/mean": 4.802469128370285,
      "rewards/wrapper/std": 14.279499669373035,
      "step": 3470
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.98125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.6,
      "completions/mean_length": 251.21875,
      "completions/mean_terminated_length": 0.6,
      "completions/min_length": 103.0,
      "completions/min_terminated_length": 0.6,
      "epoch": 0.8152492668621701,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 1.3203125,
      "kl": 0.00477030526380986,
      "learning_rate": 5.819944000291961e-06,
      "loss": -0.0143,
      "num_tokens": 38089328.0,
      "reward": 10.100382041931152,
      "reward_std": 11.479272651672364,
      "rewards/wrapper/mean": 5.050190765410662,
      "rewards/wrapper/std": 14.452794567495584,
      "step": 3475
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 1.2,
      "completions/mean_length": 248.04375,
      "completions/mean_terminated_length": 1.1,
      "completions/min_length": 52.2,
      "completions/min_terminated_length": 1.0,
      "epoch": 0.8164222873900293,
      "frac_reward_zero_std": 0.0375,
      "grad_norm": 12.6875,
      "kl": 0.010600641707424075,
      "learning_rate": 5.814691349626997e-06,
      "loss": -0.024,
      "num_tokens": 38142733.0,
      "reward": 9.5110595703125,
      "reward_std": 11.635570430755616,
      "rewards/wrapper/mean": 4.755529856681823,
      "rewards/wrapper/std": 13.565742841362953,
      "step": 3480
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 38.6,
      "completions/mean_length": 250.8125,
      "completions/mean_terminated_length": 38.6,
      "completions/min_length": 141.0,
      "completions/min_terminated_length": 38.6,
      "epoch": 0.8175953079178886,
      "frac_reward_zero_std": 0.025,
      "grad_norm": 3.515625,
      "kl": 0.008213794010225683,
      "learning_rate": 5.809435731460002e-06,
      "loss": -0.0053,
      "num_tokens": 38197519.0,
      "reward": 8.368210363388062,
      "reward_std": 11.415078401565552,
      "rewards/wrapper/mean": 4.1841050907969475,
      "rewards/wrapper/std": 12.334974782168866,
      "step": 3485
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9875,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 0.4,
      "completions/mean_length": 252.8125,
      "completions/mean_terminated_length": 0.4,
      "completions/min_length": 154.0,
      "completions/min_terminated_length": 0.4,
      "epoch": 0.8187683284457478,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 1.2734375,
      "kl": 0.007487597275758162,
      "learning_rate": 5.804177163785915e-06,
      "loss": 0.0116,
      "num_tokens": 38251175.0,
      "reward": 6.888301229476928,
      "reward_std": 7.135924625396728,
      "rewards/wrapper/mean": 3.4441506803035735,
      "rewards/wrapper/std": 10.814507488906383,
      "step": 3490
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.975,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 3.2,
      "completions/mean_length": 249.7,
      "completions/mean_terminated_length": 3.2,
      "completions/min_length": 54.4,
      "completions/min_terminated_length": 3.2,
      "epoch": 0.819941348973607,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.9609375,
      "kl": 0.01530810545082204,
      "learning_rate": 5.798915664609773e-06,
      "loss": -0.018,
      "num_tokens": 38303053.0,
      "reward": 10.540728664398193,
      "reward_std": 12.398472499847411,
      "rewards/wrapper/mean": 5.2703643180429935,
      "rewards/wrapper/std": 13.671444369852543,
      "step": 3495
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.95,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 27.8,
      "completions/mean_length": 244.19375,
      "completions/mean_terminated_length": 13.6,
      "completions/min_length": 52.0,
      "completions/min_terminated_length": 0.8,
      "epoch": 0.8211143695014663,
      "frac_reward_zero_std": 0.0125,
      "grad_norm": 2.21875,
      "kl": 0.007007872418034822,
      "learning_rate": 5.7936512519466495e-06,
      "loss": -0.0243,
      "num_tokens": 38354696.0,
      "reward": 12.547660064697265,
      "reward_std": 15.266628074645997,
      "rewards/wrapper/mean": 6.273830074816942,
      "rewards/wrapper/std": 18.49634841531515,
      "step": 3500
    },
    {
      "epoch": 0.8211143695014663,
      "eval_clip_ratio/high_max": 0.0,
      "eval_clip_ratio/high_mean": 0.0,
      "eval_clip_ratio/low_mean": 0.0,
      "eval_clip_ratio/low_min": 0.0,
      "eval_clip_ratio/region_mean": 0.0,
      "eval_completions/clipped_ratio": 0.7825,
      "eval_completions/max_length": 256.0,
      "eval_completions/max_terminated_length": 75.64,
      "eval_completions/mean_length": 224.9175,
      "eval_completions/mean_terminated_length": 61.223333587646486,
      "eval_completions/min_length": 155.16,
      "eval_completions/min_terminated_length": 45.08,
      "eval_frac_reward_zero_std": 0.005,
      "eval_kl": 0.013251524628140032,
      "eval_loss": -0.06226690113544464,
      "eval_num_tokens": 38354696.0,
      "eval_reward": 0.31256932340562343,
      "eval_reward_std": 0.15810628833714874,
      "eval_rewards/wrapper/mean": 0.15628466337919236,
      "eval_rewards/wrapper/std": 0.14292447288986296,
      "eval_runtime": 208.8173,
      "eval_samples_per_second": 0.958,
      "eval_steps_per_second": 0.239,
      "step": 3500
    }
  ],
  "logging_steps": 5,
  "max_steps": 8524,
  "num_input_tokens_seen": 38354696,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}