{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8211143695014663, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 1.2, "completions/mean_length": 254.4375, "completions/mean_terminated_length": 1.2, "completions/min_length": 206.0, "completions/min_terminated_length": 1.2, "epoch": 0.0011730205278592375, "frac_reward_zero_std": 0.0125, "grad_norm": 3.765625, "kl": 0.0012255147332325578, "learning_rate": 9.047771428571428e-07, "loss": -0.0012, "num_tokens": 54676.0, "reward": 7.508432197570801, "reward_std": 7.795059728622436, "rewards/wrapper/mean": 3.7542161136865615, "rewards/wrapper/std": 11.808628790080547, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 2.8, "completions/mean_length": 254.4875, "completions/mean_terminated_length": 2.8, "completions/min_length": 207.6, "completions/min_terminated_length": 2.8, "epoch": 0.002346041055718475, "frac_reward_zero_std": 0.05, "grad_norm": 2.1875, "kl": 0.002615465858252719, "learning_rate": 2.035748571428571e-06, "loss": -0.0035, "num_tokens": 113354.0, "reward": 6.667659282684326, "reward_std": 8.151447796821595, "rewards/wrapper/mean": 3.333829724043608, "rewards/wrapper/std": 12.321574296057225, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 250.7875, "completions/mean_terminated_length": 9.5, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.0035190615835777126, "frac_reward_zero_std": 0.05, "grad_norm": 4.9375, "kl": 0.005194541340461001, "learning_rate": 3.1667200000000002e-06, "loss": -0.0003, "num_tokens": 169594.0, "reward": 11.774119424819947, "reward_std": 13.959662055969238, "rewards/wrapper/mean": 5.887059649825096, "rewards/wrapper/std": 18.77371552735567, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 15.6, "completions/mean_length": 253.2875, "completions/mean_terminated_length": 15.6, "completions/min_length": 169.2, "completions/min_terminated_length": 15.6, "epoch": 0.00469208211143695, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.007731236255494878, "learning_rate": 4.297691428571428e-06, "loss": -0.0052, "num_tokens": 222882.0, "reward": 12.04462718963623, "reward_std": 15.45330753326416, "rewards/wrapper/mean": 6.022313681989909, "rewards/wrapper/std": 18.897284054756163, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 255.3375, "completions/mean_terminated_length": 30.0, "completions/min_length": 234.8, "completions/min_terminated_length": 30.0, "epoch": 0.005865102639296188, "frac_reward_zero_std": 0.0375, "grad_norm": 1.484375, "kl": 0.005328995548188687, "learning_rate": 5.428662857142858e-06, "loss": 0.0021, "num_tokens": 278444.0, "reward": 13.085074043273925, "reward_std": 16.36840648651123, "rewards/wrapper/mean": 6.542536787688732, "rewards/wrapper/std": 17.1350009560585, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 251.225, "completions/mean_terminated_length": 52.0, "completions/min_length": 103.2, "completions/min_terminated_length": 52.0, "epoch": 0.007038123167155425, "frac_reward_zero_std": 0.025, "grad_norm": 1.421875, "kl": 0.0037205405707936732, "learning_rate": 6.559634285714286e-06, "loss": -0.0098, "num_tokens": 332848.0, "reward": 9.3982759475708, "reward_std": 10.730877304077149, "rewards/wrapper/mean": 4.6991379834711555, "rewards/wrapper/std": 15.566500315070153, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 58.6, "completions/mean_length": 254.63125, "completions/mean_terminated_length": 58.6, "completions/min_length": 212.2, "completions/min_terminated_length": 58.6, "epoch": 0.008211143695014663, "frac_reward_zero_std": 0.0625, "grad_norm": 2.578125, "kl": 0.010630438197404146, "learning_rate": 7.690605714285714e-06, "loss": 0.0029, "num_tokens": 388563.0, "reward": 10.763528490066529, "reward_std": 7.830057907104492, "rewards/wrapper/mean": 5.381764186918735, "rewards/wrapper/std": 17.07995459139347, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 48.8, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 25.53333435058594, "completions/min_length": 108.4, "completions/min_terminated_length": 6.0, "epoch": 0.0093841642228739, "frac_reward_zero_std": 0.0375, "grad_norm": 1.1640625, "kl": 0.009318914514733479, "learning_rate": 7.916796747198757e-06, "loss": -0.012, "num_tokens": 443458.0, "reward": 7.394418716430664, "reward_std": 9.918637371063232, "rewards/wrapper/mean": 3.697209335118532, "rewards/wrapper/std": 11.686485758423805, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 7.8, "completions/mean_length": 253.04375, "completions/mean_terminated_length": 7.8, "completions/min_length": 161.4, "completions/min_terminated_length": 7.8, "epoch": 0.010557184750733138, "frac_reward_zero_std": 0.025, "grad_norm": 7.25, "kl": 0.009933865355560557, "learning_rate": 7.916783532705924e-06, "loss": -0.0065, "num_tokens": 500731.0, "reward": 6.865958595275879, "reward_std": 9.22301788330078, "rewards/wrapper/mean": 3.4329791098833082, "rewards/wrapper/std": 11.259305146336555, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 59.2, "completions/mean_length": 253.05, "completions/mean_terminated_length": 59.2, "completions/min_length": 161.6, "completions/min_terminated_length": 59.2, "epoch": 0.011730205278592375, "frac_reward_zero_std": 0.0125, "grad_norm": 2.625, "kl": 0.005990624788682908, "learning_rate": 7.916760153266633e-06, "loss": -0.0025, "num_tokens": 551543.0, "reward": 6.809884262084961, "reward_std": 8.566570162773132, "rewards/wrapper/mean": 3.4049422472715376, "rewards/wrapper/std": 8.249704784154892, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 93.2, "completions/mean_length": 252.5125, "completions/mean_terminated_length": 93.2, "completions/min_length": 144.4, "completions/min_terminated_length": 93.2, "epoch": 0.012903225806451613, "frac_reward_zero_std": 0.0625, "grad_norm": 2.265625, "kl": 0.007241980719845742, "learning_rate": 7.916726608960937e-06, "loss": 0.0009, "num_tokens": 606755.0, "reward": 14.835551500320435, "reward_std": 17.018505144119263, "rewards/wrapper/mean": 7.417775437235832, "rewards/wrapper/std": 18.011443032324316, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 5.6, "completions/mean_length": 254.575, "completions/mean_terminated_length": 5.6, "completions/min_length": 210.4, "completions/min_terminated_length": 5.6, "epoch": 0.01407624633431085, "frac_reward_zero_std": 0.025, "grad_norm": 1.328125, "kl": 0.02387903115595691, "learning_rate": 7.916682899903684e-06, "loss": -0.004, "num_tokens": 658779.0, "reward": 11.199076652526855, "reward_std": 12.978521537780761, "rewards/wrapper/mean": 5.5995381936430935, "rewards/wrapper/std": 18.202550745010377, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.015249266862170088, "frac_reward_zero_std": 0.0375, "grad_norm": 1.90625, "kl": 0.006103656144114211, "learning_rate": 7.916629026244537e-06, "loss": -0.0012, "num_tokens": 715315.0, "reward": 6.255694437026977, "reward_std": 8.289126348495483, "rewards/wrapper/mean": 3.1278470791876316, "rewards/wrapper/std": 10.367880092561245, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.016422287390029325, "frac_reward_zero_std": 0.025, "grad_norm": 1.9375, "kl": 0.006190836068708449, "learning_rate": 7.916564988167955e-06, "loss": -0.0098, "num_tokens": 769328.0, "reward": 9.829441356658936, "reward_std": 13.315379619598389, "rewards/wrapper/mean": 4.914720744639635, "rewards/wrapper/std": 14.687225022912026, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 20.8, "completions/mean_length": 255.05, "completions/mean_terminated_length": 20.8, "completions/min_length": 225.6, "completions/min_terminated_length": 20.8, "epoch": 0.017595307917888565, "frac_reward_zero_std": 0.0125, "grad_norm": 1.6796875, "kl": 0.010496543836779892, "learning_rate": 7.916490785893198e-06, "loss": -0.003, "num_tokens": 826122.0, "reward": 10.434588527679443, "reward_std": 12.015387630462646, "rewards/wrapper/mean": 5.217294257879257, "rewards/wrapper/std": 14.79298051893711, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 5.4, "completions/mean_length": 252.96875, "completions/mean_terminated_length": 5.4, "completions/min_length": 159.0, "completions/min_terminated_length": 5.4, "epoch": 0.0187683284457478, "frac_reward_zero_std": 0.0125, "grad_norm": 1.8984375, "kl": 0.16615661919931882, "learning_rate": 7.916406419674335e-06, "loss": 0.0112, "num_tokens": 881035.0, "reward": 9.748047590255737, "reward_std": 13.241822493076324, "rewards/wrapper/mean": 4.874023604393005, "rewards/wrapper/std": 16.71337246745825, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.01994134897360704, "frac_reward_zero_std": 0.025, "grad_norm": 2.484375, "kl": 0.005029451113659889, "learning_rate": 7.916311889800224e-06, "loss": 0.0002, "num_tokens": 935581.0, "reward": 10.057518577575683, "reward_std": 12.031916904449464, "rewards/wrapper/mean": 5.0287592664361, "rewards/wrapper/std": 14.356791715323926, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 5.2, "completions/mean_length": 252.96875, "completions/mean_terminated_length": 2.7, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.021114369501466276, "frac_reward_zero_std": 0.0375, "grad_norm": 2.171875, "kl": 0.0057046718808123845, "learning_rate": 7.916207196594537e-06, "loss": -0.0058, "num_tokens": 990930.0, "reward": 9.220219135284424, "reward_std": 11.743634796142578, "rewards/wrapper/mean": 4.610109446942806, "rewards/wrapper/std": 13.207088494300843, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 41.4, "completions/mean_length": 252.49375, "completions/mean_terminated_length": 41.4, "completions/min_length": 143.8, "completions/min_terminated_length": 41.4, "epoch": 0.022287390029325515, "frac_reward_zero_std": 0.0375, "grad_norm": 1.921875, "kl": 0.007934682531049475, "learning_rate": 7.916092340415737e-06, "loss": -0.008, "num_tokens": 1044417.0, "reward": 12.452170944213867, "reward_std": 11.926630926132201, "rewards/wrapper/mean": 6.226085089147091, "rewards/wrapper/std": 18.230350717902184, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 40.6, "completions/mean_length": 250.93125, "completions/mean_terminated_length": 21.5, "completions/min_length": 104.8, "completions/min_terminated_length": 2.4, "epoch": 0.02346041055718475, "frac_reward_zero_std": 0.025, "grad_norm": 1.71875, "kl": 0.13625025742221625, "learning_rate": 7.915967321657082e-06, "loss": -0.0075, "num_tokens": 1099482.0, "reward": 10.583320891857147, "reward_std": 13.258717286586762, "rewards/wrapper/mean": 5.291660659015179, "rewards/wrapper/std": 15.585099121928215, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 39.2, "completions/mean_length": 249.2375, "completions/mean_terminated_length": 19.9, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.02463343108504399, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.12125548404292204, "learning_rate": 7.915832140746629e-06, "loss": -0.0172, "num_tokens": 1152528.0, "reward": 10.50744104385376, "reward_std": 12.436288833618164, "rewards/wrapper/mean": 5.253720180690289, "rewards/wrapper/std": 16.398430436849594, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 24.4, "completions/mean_length": 248.775, "completions/mean_terminated_length": 12.5, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.025806451612903226, "frac_reward_zero_std": 0.0125, "grad_norm": 1.5, "kl": 0.00848652045824565, "learning_rate": 7.915686798147231e-06, "loss": -0.012, "num_tokens": 1203452.0, "reward": 8.89383053779602, "reward_std": 10.21728515625, "rewards/wrapper/mean": 4.446915102005005, "rewards/wrapper/std": 11.703204187750817, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 49.6, "completions/mean_length": 255.95, "completions/mean_terminated_length": 49.6, "completions/min_length": 254.4, "completions/min_terminated_length": 49.6, "epoch": 0.026979472140762465, "frac_reward_zero_std": 0.0125, "grad_norm": 3.875, "kl": 0.009458682424155995, "learning_rate": 7.915531294356533e-06, "loss": 0.0005, "num_tokens": 1257228.0, "reward": 12.8822021484375, "reward_std": 13.596014976501465, "rewards/wrapper/mean": 6.44110068231821, "rewards/wrapper/std": 20.19829418361187, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 252.89375, "completions/mean_terminated_length": 3.0, "completions/min_length": 156.6, "completions/min_terminated_length": 3.0, "epoch": 0.0281524926686217, "frac_reward_zero_std": 0.0375, "grad_norm": 3.5625, "kl": 0.007531364588066936, "learning_rate": 7.915365629906973e-06, "loss": -0.0119, "num_tokens": 1311561.0, "reward": 8.984685134887695, "reward_std": 11.791397857666016, "rewards/wrapper/mean": 4.492342208325863, "rewards/wrapper/std": 14.827356532216072, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 94.2, "completions/mean_length": 251.0625, "completions/mean_terminated_length": 74.6, "completions/min_length": 157.4, "completions/min_terminated_length": 55.0, "epoch": 0.02932551319648094, "frac_reward_zero_std": 0.0125, "grad_norm": 7.125, "kl": 0.006311549601377919, "learning_rate": 7.915189805365772e-06, "loss": -0.0016, "num_tokens": 1365499.0, "reward": 9.319353520870209, "reward_std": 12.09698166847229, "rewards/wrapper/mean": 4.6596766747534275, "rewards/wrapper/std": 11.877293466031551, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.030498533724340176, "frac_reward_zero_std": 0.025, "grad_norm": 2.84375, "kl": 0.005467874178430065, "learning_rate": 7.915003821334948e-06, "loss": -0.0086, "num_tokens": 1417623.0, "reward": 9.443776416778565, "reward_std": 11.862878894805908, "rewards/wrapper/mean": 4.72188790589571, "rewards/wrapper/std": 15.555906724929809, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 35.4, "completions/mean_length": 250.7125, "completions/mean_terminated_length": 31.1, "completions/min_length": 129.2, "completions/min_terminated_length": 26.8, "epoch": 0.03167155425219941, "frac_reward_zero_std": 0.025, "grad_norm": 2.078125, "kl": 0.007808281725738197, "learning_rate": 7.914807678451295e-06, "loss": -0.0047, "num_tokens": 1472927.0, "reward": 11.669925975799561, "reward_std": 15.550298118591309, "rewards/wrapper/mean": 5.834962645173073, "rewards/wrapper/std": 17.61610255539417, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 76.6, "completions/mean_length": 252.0, "completions/mean_terminated_length": 71.5, "completions/min_length": 168.8, "completions/min_terminated_length": 66.4, "epoch": 0.03284457478005865, "frac_reward_zero_std": 0.0125, "grad_norm": 2.625, "kl": 0.0043662395240971815, "learning_rate": 7.9146013773864e-06, "loss": 0.0015, "num_tokens": 1526883.0, "reward": 7.352222728729248, "reward_std": 9.59028902053833, "rewards/wrapper/mean": 3.6761114027351143, "rewards/wrapper/std": 13.92982299849391, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 48.4, "completions/mean_length": 251.1125, "completions/mean_terminated_length": 48.4, "completions/min_length": 99.6, "completions/min_terminated_length": 48.4, "epoch": 0.03401759530791789, "frac_reward_zero_std": 0.0375, "grad_norm": 1.6640625, "kl": 0.018718967185122892, "learning_rate": 7.914384918846623e-06, "loss": -0.0152, "num_tokens": 1579857.0, "reward": 10.049270629882812, "reward_std": 11.676184892654419, "rewards/wrapper/mean": 5.024635132402182, "rewards/wrapper/std": 16.037650553882123, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.03519061583577713, "frac_reward_zero_std": 0.0375, "grad_norm": 3.0, "kl": 0.004746401822194457, "learning_rate": 7.914158303573106e-06, "loss": -0.0036, "num_tokens": 1636140.0, "reward": 9.5964071393013, "reward_std": 12.794659090042114, "rewards/wrapper/mean": 4.79820346981287, "rewards/wrapper/std": 13.207125359773636, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.03636363636363636, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.005534662073478103, "learning_rate": 7.91392153234177e-06, "loss": -0.0149, "num_tokens": 1687599.0, "reward": 9.0966365814209, "reward_std": 11.15857810974121, "rewards/wrapper/mean": 4.5483182817697525, "rewards/wrapper/std": 14.566433542966843, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.0375366568914956, "frac_reward_zero_std": 0.0125, "grad_norm": 0.72265625, "kl": 0.0071763014071621, "learning_rate": 7.913674605963302e-06, "loss": 0.0028, "num_tokens": 1740307.0, "reward": 18.722765350341795, "reward_std": 25.852748107910156, "rewards/wrapper/mean": 9.361382472515107, "rewards/wrapper/std": 24.016877111792564, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 254.70625, "completions/mean_terminated_length": 61.0, "completions/min_length": 214.6, "completions/min_terminated_length": 61.0, "epoch": 0.03870967741935484, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.007732412667246535, "learning_rate": 7.913417525283167e-06, "loss": -0.0006, "num_tokens": 1797808.0, "reward": 12.486311435699463, "reward_std": 15.336132717132568, "rewards/wrapper/mean": 6.24315589889884, "rewards/wrapper/std": 18.720538268983365, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 103.2, "completions/mean_length": 248.0875, "completions/mean_terminated_length": 81.03333435058593, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "epoch": 0.03988269794721408, "frac_reward_zero_std": 0.0625, "grad_norm": 5.5625, "kl": 0.00886870157555677, "learning_rate": 7.913150291181591e-06, "loss": -0.0148, "num_tokens": 1850610.0, "reward": 11.940378427505493, "reward_std": 16.308357906341552, "rewards/wrapper/mean": 5.970189142972231, "rewards/wrapper/std": 17.508427077531813, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 82.8, "completions/mean_length": 250.6, "completions/mean_terminated_length": 68.66666717529297, "completions/min_length": 164.0, "completions/min_terminated_length": 61.6, "epoch": 0.04105571847507331, "frac_reward_zero_std": 0.0125, "grad_norm": 2.546875, "kl": 0.00591239562490955, "learning_rate": 7.912872904573574e-06, "loss": -0.0091, "num_tokens": 1906648.0, "reward": 10.338704681396484, "reward_std": 14.060916805267334, "rewards/wrapper/mean": 5.169352217018604, "rewards/wrapper/std": 15.273518617451192, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 3.6, "completions/mean_length": 254.5125, "completions/mean_terminated_length": 3.6, "completions/min_length": 208.4, "completions/min_terminated_length": 3.6, "epoch": 0.04222873900293255, "frac_reward_zero_std": 0.0125, "grad_norm": 0.85546875, "kl": 0.008821232226910069, "learning_rate": 7.912585366408867e-06, "loss": -0.002, "num_tokens": 1958564.0, "reward": 10.528805828094482, "reward_std": 11.93729567527771, "rewards/wrapper/mean": 5.264402637630701, "rewards/wrapper/std": 16.117270739376544, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 11.2, "completions/mean_length": 253.15, "completions/mean_terminated_length": 11.2, "completions/min_length": 164.8, "completions/min_terminated_length": 11.2, "epoch": 0.04340175953079179, "frac_reward_zero_std": 0.0125, "grad_norm": 1.1796875, "kl": 0.09926816275110469, "learning_rate": 7.912287677671986e-06, "loss": -0.0039, "num_tokens": 2012928.0, "reward": 11.147413158416748, "reward_std": 12.735891819000244, "rewards/wrapper/mean": 5.573706501722336, "rewards/wrapper/std": 15.083421854674816, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 11.6, "completions/mean_length": 251.5625, "completions/mean_terminated_length": 11.6, "completions/min_length": 114.0, "completions/min_terminated_length": 11.6, "epoch": 0.04457478005865103, "frac_reward_zero_std": 0.0125, "grad_norm": 1.4765625, "kl": 0.04385784288460855, "learning_rate": 7.911979839382199e-06, "loss": 0.0034, "num_tokens": 2067150.0, "reward": 7.201774215698242, "reward_std": 8.909545421600342, "rewards/wrapper/mean": 3.60088697001338, "rewards/wrapper/std": 11.909615388512611, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 48.2, "completions/mean_length": 254.30625, "completions/mean_terminated_length": 48.2, "completions/min_length": 201.8, "completions/min_terminated_length": 48.2, "epoch": 0.04574780058651026, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.016907342718332073, "learning_rate": 7.911661852593531e-06, "loss": -0.0034, "num_tokens": 2120981.0, "reward": 10.34285626411438, "reward_std": 11.99207363128662, "rewards/wrapper/mean": 5.171428105235099, "rewards/wrapper/std": 15.60823110193014, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 26.2, "completions/mean_length": 248.83125, "completions/mean_terminated_length": 13.4, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.0469208211143695, "frac_reward_zero_std": 0.0125, "grad_norm": 3.140625, "kl": 0.029390834498917683, "learning_rate": 7.911333718394748e-06, "loss": -0.0187, "num_tokens": 2178772.0, "reward": 10.648024845123292, "reward_std": 14.51836280822754, "rewards/wrapper/mean": 5.324012476205826, "rewards/wrapper/std": 16.51776341497898, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 9.2, "completions/mean_length": 249.89375, "completions/mean_terminated_length": 8.5, "completions/min_length": 110.2, "completions/min_terminated_length": 7.8, "epoch": 0.04809384164222874, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "kl": 0.006215279077878222, "learning_rate": 7.910995437909363e-06, "loss": 0.0135, "num_tokens": 2234131.0, "reward": 7.719930791854859, "reward_std": 9.464711093902588, "rewards/wrapper/mean": 3.8599654987454413, "rewards/wrapper/std": 12.777549323439597, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 18.0, "completions/mean_length": 253.3625, "completions/mean_terminated_length": 18.0, "completions/min_length": 171.6, "completions/min_terminated_length": 18.0, "epoch": 0.04926686217008798, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.24979069257969969, "learning_rate": 7.910647012295629e-06, "loss": 0.0059, "num_tokens": 2286511.0, "reward": 9.189921045303345, "reward_std": 10.320459508895874, "rewards/wrapper/mean": 4.594960490614175, "rewards/wrapper/std": 15.06650394052267, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 253.1125, "completions/mean_terminated_length": 10.0, "completions/min_length": 163.6, "completions/min_terminated_length": 10.0, "epoch": 0.05043988269794721, "frac_reward_zero_std": 0.0625, "grad_norm": 1.3203125, "kl": 0.8644496379303745, "learning_rate": 7.910288442746534e-06, "loss": 0.029, "num_tokens": 2344661.0, "reward": 10.684868431091308, "reward_std": 14.451215171813965, "rewards/wrapper/mean": 5.342434239387512, "rewards/wrapper/std": 17.4603207424283, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 51.2, "completions/mean_length": 252.8, "completions/mean_terminated_length": 51.2, "completions/min_length": 153.6, "completions/min_terminated_length": 51.2, "epoch": 0.05161290322580645, "frac_reward_zero_std": 0.0125, "grad_norm": 0.921875, "kl": 0.004707379778847098, "learning_rate": 7.909919730489803e-06, "loss": -0.0092, "num_tokens": 2397565.0, "reward": 10.836441993713379, "reward_std": 14.642048645019532, "rewards/wrapper/mean": 5.418220953643322, "rewards/wrapper/std": 15.194202147424221, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 112.2, "completions/mean_length": 248.325, "completions/mean_terminated_length": 56.53333435058594, "completions/min_length": 67.8, "completions/min_terminated_length": 16.6, "epoch": 0.05278592375366569, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.025090236740652472, "learning_rate": 7.909540876787885e-06, "loss": 0.0044, "num_tokens": 2453233.0, "reward": 11.105024528503417, "reward_std": 14.836073446273804, "rewards/wrapper/mean": 5.552512162923813, "rewards/wrapper/std": 18.147161222994328, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 17.8, "completions/mean_length": 243.8, "completions/mean_terminated_length": 8.266666793823243, "completions/min_length": 54.4, "completions/min_terminated_length": 3.2, "epoch": 0.05395894428152493, "frac_reward_zero_std": 0.0125, "grad_norm": 2.046875, "kl": 0.008077077101916075, "learning_rate": 7.909151882937952e-06, "loss": -0.0209, "num_tokens": 2505285.0, "reward": 8.832015323638917, "reward_std": 9.847680950164795, "rewards/wrapper/mean": 4.416007310897112, "rewards/wrapper/std": 12.637867455184459, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 61.6, "completions/mean_length": 251.5375, "completions/mean_terminated_length": 43.6, "completions/min_length": 188.2, "completions/min_terminated_length": 34.6, "epoch": 0.05513196480938416, "frac_reward_zero_std": 0.05, "grad_norm": 1.09375, "kl": 0.005555787222692743, "learning_rate": 7.9087527502719e-06, "loss": -0.007, "num_tokens": 2561367.0, "reward": 5.861324524879455, "reward_std": 7.810394310951233, "rewards/wrapper/mean": 2.930662341415882, "rewards/wrapper/std": 9.822330982983113, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.0563049853372434, "frac_reward_zero_std": 0.025, "grad_norm": 3.96875, "kl": 0.011543343169614672, "learning_rate": 7.908343480156331e-06, "loss": 0.0005, "num_tokens": 2617321.0, "reward": 8.890991735458375, "reward_std": 11.866082763671875, "rewards/wrapper/mean": 4.445495916903019, "rewards/wrapper/std": 16.05286959260702, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 25.8, "completions/mean_length": 252.00625, "completions/mean_terminated_length": 25.8, "completions/min_length": 128.2, "completions/min_terminated_length": 25.8, "epoch": 0.05747800586510264, "frac_reward_zero_std": 0.05, "grad_norm": 3.6875, "kl": 0.024089473049389198, "learning_rate": 7.907924073992568e-06, "loss": -0.0029, "num_tokens": 2671834.0, "reward": 10.378494834899902, "reward_std": 12.050719356536865, "rewards/wrapper/mean": 5.189247503876686, "rewards/wrapper/std": 17.33666645437479, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.05865102639296188, "frac_reward_zero_std": 0.0125, "grad_norm": 2.625, "kl": 0.022702468262286855, "learning_rate": 7.907494533216633e-06, "loss": -0.0048, "num_tokens": 2726430.0, "reward": 7.797803008556366, "reward_std": 10.616388177871704, "rewards/wrapper/mean": 3.8989015720784663, "rewards/wrapper/std": 13.724219344556332, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 47.6, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 47.6, "completions/min_length": 150.0, "completions/min_terminated_length": 47.6, "epoch": 0.05982404692082111, "frac_reward_zero_std": 0.025, "grad_norm": 3.234375, "kl": 0.007992285175714643, "learning_rate": 7.907054859299246e-06, "loss": -0.0079, "num_tokens": 2780486.0, "reward": 11.148936986923218, "reward_std": 15.111816167831421, "rewards/wrapper/mean": 5.574468304216862, "rewards/wrapper/std": 17.840139020979404, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 26.1, "completions/min_length": 177.8, "completions/min_terminated_length": 24.2, "epoch": 0.06099706744868035, "frac_reward_zero_std": 0.0125, "grad_norm": 3.53125, "kl": 0.009301980794407427, "learning_rate": 7.90660505374583e-06, "loss": -0.0008, "num_tokens": 2834880.0, "reward": 7.71119909286499, "reward_std": 10.172386932373048, "rewards/wrapper/mean": 3.8555997565388678, "rewards/wrapper/std": 12.606358571350574, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 249.625, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.06217008797653959, "frac_reward_zero_std": 0.025, "grad_norm": 4.78125, "kl": 0.005297352850902826, "learning_rate": 7.906145118096491e-06, "loss": -0.0149, "num_tokens": 2888866.0, "reward": 9.901272583007813, "reward_std": 11.926094055175781, "rewards/wrapper/mean": 4.9506362281739715, "rewards/wrapper/std": 13.668652732670306, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 38.4, "completions/mean_length": 253.04375, "completions/mean_terminated_length": 29.7, "completions/min_length": 174.6, "completions/min_terminated_length": 21.0, "epoch": 0.06334310850439882, "frac_reward_zero_std": 0.025, "grad_norm": 1.6328125, "kl": 0.003792907050228678, "learning_rate": 7.905675053926023e-06, "loss": -0.0024, "num_tokens": 2941299.0, "reward": 6.089825701713562, "reward_std": 8.013871765136718, "rewards/wrapper/mean": 3.0449130102992057, "rewards/wrapper/std": 10.364787791669368, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 4.4, "completions/mean_length": 249.7375, "completions/mean_terminated_length": 4.4, "completions/min_length": 55.6, "completions/min_terminated_length": 4.4, "epoch": 0.06451612903225806, "frac_reward_zero_std": 0.0125, "grad_norm": 6.59375, "kl": 0.12846245223772712, "learning_rate": 7.905194862843898e-06, "loss": 0.0065, "num_tokens": 2993889.0, "reward": 11.451298046112061, "reward_std": 14.382197093963622, "rewards/wrapper/mean": 5.7256487928330895, "rewards/wrapper/std": 14.491848316788673, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 30.6, "completions/mean_length": 252.15625, "completions/mean_terminated_length": 30.6, "completions/min_length": 133.0, "completions/min_terminated_length": 30.6, "epoch": 0.0656891495601173, "frac_reward_zero_std": 0.025, "grad_norm": 2.8125, "kl": 0.005170016887132078, "learning_rate": 7.904704546494267e-06, "loss": -0.0124, "num_tokens": 3047526.0, "reward": 13.99454574584961, "reward_std": 19.274290084838867, "rewards/wrapper/mean": 6.997273133695126, "rewards/wrapper/std": 22.01692125797272, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 89.6, "completions/mean_length": 254.0, "completions/mean_terminated_length": 89.6, "completions/min_length": 192.0, "completions/min_terminated_length": 89.6, "epoch": 0.06686217008797654, "frac_reward_zero_std": 0.05, "grad_norm": 4.4375, "kl": 0.009661910351132974, "learning_rate": 7.90420410655594e-06, "loss": -0.0058, "num_tokens": 3104106.0, "reward": 10.678810977935791, "reward_std": 11.440056610107423, "rewards/wrapper/mean": 5.3394052013754845, "rewards/wrapper/std": 17.356336744129656, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 39.8, "completions/mean_length": 252.44375, "completions/mean_terminated_length": 39.8, "completions/min_length": 142.2, "completions/min_terminated_length": 39.8, "epoch": 0.06803519061583578, "frac_reward_zero_std": 0.0375, "grad_norm": 1.671875, "kl": 0.0087807220290415, "learning_rate": 7.9036935447424e-06, "loss": -0.0017, "num_tokens": 3158783.0, "reward": 6.231310081481934, "reward_std": 8.16063642501831, "rewards/wrapper/mean": 3.115654981136322, "rewards/wrapper/std": 9.851075352728367, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 96.8, "completions/mean_length": 252.625, "completions/mean_terminated_length": 96.8, "completions/min_length": 148.0, "completions/min_terminated_length": 96.8, "epoch": 0.06920821114369502, "frac_reward_zero_std": 0.0, "grad_norm": 8.25, "kl": 0.004456112009938807, "learning_rate": 7.903172862801778e-06, "loss": -0.0015, "num_tokens": 3212223.0, "reward": 11.368969821929932, "reward_std": 15.1733154296875, "rewards/wrapper/mean": 5.68448467105627, "rewards/wrapper/std": 17.372170712053776, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 13.2, "completions/mean_length": 251.6125, "completions/mean_terminated_length": 13.2, "completions/min_length": 115.6, "completions/min_terminated_length": 13.2, "epoch": 0.07038123167155426, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.0058444807189516725, "learning_rate": 7.902642062516862e-06, "loss": -0.0076, "num_tokens": 3266219.0, "reward": 9.240291213989257, "reward_std": 12.255160140991212, "rewards/wrapper/mean": 4.620145709067583, "rewards/wrapper/std": 15.276380948722363, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 252.66875, "completions/mean_terminated_length": 47.0, "completions/min_length": 149.4, "completions/min_terminated_length": 47.0, "epoch": 0.07155425219941348, "frac_reward_zero_std": 0.0375, "grad_norm": 3.15625, "kl": 0.011432503134710715, "learning_rate": 7.902101145705079e-06, "loss": -0.0005, "num_tokens": 3322326.0, "reward": 10.280498886108399, "reward_std": 13.853771209716797, "rewards/wrapper/mean": 5.140249271690846, "rewards/wrapper/std": 16.632223202288152, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 10.4, "completions/mean_length": 251.53125, "completions/mean_terminated_length": 5.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.07272727272727272, "frac_reward_zero_std": 0.025, "grad_norm": 2.15625, "kl": 0.0049453877902124075, "learning_rate": 7.9015501142185e-06, "loss": -0.0059, "num_tokens": 3376043.0, "reward": 8.330785369873047, "reward_std": 10.894810342788697, "rewards/wrapper/mean": 4.165392602980137, "rewards/wrapper/std": 10.359532837569713, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 45.6, "completions/mean_length": 255.825, "completions/mean_terminated_length": 45.6, "completions/min_length": 250.4, "completions/min_terminated_length": 45.6, "epoch": 0.07390029325513196, "frac_reward_zero_std": 0.025, "grad_norm": 2.59375, "kl": 0.005565386958187446, "learning_rate": 7.900988969943825e-06, "loss": 0.0001, "num_tokens": 3432859.0, "reward": 13.264854145050048, "reward_std": 18.112439727783205, "rewards/wrapper/mean": 6.6324268616735935, "rewards/wrapper/std": 20.8455373570323, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.0750733137829912, "frac_reward_zero_std": 0.025, "grad_norm": 1.1796875, "kl": 0.0033622915856540204, "learning_rate": 7.900417714802381e-06, "loss": -0.0068, "num_tokens": 3485236.0, "reward": 10.746763801574707, "reward_std": 14.438395261764526, "rewards/wrapper/mean": 5.37338171377778, "rewards/wrapper/std": 18.321337571740152, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 77.8, "completions/mean_length": 252.49375, "completions/mean_terminated_length": 66.4, "completions/min_length": 157.4, "completions/min_terminated_length": 55.0, "epoch": 0.07624633431085044, "frac_reward_zero_std": 0.0125, "grad_norm": 2.90625, "kl": 0.004275998589582742, "learning_rate": 7.899836350750111e-06, "loss": -0.0088, "num_tokens": 3539361.0, "reward": 6.864453983306885, "reward_std": 9.229621601104736, "rewards/wrapper/mean": 3.4322269305586817, "rewards/wrapper/std": 12.251788380742074, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 32.2, "completions/mean_length": 250.6125, "completions/mean_terminated_length": 32.2, "completions/min_length": 134.6, "completions/min_terminated_length": 32.2, "epoch": 0.07741935483870968, "frac_reward_zero_std": 0.0375, "grad_norm": 4.59375, "kl": 0.011575464520137756, "learning_rate": 7.899244879777575e-06, "loss": -0.0178, "num_tokens": 3595203.0, "reward": 10.059148216247559, "reward_std": 11.052921676635743, "rewards/wrapper/mean": 5.029574017226696, "rewards/wrapper/std": 15.514526377618314, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 31.6, "completions/mean_length": 255.3875, "completions/mean_terminated_length": 31.6, "completions/min_length": 236.4, "completions/min_terminated_length": 31.6, "epoch": 0.07859237536656892, "frac_reward_zero_std": 0.0375, "grad_norm": 2.90625, "kl": 0.00798338907188736, "learning_rate": 7.898643303909933e-06, "loss": 0.0021, "num_tokens": 3649915.0, "reward": 12.55016393661499, "reward_std": 17.015945434570312, "rewards/wrapper/mean": 6.27508184760809, "rewards/wrapper/std": 17.497456189990043, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 10.3, "completions/min_length": 109.4, "completions/min_terminated_length": 7.0, "epoch": 0.07976539589442816, "frac_reward_zero_std": 0.0, "grad_norm": 3.6875, "kl": 0.011324020172469317, "learning_rate": 7.89803162520695e-06, "loss": -0.0073, "num_tokens": 3705082.0, "reward": 13.485107231140137, "reward_std": 18.413896560668945, "rewards/wrapper/mean": 6.7425536692142485, "rewards/wrapper/std": 19.012604074180125, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 3.6, "completions/mean_length": 248.125, "completions/mean_terminated_length": 2.1, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.08093841642228738, "frac_reward_zero_std": 0.0375, "grad_norm": 1.421875, "kl": 0.005373219301691279, "learning_rate": 7.897409845762977e-06, "loss": -0.0265, "num_tokens": 3761870.0, "reward": 13.213876724243164, "reward_std": 17.01407127380371, "rewards/wrapper/mean": 6.606937965750694, "rewards/wrapper/std": 17.88588900715113, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 32.2, "completions/mean_length": 249.0125, "completions/mean_terminated_length": 17.5, "completions/min_length": 54.0, "completions/min_terminated_length": 2.8, "epoch": 0.08211143695014662, "frac_reward_zero_std": 0.0375, "grad_norm": 3.09375, "kl": 0.006631963752442971, "learning_rate": 7.896777967706954e-06, "loss": -0.0242, "num_tokens": 3814768.0, "reward": 12.118576288223267, "reward_std": 16.33619499206543, "rewards/wrapper/mean": 6.059287798404694, "rewards/wrapper/std": 19.245304891467093, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 46.2, "completions/mean_length": 252.80625, "completions/mean_terminated_length": 25.8, "completions/min_length": 159.0, "completions/min_terminated_length": 5.4, "epoch": 0.08328445747800586, "frac_reward_zero_std": 0.025, "grad_norm": 2.984375, "kl": 0.0040073363459669055, "learning_rate": 7.896135993202392e-06, "loss": -0.0111, "num_tokens": 3868007.0, "reward": 5.220393860340119, "reward_std": 6.543023681640625, "rewards/wrapper/mean": 2.6101968064904213, "rewards/wrapper/std": 9.627190099656582, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 252.8625, "completions/mean_terminated_length": 2.0, "completions/min_length": 155.6, "completions/min_terminated_length": 2.0, "epoch": 0.0844574780058651, "frac_reward_zero_std": 0.0, "grad_norm": 4.3125, "kl": 0.020079531124792993, "learning_rate": 7.895483924447377e-06, "loss": -0.0065, "num_tokens": 3923273.0, "reward": 12.692752647399903, "reward_std": 15.342731380462647, "rewards/wrapper/mean": 6.346376179158687, "rewards/wrapper/std": 20.673074428737163, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.08563049853372434, "frac_reward_zero_std": 0.025, "grad_norm": 17.75, "kl": 0.043798852752661335, "learning_rate": 7.894821763674556e-06, "loss": -0.0052, "num_tokens": 3978401.0, "reward": 9.202295875549316, "reward_std": 9.355103588104248, "rewards/wrapper/mean": 4.60114776045084, "rewards/wrapper/std": 13.482706923782825, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.08680351906158358, "frac_reward_zero_std": 0.0125, "grad_norm": 5.03125, "kl": 0.0192249774816446, "learning_rate": 7.89414951315113e-06, "loss": -0.0068, "num_tokens": 4035268.0, "reward": 7.144095826148987, "reward_std": 9.492268562316895, "rewards/wrapper/mean": 3.572047848254442, "rewards/wrapper/std": 10.409622764587402, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 250.1375, "completions/mean_terminated_length": 10.1, "completions/min_length": 105.6, "completions/min_terminated_length": 3.2, "epoch": 0.08797653958944282, "frac_reward_zero_std": 0.05, "grad_norm": 6.25, "kl": 0.0067635633982717994, "learning_rate": 7.893467175178848e-06, "loss": -0.0136, "num_tokens": 4089458.0, "reward": 5.858023273944855, "reward_std": 7.792664754390716, "rewards/wrapper/mean": 2.929011580348015, "rewards/wrapper/std": 8.176401071250439, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 5.4, "completions/mean_length": 252.96875, "completions/mean_terminated_length": 5.4, "completions/min_length": 159.0, "completions/min_terminated_length": 5.4, "epoch": 0.08914956011730206, "frac_reward_zero_std": 0.0125, "grad_norm": 3.515625, "kl": 0.008263138856273144, "learning_rate": 7.892774752093993e-06, "loss": -0.0069, "num_tokens": 4144745.0, "reward": 5.752477717399597, "reward_std": 7.585747843980789, "rewards/wrapper/mean": 2.8762387074530125, "rewards/wrapper/std": 10.010685224831104, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 54.8, "completions/mean_length": 252.9125, "completions/mean_terminated_length": 54.8, "completions/min_length": 157.2, "completions/min_terminated_length": 54.8, "epoch": 0.09032258064516129, "frac_reward_zero_std": 0.025, "grad_norm": 4.3125, "kl": 0.008487808145582676, "learning_rate": 7.892072246267383e-06, "loss": -0.0105, "num_tokens": 4198865.0, "reward": 12.836762094497681, "reward_std": 15.75669355392456, "rewards/wrapper/mean": 6.418380816280842, "rewards/wrapper/std": 18.001925800740718, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 249.625, "completions/mean_terminated_length": 0.8, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.09149560117302052, "frac_reward_zero_std": 0.05, "grad_norm": 2.625, "kl": 0.02361058863461949, "learning_rate": 7.891359660104361e-06, "loss": -0.0022, "num_tokens": 4257033.0, "reward": 10.801888704299927, "reward_std": 13.946699237823486, "rewards/wrapper/mean": 5.400944182276726, "rewards/wrapper/std": 14.914789086580276, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 2.2, "completions/mean_length": 251.26875, "completions/mean_terminated_length": 2.2, "completions/min_length": 104.6, "completions/min_terminated_length": 2.2, "epoch": 0.09266862170087976, "frac_reward_zero_std": 0.05, "grad_norm": 1.078125, "kl": 0.07726627183728851, "learning_rate": 7.89063699604478e-06, "loss": 0.0079, "num_tokens": 4307928.0, "reward": 9.3045334815979, "reward_std": 10.442511081695557, "rewards/wrapper/mean": 4.652266338467598, "rewards/wrapper/std": 14.739392180740833, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.0, "completions/mean_length": 252.83125, "completions/mean_terminated_length": 1.0, "completions/min_length": 154.6, "completions/min_terminated_length": 1.0, "epoch": 0.093841642228739, "frac_reward_zero_std": 0.05, "grad_norm": 3.0625, "kl": 0.005269644845975563, "learning_rate": 7.889904256563e-06, "loss": -0.0066, "num_tokens": 4364827.0, "reward": 10.288627099990844, "reward_std": 13.034698617458343, "rewards/wrapper/mean": 5.144313305616379, "rewards/wrapper/std": 14.584010930359364, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 38.2, "completions/mean_length": 250.80625, "completions/mean_terminated_length": 19.3, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.09501466275659824, "frac_reward_zero_std": 0.0, "grad_norm": 88.0, "kl": 0.5907435442029965, "learning_rate": 7.88916144416788e-06, "loss": 0.0165, "num_tokens": 4419578.0, "reward": 7.900735855102539, "reward_std": 10.42778558731079, "rewards/wrapper/mean": 3.9503678739070893, "rewards/wrapper/std": 13.24696546792984, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.09618768328445748, "frac_reward_zero_std": 0.0375, "grad_norm": 5.59375, "kl": 0.004958215163787827, "learning_rate": 7.888408561402767e-06, "loss": -0.0023, "num_tokens": 4475905.0, "reward": 7.845335531234741, "reward_std": 10.648851537704468, "rewards/wrapper/mean": 3.92266783118248, "rewards/wrapper/std": 13.168632271885873, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 17.2, "completions/mean_length": 253.3375, "completions/mean_terminated_length": 17.2, "completions/min_length": 170.8, "completions/min_terminated_length": 17.2, "epoch": 0.09736070381231672, "frac_reward_zero_std": 0.0, "grad_norm": 4.3125, "kl": 0.009949381230399013, "learning_rate": 7.887645610845491e-06, "loss": -0.0014, "num_tokens": 4528455.0, "reward": 6.3044956684112545, "reward_std": 8.397425222396851, "rewards/wrapper/mean": 3.1522478252649306, "rewards/wrapper/std": 13.238372421264648, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 251.475, "completions/mean_terminated_length": 60.0, "completions/min_length": 111.2, "completions/min_terminated_length": 60.0, "epoch": 0.09853372434017596, "frac_reward_zero_std": 0.0125, "grad_norm": 9.5625, "kl": 0.009382255608215928, "learning_rate": 7.88687259510835e-06, "loss": -0.0136, "num_tokens": 4586407.0, "reward": 10.528643894195557, "reward_std": 14.410901546478271, "rewards/wrapper/mean": 5.264321990311146, "rewards/wrapper/std": 17.865098947286604, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 254.41875, "completions/mean_terminated_length": 0.6, "completions/min_length": 205.4, "completions/min_terminated_length": 0.6, "epoch": 0.09970674486803519, "frac_reward_zero_std": 0.0125, "grad_norm": 4.1875, "kl": 0.006355077913030982, "learning_rate": 7.886089516838104e-06, "loss": -0.0035, "num_tokens": 4640554.0, "reward": 8.880403900146485, "reward_std": 11.53792371749878, "rewards/wrapper/mean": 4.440201735496521, "rewards/wrapper/std": 14.214903639256955, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 55.2, "completions/mean_length": 252.925, "completions/mean_terminated_length": 55.2, "completions/min_length": 157.6, "completions/min_terminated_length": 55.2, "epoch": 0.10087976539589442, "frac_reward_zero_std": 0.05, "grad_norm": 3.9375, "kl": 0.011302582046482712, "learning_rate": 7.885296378715972e-06, "loss": -0.0074, "num_tokens": 4695284.0, "reward": 9.465247249603271, "reward_std": 12.294286060333253, "rewards/wrapper/mean": 4.732623440772295, "rewards/wrapper/std": 16.889464408159256, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 64.8, "completions/mean_length": 249.25625, "completions/mean_terminated_length": 60.2, "completions/min_length": 106.8, "completions/min_terminated_length": 55.6, "epoch": 0.10205278592375366, "frac_reward_zero_std": 0.025, "grad_norm": 1.0390625, "kl": 0.008649818610865622, "learning_rate": 7.884493183457612e-06, "loss": -0.0084, "num_tokens": 4747981.0, "reward": 10.74110836982727, "reward_std": 14.534211778640747, "rewards/wrapper/mean": 5.37055408731103, "rewards/wrapper/std": 17.95357619225979, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 24.2, "completions/mean_length": 255.15625, "completions/mean_terminated_length": 24.2, "completions/min_length": 229.0, "completions/min_terminated_length": 24.2, "epoch": 0.1032258064516129, "frac_reward_zero_std": 0.025, "grad_norm": 1.7421875, "kl": 0.01159596272627823, "learning_rate": 7.883679933813119e-06, "loss": -0.0014, "num_tokens": 4802888.0, "reward": 15.938543891906738, "reward_std": 20.0162145614624, "rewards/wrapper/mean": 7.9692716032266615, "rewards/wrapper/std": 23.760393367707728, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 29.6, "completions/mean_length": 252.13125, "completions/mean_terminated_length": 15.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.10439882697947214, "frac_reward_zero_std": 0.025, "grad_norm": 3.421875, "kl": 0.008417817368172108, "learning_rate": 7.882856632567015e-06, "loss": -0.0053, "num_tokens": 4859187.0, "reward": 17.78517837524414, "reward_std": 22.430026054382324, "rewards/wrapper/mean": 8.892589239776134, "rewards/wrapper/std": 25.629758620262145, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 38.6, "completions/mean_length": 247.63125, "completions/mean_terminated_length": 35.9, "completions/min_length": 135.8, "completions/min_terminated_length": 33.4, "epoch": 0.10557184750733138, "frac_reward_zero_std": 0.0125, "grad_norm": 0.9765625, "kl": 0.005837788642384112, "learning_rate": 7.882023282538236e-06, "loss": -0.0218, "num_tokens": 4911118.0, "reward": 13.440273857116699, "reward_std": 12.466052770614624, "rewards/wrapper/mean": 6.720136827230453, "rewards/wrapper/std": 17.47331467717886, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 4.8, "completions/mean_length": 244.96875, "completions/mean_terminated_length": 2.8, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.10674486803519062, "frac_reward_zero_std": 0.0375, "grad_norm": 5.5, "kl": 0.007168042741250246, "learning_rate": 7.881179886580125e-06, "loss": -0.0221, "num_tokens": 4963319.0, "reward": 12.285084342956543, "reward_std": 14.525907707214355, "rewards/wrapper/mean": 6.142542491853237, "rewards/wrapper/std": 17.427485939860343, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.10791788856304986, "frac_reward_zero_std": 0.025, "grad_norm": 1.828125, "kl": 0.009624737745616585, "learning_rate": 7.880326447580421e-06, "loss": -0.0021, "num_tokens": 5017498.0, "reward": 10.349391460418701, "reward_std": 13.977695178985595, "rewards/wrapper/mean": 5.174696030467748, "rewards/wrapper/std": 17.32550062686205, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 6.6, "completions/mean_length": 251.40625, "completions/mean_terminated_length": 6.6, "completions/min_length": 109.0, "completions/min_terminated_length": 6.6, "epoch": 0.10909090909090909, "frac_reward_zero_std": 0.0375, "grad_norm": 1.0, "kl": 0.005783123133005575, "learning_rate": 7.879462968461254e-06, "loss": 0.0054, "num_tokens": 5075057.0, "reward": 12.299484062194825, "reward_std": 13.539479064941407, "rewards/wrapper/mean": 6.149741820991039, "rewards/wrapper/std": 17.295961599051953, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 50.8, "completions/mean_length": 252.84375, "completions/mean_terminated_length": 26.4, "completions/min_length": 155.6, "completions/min_terminated_length": 2.0, "epoch": 0.11026392961876832, "frac_reward_zero_std": 0.0125, "grad_norm": 2.375, "kl": 0.017190750857116653, "learning_rate": 7.878589452179124e-06, "loss": -0.0092, "num_tokens": 5129870.0, "reward": 16.291987419128418, "reward_std": 16.42507972717285, "rewards/wrapper/mean": 8.145993730425834, "rewards/wrapper/std": 23.72040745615959, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 41.6, "completions/mean_length": 252.5, "completions/mean_terminated_length": 41.6, "completions/min_length": 144.0, "completions/min_terminated_length": 41.6, "epoch": 0.11143695014662756, "frac_reward_zero_std": 0.0125, "grad_norm": 2.609375, "kl": 0.026824597117956726, "learning_rate": 7.877705901724904e-06, "loss": -0.004, "num_tokens": 5185876.0, "reward": 9.22128176689148, "reward_std": 12.521512603759765, "rewards/wrapper/mean": 4.610640931129455, "rewards/wrapper/std": 15.985003382712602, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 20.6, "completions/mean_length": 253.44375, "completions/mean_terminated_length": 20.6, "completions/min_length": 174.2, "completions/min_terminated_length": 20.6, "epoch": 0.1126099706744868, "frac_reward_zero_std": 0.0125, "grad_norm": 1.5859375, "kl": 0.008066412215703167, "learning_rate": 7.876812320123819e-06, "loss": -0.0074, "num_tokens": 5238537.0, "reward": 7.3396319389343265, "reward_std": 9.42587718963623, "rewards/wrapper/mean": 3.6698158286511897, "rewards/wrapper/std": 11.667912058532238, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 49.6, "completions/mean_length": 254.35, "completions/mean_terminated_length": 49.6, "completions/min_length": 203.2, "completions/min_terminated_length": 49.6, "epoch": 0.11378299120234604, "frac_reward_zero_std": 0.025, "grad_norm": 5.0, "kl": 0.005426285532303154, "learning_rate": 7.875908710435441e-06, "loss": -0.0052, "num_tokens": 5292019.0, "reward": 13.085040092468262, "reward_std": 13.977268314361572, "rewards/wrapper/mean": 6.542520047724247, "rewards/wrapper/std": 18.962931068241595, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 79.4, "completions/mean_length": 247.86875, "completions/mean_terminated_length": 56.03333435058594, "completions/min_length": 87.0, "completions/min_terminated_length": 35.8, "epoch": 0.11495601173020528, "frac_reward_zero_std": 0.0375, "grad_norm": 1.21875, "kl": 0.0073792809271253645, "learning_rate": 7.874995075753678e-06, "loss": -0.0044, "num_tokens": 5344538.0, "reward": 6.792131614685059, "reward_std": 9.024149131774902, "rewards/wrapper/mean": 3.3960657209157943, "rewards/wrapper/std": 11.643630801141262, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 122.4, "completions/mean_length": 252.16875, "completions/mean_terminated_length": 102.9, "completions/min_length": 134.6, "completions/min_terminated_length": 83.4, "epoch": 0.11612903225806452, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.011405504535650835, "learning_rate": 7.874071419206767e-06, "loss": -0.0067, "num_tokens": 5399329.0, "reward": 10.500157880783082, "reward_std": 12.362210059165955, "rewards/wrapper/mean": 5.250078846514225, "rewards/wrapper/std": 16.170384666323663, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 42.8, "completions/mean_length": 251.05625, "completions/mean_terminated_length": 30.0, "completions/min_length": 119.6, "completions/min_terminated_length": 17.2, "epoch": 0.11730205278592376, "frac_reward_zero_std": 0.0, "grad_norm": 15.75, "kl": 0.023087952454807235, "learning_rate": 7.873137743957253e-06, "loss": -0.0053, "num_tokens": 5453188.0, "reward": 10.207341730594635, "reward_std": 13.235777711868286, "rewards/wrapper/mean": 5.1036708191037174, "rewards/wrapper/std": 15.854820623993874, "step": 500 }, { "epoch": 0.11730205278592376, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.805, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 64.59, "eval_completions/mean_length": 225.58, "eval_completions/mean_terminated_length": 54.89000015258789, "eval_completions/min_length": 160.77, "eval_completions/min_terminated_length": 45.57, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.008613987206481398, "eval_loss": -0.05724004656076431, "eval_num_tokens": 5453188.0, "eval_reward": 0.3156442906707525, "eval_reward_std": 0.15477610152214766, "eval_rewards/wrapper/mean": 0.1578221420943737, "eval_rewards/wrapper/std": 0.14301297422032802, "eval_runtime": 207.7513, "eval_samples_per_second": 0.963, "eval_steps_per_second": 0.241, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 1.6, "completions/mean_length": 249.65, "completions/mean_terminated_length": 1.6, "completions/min_length": 52.8, "completions/min_terminated_length": 1.6, "epoch": 0.11847507331378299, "frac_reward_zero_std": 0.025, "grad_norm": 2.265625, "kl": 0.007007372000953182, "learning_rate": 7.872194053201988e-06, "loss": -0.0135, "num_tokens": 5505338.0, "reward": 5.762967586517334, "reward_std": 7.6124520778656, "rewards/wrapper/mean": 2.8814836353063584, "rewards/wrapper/std": 10.962222987413407, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.11964809384164223, "frac_reward_zero_std": 0.0125, "grad_norm": 2.765625, "kl": 0.00901545921806246, "learning_rate": 7.871240350172112e-06, "loss": -0.0059, "num_tokens": 5561149.0, "reward": 10.71638011932373, "reward_std": 11.327287292480468, "rewards/wrapper/mean": 5.358189883828163, "rewards/wrapper/std": 15.345132572948932, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.8, "completions/mean_length": 252.1, "completions/mean_terminated_length": 28.8, "completions/min_length": 131.2, "completions/min_terminated_length": 28.8, "epoch": 0.12082111436950146, "frac_reward_zero_std": 0.0125, "grad_norm": 5.0, "kl": 1.7968808292440372, "learning_rate": 7.870276638133056e-06, "loss": 0.0643, "num_tokens": 5615687.0, "reward": 10.27314796447754, "reward_std": 12.886303329467774, "rewards/wrapper/mean": 5.13657393977046, "rewards/wrapper/std": 17.148977878689767, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 25.4, "completions/mean_length": 253.59375, "completions/mean_terminated_length": 25.4, "completions/min_length": 179.0, "completions/min_terminated_length": 25.4, "epoch": 0.1219941348973607, "frac_reward_zero_std": 0.025, "grad_norm": 3.625, "kl": 0.008333416358800605, "learning_rate": 7.869302920384511e-06, "loss": -0.0036, "num_tokens": 5669610.0, "reward": 8.138755130767823, "reward_std": 9.697688674926757, "rewards/wrapper/mean": 4.069377472251654, "rewards/wrapper/std": 13.40217920690775, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 27.4, "completions/mean_length": 248.93125, "completions/mean_terminated_length": 16.0, "completions/min_length": 107.0, "completions/min_terminated_length": 4.6, "epoch": 0.12316715542521994, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.00565977388760075, "learning_rate": 7.868319200260435e-06, "loss": 0.0013, "num_tokens": 5723607.0, "reward": 7.519344139099121, "reward_std": 9.667561912536621, "rewards/wrapper/mean": 3.7596719533205034, "rewards/wrapper/std": 12.146075774729251, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 74.2, "completions/mean_length": 249.6, "completions/mean_terminated_length": 61.06666717529297, "completions/min_length": 91.8, "completions/min_terminated_length": 40.6, "epoch": 0.12434017595307918, "frac_reward_zero_std": 0.025, "grad_norm": 2.359375, "kl": 0.0132868135930039, "learning_rate": 7.867325481129026e-06, "loss": -0.0132, "num_tokens": 5779257.0, "reward": 13.06201467514038, "reward_std": 14.132675647735596, "rewards/wrapper/mean": 6.531007275730372, "rewards/wrapper/std": 19.79606983065605, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 253.01875, "completions/mean_terminated_length": 7.0, "completions/min_length": 160.6, "completions/min_terminated_length": 7.0, "epoch": 0.12551319648093842, "frac_reward_zero_std": 0.025, "grad_norm": 2.953125, "kl": 0.011028506548609585, "learning_rate": 7.866321766392723e-06, "loss": -0.0078, "num_tokens": 5835452.0, "reward": 6.974220561981201, "reward_std": 9.202022171020507, "rewards/wrapper/mean": 3.4871101051568987, "rewards/wrapper/std": 11.520281651616097, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 254.3625, "completions/mean_terminated_length": 50.0, "completions/min_length": 203.6, "completions/min_terminated_length": 50.0, "epoch": 0.12668621700879765, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.006271988293156028, "learning_rate": 7.86530805948819e-06, "loss": -0.0035, "num_tokens": 5888666.0, "reward": 11.376586723327637, "reward_std": 13.494106674194336, "rewards/wrapper/mean": 5.688293327391148, "rewards/wrapper/std": 18.444926972687245, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 42.6, "completions/mean_length": 252.5375, "completions/mean_terminated_length": 21.5, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.1278592375366569, "frac_reward_zero_std": 0.025, "grad_norm": 3.359375, "kl": 0.017584053613245488, "learning_rate": 7.864284363886301e-06, "loss": -0.0096, "num_tokens": 5946884.0, "reward": 6.535472047328949, "reward_std": 8.555399453639984, "rewards/wrapper/mean": 3.267736179381609, "rewards/wrapper/std": 9.66622311770916, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 71.4, "completions/mean_length": 249.46875, "completions/mean_terminated_length": 49.3, "completions/min_length": 78.4, "completions/min_terminated_length": 27.2, "epoch": 0.12903225806451613, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.028963472304167227, "learning_rate": 7.863250683092132e-06, "loss": -0.0195, "num_tokens": 5998817.0, "reward": 12.318414115905762, "reward_std": 15.487536811828614, "rewards/wrapper/mean": 6.159206974506378, "rewards/wrapper/std": 18.252126486599444, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13020527859237538, "frac_reward_zero_std": 0.0125, "grad_norm": 2.046875, "kl": 0.006062440911773592, "learning_rate": 7.862207020644947e-06, "loss": 0.0002, "num_tokens": 6053299.0, "reward": 7.462558031082153, "reward_std": 9.897032356262207, "rewards/wrapper/mean": 3.731279059499502, "rewards/wrapper/std": 11.677504101395607, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.1313782991202346, "frac_reward_zero_std": 0.05, "grad_norm": 3.140625, "kl": 0.005756572855170816, "learning_rate": 7.861153380118187e-06, "loss": 0.0002, "num_tokens": 6107719.0, "reward": 9.559343433380127, "reward_std": 12.309616780281067, "rewards/wrapper/mean": 4.77967184856534, "rewards/wrapper/std": 15.938154307007789, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 61.8, "completions/mean_length": 249.9375, "completions/mean_terminated_length": 40.0, "completions/min_length": 69.4, "completions/min_terminated_length": 18.2, "epoch": 0.13255131964809383, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.0070392878842540085, "learning_rate": 7.860089765119458e-06, "loss": -0.0095, "num_tokens": 6162783.0, "reward": 15.076542854309082, "reward_std": 18.630818176269532, "rewards/wrapper/mean": 7.538271514326334, "rewards/wrapper/std": 20.744914372265338, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.13372434017595308, "frac_reward_zero_std": 0.075, "grad_norm": 5.1875, "kl": 0.015195795160252601, "learning_rate": 7.859016179290516e-06, "loss": 0.0006, "num_tokens": 6222655.0, "reward": 9.283301067352294, "reward_std": 10.608103895187378, "rewards/wrapper/mean": 4.641650436818599, "rewards/wrapper/std": 13.378304573893548, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 48.6, "completions/mean_length": 254.325, "completions/mean_terminated_length": 24.4, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.1348973607038123, "frac_reward_zero_std": 0.0125, "grad_norm": 2.1875, "kl": 0.03376310399617068, "learning_rate": 7.857932626307261e-06, "loss": 0.0018, "num_tokens": 6276273.0, "reward": 12.699429130554199, "reward_std": 14.780900478363037, "rewards/wrapper/mean": 6.349714441597461, "rewards/wrapper/std": 23.842760853469372, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 56.2, "completions/mean_length": 254.55625, "completions/mean_terminated_length": 56.2, "completions/min_length": 209.8, "completions/min_terminated_length": 56.2, "epoch": 0.13607038123167156, "frac_reward_zero_std": 0.0125, "grad_norm": 3.171875, "kl": 0.007011306460481137, "learning_rate": 7.856839109879712e-06, "loss": 0.0015, "num_tokens": 6333892.0, "reward": 7.93853178024292, "reward_std": 10.552415084838866, "rewards/wrapper/mean": 3.9692660093307497, "rewards/wrapper/std": 13.967398370802403, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 254.20625, "completions/mean_terminated_length": 45.0, "completions/min_length": 198.6, "completions/min_terminated_length": 45.0, "epoch": 0.1372434017595308, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.01811500685289502, "learning_rate": 7.855735633752014e-06, "loss": 0.0048, "num_tokens": 6391489.0, "reward": 7.980970191955566, "reward_std": 9.922795867919922, "rewards/wrapper/mean": 3.9904851004481317, "rewards/wrapper/std": 13.737288218736648, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 40.4, "completions/mean_length": 249.45625, "completions/mean_terminated_length": 23.4, "completions/min_length": 108.8, "completions/min_terminated_length": 6.4, "epoch": 0.13841642228739004, "frac_reward_zero_std": 0.025, "grad_norm": 2.140625, "kl": 0.008395724120782688, "learning_rate": 7.854622201702398e-06, "loss": -0.021, "num_tokens": 6445788.0, "reward": 17.261702919006346, "reward_std": 22.740974617004394, "rewards/wrapper/mean": 8.630851200222969, "rewards/wrapper/std": 25.095195826888084, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.13958944281524927, "frac_reward_zero_std": 0.05, "grad_norm": 2.234375, "kl": 0.006187805708032101, "learning_rate": 7.8534988175432e-06, "loss": -0.0054, "num_tokens": 6500061.0, "reward": 4.855136448144913, "reward_std": 6.186183905601501, "rewards/wrapper/mean": 2.4275682747364042, "rewards/wrapper/std": 8.418499158322811, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.14076246334310852, "frac_reward_zero_std": 0.0125, "grad_norm": 5.84375, "kl": 0.005689225665992126, "learning_rate": 7.852365485120821e-06, "loss": -0.0123, "num_tokens": 6553787.0, "reward": 11.236301851272582, "reward_std": 12.74597978591919, "rewards/wrapper/mean": 5.618150828778743, "rewards/wrapper/std": 13.135591895878315, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 254.64375, "completions/mean_terminated_length": 59.0, "completions/min_length": 212.6, "completions/min_terminated_length": 59.0, "epoch": 0.14193548387096774, "frac_reward_zero_std": 0.0125, "grad_norm": 1.34375, "kl": 0.006667055486468598, "learning_rate": 7.851222208315726e-06, "loss": -0.0047, "num_tokens": 6607744.0, "reward": 7.296946382522583, "reward_std": 9.803734683990479, "rewards/wrapper/mean": 3.648472948372364, "rewards/wrapper/std": 13.976402992010117, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 248.2625, "completions/mean_terminated_length": 4.3, "completions/min_length": 104.0, "completions/min_terminated_length": 1.6, "epoch": 0.14310850439882697, "frac_reward_zero_std": 0.0375, "grad_norm": 4.65625, "kl": 0.023930460814153776, "learning_rate": 7.850068991042432e-06, "loss": -0.0114, "num_tokens": 6661002.0, "reward": 7.392018556594849, "reward_std": 8.457538390159607, "rewards/wrapper/mean": 3.696009010076523, "rewards/wrapper/std": 10.95611379891634, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 251.94375, "completions/mean_terminated_length": 75.0, "completions/min_length": 126.2, "completions/min_terminated_length": 75.0, "epoch": 0.14428152492668622, "frac_reward_zero_std": 0.025, "grad_norm": 1.671875, "kl": 17.328260349377523, "learning_rate": 7.848905837249485e-06, "loss": 0.6789, "num_tokens": 6717383.0, "reward": 6.301204872131348, "reward_std": 8.14721348285675, "rewards/wrapper/mean": 3.150602462887764, "rewards/wrapper/std": 10.698176135122775, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.225, "completions/mean_terminated_length": 0.5, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.14545454545454545, "frac_reward_zero_std": 0.0, "grad_norm": 5.0625, "kl": 0.007349775591865182, "learning_rate": 7.847732750919463e-06, "loss": -0.0085, "num_tokens": 6771751.0, "reward": 15.633229780197144, "reward_std": 19.575571060180664, "rewards/wrapper/mean": 7.816615104675293, "rewards/wrapper/std": 21.27004445493221, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 35.6, "completions/mean_length": 252.68125, "completions/mean_terminated_length": 31.0, "completions/min_length": 180.0, "completions/min_terminated_length": 26.4, "epoch": 0.1466275659824047, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 1.286929903679993, "learning_rate": 7.846549736068945e-06, "loss": 0.0417, "num_tokens": 6829496.0, "reward": 10.100393962860107, "reward_std": 10.381993126869201, "rewards/wrapper/mean": 5.050196871161461, "rewards/wrapper/std": 15.772063083946705, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 1.2, "completions/mean_length": 254.4375, "completions/mean_terminated_length": 1.2, "completions/min_length": 206.0, "completions/min_terminated_length": 1.2, "epoch": 0.14780058651026393, "frac_reward_zero_std": 0.0125, "grad_norm": 1.78125, "kl": 0.030532787676202135, "learning_rate": 7.845356796748507e-06, "loss": -0.0012, "num_tokens": 6881764.0, "reward": 8.524028420448303, "reward_std": 9.672714823484421, "rewards/wrapper/mean": 4.262014053016901, "rewards/wrapper/std": 12.955846460163594, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 62.2, "completions/mean_length": 249.95, "completions/mean_terminated_length": 61.6, "completions/min_length": 112.2, "completions/min_terminated_length": 61.0, "epoch": 0.14897360703812318, "frac_reward_zero_std": 0.0375, "grad_norm": 1.8203125, "kl": 0.008349815453402697, "learning_rate": 7.844153937042703e-06, "loss": 0.0055, "num_tokens": 6937332.0, "reward": 7.351616859436035, "reward_std": 9.709289264678954, "rewards/wrapper/mean": 3.67580828666687, "rewards/wrapper/std": 11.747823464870454, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 62.8, "completions/mean_length": 250.15, "completions/mean_terminated_length": 35.7, "completions/min_length": 111.0, "completions/min_terminated_length": 8.6, "epoch": 0.1501466275659824, "frac_reward_zero_std": 0.05, "grad_norm": 1.0859375, "kl": 0.005067132035037503, "learning_rate": 7.84294116107006e-06, "loss": -0.0055, "num_tokens": 6991852.0, "reward": 15.24313793182373, "reward_std": 19.83392467498779, "rewards/wrapper/mean": 7.621568508446217, "rewards/wrapper/std": 21.900732143223287, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 5.4, "completions/mean_length": 254.56875, "completions/mean_terminated_length": 5.4, "completions/min_length": 210.2, "completions/min_terminated_length": 5.4, "epoch": 0.15131964809384163, "frac_reward_zero_std": 0.0125, "grad_norm": 6.5625, "kl": 0.22633971532341093, "learning_rate": 7.841718472983054e-06, "loss": 0.0046, "num_tokens": 7047317.0, "reward": 11.10000295639038, "reward_std": 13.981466102600098, "rewards/wrapper/mean": 5.550001335144043, "rewards/wrapper/std": 15.351305815577508, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 3.4, "completions/mean_length": 254.50625, "completions/mean_terminated_length": 3.4, "completions/min_length": 208.2, "completions/min_terminated_length": 3.4, "epoch": 0.15249266862170088, "frac_reward_zero_std": 0.0125, "grad_norm": 35.25, "kl": 0.05593276094878093, "learning_rate": 7.840485876968097e-06, "loss": -0.0024, "num_tokens": 7099374.0, "reward": 12.740144157409668, "reward_std": 14.847570991516113, "rewards/wrapper/mean": 6.3700722143054005, "rewards/wrapper/std": 17.99122847020626, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.1536656891495601, "frac_reward_zero_std": 0.025, "grad_norm": 4.46875, "kl": 0.010588540998287498, "learning_rate": 7.839243377245529e-06, "loss": -0.004, "num_tokens": 7152777.0, "reward": 8.608040571212769, "reward_std": 9.49074192047119, "rewards/wrapper/mean": 4.304020477086306, "rewards/wrapper/std": 12.330823975801469, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 8.6, "completions/mean_length": 253.06875, "completions/mean_terminated_length": 8.6, "completions/min_length": 162.2, "completions/min_terminated_length": 8.6, "epoch": 0.15483870967741936, "frac_reward_zero_std": 0.0125, "grad_norm": 2.828125, "kl": 0.23861043564975262, "learning_rate": 7.8379909780696e-06, "loss": 0.0039, "num_tokens": 7208812.0, "reward": 9.388974571228028, "reward_std": 11.66281862258911, "rewards/wrapper/mean": 4.694487226009369, "rewards/wrapper/std": 13.111043818295002, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 252.16875, "completions/mean_terminated_length": 31.0, "completions/min_length": 133.4, "completions/min_terminated_length": 31.0, "epoch": 0.1560117302052786, "frac_reward_zero_std": 0.025, "grad_norm": 5.0, "kl": 0.005851204495411366, "learning_rate": 7.836728683728452e-06, "loss": 0.0058, "num_tokens": 7263149.0, "reward": 11.78786849975586, "reward_std": 15.879752826690673, "rewards/wrapper/mean": 5.893934021890163, "rewards/wrapper/std": 17.91351638287306, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 17.8, "completions/mean_length": 253.35625, "completions/mean_terminated_length": 17.8, "completions/min_length": 171.4, "completions/min_terminated_length": 17.8, "epoch": 0.15718475073313784, "frac_reward_zero_std": 0.0125, "grad_norm": 2.96875, "kl": 0.00533767455490306, "learning_rate": 7.83545649854411e-06, "loss": -0.0037, "num_tokens": 7317060.0, "reward": 9.627443599700928, "reward_std": 10.127561569213867, "rewards/wrapper/mean": 4.813721719384193, "rewards/wrapper/std": 14.26107615828514, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 23.6, "completions/mean_length": 250.34375, "completions/mean_terminated_length": 22.3, "completions/min_length": 123.4, "completions/min_terminated_length": 21.0, "epoch": 0.15835777126099707, "frac_reward_zero_std": 0.0125, "grad_norm": 2.375, "kl": 0.005868964816909283, "learning_rate": 7.834174426872463e-06, "loss": -0.0032, "num_tokens": 7369351.0, "reward": 9.649084949493409, "reward_std": 12.152606201171874, "rewards/wrapper/mean": 4.824542417377233, "rewards/wrapper/std": 15.261625829339028, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 53.8, "completions/mean_length": 251.2875, "completions/mean_terminated_length": 50.6, "completions/min_length": 149.8, "completions/min_terminated_length": 47.4, "epoch": 0.15953079178885632, "frac_reward_zero_std": 0.025, "grad_norm": 2.09375, "kl": 0.00833343998529017, "learning_rate": 7.832882473103254e-06, "loss": -0.0141, "num_tokens": 7426519.0, "reward": 14.143692588806152, "reward_std": 15.034424209594727, "rewards/wrapper/mean": 7.0718462012708185, "rewards/wrapper/std": 21.05203797221184, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 49.4, "completions/mean_length": 252.74375, "completions/mean_terminated_length": 49.4, "completions/min_length": 151.8, "completions/min_terminated_length": 49.4, "epoch": 0.16070381231671554, "frac_reward_zero_std": 0.025, "grad_norm": 1.5, "kl": 0.010165076283738018, "learning_rate": 7.831580641660056e-06, "loss": -0.0069, "num_tokens": 7479772.0, "reward": 8.289137840270996, "reward_std": 11.20027780532837, "rewards/wrapper/mean": 4.1445689931511875, "rewards/wrapper/std": 12.911294972896576, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 76.2, "completions/mean_length": 253.5875, "completions/mean_terminated_length": 50.7, "completions/min_length": 178.8, "completions/min_terminated_length": 25.2, "epoch": 0.16187683284457477, "frac_reward_zero_std": 0.025, "grad_norm": 4.03125, "kl": 0.00548412193893455, "learning_rate": 7.83026893700027e-06, "loss": -0.0066, "num_tokens": 7534518.0, "reward": 13.755285120010376, "reward_std": 13.63194980621338, "rewards/wrapper/mean": 6.877642697840929, "rewards/wrapper/std": 19.16850001066923, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 251.98125, "completions/mean_terminated_length": 63.2, "completions/min_length": 152.8, "completions/min_terminated_length": 50.4, "epoch": 0.16304985337243402, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.011029014608357101, "learning_rate": 7.828947363615096e-06, "loss": -0.0103, "num_tokens": 7590827.0, "reward": 12.973630714416505, "reward_std": 17.048142385482787, "rewards/wrapper/mean": 6.4868153288960455, "rewards/wrapper/std": 19.200295877456664, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 248.0375, "completions/mean_terminated_length": 0.7, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.16422287390029325, "frac_reward_zero_std": 0.0125, "grad_norm": 1.546875, "kl": 0.006441613682545722, "learning_rate": 7.827615926029526e-06, "loss": -0.0086, "num_tokens": 7645821.0, "reward": 9.089998376369476, "reward_std": 10.087905651330948, "rewards/wrapper/mean": 4.544999056309462, "rewards/wrapper/std": 13.267100870609283, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.2, "completions/mean_length": 250.3625, "completions/mean_terminated_length": 23.7, "completions/min_length": 125.6, "completions/min_terminated_length": 23.2, "epoch": 0.1653958944281525, "frac_reward_zero_std": 0.0125, "grad_norm": 1.2734375, "kl": 0.0045943191333208235, "learning_rate": 7.826274628802327e-06, "loss": -0.0068, "num_tokens": 7700815.0, "reward": 8.232522821426391, "reward_std": 11.1143310546875, "rewards/wrapper/mean": 4.116261105984449, "rewards/wrapper/std": 12.995249216258525, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 6.8, "completions/mean_length": 253.01875, "completions/mean_terminated_length": 3.5, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.16656891495601173, "frac_reward_zero_std": 0.0125, "grad_norm": 5.40625, "kl": 0.007572338276077062, "learning_rate": 7.824923476526026e-06, "loss": -0.0044, "num_tokens": 7755876.0, "reward": 13.800330543518067, "reward_std": 17.5712028503418, "rewards/wrapper/mean": 6.900165131688118, "rewards/wrapper/std": 18.640123146772385, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 27.2, "completions/mean_length": 253.65, "completions/mean_terminated_length": 27.2, "completions/min_length": 180.8, "completions/min_terminated_length": 27.2, "epoch": 0.16774193548387098, "frac_reward_zero_std": 0.05, "grad_norm": 3.84375, "kl": 0.006318861967884004, "learning_rate": 7.823562473826892e-06, "loss": -0.0044, "num_tokens": 7813810.0, "reward": 7.7993542671203615, "reward_std": 10.558793354034425, "rewards/wrapper/mean": 3.8996770560741423, "rewards/wrapper/std": 12.23674759566784, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 252.14375, "completions/mean_terminated_length": 15.2, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.1689149560117302, "frac_reward_zero_std": 0.075, "grad_norm": 2.5625, "kl": 0.008202216494828463, "learning_rate": 7.822191625364916e-06, "loss": 0.0079, "num_tokens": 7869451.0, "reward": 18.051965522766114, "reward_std": 21.227079582214355, "rewards/wrapper/mean": 9.02598342001438, "rewards/wrapper/std": 20.78606193512678, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.81875, "completions/mean_terminated_length": 0.3, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.17008797653958943, "frac_reward_zero_std": 0.05, "grad_norm": 7.8125, "kl": 0.008373394550289959, "learning_rate": 7.820810935833813e-06, "loss": -0.0059, "num_tokens": 7922738.0, "reward": 13.169144535064698, "reward_std": 18.08014087677002, "rewards/wrapper/mean": 6.5845720142126085, "rewards/wrapper/std": 18.33825700432062, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 25.4, "completions/mean_length": 251.99375, "completions/mean_terminated_length": 25.4, "completions/min_length": 127.8, "completions/min_terminated_length": 25.4, "epoch": 0.17126099706744868, "frac_reward_zero_std": 0.0375, "grad_norm": 2.375, "kl": 0.006507234089076519, "learning_rate": 7.819420409960982e-06, "loss": -0.0115, "num_tokens": 7979947.0, "reward": 5.9373430728912355, "reward_std": 7.835532140731812, "rewards/wrapper/mean": 2.9686716251075267, "rewards/wrapper/std": 8.117770229279994, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.1724340175953079, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.0059782372671179475, "learning_rate": 7.818020052507503e-06, "loss": -0.0035, "num_tokens": 8033822.0, "reward": 8.168022966384887, "reward_std": 10.164891624450684, "rewards/wrapper/mean": 4.084011057019234, "rewards/wrapper/std": 12.692610569298267, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 52.2, "completions/mean_length": 252.83125, "completions/mean_terminated_length": 52.2, "completions/min_length": 154.6, "completions/min_terminated_length": 52.2, "epoch": 0.17360703812316716, "frac_reward_zero_std": 0.0125, "grad_norm": 1.6640625, "kl": 0.006373942282516509, "learning_rate": 7.816609868268123e-06, "loss": 0.0013, "num_tokens": 8085969.0, "reward": 13.514916610717773, "reward_std": 16.967267417907713, "rewards/wrapper/mean": 6.757457870990038, "rewards/wrapper/std": 20.913749350607397, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 14.6, "completions/mean_length": 250.0625, "completions/mean_terminated_length": 14.6, "completions/min_length": 117.0, "completions/min_terminated_length": 14.6, "epoch": 0.1747800586510264, "frac_reward_zero_std": 0.025, "grad_norm": 7.71875, "kl": 0.015622739831451326, "learning_rate": 7.81518986207123e-06, "loss": -0.0026, "num_tokens": 8141105.0, "reward": 9.454648804664611, "reward_std": 12.46040769815445, "rewards/wrapper/mean": 4.727324414253235, "rewards/wrapper/std": 14.982401445508003, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 30.8, "completions/mean_length": 250.56875, "completions/mean_terminated_length": 15.7, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.17595307917888564, "frac_reward_zero_std": 0.0125, "grad_norm": 1.890625, "kl": 0.023395044403150676, "learning_rate": 7.81376003877885e-06, "loss": -0.0032, "num_tokens": 8195538.0, "reward": 5.900467705726624, "reward_std": 7.061243009567261, "rewards/wrapper/mean": 2.950233814120293, "rewards/wrapper/std": 9.529572576284409, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 9.4, "completions/mean_length": 249.94375, "completions/mean_terminated_length": 6.9, "completions/min_length": 106.8, "completions/min_terminated_length": 4.4, "epoch": 0.17712609970674487, "frac_reward_zero_std": 0.025, "grad_norm": 2.734375, "kl": 0.027390728006139398, "learning_rate": 7.812320403286612e-06, "loss": -0.0116, "num_tokens": 8248127.0, "reward": 7.320503807067871, "reward_std": 7.543186902999878, "rewards/wrapper/mean": 3.660252057760954, "rewards/wrapper/std": 10.80094509869814, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 33.8, "completions/mean_length": 253.85625, "completions/mean_terminated_length": 33.8, "completions/min_length": 187.4, "completions/min_terminated_length": 33.8, "epoch": 0.17829912023460412, "frac_reward_zero_std": 0.025, "grad_norm": 5.875, "kl": 0.022175381588749588, "learning_rate": 7.810870960523749e-06, "loss": -0.0016, "num_tokens": 8301810.0, "reward": 8.605068969726563, "reward_std": 11.631190872192382, "rewards/wrapper/mean": 4.302534601837396, "rewards/wrapper/std": 15.27453635185957, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 6.8, "completions/mean_length": 253.0125, "completions/mean_terminated_length": 6.8, "completions/min_length": 160.4, "completions/min_terminated_length": 6.8, "epoch": 0.17947214076246334, "frac_reward_zero_std": 0.025, "grad_norm": 1.21875, "kl": 0.007904288393910974, "learning_rate": 7.809411715453069e-06, "loss": -0.0078, "num_tokens": 8357124.0, "reward": 9.56756021976471, "reward_std": 11.147935009002685, "rewards/wrapper/mean": 4.7837800078094, "rewards/wrapper/std": 16.28676289319992, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 252.95625, "completions/mean_terminated_length": 103.8, "completions/min_length": 203.0, "completions/min_terminated_length": 100.6, "epoch": 0.18064516129032257, "frac_reward_zero_std": 0.0375, "grad_norm": 2.328125, "kl": 0.02297303997911513, "learning_rate": 7.807942673070945e-06, "loss": -0.0041, "num_tokens": 8413391.0, "reward": 10.628111362457275, "reward_std": 12.795229434967041, "rewards/wrapper/mean": 5.314055364578962, "rewards/wrapper/std": 16.77784028351307, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 51.4, "completions/mean_length": 251.3625, "completions/mean_terminated_length": 31.8, "completions/min_length": 114.6, "completions/min_terminated_length": 12.2, "epoch": 0.18181818181818182, "frac_reward_zero_std": 0.0375, "grad_norm": 1.3046875, "kl": 0.006928546976996586, "learning_rate": 7.806463838407295e-06, "loss": -0.008, "num_tokens": 8466755.0, "reward": 13.176506042480469, "reward_std": 16.891181087493898, "rewards/wrapper/mean": 6.588253006339073, "rewards/wrapper/std": 16.821544279158115, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 31.2, "completions/mean_length": 252.18125, "completions/mean_terminated_length": 31.2, "completions/min_length": 184.8, "completions/min_terminated_length": 31.2, "epoch": 0.18299120234604105, "frac_reward_zero_std": 0.0375, "grad_norm": 2.640625, "kl": 0.07739391865034122, "learning_rate": 7.804975216525566e-06, "loss": -0.0047, "num_tokens": 8521826.0, "reward": 6.241498494148255, "reward_std": 8.379155158996582, "rewards/wrapper/mean": 3.1207491770386695, "rewards/wrapper/std": 10.579415337741375, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 252.9875, "completions/mean_terminated_length": 6.0, "completions/min_length": 159.6, "completions/min_terminated_length": 6.0, "epoch": 0.1841642228739003, "frac_reward_zero_std": 0.0625, "grad_norm": 2.046875, "kl": 0.00618872475461103, "learning_rate": 7.803476812522711e-06, "loss": -0.0075, "num_tokens": 8577354.0, "reward": 12.9086124420166, "reward_std": 16.800788116455077, "rewards/wrapper/mean": 6.454306278377771, "rewards/wrapper/std": 19.88245558142662, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 254.49375, "completions/mean_terminated_length": 3.0, "completions/min_length": 207.8, "completions/min_terminated_length": 3.0, "epoch": 0.18533724340175953, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.01572358557023108, "learning_rate": 7.801968631529187e-06, "loss": -0.0041, "num_tokens": 8631829.0, "reward": 7.105897712707519, "reward_std": 8.864458084106445, "rewards/wrapper/mean": 3.552948968857527, "rewards/wrapper/std": 12.096154929697514, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 31.2, "completions/mean_length": 255.375, "completions/mean_terminated_length": 31.2, "completions/min_length": 236.0, "completions/min_terminated_length": 31.2, "epoch": 0.18651026392961878, "frac_reward_zero_std": 0.05, "grad_norm": 5.25, "kl": 0.04917837719549425, "learning_rate": 7.800450678708914e-06, "loss": 0.0033, "num_tokens": 8687353.0, "reward": 15.010721778869629, "reward_std": 16.94651641845703, "rewards/wrapper/mean": 7.5053609274327755, "rewards/wrapper/std": 22.161951984465123, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 250.5875, "completions/mean_terminated_length": 30.7, "completions/min_length": 132.8, "completions/min_terminated_length": 30.4, "epoch": 0.187683284457478, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.005762098281411454, "learning_rate": 7.79892295925928e-06, "loss": -0.0046, "num_tokens": 8739759.0, "reward": 11.818170356750489, "reward_std": 12.876243591308594, "rewards/wrapper/mean": 5.909084708243609, "rewards/wrapper/std": 18.03155415803194, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 37.2, "completions/mean_length": 252.4875, "completions/mean_terminated_length": 32.5, "completions/min_length": 181.4, "completions/min_terminated_length": 27.8, "epoch": 0.18885630498533723, "frac_reward_zero_std": 0.0375, "grad_norm": 1.984375, "kl": 0.028124336060136555, "learning_rate": 7.797385478411107e-06, "loss": -0.002, "num_tokens": 8794821.0, "reward": 7.452442216873169, "reward_std": 8.096717083454132, "rewards/wrapper/mean": 3.7262209847569467, "rewards/wrapper/std": 10.810575023293495, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 19.6, "completions/mean_length": 247.0375, "completions/mean_terminated_length": 9.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.19002932551319648, "frac_reward_zero_std": 0.0375, "grad_norm": 524.0, "kl": 0.06574524050229229, "learning_rate": 7.795838241428644e-06, "loss": 0.0005, "num_tokens": 8849903.0, "reward": 7.610436058044433, "reward_std": 9.26673491001129, "rewards/wrapper/mean": 3.805217783153057, "rewards/wrapper/std": 13.05225038230419, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.1912023460410557, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.032787256489973514, "learning_rate": 7.794281253609542e-06, "loss": 0.0038, "num_tokens": 8903826.0, "reward": 9.780489444732666, "reward_std": 13.06036205291748, "rewards/wrapper/mean": 4.890244487673044, "rewards/wrapper/std": 16.01880385428667, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.8, "completions/mean_length": 252.10625, "completions/mean_terminated_length": 28.8, "completions/min_length": 182.4, "completions/min_terminated_length": 28.8, "epoch": 0.19237536656891496, "frac_reward_zero_std": 0.0375, "grad_norm": 3.890625, "kl": 0.007354407967068255, "learning_rate": 7.79271452028484e-06, "loss": -0.0057, "num_tokens": 8960313.0, "reward": 12.27817931175232, "reward_std": 16.556332683563234, "rewards/wrapper/mean": 6.139089624583721, "rewards/wrapper/std": 15.078392013907433, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 49.4, "completions/mean_length": 252.98125, "completions/mean_terminated_length": 37.0, "completions/min_length": 178.2, "completions/min_terminated_length": 24.6, "epoch": 0.1935483870967742, "frac_reward_zero_std": 0.0375, "grad_norm": 1.328125, "kl": 0.007571105333045125, "learning_rate": 7.791138046818944e-06, "loss": -0.0037, "num_tokens": 9014292.0, "reward": 14.692852687835693, "reward_std": 19.98781144618988, "rewards/wrapper/mean": 7.346426869183778, "rewards/wrapper/std": 20.598740892112254, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 1.6, "completions/mean_length": 251.25, "completions/mean_terminated_length": 1.6, "completions/min_length": 104.0, "completions/min_terminated_length": 1.6, "epoch": 0.19472140762463344, "frac_reward_zero_std": 0.0375, "grad_norm": 4.78125, "kl": 0.009129003703128547, "learning_rate": 7.78955183860961e-06, "loss": -0.0121, "num_tokens": 9070082.0, "reward": 9.678040361404419, "reward_std": 12.95136342048645, "rewards/wrapper/mean": 4.839020009338856, "rewards/wrapper/std": 14.712241315841675, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 249.91875, "completions/mean_terminated_length": 6.5, "completions/min_length": 105.4, "completions/min_terminated_length": 3.0, "epoch": 0.19589442815249267, "frac_reward_zero_std": 0.0125, "grad_norm": 4.71875, "kl": 0.00882287950371392, "learning_rate": 7.787955901087924e-06, "loss": -0.0139, "num_tokens": 9126347.0, "reward": 8.241892063617707, "reward_std": 11.121530401706696, "rewards/wrapper/mean": 4.120945824682712, "rewards/wrapper/std": 13.859178911149503, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.19706744868035192, "frac_reward_zero_std": 0.025, "grad_norm": 1.265625, "kl": 0.0057106147520244125, "learning_rate": 7.786350239718285e-06, "loss": -0.0104, "num_tokens": 9184344.0, "reward": 7.929042911529541, "reward_std": 10.194645261764526, "rewards/wrapper/mean": 3.9645213529467584, "rewards/wrapper/std": 12.511058503389359, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 43.2, "completions/mean_length": 255.75, "completions/mean_terminated_length": 43.2, "completions/min_length": 248.0, "completions/min_terminated_length": 43.2, "epoch": 0.19824046920821115, "frac_reward_zero_std": 0.0375, "grad_norm": 2.546875, "kl": 0.010326540493406356, "learning_rate": 7.784734859998386e-06, "loss": 0.0013, "num_tokens": 9241426.0, "reward": 12.294482326507568, "reward_std": 14.175844478607178, "rewards/wrapper/mean": 6.14724093079567, "rewards/wrapper/std": 16.098360952734946, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.19941348973607037, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.006363995838910341, "learning_rate": 7.783109767459199e-06, "loss": -0.0111, "num_tokens": 9294562.0, "reward": 12.141466617584229, "reward_std": 16.330566787719725, "rewards/wrapper/mean": 6.070733168721199, "rewards/wrapper/std": 16.94215931892395, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 54.8, "completions/mean_length": 245.30625, "completions/mean_terminated_length": 29.5, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.20058651026392962, "frac_reward_zero_std": 0.05, "grad_norm": 4.15625, "kl": 0.007616071903612465, "learning_rate": 7.781474967664944e-06, "loss": -0.018, "num_tokens": 9348237.0, "reward": 7.399131870269775, "reward_std": 9.726300144195557, "rewards/wrapper/mean": 3.699565923213959, "rewards/wrapper/std": 13.051005025207996, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94375, "completions/max_length": 256.0, "completions/max_terminated_length": 53.4, "completions/mean_length": 243.3, "completions/mean_terminated_length": 18.33333339691162, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.20175953079178885, "frac_reward_zero_std": 0.0375, "grad_norm": 3.21875, "kl": 0.01259395177476108, "learning_rate": 7.779830466213087e-06, "loss": -0.0218, "num_tokens": 9401783.0, "reward": 9.727622842788696, "reward_std": 11.499066877365113, "rewards/wrapper/mean": 4.863811122626066, "rewards/wrapper/std": 11.483076599240302, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 252.925, "completions/mean_terminated_length": 4.0, "completions/min_length": 157.6, "completions/min_terminated_length": 4.0, "epoch": 0.2029325513196481, "frac_reward_zero_std": 0.05, "grad_norm": 5.5, "kl": 0.018720851698890328, "learning_rate": 7.778176268734307e-06, "loss": -0.0095, "num_tokens": 9464335.0, "reward": 11.788486242294312, "reward_std": 14.090129899978638, "rewards/wrapper/mean": 5.894242788851261, "rewards/wrapper/std": 16.64428468346596, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 249.625, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.20410557184750733, "frac_reward_zero_std": 0.0375, "grad_norm": 3.015625, "kl": 12.23134752092883, "learning_rate": 7.776512380892478e-06, "loss": 0.4802, "num_tokens": 9518197.0, "reward": 12.99444284439087, "reward_std": 14.776373958587646, "rewards/wrapper/mean": 6.497221313416958, "rewards/wrapper/std": 17.821214818954466, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 247.7625, "completions/mean_terminated_length": 36.1, "completions/min_length": 131.6, "completions/min_terminated_length": 29.2, "epoch": 0.20527859237536658, "frac_reward_zero_std": 0.0125, "grad_norm": 1.5546875, "kl": 0.22191111339488998, "learning_rate": 7.774838808384665e-06, "loss": -0.0061, "num_tokens": 9574929.0, "reward": 8.010742235183717, "reward_std": 8.707789611816406, "rewards/wrapper/mean": 4.005371156334877, "rewards/wrapper/std": 11.873735588788985, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 249.74375, "completions/mean_terminated_length": 28.0, "completions/min_length": 130.4, "completions/min_terminated_length": 28.0, "epoch": 0.2064516129032258, "frac_reward_zero_std": 0.025, "grad_norm": 2.234375, "kl": 0.0091151007974986, "learning_rate": 7.773155556941077e-06, "loss": -0.0074, "num_tokens": 9628886.0, "reward": 6.747600078582764, "reward_std": 8.869162845611573, "rewards/wrapper/mean": 3.3737999342381952, "rewards/wrapper/std": 12.170971043407917, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 1.4, "completions/mean_length": 249.64375, "completions/mean_terminated_length": 1.4, "completions/min_length": 52.6, "completions/min_terminated_length": 1.4, "epoch": 0.20762463343108503, "frac_reward_zero_std": 0.05, "grad_norm": 4.53125, "kl": 0.008626295503927395, "learning_rate": 7.771462632325079e-06, "loss": -0.0022, "num_tokens": 9684261.0, "reward": 6.184603309631347, "reward_std": 8.200012350082398, "rewards/wrapper/mean": 3.0923014655709267, "rewards/wrapper/std": 11.65102232992649, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 249.85625, "completions/mean_terminated_length": 4.3, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.20879765395894428, "frac_reward_zero_std": 0.025, "grad_norm": 3.75, "kl": 0.006855689559597522, "learning_rate": 7.769760040333146e-06, "loss": -0.0166, "num_tokens": 9743200.0, "reward": 10.165173721313476, "reward_std": 13.681201267242432, "rewards/wrapper/mean": 5.082586967200041, "rewards/wrapper/std": 15.366864316165447, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 72.4, "completions/mean_length": 250.275, "completions/mean_terminated_length": 72.4, "completions/min_length": 174.8, "completions/min_terminated_length": 72.4, "epoch": 0.2099706744868035, "frac_reward_zero_std": 0.0125, "grad_norm": 1.765625, "kl": 0.006981910020112991, "learning_rate": 7.768047786794854e-06, "loss": -0.0108, "num_tokens": 9798840.0, "reward": 12.545556449890137, "reward_std": 16.114229202270508, "rewards/wrapper/mean": 6.272777940332889, "rewards/wrapper/std": 16.644485236704348, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 254.825, "completions/mean_terminated_length": 13.6, "completions/min_length": 218.4, "completions/min_terminated_length": 13.6, "epoch": 0.21114369501466276, "frac_reward_zero_std": 0.025, "grad_norm": 2.890625, "kl": 0.008089781797025353, "learning_rate": 7.766325877572865e-06, "loss": -0.0024, "num_tokens": 9855976.0, "reward": 8.491017055511474, "reward_std": 9.61400227546692, "rewards/wrapper/mean": 4.245508745312691, "rewards/wrapper/std": 13.363166551291943, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 12.6, "completions/mean_length": 251.6, "completions/mean_terminated_length": 6.5, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.212316715542522, "frac_reward_zero_std": 0.025, "grad_norm": 2.375, "kl": 85.43212811666308, "learning_rate": 7.764594318562897e-06, "loss": 3.401, "num_tokens": 9910584.0, "reward": 12.041954612731933, "reward_std": 12.423308753967286, "rewards/wrapper/mean": 6.020977398753166, "rewards/wrapper/std": 16.532723309099673, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 46.4, "completions/mean_length": 252.65, "completions/mean_terminated_length": 46.4, "completions/min_length": 148.8, "completions/min_terminated_length": 46.4, "epoch": 0.21348973607038124, "frac_reward_zero_std": 0.025, "grad_norm": 2.703125, "kl": 0.02620959288906306, "learning_rate": 7.76285311569371e-06, "loss": -0.0118, "num_tokens": 9969166.0, "reward": 10.540363311767578, "reward_std": 12.41097354888916, "rewards/wrapper/mean": 5.270181411504746, "rewards/wrapper/std": 16.059864945709705, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.21466275659824047, "frac_reward_zero_std": 0.0125, "grad_norm": 1.078125, "kl": 0.006863275967771187, "learning_rate": 7.761102274927087e-06, "loss": -0.006, "num_tokens": 10026009.0, "reward": 10.822275352478027, "reward_std": 12.488915252685548, "rewards/wrapper/mean": 5.411137568205595, "rewards/wrapper/std": 17.650423718988897, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 36.6, "completions/mean_length": 252.6125, "completions/mean_terminated_length": 22.7, "completions/min_length": 162.4, "completions/min_terminated_length": 8.8, "epoch": 0.21583577712609972, "frac_reward_zero_std": 0.05, "grad_norm": 2.65625, "kl": 0.012789941893424838, "learning_rate": 7.759341802257804e-06, "loss": -0.0037, "num_tokens": 10081561.0, "reward": 8.169697475433349, "reward_std": 10.355749702453613, "rewards/wrapper/mean": 4.084848717600107, "rewards/wrapper/std": 14.507414634525777, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 58.8, "completions/mean_length": 248.25625, "completions/mean_terminated_length": 47.0, "completions/min_length": 143.2, "completions/min_terminated_length": 40.8, "epoch": 0.21700879765395895, "frac_reward_zero_std": 0.025, "grad_norm": 2.421875, "kl": 0.0058027503313496705, "learning_rate": 7.75757170371362e-06, "loss": -0.0189, "num_tokens": 10138170.0, "reward": 9.064529609680175, "reward_std": 10.007425928115845, "rewards/wrapper/mean": 4.532264867424965, "rewards/wrapper/std": 15.986943626403809, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 53.2, "completions/mean_length": 251.2625, "completions/mean_terminated_length": 53.2, "completions/min_length": 104.4, "completions/min_terminated_length": 53.2, "epoch": 0.21818181818181817, "frac_reward_zero_std": 0.0125, "grad_norm": 1.8046875, "kl": 0.01003794745192863, "learning_rate": 7.755791985355252e-06, "loss": -0.0052, "num_tokens": 10188544.0, "reward": 12.584936428070069, "reward_std": 16.22059907913208, "rewards/wrapper/mean": 6.292468182742596, "rewards/wrapper/std": 17.238589255511762, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 2.6, "completions/mean_length": 252.8875, "completions/mean_terminated_length": 1.4, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.21935483870967742, "frac_reward_zero_std": 0.0375, "grad_norm": 2.328125, "kl": 0.10706454229075461, "learning_rate": 7.754002653276356e-06, "loss": -0.0062, "num_tokens": 10240576.0, "reward": 8.845070493221282, "reward_std": 11.477675139904022, "rewards/wrapper/mean": 4.422535435855389, "rewards/wrapper/std": 14.686181424558162, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 57.4, "completions/mean_length": 253.86875, "completions/mean_terminated_length": 46.0, "completions/min_length": 188.2, "completions/min_terminated_length": 34.6, "epoch": 0.22052785923753665, "frac_reward_zero_std": 0.025, "grad_norm": 2.421875, "kl": 0.012510137457866222, "learning_rate": 7.752203713603501e-06, "loss": -0.0001, "num_tokens": 10297877.0, "reward": 15.567786598205567, "reward_std": 18.3408540725708, "rewards/wrapper/mean": 7.783893074095249, "rewards/wrapper/std": 18.60699598044157, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 48.8, "completions/mean_length": 252.73125, "completions/mean_terminated_length": 24.6, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.2217008797653959, "frac_reward_zero_std": 0.0125, "grad_norm": 206.0, "kl": 0.020914323756005614, "learning_rate": 7.750395172496158e-06, "loss": -0.0082, "num_tokens": 10353394.0, "reward": 7.288831424713135, "reward_std": 9.706880664825439, "rewards/wrapper/mean": 3.6444156602025033, "rewards/wrapper/std": 11.424931126832963, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 5.2, "completions/mean_length": 248.175, "completions/mean_terminated_length": 5.2, "completions/min_length": 107.6, "completions/min_terminated_length": 5.2, "epoch": 0.22287390029325513, "frac_reward_zero_std": 0.0, "grad_norm": 6.875, "kl": 0.015235843160189689, "learning_rate": 7.748577036146666e-06, "loss": -0.0159, "num_tokens": 10408856.0, "reward": 7.371206140518188, "reward_std": 8.97930736541748, "rewards/wrapper/mean": 3.6856027841567993, "rewards/wrapper/std": 11.833311099559069, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.22404692082111438, "frac_reward_zero_std": 0.0125, "grad_norm": 11.125, "kl": 0.014451591449324041, "learning_rate": 7.746749310780223e-06, "loss": -0.0044, "num_tokens": 10463535.0, "reward": 11.889365959167481, "reward_std": 8.928557300567627, "rewards/wrapper/mean": 5.944682708382606, "rewards/wrapper/std": 17.176417842507362, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 99.6, "completions/mean_length": 251.64375, "completions/mean_terminated_length": 50.03333435058594, "completions/min_length": 170.4, "completions/min_terminated_length": 16.8, "epoch": 0.2252199413489736, "frac_reward_zero_std": 0.025, "grad_norm": 3.71875, "kl": 0.005338036478497088, "learning_rate": 7.744912002654856e-06, "loss": -0.0013, "num_tokens": 10520552.0, "reward": 7.731118607521057, "reward_std": 10.374876952171325, "rewards/wrapper/mean": 3.8655590668320654, "rewards/wrapper/std": 13.530312813818455, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 247.75625, "completions/mean_terminated_length": 41.8, "completions/min_length": 91.8, "completions/min_terminated_length": 40.6, "epoch": 0.22639296187683283, "frac_reward_zero_std": 0.0375, "grad_norm": 4.28125, "kl": 0.011531842313706875, "learning_rate": 7.743065118061405e-06, "loss": -0.0041, "num_tokens": 10573747.0, "reward": 11.806914710998536, "reward_std": 15.965612602233886, "rewards/wrapper/mean": 5.90345728546381, "rewards/wrapper/std": 17.234433594346047, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 5.2, "completions/mean_length": 254.5625, "completions/mean_terminated_length": 5.2, "completions/min_length": 210.0, "completions/min_terminated_length": 5.2, "epoch": 0.22756598240469209, "frac_reward_zero_std": 0.0375, "grad_norm": 9.25, "kl": 0.011473823472624645, "learning_rate": 7.741208663323497e-06, "loss": -0.0029, "num_tokens": 10626809.0, "reward": 11.198212456703185, "reward_std": 12.455942213535309, "rewards/wrapper/mean": 5.599106089770794, "rewards/wrapper/std": 15.173226012289524, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.8, "completions/mean_length": 252.85625, "completions/mean_terminated_length": 1.8, "completions/min_length": 155.4, "completions/min_terminated_length": 1.8, "epoch": 0.2287390029325513, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.01461629854165949, "learning_rate": 7.739342644797526e-06, "loss": 0.0007, "num_tokens": 10680226.0, "reward": 7.582841587066651, "reward_std": 10.145219755172729, "rewards/wrapper/mean": 3.7914206713438032, "rewards/wrapper/std": 12.200360830128194, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 251.5125, "completions/mean_terminated_length": 10.0, "completions/min_length": 112.4, "completions/min_terminated_length": 10.0, "epoch": 0.22991202346041056, "frac_reward_zero_std": 0.0125, "grad_norm": 1.3203125, "kl": 1.5801498372165952, "learning_rate": 7.737467068872637e-06, "loss": 0.0504, "num_tokens": 10733206.0, "reward": 11.884287261962891, "reward_std": 14.595683097839355, "rewards/wrapper/mean": 5.942143467068672, "rewards/wrapper/std": 19.32650369256735, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.2310850439882698, "frac_reward_zero_std": 0.025, "grad_norm": 4.75, "kl": 0.009276495571248234, "learning_rate": 7.735581941970693e-06, "loss": -0.0059, "num_tokens": 10785268.0, "reward": 6.786200904846192, "reward_std": 8.93930425643921, "rewards/wrapper/mean": 3.393100444227457, "rewards/wrapper/std": 10.605686566233635, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.23225806451612904, "frac_reward_zero_std": 0.0375, "grad_norm": 4.40625, "kl": 0.0071992134675383564, "learning_rate": 7.733687270546263e-06, "loss": -0.006, "num_tokens": 10839475.0, "reward": 12.208200645446777, "reward_std": 14.330049514770508, "rewards/wrapper/mean": 6.104100047051906, "rewards/wrapper/std": 16.828649199008943, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 47.2, "completions/mean_length": 246.2875, "completions/mean_terminated_length": 47.2, "completions/min_length": 47.2, "completions/min_terminated_length": 47.2, "epoch": 0.23343108504398827, "frac_reward_zero_std": 0.05, "grad_norm": 2.6875, "kl": 0.20198758316691964, "learning_rate": 7.731783061086594e-06, "loss": -0.0104, "num_tokens": 10894911.0, "reward": 9.800655174255372, "reward_std": 12.638169860839843, "rewards/wrapper/mean": 4.900327530503273, "rewards/wrapper/std": 17.312468548119067, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 12.4, "completions/mean_length": 248.39375, "completions/mean_terminated_length": 11.9, "completions/min_length": 62.6, "completions/min_terminated_length": 11.4, "epoch": 0.23460410557184752, "frac_reward_zero_std": 0.0375, "grad_norm": 3.703125, "kl": 0.029017451941035687, "learning_rate": 7.729869320111593e-06, "loss": -0.0203, "num_tokens": 10952706.0, "reward": 8.67610182762146, "reward_std": 9.548018550872802, "rewards/wrapper/mean": 4.338050843402743, "rewards/wrapper/std": 12.254481440782547, "step": 1000 }, { "epoch": 0.23460410557184752, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.82, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 69.66, "eval_completions/mean_length": 230.335, "eval_completions/mean_terminated_length": 61.85166683197021, "eval_completions/min_length": 173.91, "eval_completions/min_terminated_length": 53.59, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.009674767768010496, "eval_loss": -0.04156604781746864, "eval_num_tokens": 10952706.0, "eval_reward": 0.31490315936505797, "eval_reward_std": 0.11863522203173488, "eval_rewards/wrapper/mean": 0.1574515798687935, "eval_rewards/wrapper/std": 0.11627076880075038, "eval_runtime": 211.1908, "eval_samples_per_second": 0.947, "eval_steps_per_second": 0.237, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.23577712609970675, "frac_reward_zero_std": 0.025, "grad_norm": 2.5625, "kl": 0.011619134614011272, "learning_rate": 7.727946054173796e-06, "loss": -0.0046, "num_tokens": 11006125.0, "reward": 12.64652976989746, "reward_std": 15.572536277770997, "rewards/wrapper/mean": 6.323264981806278, "rewards/wrapper/std": 20.6168105751276, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.23695014662756597, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.006164612190332264, "learning_rate": 7.726013269858362e-06, "loss": -0.001, "num_tokens": 11061484.0, "reward": 11.502595329284668, "reward_std": 15.562606811523438, "rewards/wrapper/mean": 5.751297509670257, "rewards/wrapper/std": 18.69067438542843, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.23812316715542522, "frac_reward_zero_std": 0.0125, "grad_norm": 3.65625, "kl": 0.0124075862695463, "learning_rate": 7.724070973783033e-06, "loss": -0.0083, "num_tokens": 11116552.0, "reward": 9.195004653930663, "reward_std": 12.420144939422608, "rewards/wrapper/mean": 4.59750243127346, "rewards/wrapper/std": 16.103847907483576, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 14.8, "completions/mean_length": 248.475, "completions/mean_terminated_length": 7.7, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.23929618768328445, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.009327601094264537, "learning_rate": 7.722119172598117e-06, "loss": -0.0073, "num_tokens": 11170880.0, "reward": 8.787823486328126, "reward_std": 11.928912353515624, "rewards/wrapper/mean": 4.393911641836167, "rewards/wrapper/std": 14.354398925602435, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 35.2, "completions/mean_length": 252.3, "completions/mean_terminated_length": 35.2, "completions/min_length": 137.6, "completions/min_terminated_length": 35.2, "epoch": 0.2404692082111437, "frac_reward_zero_std": 0.0125, "grad_norm": 7.125, "kl": 0.007904788851737976, "learning_rate": 7.720157872986474e-06, "loss": -0.0067, "num_tokens": 11227748.0, "reward": 9.267306017875672, "reward_std": 12.493532657623291, "rewards/wrapper/mean": 4.633652974665165, "rewards/wrapper/std": 14.101441629230976, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 249.6625, "completions/mean_terminated_length": 2.0, "completions/min_length": 53.2, "completions/min_terminated_length": 2.0, "epoch": 0.24164222873900293, "frac_reward_zero_std": 0.025, "grad_norm": 5.03125, "kl": 0.013292332063429058, "learning_rate": 7.718187081663484e-06, "loss": -0.0012, "num_tokens": 11283032.0, "reward": 11.44096269607544, "reward_std": 15.433443355560303, "rewards/wrapper/mean": 5.72048115581274, "rewards/wrapper/std": 17.70729095637798, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.24281524926686218, "frac_reward_zero_std": 0.025, "grad_norm": 3.046875, "kl": 0.009489351080264895, "learning_rate": 7.716206805377021e-06, "loss": 0.0004, "num_tokens": 11339056.0, "reward": 11.168231201171874, "reward_std": 9.798115158081055, "rewards/wrapper/mean": 5.584115269035101, "rewards/wrapper/std": 14.987447142601013, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 111.2, "completions/mean_length": 249.9, "completions/mean_terminated_length": 107.6, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2439882697947214, "frac_reward_zero_std": 0.025, "grad_norm": 1.4765625, "kl": 0.009569767396897078, "learning_rate": 7.714217050907444e-06, "loss": -0.0182, "num_tokens": 11392794.0, "reward": 9.1754976272583, "reward_std": 11.94598445892334, "rewards/wrapper/mean": 4.587748650461435, "rewards/wrapper/std": 14.406213076412678, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.24516129032258063, "frac_reward_zero_std": 0.0125, "grad_norm": 2.609375, "kl": 0.006833912117872387, "learning_rate": 7.712217825067554e-06, "loss": -0.0123, "num_tokens": 11450883.0, "reward": 11.125387191772461, "reward_std": 14.580111122131347, "rewards/wrapper/mean": 5.562693519145251, "rewards/wrapper/std": 18.808959732949734, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 21.8, "completions/mean_length": 248.6875, "completions/mean_terminated_length": 21.8, "completions/min_length": 73.0, "completions/min_terminated_length": 21.8, "epoch": 0.24633431085043989, "frac_reward_zero_std": 0.0625, "grad_norm": 7.0625, "kl": 0.04593934025615454, "learning_rate": 7.710209134702588e-06, "loss": -0.0156, "num_tokens": 11506459.0, "reward": 9.748054599761963, "reward_std": 10.04455499649048, "rewards/wrapper/mean": 4.874027146399021, "rewards/wrapper/std": 14.737575414776803, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 249.3, "completions/mean_terminated_length": 17.333333587646486, "completions/min_length": 107.8, "completions/min_terminated_length": 5.4, "epoch": 0.2475073313782991, "frac_reward_zero_std": 0.05, "grad_norm": 2.03125, "kl": 0.011725964327342808, "learning_rate": 7.708190986690189e-06, "loss": -0.005, "num_tokens": 11560853.0, "reward": 6.436659145355224, "reward_std": 6.5048364162445065, "rewards/wrapper/mean": 3.218329684436321, "rewards/wrapper/std": 10.194329760968685, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 48.4, "completions/mean_length": 250.19375, "completions/mean_terminated_length": 39.9, "completions/min_length": 133.8, "completions/min_terminated_length": 31.4, "epoch": 0.24868035190615836, "frac_reward_zero_std": 0.0625, "grad_norm": 3.96875, "kl": 0.006699386925902218, "learning_rate": 7.706163387940381e-06, "loss": -0.0045, "num_tokens": 11617612.0, "reward": 10.259013462066651, "reward_std": 12.297688674926757, "rewards/wrapper/mean": 5.129506582021714, "rewards/wrapper/std": 16.151020860672, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 43.2, "completions/mean_length": 252.55, "completions/mean_terminated_length": 43.2, "completions/min_length": 145.6, "completions/min_terminated_length": 43.2, "epoch": 0.2498533724340176, "frac_reward_zero_std": 0.05, "grad_norm": 3.0, "kl": 0.02774180765263736, "learning_rate": 7.704126345395549e-06, "loss": -0.0038, "num_tokens": 11672532.0, "reward": 8.056447982788086, "reward_std": 8.833860492706298, "rewards/wrapper/mean": 4.02822390422225, "rewards/wrapper/std": 14.670276536047458, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 21.2, "completions/mean_length": 248.75625, "completions/mean_terminated_length": 12.2, "completions/min_length": 105.6, "completions/min_terminated_length": 3.2, "epoch": 0.25102639296187684, "frac_reward_zero_std": 0.0375, "grad_norm": 1.6953125, "kl": 0.009839186503086239, "learning_rate": 7.702079866030408e-06, "loss": 0.0011, "num_tokens": 11726633.0, "reward": 11.247284412384033, "reward_std": 15.467829513549805, "rewards/wrapper/mean": 5.623642058670521, "rewards/wrapper/std": 18.223739244043827, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.4, "completions/mean_length": 252.84375, "completions/mean_terminated_length": 1.4, "completions/min_length": 155.0, "completions/min_terminated_length": 1.4, "epoch": 0.25219941348973607, "frac_reward_zero_std": 0.025, "grad_norm": 3.453125, "kl": 0.026338630728423594, "learning_rate": 7.700023956851989e-06, "loss": -0.0101, "num_tokens": 11781894.0, "reward": 9.359408187866212, "reward_std": 10.791181874275207, "rewards/wrapper/mean": 4.679704067856074, "rewards/wrapper/std": 12.456576159596443, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 41.2, "completions/mean_length": 247.69375, "completions/mean_terminated_length": 37.9, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.2533724340175953, "frac_reward_zero_std": 0.0375, "grad_norm": 4.6875, "kl": 0.008290641068015248, "learning_rate": 7.697958624899609e-06, "loss": -0.0154, "num_tokens": 11836557.0, "reward": 4.384031456708908, "reward_std": 5.678225213289261, "rewards/wrapper/mean": 2.1920157223939896, "rewards/wrapper/std": 7.228950951993466, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 47.6, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 47.6, "completions/min_length": 150.0, "completions/min_terminated_length": 47.6, "epoch": 0.2545454545454545, "frac_reward_zero_std": 0.05, "grad_norm": 7.65625, "kl": 0.009902437112759798, "learning_rate": 7.695883877244846e-06, "loss": -0.0113, "num_tokens": 11891565.0, "reward": 9.418929600715638, "reward_std": 11.308019065856934, "rewards/wrapper/mean": 4.709464704990387, "rewards/wrapper/std": 14.645676551759243, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 253.3875, "completions/mean_terminated_length": 70.0, "completions/min_length": 172.4, "completions/min_terminated_length": 70.0, "epoch": 0.2557184750733138, "frac_reward_zero_std": 0.0375, "grad_norm": 3.203125, "kl": 0.008452445617876947, "learning_rate": 7.69379972099152e-06, "loss": -0.0044, "num_tokens": 11946271.0, "reward": 10.250184059143066, "reward_std": 13.61496181488037, "rewards/wrapper/mean": 5.125091888010502, "rewards/wrapper/std": 16.24077228009701, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 250.25625, "completions/mean_terminated_length": 30.733334350585938, "completions/min_length": 112.6, "completions/min_terminated_length": 10.2, "epoch": 0.256891495601173, "frac_reward_zero_std": 0.0125, "grad_norm": 1.625, "kl": 0.22729696487658657, "learning_rate": 7.691706163275663e-06, "loss": -0.0026, "num_tokens": 12000630.0, "reward": 6.446021175384521, "reward_std": 6.5143946528434755, "rewards/wrapper/mean": 3.223010669648647, "rewards/wrapper/std": 10.89571967869997, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.25806451612903225, "frac_reward_zero_std": 0.025, "grad_norm": 1.0078125, "kl": 0.0057296501705423, "learning_rate": 7.689603211265496e-06, "loss": -0.0061, "num_tokens": 12053895.0, "reward": 11.164654111862182, "reward_std": 14.292295026779176, "rewards/wrapper/mean": 5.58232696801424, "rewards/wrapper/std": 16.829090513288975, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.2592375366568915, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.006397956318687647, "learning_rate": 7.68749087216141e-06, "loss": -0.006, "num_tokens": 12107008.0, "reward": 9.694752669334411, "reward_std": 11.153807973861694, "rewards/wrapper/mean": 4.84737599119544, "rewards/wrapper/std": 13.409111241996289, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 6.2, "completions/mean_length": 254.59375, "completions/mean_terminated_length": 6.2, "completions/min_length": 211.0, "completions/min_terminated_length": 6.2, "epoch": 0.26041055718475076, "frac_reward_zero_std": 0.025, "grad_norm": 5.0625, "kl": 4.17301641855156, "learning_rate": 7.685369153195933e-06, "loss": 0.1615, "num_tokens": 12160781.0, "reward": 11.649996852874756, "reward_std": 15.833073997497559, "rewards/wrapper/mean": 5.824998654425144, "rewards/wrapper/std": 17.238114669919014, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 251.4875, "completions/mean_terminated_length": 20.26666717529297, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.26158357771261, "frac_reward_zero_std": 0.025, "grad_norm": 8.25, "kl": 0.009280881262384355, "learning_rate": 7.683238061633712e-06, "loss": -0.0052, "num_tokens": 12215855.0, "reward": 12.724572658538818, "reward_std": 13.146955060958863, "rewards/wrapper/mean": 6.362286276370287, "rewards/wrapper/std": 15.493781666457654, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 89.6, "completions/mean_length": 250.86875, "completions/mean_terminated_length": 68.1, "completions/min_length": 97.8, "completions/min_terminated_length": 46.6, "epoch": 0.2627565982404692, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.004430928025976754, "learning_rate": 7.68109760477148e-06, "loss": -0.0145, "num_tokens": 12269000.0, "reward": 10.06506805419922, "reward_std": 13.367911243438721, "rewards/wrapper/mean": 5.032533720880747, "rewards/wrapper/std": 14.828080916404724, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.26392961876832843, "frac_reward_zero_std": 0.0375, "grad_norm": 2.640625, "kl": 0.012790444202255457, "learning_rate": 7.678947789938045e-06, "loss": -0.0039, "num_tokens": 12323763.0, "reward": 6.5787577629089355, "reward_std": 8.753919792175292, "rewards/wrapper/mean": 3.289378835260868, "rewards/wrapper/std": 10.194012176990508, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 59.4, "completions/mean_length": 251.4625, "completions/mean_terminated_length": 46.4, "completions/min_length": 135.8, "completions/min_terminated_length": 33.4, "epoch": 0.26510263929618766, "frac_reward_zero_std": 0.0125, "grad_norm": 1.7890625, "kl": 0.010952617693692445, "learning_rate": 7.676788624494249e-06, "loss": -0.0055, "num_tokens": 12376033.0, "reward": 9.405997359752655, "reward_std": 10.985446679592133, "rewards/wrapper/mean": 4.702998787909746, "rewards/wrapper/std": 12.160600701719522, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 57.4, "completions/mean_length": 252.99375, "completions/mean_terminated_length": 57.4, "completions/min_length": 159.8, "completions/min_terminated_length": 57.4, "epoch": 0.26627565982404694, "frac_reward_zero_std": 0.025, "grad_norm": 3.4375, "kl": 0.2351893066195771, "learning_rate": 7.674620115832949e-06, "loss": 0.0148, "num_tokens": 12432798.0, "reward": 8.022018957138062, "reward_std": 10.71765694618225, "rewards/wrapper/mean": 4.011009331047535, "rewards/wrapper/std": 12.21592505723238, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 31.6, "completions/mean_length": 253.7875, "completions/mean_terminated_length": 31.6, "completions/min_length": 185.2, "completions/min_terminated_length": 31.6, "epoch": 0.26744868035190617, "frac_reward_zero_std": 0.0125, "grad_norm": 1.2578125, "kl": 0.016819122969172894, "learning_rate": 7.672442271379e-06, "loss": -0.0055, "num_tokens": 12490174.0, "reward": 8.455910956859588, "reward_std": 11.398876094818116, "rewards/wrapper/mean": 4.2279553160071375, "rewards/wrapper/std": 15.768989896774292, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 21.6, "completions/mean_length": 250.2875, "completions/mean_terminated_length": 11.0, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.2686217008797654, "frac_reward_zero_std": 0.0125, "grad_norm": 8.0, "kl": 0.02696207492845133, "learning_rate": 7.670255098589216e-06, "loss": -0.014, "num_tokens": 12543940.0, "reward": 6.251084423065185, "reward_std": 8.070647144317627, "rewards/wrapper/mean": 3.125542238354683, "rewards/wrapper/std": 10.576351188123226, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.2697947214076246, "frac_reward_zero_std": 0.0375, "grad_norm": 10.5625, "kl": 0.014508222823496907, "learning_rate": 7.668058604952354e-06, "loss": -0.0051, "num_tokens": 12600511.0, "reward": 12.813151550292968, "reward_std": 15.152927589416503, "rewards/wrapper/mean": 6.406575272977352, "rewards/wrapper/std": 19.122226648032665, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.2709677419354839, "frac_reward_zero_std": 0.025, "grad_norm": 3.21875, "kl": 0.005419670697301626, "learning_rate": 7.66585279798908e-06, "loss": -0.0048, "num_tokens": 12656646.0, "reward": 10.259989547729493, "reward_std": 10.767040920257568, "rewards/wrapper/mean": 5.129994577169418, "rewards/wrapper/std": 14.785299123823643, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.2721407624633431, "frac_reward_zero_std": 0.0375, "grad_norm": 5.28125, "kl": 0.027052150969393552, "learning_rate": 7.663637685251955e-06, "loss": -0.0109, "num_tokens": 12709572.0, "reward": 12.856212615966797, "reward_std": 17.106783866882324, "rewards/wrapper/mean": 6.42810637652874, "rewards/wrapper/std": 19.706878601014616, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 9.6, "completions/mean_length": 249.93125, "completions/mean_terminated_length": 3.666666793823242, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.27331378299120235, "frac_reward_zero_std": 0.0625, "grad_norm": 3.875, "kl": 0.01902551531093195, "learning_rate": 7.6614132743254e-06, "loss": -0.0208, "num_tokens": 12767009.0, "reward": 11.094143390655518, "reward_std": 15.04799976348877, "rewards/wrapper/mean": 5.547071680426598, "rewards/wrapper/std": 15.767345032095909, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 56.4, "completions/mean_length": 252.9625, "completions/mean_terminated_length": 56.4, "completions/min_length": 158.8, "completions/min_terminated_length": 56.4, "epoch": 0.2744868035190616, "frac_reward_zero_std": 0.025, "grad_norm": 6.59375, "kl": 3.983160498877987, "learning_rate": 7.659179572825669e-06, "loss": 0.1554, "num_tokens": 12822107.0, "reward": 12.889067268371582, "reward_std": 17.192436599731444, "rewards/wrapper/mean": 6.444533663988113, "rewards/wrapper/std": 18.103868405520917, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 76.8, "completions/mean_length": 249.7, "completions/mean_terminated_length": 67.5, "completions/min_length": 109.4, "completions/min_terminated_length": 58.2, "epoch": 0.2756598240469208, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.0451830686070025, "learning_rate": 7.65693658840083e-06, "loss": -0.0085, "num_tokens": 12878973.0, "reward": 9.611810493469239, "reward_std": 13.04067497253418, "rewards/wrapper/mean": 4.805905170738697, "rewards/wrapper/std": 17.624341449141504, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 43.8, "completions/mean_length": 255.76875, "completions/mean_terminated_length": 43.8, "completions/min_length": 248.6, "completions/min_terminated_length": 43.8, "epoch": 0.2768328445747801, "frac_reward_zero_std": 0.0375, "grad_norm": 3.765625, "kl": 0.00828095116885379, "learning_rate": 7.654684328730737e-06, "loss": -0.0003, "num_tokens": 12935770.0, "reward": 6.8795403957366945, "reward_std": 9.140692472457886, "rewards/wrapper/mean": 3.439770007133484, "rewards/wrapper/std": 13.50705413967371, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 1.2, "completions/mean_length": 251.24375, "completions/mean_terminated_length": 0.8, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.2780058651026393, "frac_reward_zero_std": 0.0125, "grad_norm": 1.328125, "kl": 0.008541014278307557, "learning_rate": 7.652422801526998e-06, "loss": -0.0184, "num_tokens": 12990883.0, "reward": 10.144163513183594, "reward_std": 13.843765115737915, "rewards/wrapper/mean": 5.072081534564495, "rewards/wrapper/std": 15.462312346696853, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 54.2, "completions/mean_length": 248.175, "completions/mean_terminated_length": 29.6, "completions/min_length": 112.6, "completions/min_terminated_length": 10.2, "epoch": 0.27917888563049853, "frac_reward_zero_std": 0.0375, "grad_norm": 0.8359375, "kl": 0.016829893505200744, "learning_rate": 7.650152014532953e-06, "loss": -0.0205, "num_tokens": 13044743.0, "reward": 9.684790706634521, "reward_std": 13.179641246795654, "rewards/wrapper/mean": 4.8423951178789135, "rewards/wrapper/std": 16.63096822053194, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 3.8, "completions/mean_length": 251.31875, "completions/mean_terminated_length": 3.8, "completions/min_length": 106.2, "completions/min_terminated_length": 3.8, "epoch": 0.28035190615835776, "frac_reward_zero_std": 0.0125, "grad_norm": 3.71875, "kl": 0.006020776781952009, "learning_rate": 7.647871975523648e-06, "loss": -0.0139, "num_tokens": 13097946.0, "reward": 6.571639347076416, "reward_std": 8.726711702346801, "rewards/wrapper/mean": 3.285819558799267, "rewards/wrapper/std": 11.842679353058339, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.28152492668621704, "frac_reward_zero_std": 0.025, "grad_norm": 10.375, "kl": 0.03795347143895924, "learning_rate": 7.645582692305809e-06, "loss": -0.006, "num_tokens": 13154027.0, "reward": 11.074889278411865, "reward_std": 9.10773811340332, "rewards/wrapper/mean": 5.537444531917572, "rewards/wrapper/std": 15.783117219805717, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 40.4, "completions/mean_length": 249.35, "completions/mean_terminated_length": 14.466667175292969, "completions/min_length": 154.8, "completions/min_terminated_length": 1.2, "epoch": 0.28269794721407626, "frac_reward_zero_std": 0.0375, "grad_norm": 2.265625, "kl": 0.006860299198888242, "learning_rate": 7.643284172717809e-06, "loss": -0.0026, "num_tokens": 13207235.0, "reward": 12.508953714370728, "reward_std": 16.71675834655762, "rewards/wrapper/mean": 6.254476898163557, "rewards/wrapper/std": 16.900203044712544, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 250.81875, "completions/mean_terminated_length": 39.0, "completions/min_length": 90.2, "completions/min_terminated_length": 39.0, "epoch": 0.2838709677419355, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.010175050923135132, "learning_rate": 7.64097642462965e-06, "loss": -0.0141, "num_tokens": 13261454.0, "reward": 10.52227783203125, "reward_std": 11.177353668212891, "rewards/wrapper/mean": 5.261138796061277, "rewards/wrapper/std": 14.039105215668679, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 82.2, "completions/mean_length": 247.41875, "completions/mean_terminated_length": 62.4, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.2850439882697947, "frac_reward_zero_std": 0.0125, "grad_norm": 5.0, "kl": 0.012304930877871812, "learning_rate": 7.638659455942934e-06, "loss": -0.0229, "num_tokens": 13315821.0, "reward": 4.5331168413162235, "reward_std": 5.74956374168396, "rewards/wrapper/mean": 2.266558450460434, "rewards/wrapper/std": 7.662119425088167, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 111.8, "completions/mean_length": 250.13125, "completions/mean_terminated_length": 79.4, "completions/min_length": 98.2, "completions/min_terminated_length": 47.0, "epoch": 0.28621700879765394, "frac_reward_zero_std": 0.0125, "grad_norm": 1.390625, "kl": 0.007477234344696626, "learning_rate": 7.636333274590826e-06, "loss": -0.009, "num_tokens": 13368678.0, "reward": 7.7060727834701535, "reward_std": 10.061940121650697, "rewards/wrapper/mean": 3.853036458790302, "rewards/wrapper/std": 10.947172378003597, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 49.2, "completions/mean_length": 253.25, "completions/mean_terminated_length": 43.6, "completions/min_length": 191.6, "completions/min_terminated_length": 38.0, "epoch": 0.2873900293255132, "frac_reward_zero_std": 0.0125, "grad_norm": 1.734375, "kl": 0.015387153357733042, "learning_rate": 7.63399788853804e-06, "loss": -0.0022, "num_tokens": 13424186.0, "reward": 10.410561656951904, "reward_std": 13.186028957366943, "rewards/wrapper/mean": 5.205280630290508, "rewards/wrapper/std": 13.38996929973364, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 39.8, "completions/mean_length": 247.6625, "completions/mean_terminated_length": 20.2, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.28856304985337244, "frac_reward_zero_std": 0.05, "grad_norm": 44.75, "kl": 0.08414683194132522, "learning_rate": 7.631653305780806e-06, "loss": -0.0171, "num_tokens": 13478848.0, "reward": 6.07905797958374, "reward_std": 7.165961527824402, "rewards/wrapper/mean": 3.039528689533472, "rewards/wrapper/std": 8.566541536152362, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.28973607038123167, "frac_reward_zero_std": 0.0125, "grad_norm": 5.09375, "kl": 0.012354453129228204, "learning_rate": 7.629299534346842e-06, "loss": 0.0103, "num_tokens": 13535398.0, "reward": 9.07586328983307, "reward_std": 10.113778376579285, "rewards/wrapper/mean": 4.53793145492673, "rewards/wrapper/std": 13.287790149450302, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 14.2, "completions/mean_length": 254.84375, "completions/mean_terminated_length": 14.2, "completions/min_length": 219.0, "completions/min_terminated_length": 14.2, "epoch": 0.2909090909090909, "frac_reward_zero_std": 0.025, "grad_norm": 1.3828125, "kl": 0.007052855577785522, "learning_rate": 7.626936582295328e-06, "loss": -0.0023, "num_tokens": 13588821.0, "reward": 11.41899070739746, "reward_std": 14.385398948192597, "rewards/wrapper/mean": 5.709495208412409, "rewards/wrapper/std": 15.28487433195114, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.2920821114369501, "frac_reward_zero_std": 0.05, "grad_norm": 1.390625, "kl": 0.06638398257782682, "learning_rate": 7.624564457716878e-06, "loss": -0.0149, "num_tokens": 13642985.0, "reward": 17.59623432159424, "reward_std": 19.738516807556152, "rewards/wrapper/mean": 8.798117038607597, "rewards/wrapper/std": 21.251633982360364, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 117.2, "completions/mean_length": 253.2625, "completions/mean_terminated_length": 117.2, "completions/min_length": 168.4, "completions/min_terminated_length": 117.2, "epoch": 0.2932551319648094, "frac_reward_zero_std": 0.0, "grad_norm": 3.328125, "kl": 0.023045004159212113, "learning_rate": 7.622183168733512e-06, "loss": -0.0004, "num_tokens": 13696105.0, "reward": 5.633255100250244, "reward_std": 7.433923816680908, "rewards/wrapper/mean": 2.8166275203227995, "rewards/wrapper/std": 11.261727234721183, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 32.6, "completions/mean_length": 253.81875, "completions/mean_terminated_length": 32.6, "completions/min_length": 186.2, "completions/min_terminated_length": 32.6, "epoch": 0.2944281524926686, "frac_reward_zero_std": 0.0125, "grad_norm": 3.15625, "kl": 0.007092752261087298, "learning_rate": 7.619792723498629e-06, "loss": 0.0075, "num_tokens": 13750444.0, "reward": 11.408452892303467, "reward_std": 14.730864334106446, "rewards/wrapper/mean": 5.7042262017726895, "rewards/wrapper/std": 17.131004671752454, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 250.0125, "completions/mean_terminated_length": 13.0, "completions/min_length": 115.4, "completions/min_terminated_length": 13.0, "epoch": 0.29560117302052785, "frac_reward_zero_std": 0.0125, "grad_norm": 4.25, "kl": 1.5145615183922927, "learning_rate": 7.617393130196977e-06, "loss": 0.0421, "num_tokens": 13803322.0, "reward": 11.319013595581055, "reward_std": 13.331906461715699, "rewards/wrapper/mean": 5.659506534039974, "rewards/wrapper/std": 16.614761224389078, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 21.2, "completions/mean_length": 250.26875, "completions/mean_terminated_length": 20.9, "completions/min_length": 123.0, "completions/min_terminated_length": 20.6, "epoch": 0.2967741935483871, "frac_reward_zero_std": 0.0625, "grad_norm": 3.59375, "kl": 0.008018044813070446, "learning_rate": 7.614984397044628e-06, "loss": -0.012, "num_tokens": 13856751.0, "reward": 7.36223726272583, "reward_std": 9.809179973602294, "rewards/wrapper/mean": 3.6811186604201795, "rewards/wrapper/std": 11.822094440460205, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 252.54375, "completions/mean_terminated_length": 43.0, "completions/min_length": 145.4, "completions/min_terminated_length": 43.0, "epoch": 0.29794721407624636, "frac_reward_zero_std": 0.0125, "grad_norm": 1.828125, "kl": 0.004893030686071142, "learning_rate": 7.6125665322889466e-06, "loss": -0.0021, "num_tokens": 13907542.0, "reward": 6.66135311126709, "reward_std": 8.785630035400391, "rewards/wrapper/mean": 3.330676446855068, "rewards/wrapper/std": 11.304787519574166, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 45.8, "completions/mean_length": 252.63125, "completions/mean_terminated_length": 45.8, "completions/min_length": 148.2, "completions/min_terminated_length": 45.8, "epoch": 0.2991202346041056, "frac_reward_zero_std": 0.025, "grad_norm": 3.609375, "kl": 5.395943081658333, "learning_rate": 7.610139544208566e-06, "loss": 0.2097, "num_tokens": 13965193.0, "reward": 10.49851016998291, "reward_std": 11.999550914764404, "rewards/wrapper/mean": 5.249254953861237, "rewards/wrapper/std": 15.486617393791676, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.8, "completions/mean_length": 252.25, "completions/mean_terminated_length": 17.1, "completions/min_length": 159.0, "completions/min_terminated_length": 5.4, "epoch": 0.3002932551319648, "frac_reward_zero_std": 0.0, "grad_norm": 3.5625, "kl": 0.012830907647730783, "learning_rate": 7.607703441113355e-06, "loss": -0.0125, "num_tokens": 14017931.0, "reward": 9.211493253707886, "reward_std": 10.461997652053833, "rewards/wrapper/mean": 4.605746623873711, "rewards/wrapper/std": 13.777990686893464, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 89.6, "completions/mean_length": 253.3875, "completions/mean_terminated_length": 86.4, "completions/min_length": 185.6, "completions/min_terminated_length": 83.2, "epoch": 0.30146627565982403, "frac_reward_zero_std": 0.0375, "grad_norm": 3.265625, "kl": 0.012887239316478371, "learning_rate": 7.605258231344392e-06, "loss": 0.0047, "num_tokens": 14074503.0, "reward": 7.647303819656372, "reward_std": 10.293930107355118, "rewards/wrapper/mean": 3.8236515186727047, "rewards/wrapper/std": 11.564179126918315, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 50.8, "completions/mean_length": 249.975, "completions/mean_terminated_length": 44.9, "completions/min_length": 90.2, "completions/min_terminated_length": 39.0, "epoch": 0.30263929618768326, "frac_reward_zero_std": 0.05, "grad_norm": 4.96875, "kl": 0.014583267294801772, "learning_rate": 7.602803923273938e-06, "loss": 0.0005, "num_tokens": 14127933.0, "reward": 13.534779834747315, "reward_std": 17.292752075195313, "rewards/wrapper/mean": 6.767389929294586, "rewards/wrapper/std": 19.772680358588694, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 3.4, "completions/mean_length": 251.30625, "completions/mean_terminated_length": 3.4, "completions/min_length": 105.8, "completions/min_terminated_length": 3.4, "epoch": 0.30381231671554254, "frac_reward_zero_std": 0.0375, "grad_norm": 1.0703125, "kl": 0.006688924302579835, "learning_rate": 7.600340525305404e-06, "loss": -0.0058, "num_tokens": 14182722.0, "reward": 8.628307437896728, "reward_std": 11.121209239959716, "rewards/wrapper/mean": 4.314153614640236, "rewards/wrapper/std": 16.781408032774927, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 8.6, "completions/mean_length": 251.46875, "completions/mean_terminated_length": 8.6, "completions/min_length": 111.0, "completions/min_terminated_length": 8.6, "epoch": 0.30498533724340177, "frac_reward_zero_std": 0.05, "grad_norm": 1.546875, "kl": 0.008998627658002079, "learning_rate": 7.5978680458733254e-06, "loss": -0.0098, "num_tokens": 14237021.0, "reward": 14.251729774475098, "reward_std": 13.466583633422852, "rewards/wrapper/mean": 7.125864551961422, "rewards/wrapper/std": 18.563405425846575, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 44.2, "completions/mean_length": 252.58125, "completions/mean_terminated_length": 44.2, "completions/min_length": 146.6, "completions/min_terminated_length": 44.2, "epoch": 0.306158357771261, "frac_reward_zero_std": 0.0125, "grad_norm": 3.078125, "kl": 0.005243840476032347, "learning_rate": 7.5953864934433305e-06, "loss": -0.0049, "num_tokens": 14289700.0, "reward": 7.625714588165283, "reward_std": 10.161770915985107, "rewards/wrapper/mean": 3.8128572389483453, "rewards/wrapper/std": 12.751371662318707, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 9.8, "completions/mean_length": 251.50625, "completions/mean_terminated_length": 9.8, "completions/min_length": 112.2, "completions/min_terminated_length": 9.8, "epoch": 0.3073313782991202, "frac_reward_zero_std": 0.0625, "grad_norm": 2.203125, "kl": 0.008733793662395328, "learning_rate": 7.592895876512114e-06, "loss": -0.011, "num_tokens": 14347077.0, "reward": 7.226803135871887, "reward_std": 9.705256414413451, "rewards/wrapper/mean": 3.6134014263749124, "rewards/wrapper/std": 12.593166868388654, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 251.96875, "completions/mean_terminated_length": 12.4, "completions/min_length": 161.4, "completions/min_terminated_length": 7.8, "epoch": 0.3085043988269795, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.008413564675720409, "learning_rate": 7.590396203607408e-06, "loss": -0.0103, "num_tokens": 14398824.0, "reward": 8.467525911331176, "reward_std": 9.760836601257324, "rewards/wrapper/mean": 4.233762781322002, "rewards/wrapper/std": 13.273331837356091, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 252.025, "completions/mean_terminated_length": 8.8, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.3096774193548387, "frac_reward_zero_std": 0.025, "grad_norm": 2.265625, "kl": 0.011639650189317763, "learning_rate": 7.58788748328795e-06, "loss": -0.0051, "num_tokens": 14454298.0, "reward": 12.13127155303955, "reward_std": 15.99749984741211, "rewards/wrapper/mean": 6.065636083483696, "rewards/wrapper/std": 19.705180183053017, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.31085043988269795, "frac_reward_zero_std": 0.025, "grad_norm": 3.609375, "kl": 0.01836782739846967, "learning_rate": 7.585369724143458e-06, "loss": -0.0068, "num_tokens": 14509264.0, "reward": 11.664790630340576, "reward_std": 13.763594150543213, "rewards/wrapper/mean": 5.832394993305206, "rewards/wrapper/std": 16.633710739016532, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 52.6, "completions/mean_length": 251.24375, "completions/mean_terminated_length": 52.6, "completions/min_length": 103.8, "completions/min_terminated_length": 52.6, "epoch": 0.3120234604105572, "frac_reward_zero_std": 0.0125, "grad_norm": 1.7109375, "kl": 0.00812934830901213, "learning_rate": 7.582842934794593e-06, "loss": -0.0138, "num_tokens": 14562739.0, "reward": 12.168029403686523, "reward_std": 13.295973205566407, "rewards/wrapper/mean": 6.084014493227005, "rewards/wrapper/std": 18.827900260686874, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 23.8, "completions/mean_length": 251.94375, "completions/mean_terminated_length": 23.8, "completions/min_length": 126.2, "completions/min_terminated_length": 23.8, "epoch": 0.3131964809384164, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.006739077181555331, "learning_rate": 7.580307123892941e-06, "loss": -0.0124, "num_tokens": 14616972.0, "reward": 11.35460147857666, "reward_std": 15.193605709075928, "rewards/wrapper/mean": 5.677300703525543, "rewards/wrapper/std": 17.427980916202067, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 251.60625, "completions/mean_terminated_length": 53.3, "completions/min_length": 145.0, "completions/min_terminated_length": 42.6, "epoch": 0.3143695014662757, "frac_reward_zero_std": 0.025, "grad_norm": 8.1875, "kl": 0.011302015569526702, "learning_rate": 7.577762300120974e-06, "loss": -0.008, "num_tokens": 14675993.0, "reward": 11.836063861846924, "reward_std": 15.175680541992188, "rewards/wrapper/mean": 5.918032126128674, "rewards/wrapper/std": 17.08586499094963, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 246.71875, "completions/mean_terminated_length": 35.53333358764648, "completions/min_length": 66.0, "completions/min_terminated_length": 14.8, "epoch": 0.3155425219941349, "frac_reward_zero_std": 0.0125, "grad_norm": 2.078125, "kl": 0.014866840979084372, "learning_rate": 7.575208472192025e-06, "loss": -0.0225, "num_tokens": 14728314.0, "reward": 8.228108072280884, "reward_std": 10.156515312194824, "rewards/wrapper/mean": 4.114054039120674, "rewards/wrapper/std": 11.845364609360695, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 25.8, "completions/mean_length": 252.00625, "completions/mean_terminated_length": 25.8, "completions/min_length": 128.2, "completions/min_terminated_length": 25.8, "epoch": 0.31671554252199413, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.11884763344423846, "learning_rate": 7.572645648850256e-06, "loss": -0.0057, "num_tokens": 14780373.0, "reward": 11.06895570755005, "reward_std": 14.749995613098145, "rewards/wrapper/mean": 5.5344778671860695, "rewards/wrapper/std": 17.256612426042558, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.4, "completions/mean_length": 252.09375, "completions/mean_terminated_length": 28.4, "completions/min_length": 182.0, "completions/min_terminated_length": 28.4, "epoch": 0.31788856304985336, "frac_reward_zero_std": 0.0375, "grad_norm": 1.984375, "kl": 0.00944446304347366, "learning_rate": 7.570073838870627e-06, "loss": -0.0071, "num_tokens": 14838402.0, "reward": 7.713986945152283, "reward_std": 8.773927760124206, "rewards/wrapper/mean": 3.8569933280348776, "rewards/wrapper/std": 12.51425680667162, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 48.2, "completions/mean_length": 252.7125, "completions/mean_terminated_length": 45.5, "completions/min_length": 196.4, "completions/min_terminated_length": 42.8, "epoch": 0.31906158357771264, "frac_reward_zero_std": 0.025, "grad_norm": 2.140625, "kl": 0.01574853319907561, "learning_rate": 7.567493051058871e-06, "loss": 0.0067, "num_tokens": 14892928.0, "reward": 15.730212306976318, "reward_std": 16.052964973449708, "rewards/wrapper/mean": 7.865105799585581, "rewards/wrapper/std": 22.611429415643215, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 21.6, "completions/mean_length": 251.89375, "completions/mean_terminated_length": 11.2, "completions/min_length": 154.4, "completions/min_terminated_length": 0.8, "epoch": 0.32023460410557186, "frac_reward_zero_std": 0.0375, "grad_norm": 1.7265625, "kl": 0.011211365048075095, "learning_rate": 7.56490329425146e-06, "loss": -0.01, "num_tokens": 14948657.0, "reward": 10.49436092376709, "reward_std": 14.247867679595947, "rewards/wrapper/mean": 5.247180543094873, "rewards/wrapper/std": 16.597414763271807, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.3214076246334311, "frac_reward_zero_std": 0.05, "grad_norm": 2.6875, "kl": 0.008569952449761331, "learning_rate": 7.562304577315573e-06, "loss": 0.0003, "num_tokens": 15007042.0, "reward": 13.28712511062622, "reward_std": 13.76039524078369, "rewards/wrapper/mean": 6.643562447279692, "rewards/wrapper/std": 18.46181525737047, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 254.26875, "completions/mean_terminated_length": 47.0, "completions/min_length": 200.6, "completions/min_terminated_length": 47.0, "epoch": 0.3225806451612903, "frac_reward_zero_std": 0.0, "grad_norm": 4.0625, "kl": 0.009434047807008027, "learning_rate": 7.559696909149068e-06, "loss": -0.0026, "num_tokens": 15068173.0, "reward": 4.01227194070816, "reward_std": 4.248266899585724, "rewards/wrapper/mean": 2.006135963648558, "rewards/wrapper/std": 6.169549755752087, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 47.6, "completions/mean_length": 254.2875, "completions/mean_terminated_length": 47.6, "completions/min_length": 201.2, "completions/min_terminated_length": 47.6, "epoch": 0.32375366568914954, "frac_reward_zero_std": 0.025, "grad_norm": 3.5, "kl": 0.008193347556516527, "learning_rate": 7.557080298680456e-06, "loss": -0.003, "num_tokens": 15127993.0, "reward": 10.501580429077148, "reward_std": 14.141244888305664, "rewards/wrapper/mean": 5.250790251791477, "rewards/wrapper/std": 16.1909792765975, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 9.8, "completions/mean_length": 254.70625, "completions/mean_terminated_length": 9.8, "completions/min_length": 214.6, "completions/min_terminated_length": 9.8, "epoch": 0.3249266862170088, "frac_reward_zero_std": 0.0375, "grad_norm": 4.875, "kl": 0.0087321916827932, "learning_rate": 7.554454754868861e-06, "loss": -0.0026, "num_tokens": 15183854.0, "reward": 9.186988854408265, "reward_std": 12.355615091323852, "rewards/wrapper/mean": 4.593494184315205, "rewards/wrapper/std": 14.244051401317119, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 26.8, "completions/mean_length": 247.25, "completions/mean_terminated_length": 26.8, "completions/min_length": 78.0, "completions/min_terminated_length": 26.8, "epoch": 0.32609970674486805, "frac_reward_zero_std": 0.0125, "grad_norm": 1.75, "kl": 0.009691889875102788, "learning_rate": 7.551820286703997e-06, "loss": -0.0212, "num_tokens": 15237238.0, "reward": 12.978730201721191, "reward_std": 17.73060188293457, "rewards/wrapper/mean": 6.489365118741989, "rewards/wrapper/std": 19.83084503412247, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 254.23125, "completions/mean_terminated_length": 97.0, "completions/min_length": 199.4, "completions/min_terminated_length": 97.0, "epoch": 0.32727272727272727, "frac_reward_zero_std": 0.0375, "grad_norm": 5.71875, "kl": 0.007169304159469902, "learning_rate": 7.549176903206133e-06, "loss": 0.004, "num_tokens": 15295067.0, "reward": 14.387743473052979, "reward_std": 19.485210800170897, "rewards/wrapper/mean": 7.193871764093638, "rewards/wrapper/std": 21.868296499550343, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 21.6, "completions/mean_length": 255.075, "completions/mean_terminated_length": 21.6, "completions/min_length": 226.4, "completions/min_terminated_length": 21.6, "epoch": 0.3284457478005865, "frac_reward_zero_std": 0.025, "grad_norm": 4.15625, "kl": 0.007469373976346105, "learning_rate": 7.546524613426066e-06, "loss": -0.0011, "num_tokens": 15353999.0, "reward": 10.911634540557861, "reward_std": 14.8988431930542, "rewards/wrapper/mean": 5.455817250907421, "rewards/wrapper/std": 19.719540111720562, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 80.6, "completions/mean_length": 250.58125, "completions/mean_terminated_length": 66.7, "completions/min_length": 104.0, "completions/min_terminated_length": 52.8, "epoch": 0.3296187683284457, "frac_reward_zero_std": 0.0125, "grad_norm": 4.8125, "kl": 0.008937957452144474, "learning_rate": 7.543863426445082e-06, "loss": -0.0028, "num_tokens": 15407088.0, "reward": 9.857917308807373, "reward_std": 11.83640947341919, "rewards/wrapper/mean": 4.928958788514137, "rewards/wrapper/std": 15.389665246009827, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 2.6, "completions/mean_length": 254.48125, "completions/mean_terminated_length": 2.6, "completions/min_length": 207.4, "completions/min_terminated_length": 2.6, "epoch": 0.330791788856305, "frac_reward_zero_std": 0.05, "grad_norm": 2.921875, "kl": 0.012071310554165392, "learning_rate": 7.5411933513749375e-06, "loss": -0.0037, "num_tokens": 15461157.0, "reward": 11.119661998748779, "reward_std": 11.121147727966308, "rewards/wrapper/mean": 5.5598307564854625, "rewards/wrapper/std": 16.329788361489772, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 59.2, "completions/mean_length": 253.05, "completions/mean_terminated_length": 59.2, "completions/min_length": 161.6, "completions/min_terminated_length": 59.2, "epoch": 0.33196480938416423, "frac_reward_zero_std": 0.025, "grad_norm": 2.125, "kl": 0.011578649946022779, "learning_rate": 7.538514397357817e-06, "loss": -0.0086, "num_tokens": 15517341.0, "reward": 9.766520977020264, "reward_std": 12.796609210968018, "rewards/wrapper/mean": 4.883260330557823, "rewards/wrapper/std": 15.593793278932571, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 33.6, "completions/mean_length": 252.25625, "completions/mean_terminated_length": 17.9, "completions/min_length": 155.8, "completions/min_terminated_length": 2.2, "epoch": 0.33313782991202345, "frac_reward_zero_std": 0.0125, "grad_norm": 5.53125, "kl": 0.004327619081595913, "learning_rate": 7.535826573566306e-06, "loss": -0.011, "num_tokens": 15569564.0, "reward": 10.319400787353516, "reward_std": 13.982878303527832, "rewards/wrapper/mean": 5.1597000800073145, "rewards/wrapper/std": 16.632020924985408, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 16.4, "completions/mean_length": 254.9125, "completions/mean_terminated_length": 16.4, "completions/min_length": 221.2, "completions/min_terminated_length": 16.4, "epoch": 0.3343108504398827, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.009104005433619023, "learning_rate": 7.533129889203364e-06, "loss": -0.0021, "num_tokens": 15622916.0, "reward": 11.106964683532714, "reward_std": 11.095128536224365, "rewards/wrapper/mean": 5.553482050448656, "rewards/wrapper/std": 14.927113994956017, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 44.6, "completions/mean_length": 252.84375, "completions/mean_terminated_length": 40.7, "completions/min_length": 190.4, "completions/min_terminated_length": 36.8, "epoch": 0.33548387096774196, "frac_reward_zero_std": 0.025, "grad_norm": 5.9375, "kl": 0.014619318360928446, "learning_rate": 7.530424353502283e-06, "loss": -0.0096, "num_tokens": 15678809.0, "reward": 13.947904205322265, "reward_std": 15.49264030456543, "rewards/wrapper/mean": 6.973952141404152, "rewards/wrapper/std": 19.970910519361496, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 49.4, "completions/mean_length": 252.75, "completions/mean_terminated_length": 33.0, "completions/min_length": 170.2, "completions/min_terminated_length": 16.6, "epoch": 0.3366568914956012, "frac_reward_zero_std": 0.0125, "grad_norm": 3.046875, "kl": 0.00951800765178632, "learning_rate": 7.527709975726663e-06, "loss": -0.0063, "num_tokens": 15737421.0, "reward": 10.923522877693177, "reward_std": 11.951661324501037, "rewards/wrapper/mean": 5.461761482059956, "rewards/wrapper/std": 15.331962569057941, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 6.2, "completions/mean_length": 254.59375, "completions/mean_terminated_length": 6.2, "completions/min_length": 211.0, "completions/min_terminated_length": 6.2, "epoch": 0.3378299120234604, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.008740646217484027, "learning_rate": 7.5249867651703825e-06, "loss": -0.0029, "num_tokens": 15790706.0, "reward": 11.986185383796691, "reward_std": 13.874305212497712, "rewards/wrapper/mean": 5.993092510849237, "rewards/wrapper/std": 15.854976122826338, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.33900293255131964, "frac_reward_zero_std": 0.025, "grad_norm": 1.3984375, "kl": 0.005966075940523297, "learning_rate": 7.522254731157557e-06, "loss": -0.011, "num_tokens": 15845562.0, "reward": 9.148483896255494, "reward_std": 11.106124210357667, "rewards/wrapper/mean": 4.574241859093308, "rewards/wrapper/std": 15.179227907955646, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.34017595307917886, "frac_reward_zero_std": 0.0375, "grad_norm": 6.84375, "kl": 0.02206381254363805, "learning_rate": 7.519513883042518e-06, "loss": -0.0016, "num_tokens": 15900385.0, "reward": 13.788352870941162, "reward_std": 17.711102962493896, "rewards/wrapper/mean": 6.89417629390955, "rewards/wrapper/std": 20.102886700630187, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 35.4, "completions/mean_length": 255.50625, "completions/mean_terminated_length": 35.4, "completions/min_length": 240.2, "completions/min_terminated_length": 35.4, "epoch": 0.34134897360703814, "frac_reward_zero_std": 0.0125, "grad_norm": 1.796875, "kl": 0.008329333225265145, "learning_rate": 7.516764230209772e-06, "loss": -0.0004, "num_tokens": 15957806.0, "reward": 10.32720980644226, "reward_std": 12.660326385498047, "rewards/wrapper/mean": 5.163604502379894, "rewards/wrapper/std": 14.365816079080105, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 253.675, "completions/mean_terminated_length": 28.0, "completions/min_length": 181.6, "completions/min_terminated_length": 28.0, "epoch": 0.34252199413489737, "frac_reward_zero_std": 0.025, "grad_norm": 1.3671875, "kl": 0.005196410208009183, "learning_rate": 7.514005782073976e-06, "loss": 0.0034, "num_tokens": 16012790.0, "reward": 11.027314805984497, "reward_std": 14.765119647979736, "rewards/wrapper/mean": 5.513657581061125, "rewards/wrapper/std": 18.939475986361504, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 22.4, "completions/mean_length": 247.1625, "completions/mean_terminated_length": 11.233333587646484, "completions/min_length": 104.6, "completions/min_terminated_length": 2.2, "epoch": 0.3436950146627566, "frac_reward_zero_std": 0.025, "grad_norm": 2.296875, "kl": 0.024585752293933182, "learning_rate": 7.5112385480799005e-06, "loss": -0.0166, "num_tokens": 16065540.0, "reward": 10.954252338409423, "reward_std": 12.776335978507996, "rewards/wrapper/mean": 5.477126209437847, "rewards/wrapper/std": 15.678541065752507, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 42.4, "completions/mean_length": 254.125, "completions/mean_terminated_length": 42.4, "completions/min_length": 196.0, "completions/min_terminated_length": 42.4, "epoch": 0.3448680351906158, "frac_reward_zero_std": 0.0, "grad_norm": 5.125, "kl": 0.009685445297509431, "learning_rate": 7.5084625377023954e-06, "loss": -0.0056, "num_tokens": 16121824.0, "reward": 7.70174765586853, "reward_std": 8.262192821502685, "rewards/wrapper/mean": 3.8508735738694666, "rewards/wrapper/std": 14.08769258260727, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.3460410557184751, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9296875, "kl": 0.1006514249893371, "learning_rate": 7.505677760446367e-06, "loss": -0.001, "num_tokens": 16178669.0, "reward": 15.888223457336426, "reward_std": 19.840636253356934, "rewards/wrapper/mean": 7.944112040102482, "rewards/wrapper/std": 21.836421263217925, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.3472140762463343, "frac_reward_zero_std": 0.0125, "grad_norm": 1.859375, "kl": 0.029327043099328876, "learning_rate": 7.502884225846729e-06, "loss": 0.0017, "num_tokens": 16230501.0, "reward": 7.629909253120422, "reward_std": 8.137647867202759, "rewards/wrapper/mean": 3.8149545326828957, "rewards/wrapper/std": 11.658829681575298, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.34838709677419355, "frac_reward_zero_std": 0.05, "grad_norm": 5.625, "kl": 0.01167833567596972, "learning_rate": 7.50008194346839e-06, "loss": -0.0121, "num_tokens": 16287069.0, "reward": 10.761643028259277, "reward_std": 12.282279825210571, "rewards/wrapper/mean": 5.380821162462235, "rewards/wrapper/std": 13.490466183423996, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 43.2, "completions/mean_length": 250.95625, "completions/mean_terminated_length": 42.9, "completions/min_length": 145.0, "completions/min_terminated_length": 42.6, "epoch": 0.3495601173020528, "frac_reward_zero_std": 0.0125, "grad_norm": 2.640625, "kl": 0.015173644432798028, "learning_rate": 7.497270922906204e-06, "loss": -0.0121, "num_tokens": 16342400.0, "reward": 7.586162424087524, "reward_std": 9.830938339233398, "rewards/wrapper/mean": 3.793080995231867, "rewards/wrapper/std": 12.029572662711143, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 2.8, "completions/mean_length": 254.4875, "completions/mean_terminated_length": 2.8, "completions/min_length": 207.6, "completions/min_terminated_length": 2.8, "epoch": 0.350733137829912, "frac_reward_zero_std": 0.025, "grad_norm": 5.25, "kl": 0.008850508206523954, "learning_rate": 7.494451173784947e-06, "loss": -0.0056, "num_tokens": 16398928.0, "reward": 10.645478534698487, "reward_std": 12.192581272125244, "rewards/wrapper/mean": 5.322738918662071, "rewards/wrapper/std": 16.82269820868969, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 63.2, "completions/mean_length": 253.175, "completions/mean_terminated_length": 63.2, "completions/min_length": 165.6, "completions/min_terminated_length": 63.2, "epoch": 0.3519061583577713, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.47186655312543735, "learning_rate": 7.491622705759279e-06, "loss": 0.0119, "num_tokens": 16452334.0, "reward": 13.618064212799073, "reward_std": 16.841291904449463, "rewards/wrapper/mean": 6.809031952917576, "rewards/wrapper/std": 19.131683690845968, "step": 1500 }, { "epoch": 0.3519061583577713, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.845, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 57.44, "eval_completions/mean_length": 233.8025, "eval_completions/mean_terminated_length": 49.33000011444092, "eval_completions/min_length": 185.48, "eval_completions/min_terminated_length": 42.12, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.011634215260855854, "eval_loss": -0.030341310426592827, "eval_num_tokens": 16452334.0, "eval_reward": 0.40629449486732483, "eval_reward_std": 0.2625969736929983, "eval_rewards/wrapper/mean": 0.2031472486257553, "eval_rewards/wrapper/std": 0.21003973964601755, "eval_runtime": 207.7988, "eval_samples_per_second": 0.962, "eval_steps_per_second": 0.241, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 249.63125, "completions/mean_terminated_length": 0.7, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.3530791788856305, "frac_reward_zero_std": 0.0125, "grad_norm": 2.8125, "kl": 0.008165108982939272, "learning_rate": 7.488785528513715e-06, "loss": -0.0143, "num_tokens": 16509875.0, "reward": 9.125647592544556, "reward_std": 12.147355389595031, "rewards/wrapper/mean": 4.562823601812124, "rewards/wrapper/std": 14.803350380063057, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.35425219941348973, "frac_reward_zero_std": 0.0125, "grad_norm": 1.8671875, "kl": 0.02630561958067119, "learning_rate": 7.485939651762588e-06, "loss": -0.0074, "num_tokens": 16568078.0, "reward": 8.876133251190186, "reward_std": 10.792211532592773, "rewards/wrapper/mean": 4.438066463172436, "rewards/wrapper/std": 12.372373120486737, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 90.6, "completions/mean_length": 250.8375, "completions/mean_terminated_length": 69.5, "completions/min_length": 99.6, "completions/min_terminated_length": 48.4, "epoch": 0.35542521994134896, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.007182114117313176, "learning_rate": 7.483085085250019e-06, "loss": -0.0057, "num_tokens": 16622066.0, "reward": 10.847808790206908, "reward_std": 13.713920974731446, "rewards/wrapper/mean": 5.423904552310705, "rewards/wrapper/std": 17.696707151830196, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.35659824046920824, "frac_reward_zero_std": 0.05, "grad_norm": 2.296875, "kl": 1.0365399254136718, "learning_rate": 7.480221838749882e-06, "loss": 0.0364, "num_tokens": 16682705.0, "reward": 7.309674024581909, "reward_std": 9.755594110488891, "rewards/wrapper/mean": 3.6548370026051997, "rewards/wrapper/std": 10.389201259613037, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 46.6, "completions/mean_length": 252.66875, "completions/mean_terminated_length": 23.6, "completions/min_length": 154.2, "completions/min_terminated_length": 0.6, "epoch": 0.35777126099706746, "frac_reward_zero_std": 0.0125, "grad_norm": 2.234375, "kl": 0.007430961437057704, "learning_rate": 7.477349922065771e-06, "loss": -0.0063, "num_tokens": 16735422.0, "reward": 14.080436992645264, "reward_std": 16.378932380676268, "rewards/wrapper/mean": 7.040218336880207, "rewards/wrapper/std": 18.554736307263376, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 250.8875, "completions/mean_terminated_length": 41.0, "completions/min_length": 143.4, "completions/min_terminated_length": 41.0, "epoch": 0.3589442815249267, "frac_reward_zero_std": 0.0125, "grad_norm": 1.953125, "kl": 0.007186479715164751, "learning_rate": 7.474469345030966e-06, "loss": -0.0064, "num_tokens": 16787668.0, "reward": 6.7414408206939695, "reward_std": 8.522413969039917, "rewards/wrapper/mean": 3.370720238983631, "rewards/wrapper/std": 10.809870810806752, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 254.8375, "completions/mean_terminated_length": 14.0, "completions/min_length": 218.8, "completions/min_terminated_length": 14.0, "epoch": 0.3601173020527859, "frac_reward_zero_std": 0.0375, "grad_norm": 5.6875, "kl": 0.015209858620073647, "learning_rate": 7.471580117508398e-06, "loss": 0.0033, "num_tokens": 16843118.0, "reward": 9.573083400726318, "reward_std": 12.79993715286255, "rewards/wrapper/mean": 4.78654208779335, "rewards/wrapper/std": 15.178947728872298, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.36129032258064514, "frac_reward_zero_std": 0.0125, "grad_norm": 4.03125, "kl": 0.06851877669105307, "learning_rate": 7.468682249390621e-06, "loss": 0.0002, "num_tokens": 16896579.0, "reward": 7.265839624404907, "reward_std": 9.448273944854737, "rewards/wrapper/mean": 3.632919803261757, "rewards/wrapper/std": 11.570318593084812, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 151.8, "completions/mean_length": 252.75, "completions/mean_terminated_length": 137.6, "completions/min_length": 174.6, "completions/min_terminated_length": 123.4, "epoch": 0.3624633431085044, "frac_reward_zero_std": 0.025, "grad_norm": 2.28125, "kl": 0.011325775182922371, "learning_rate": 7.465775750599767e-06, "loss": -0.0059, "num_tokens": 16950423.0, "reward": 11.469275760650635, "reward_std": 12.777053165435792, "rewards/wrapper/mean": 5.734637747704983, "rewards/wrapper/std": 16.157272858917715, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 33.8, "completions/mean_length": 252.25625, "completions/mean_terminated_length": 33.8, "completions/min_length": 136.2, "completions/min_terminated_length": 33.8, "epoch": 0.36363636363636365, "frac_reward_zero_std": 0.0125, "grad_norm": 2.796875, "kl": 0.007997657801024616, "learning_rate": 7.462860631087526e-06, "loss": -0.0103, "num_tokens": 17006628.0, "reward": 11.788213729858398, "reward_std": 15.076388835906982, "rewards/wrapper/mean": 5.894107177108526, "rewards/wrapper/std": 17.494142431020737, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 253.83125, "completions/mean_terminated_length": 33.0, "completions/min_length": 186.6, "completions/min_terminated_length": 33.0, "epoch": 0.36480938416422287, "frac_reward_zero_std": 0.0, "grad_norm": 0.9296875, "kl": 0.006896321999374777, "learning_rate": 7.459936900835101e-06, "loss": -0.0029, "num_tokens": 17061199.0, "reward": 15.098381996154785, "reward_std": 20.83281021118164, "rewards/wrapper/mean": 7.549191132187843, "rewards/wrapper/std": 22.724813936650754, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 255.2125, "completions/mean_terminated_length": 26.0, "completions/min_length": 230.8, "completions/min_terminated_length": 26.0, "epoch": 0.3659824046920821, "frac_reward_zero_std": 0.0375, "grad_norm": 4.75, "kl": 0.012470835470594466, "learning_rate": 7.45700456985318e-06, "loss": -0.0012, "num_tokens": 17115033.0, "reward": 12.148631858825684, "reward_std": 15.902271842956543, "rewards/wrapper/mean": 6.0743159070611, "rewards/wrapper/std": 15.158780360221863, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 51.8, "completions/mean_length": 252.81875, "completions/mean_terminated_length": 51.8, "completions/min_length": 154.2, "completions/min_terminated_length": 51.8, "epoch": 0.3671554252199413, "frac_reward_zero_std": 0.0375, "grad_norm": 1.8203125, "kl": 0.006677528831642121, "learning_rate": 7.454063648181896e-06, "loss": -0.0029, "num_tokens": 17168222.0, "reward": 13.805829715728759, "reward_std": 15.897808837890626, "rewards/wrapper/mean": 6.902914525568486, "rewards/wrapper/std": 22.55402392446995, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 29.2, "completions/mean_length": 248.925, "completions/mean_terminated_length": 14.9, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.3683284457478006, "frac_reward_zero_std": 0.025, "grad_norm": 2.71875, "kl": 0.021633249043952675, "learning_rate": 7.451114145890799e-06, "loss": -0.0192, "num_tokens": 17222840.0, "reward": 9.334974765777588, "reward_std": 12.756710720062255, "rewards/wrapper/mean": 4.667487615346909, "rewards/wrapper/std": 15.754800505936146, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 14.4, "completions/mean_length": 253.25, "completions/mean_terminated_length": 14.4, "completions/min_length": 168.0, "completions/min_terminated_length": 14.4, "epoch": 0.36950146627565983, "frac_reward_zero_std": 0.0, "grad_norm": 4.875, "kl": 0.0058940518880262974, "learning_rate": 7.448156073078817e-06, "loss": -0.0087, "num_tokens": 17275464.0, "reward": 9.2894437789917, "reward_std": 11.119479942321778, "rewards/wrapper/mean": 4.644721812009811, "rewards/wrapper/std": 12.392537288367748, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 12.6, "completions/mean_length": 253.2, "completions/mean_terminated_length": 6.4, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.37067448680351905, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.008174310147296637, "learning_rate": 7.445189439874223e-06, "loss": -0.0035, "num_tokens": 17331486.0, "reward": 9.974001216888428, "reward_std": 12.6971941947937, "rewards/wrapper/mean": 4.987000489979982, "rewards/wrapper/std": 15.624969989061356, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.3718475073313783, "frac_reward_zero_std": 0.05, "grad_norm": 8.0625, "kl": 0.010091668064706027, "learning_rate": 7.442214256434603e-06, "loss": 0.0004, "num_tokens": 17387346.0, "reward": 6.865322303771973, "reward_std": 8.876708436012269, "rewards/wrapper/mean": 3.432661159336567, "rewards/wrapper/std": 11.23374333679676, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 44.4, "completions/mean_length": 254.1875, "completions/mean_terminated_length": 44.4, "completions/min_length": 198.0, "completions/min_terminated_length": 44.4, "epoch": 0.37302052785923756, "frac_reward_zero_std": 0.025, "grad_norm": 2.28125, "kl": 0.007138772174948826, "learning_rate": 7.439230532946815e-06, "loss": -0.0055, "num_tokens": 17439276.0, "reward": 9.094755506515503, "reward_std": 11.259793186187744, "rewards/wrapper/mean": 4.5473778083920475, "rewards/wrapper/std": 13.361431784927845, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 252.81875, "completions/mean_terminated_length": 0.6, "completions/min_length": 154.2, "completions/min_terminated_length": 0.6, "epoch": 0.3741935483870968, "frac_reward_zero_std": 0.0125, "grad_norm": 4.90625, "kl": 0.006641448987647891, "learning_rate": 7.436238279626959e-06, "loss": -0.0091, "num_tokens": 17492901.0, "reward": 9.107149982452393, "reward_std": 12.01515827178955, "rewards/wrapper/mean": 4.553574965894223, "rewards/wrapper/std": 13.188546454906463, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 5.2, "completions/mean_length": 251.36875, "completions/mean_terminated_length": 2.8, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.375366568914956, "frac_reward_zero_std": 0.0125, "grad_norm": 5.21875, "kl": 0.0073716026323381815, "learning_rate": 7.433237506720342e-06, "loss": -0.0184, "num_tokens": 17546222.0, "reward": 11.634984397888184, "reward_std": 12.310375213623047, "rewards/wrapper/mean": 5.81749247610569, "rewards/wrapper/std": 17.150740154087543, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.37653958944281524, "frac_reward_zero_std": 0.0375, "grad_norm": 2.0, "kl": 0.007083708542631939, "learning_rate": 7.430228224501438e-06, "loss": -0.0058, "num_tokens": 17601677.0, "reward": 7.806136894226074, "reward_std": 8.510241031646729, "rewards/wrapper/mean": 3.903068270534277, "rewards/wrapper/std": 14.243344616889953, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 10.2, "completions/mean_length": 254.71875, "completions/mean_terminated_length": 10.2, "completions/min_length": 215.0, "completions/min_terminated_length": 10.2, "epoch": 0.37771260997067446, "frac_reward_zero_std": 0.0375, "grad_norm": 5.8125, "kl": 0.011267528822645544, "learning_rate": 7.427210443273859e-06, "loss": -0.003, "num_tokens": 17655154.0, "reward": 11.17388744354248, "reward_std": 12.301093673706054, "rewards/wrapper/mean": 5.586943505704403, "rewards/wrapper/std": 16.70587693154812, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 251.1625, "completions/mean_terminated_length": 50.0, "completions/min_length": 101.2, "completions/min_terminated_length": 50.0, "epoch": 0.37888563049853374, "frac_reward_zero_std": 0.05, "grad_norm": 1.671875, "kl": 0.007835835078731179, "learning_rate": 7.424184173370319e-06, "loss": 0.0061, "num_tokens": 17708750.0, "reward": 13.72214469909668, "reward_std": 15.858495712280273, "rewards/wrapper/mean": 6.861072225868702, "rewards/wrapper/std": 21.06146321594715, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 254.33125, "completions/mean_terminated_length": 49.0, "completions/min_length": 202.6, "completions/min_terminated_length": 49.0, "epoch": 0.38005865102639297, "frac_reward_zero_std": 0.0125, "grad_norm": 4.90625, "kl": 0.0077929800259880725, "learning_rate": 7.421149425152591e-06, "loss": -0.0013, "num_tokens": 17763259.0, "reward": 9.450750017166138, "reward_std": 9.001422214508057, "rewards/wrapper/mean": 4.725374779850244, "rewards/wrapper/std": 13.301532693952321, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 57.8, "completions/mean_length": 253.29375, "completions/mean_terminated_length": 48.1, "completions/min_length": 192.0, "completions/min_terminated_length": 38.4, "epoch": 0.3812316715542522, "frac_reward_zero_std": 0.0125, "grad_norm": 1.1640625, "kl": 0.010976305836811662, "learning_rate": 7.418106209011485e-06, "loss": 0.0035, "num_tokens": 17816420.0, "reward": 8.98981170654297, "reward_std": 12.001905918121338, "rewards/wrapper/mean": 4.494905859231949, "rewards/wrapper/std": 14.715555727481842, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 1.8, "completions/mean_length": 249.6625, "completions/mean_terminated_length": 1.3, "completions/min_length": 103.2, "completions/min_terminated_length": 0.8, "epoch": 0.3824046920821114, "frac_reward_zero_std": 0.025, "grad_norm": 4.75, "kl": 0.007970065460540354, "learning_rate": 7.415054535366797e-06, "loss": -0.0185, "num_tokens": 17871970.0, "reward": 15.410069465637207, "reward_std": 18.368908309936522, "rewards/wrapper/mean": 7.705034771561623, "rewards/wrapper/std": 21.16511830240488, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 251.19375, "completions/mean_terminated_length": 80.2, "completions/min_length": 118.6, "completions/min_terminated_length": 67.4, "epoch": 0.3835777126099707, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.03646875837584958, "learning_rate": 7.411994414667286e-06, "loss": -0.0052, "num_tokens": 17926495.0, "reward": 11.289712238311768, "reward_std": 12.078022670745849, "rewards/wrapper/mean": 5.644856164604425, "rewards/wrapper/std": 15.732601109147073, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 247.1, "completions/mean_terminated_length": 15.0, "completions/min_length": 65.2, "completions/min_terminated_length": 14.0, "epoch": 0.3847507331378299, "frac_reward_zero_std": 0.025, "grad_norm": 2.359375, "kl": 0.010182082024402916, "learning_rate": 7.4089258573906325e-06, "loss": -0.0291, "num_tokens": 17982111.0, "reward": 10.671856796741485, "reward_std": 14.430114448070526, "rewards/wrapper/mean": 5.335928474366665, "rewards/wrapper/std": 17.302095092833042, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 22.0, "completions/mean_length": 253.49375, "completions/mean_terminated_length": 11.1, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.38592375366568915, "frac_reward_zero_std": 0.0125, "grad_norm": 2.28125, "kl": 0.027646390511654316, "learning_rate": 7.4058488740434015e-06, "loss": -0.0053, "num_tokens": 18035278.0, "reward": 11.211683654785157, "reward_std": 12.046236991882324, "rewards/wrapper/mean": 5.605841771513224, "rewards/wrapper/std": 16.439643205702303, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 26.8, "completions/mean_length": 253.6375, "completions/mean_terminated_length": 26.8, "completions/min_length": 180.4, "completions/min_terminated_length": 26.8, "epoch": 0.3870967741935484, "frac_reward_zero_std": 0.0, "grad_norm": 19.125, "kl": 0.023396419017808513, "learning_rate": 7.402763475161009e-06, "loss": 0.005, "num_tokens": 18090086.0, "reward": 10.315212440490722, "reward_std": 10.757926654815673, "rewards/wrapper/mean": 5.157606067508459, "rewards/wrapper/std": 16.255353631079196, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 46.2, "completions/mean_length": 252.64375, "completions/mean_terminated_length": 46.2, "completions/min_length": 148.6, "completions/min_terminated_length": 46.2, "epoch": 0.3882697947214076, "frac_reward_zero_std": 0.0375, "grad_norm": 1.7578125, "kl": 0.012765820871572941, "learning_rate": 7.3996696713076875e-06, "loss": 0.0038, "num_tokens": 18144855.0, "reward": 9.633971977233887, "reward_std": 12.852963256835938, "rewards/wrapper/mean": 4.81698562502861, "rewards/wrapper/std": 14.805128015577793, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 41.8, "completions/mean_length": 247.71875, "completions/mean_terminated_length": 21.3, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.3894428152492669, "frac_reward_zero_std": 0.0625, "grad_norm": 1.5703125, "kl": 0.008341539406683297, "learning_rate": 7.3965674730764436e-06, "loss": -0.0171, "num_tokens": 18199476.0, "reward": 12.23201961517334, "reward_std": 14.92741813659668, "rewards/wrapper/mean": 6.116009667515755, "rewards/wrapper/std": 18.56237207353115, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.3906158357771261, "frac_reward_zero_std": 0.0375, "grad_norm": 3.09375, "kl": 0.008628571312874556, "learning_rate": 7.393456891089031e-06, "loss": -0.0047, "num_tokens": 18254951.0, "reward": 13.272594833374024, "reward_std": 16.658662605285645, "rewards/wrapper/mean": 6.636297233402729, "rewards/wrapper/std": 19.94486008733511, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 16.4, "completions/mean_length": 251.71875, "completions/mean_terminated_length": 8.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.39178885630498533, "frac_reward_zero_std": 0.0125, "grad_norm": 1.46875, "kl": 0.015108229930046947, "learning_rate": 7.3903379359959035e-06, "loss": -0.0113, "num_tokens": 18309606.0, "reward": 11.978076171875, "reward_std": 15.562600898742676, "rewards/wrapper/mean": 5.989037749916315, "rewards/wrapper/std": 17.648951482772826, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 250.7625, "completions/mean_terminated_length": 58.66666717529297, "completions/min_length": 146.4, "completions/min_terminated_length": 44.0, "epoch": 0.39296187683284456, "frac_reward_zero_std": 0.025, "grad_norm": 1.859375, "kl": 0.009045041864737868, "learning_rate": 7.387210618476187e-06, "loss": -0.0051, "num_tokens": 18364170.0, "reward": 7.711848163604737, "reward_std": 10.005695056915282, "rewards/wrapper/mean": 3.8559240214526653, "rewards/wrapper/std": 13.164579983055592, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 56.2, "completions/mean_length": 253.86875, "completions/mean_terminated_length": 55.5, "completions/min_length": 208.4, "completions/min_terminated_length": 54.8, "epoch": 0.39413489736070384, "frac_reward_zero_std": 0.0, "grad_norm": 4.25, "kl": 0.010809017776045949, "learning_rate": 7.38407494923764e-06, "loss": -0.0047, "num_tokens": 18419115.0, "reward": 11.691289234161378, "reward_std": 14.441447448730468, "rewards/wrapper/mean": 5.845644051581621, "rewards/wrapper/std": 17.613725888729096, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 28.4, "completions/mean_length": 247.5375, "completions/mean_terminated_length": 18.2, "completions/min_length": 110.4, "completions/min_terminated_length": 8.0, "epoch": 0.39530791788856307, "frac_reward_zero_std": 0.0375, "grad_norm": 1.4375, "kl": 0.006498944421764463, "learning_rate": 7.380930939016617e-06, "loss": -0.0026, "num_tokens": 18470913.0, "reward": 11.800182819366455, "reward_std": 13.530944919586181, "rewards/wrapper/mean": 5.900091470777989, "rewards/wrapper/std": 14.850279198586941, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 251.83125, "completions/mean_terminated_length": 43.5, "completions/min_length": 124.4, "completions/min_terminated_length": 22.0, "epoch": 0.3964809384164223, "frac_reward_zero_std": 0.0125, "grad_norm": 3.53125, "kl": 0.0053620882332324985, "learning_rate": 7.377778598578028e-06, "loss": -0.015, "num_tokens": 18523112.0, "reward": 11.4480149269104, "reward_std": 15.00893726348877, "rewards/wrapper/mean": 5.724007427692413, "rewards/wrapper/std": 17.372301462292672, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.3976539589442815, "frac_reward_zero_std": 0.0125, "grad_norm": 1.2734375, "kl": 0.017944533191621304, "learning_rate": 7.37461793871531e-06, "loss": 0.0032, "num_tokens": 18578531.0, "reward": 6.788930177688599, "reward_std": 8.851951217651367, "rewards/wrapper/mean": 3.3944652788341045, "rewards/wrapper/std": 11.30336948186159, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 31.4, "completions/mean_length": 255.38125, "completions/mean_terminated_length": 31.4, "completions/min_length": 236.2, "completions/min_terminated_length": 31.4, "epoch": 0.39882697947214074, "frac_reward_zero_std": 0.0125, "grad_norm": 2.015625, "kl": 0.009708100673742592, "learning_rate": 7.371448970250383e-06, "loss": 0.0022, "num_tokens": 18634028.0, "reward": 11.817505073547363, "reward_std": 14.400478649139405, "rewards/wrapper/mean": 5.908752170950175, "rewards/wrapper/std": 17.49454737752676, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 79.6, "completions/mean_length": 248.90625, "completions/mean_terminated_length": 33.93333435058594, "completions/min_length": 104.6, "completions/min_terminated_length": 2.2, "epoch": 0.4, "frac_reward_zero_std": 0.025, "grad_norm": 6.4375, "kl": 0.013681471673771739, "learning_rate": 7.368271704033615e-06, "loss": -0.0172, "num_tokens": 18687007.0, "reward": 8.214220666885376, "reward_std": 11.032522630691528, "rewards/wrapper/mean": 4.107110323756933, "rewards/wrapper/std": 13.84426678419113, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 59.4, "completions/mean_length": 251.46875, "completions/mean_terminated_length": 29.9, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.40117302052785925, "frac_reward_zero_std": 0.025, "grad_norm": 6.0, "kl": 0.0418191681150347, "learning_rate": 7.365086150943786e-06, "loss": -0.0087, "num_tokens": 18746950.0, "reward": 13.3557297706604, "reward_std": 14.804699611663818, "rewards/wrapper/mean": 6.677864947915078, "rewards/wrapper/std": 18.732959206402302, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 2.4, "completions/mean_length": 252.875, "completions/mean_terminated_length": 2.4, "completions/min_length": 156.0, "completions/min_terminated_length": 2.4, "epoch": 0.4023460410557185, "frac_reward_zero_std": 0.025, "grad_norm": 3.21875, "kl": 0.012403641873970628, "learning_rate": 7.3618923218880465e-06, "loss": -0.0037, "num_tokens": 18802882.0, "reward": 6.81656813621521, "reward_std": 8.471649742126464, "rewards/wrapper/mean": 3.408283967524767, "rewards/wrapper/std": 10.526500597596169, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 253.89375, "completions/mean_terminated_length": 35.0, "completions/min_length": 188.6, "completions/min_terminated_length": 35.0, "epoch": 0.4035190615835777, "frac_reward_zero_std": 0.0125, "grad_norm": 1.53125, "kl": 0.013926792331039906, "learning_rate": 7.35869022780189e-06, "loss": -0.0055, "num_tokens": 18857201.0, "reward": 10.40798740386963, "reward_std": 11.334413433074952, "rewards/wrapper/mean": 5.203993559628725, "rewards/wrapper/std": 17.202790300548077, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.4046920821114369, "frac_reward_zero_std": 0.05, "grad_norm": 2.609375, "kl": 0.0062289016088470815, "learning_rate": 7.355479879649102e-06, "loss": -0.0048, "num_tokens": 18911494.0, "reward": 9.848146200180054, "reward_std": 10.52330822944641, "rewards/wrapper/mean": 4.924072936177254, "rewards/wrapper/std": 12.834916192293168, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 17.2, "completions/mean_length": 253.3375, "completions/mean_terminated_length": 17.2, "completions/min_length": 170.8, "completions/min_terminated_length": 17.2, "epoch": 0.4058651026392962, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.009612555714556947, "learning_rate": 7.352261288421734e-06, "loss": 0.0099, "num_tokens": 18965412.0, "reward": 14.73887882232666, "reward_std": 18.674267578125, "rewards/wrapper/mean": 7.369439592212439, "rewards/wrapper/std": 19.213176207244395, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.40703812316715543, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.013163172616623342, "learning_rate": 7.349034465140059e-06, "loss": -0.0045, "num_tokens": 19024361.0, "reward": 12.89403257369995, "reward_std": 14.669578742980956, "rewards/wrapper/mean": 6.447016255557537, "rewards/wrapper/std": 18.42922862917185, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 12.6, "completions/mean_length": 251.59375, "completions/mean_terminated_length": 12.6, "completions/min_length": 115.0, "completions/min_terminated_length": 12.6, "epoch": 0.40821114369501466, "frac_reward_zero_std": 0.025, "grad_norm": 3.40625, "kl": 0.01098422622308135, "learning_rate": 7.345799420852538e-06, "loss": -0.0102, "num_tokens": 19081464.0, "reward": 6.755143082141876, "reward_std": 9.144136524200439, "rewards/wrapper/mean": 3.3775712579488752, "rewards/wrapper/std": 10.79396327584982, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 17.8, "completions/mean_length": 254.95625, "completions/mean_terminated_length": 17.8, "completions/min_length": 222.6, "completions/min_terminated_length": 17.8, "epoch": 0.4093841642228739, "frac_reward_zero_std": 0.0625, "grad_norm": 1.0234375, "kl": 0.016762871819082648, "learning_rate": 7.342556166635778e-06, "loss": 0.003, "num_tokens": 19139861.0, "reward": 11.431461834907532, "reward_std": 14.701215863227844, "rewards/wrapper/mean": 5.715730750560761, "rewards/wrapper/std": 18.74812933206558, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 253.26875, "completions/mean_terminated_length": 15.0, "completions/min_length": 168.6, "completions/min_terminated_length": 15.0, "epoch": 0.41055718475073316, "frac_reward_zero_std": 0.0375, "grad_norm": 3.0, "kl": 0.02420689011923969, "learning_rate": 7.3393047135944975e-06, "loss": -0.0026, "num_tokens": 19193342.0, "reward": 10.389392566680907, "reward_std": 11.804220390319824, "rewards/wrapper/mean": 5.194696100801229, "rewards/wrapper/std": 15.313809236884117, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.4, "completions/mean_length": 251.65625, "completions/mean_terminated_length": 7.5, "completions/min_length": 154.2, "completions/min_terminated_length": 0.6, "epoch": 0.4117302052785924, "frac_reward_zero_std": 0.0125, "grad_norm": 1.6484375, "kl": 0.008320258150342852, "learning_rate": 7.336045072861489e-06, "loss": -0.0107, "num_tokens": 19248133.0, "reward": 12.969413948059081, "reward_std": 17.84697332382202, "rewards/wrapper/mean": 6.484706741571427, "rewards/wrapper/std": 17.889204749464987, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 27.2, "completions/mean_length": 252.05625, "completions/mean_terminated_length": 22.5, "completions/min_length": 171.4, "completions/min_terminated_length": 17.8, "epoch": 0.4129032258064516, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.006936083594337106, "learning_rate": 7.332777255597575e-06, "loss": -0.0072, "num_tokens": 19300942.0, "reward": 9.992176389694214, "reward_std": 10.995404851436614, "rewards/wrapper/mean": 4.996087930724025, "rewards/wrapper/std": 13.14226104170084, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 45.8, "completions/mean_length": 251.03125, "completions/mean_terminated_length": 45.8, "completions/min_length": 97.0, "completions/min_terminated_length": 45.8, "epoch": 0.41407624633431084, "frac_reward_zero_std": 0.05, "grad_norm": 1.578125, "kl": 0.008393231546506286, "learning_rate": 7.3295012729915785e-06, "loss": -0.0081, "num_tokens": 19355175.0, "reward": 14.870281219482422, "reward_std": 18.440231704711913, "rewards/wrapper/mean": 7.435140260308981, "rewards/wrapper/std": 21.349429170787335, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 250.5375, "completions/mean_terminated_length": 30.0, "completions/min_length": 81.2, "completions/min_terminated_length": 30.0, "epoch": 0.41524926686217006, "frac_reward_zero_std": 0.0125, "grad_norm": 2.453125, "kl": 0.007583037635777145, "learning_rate": 7.326217136260277e-06, "loss": -0.0148, "num_tokens": 19407093.0, "reward": 12.376422429084778, "reward_std": 14.544963467121125, "rewards/wrapper/mean": 6.18821112215519, "rewards/wrapper/std": 15.661508214473724, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 253.7375, "completions/mean_terminated_length": 30.0, "completions/min_length": 183.6, "completions/min_terminated_length": 30.0, "epoch": 0.41642228739002934, "frac_reward_zero_std": 0.0125, "grad_norm": 1.8515625, "kl": 0.010185779200401156, "learning_rate": 7.322924856648371e-06, "loss": -0.0048, "num_tokens": 19461935.0, "reward": 11.91078872680664, "reward_std": 16.30748119354248, "rewards/wrapper/mean": 5.955394900590181, "rewards/wrapper/std": 19.23618437051773, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 251.625, "completions/mean_terminated_length": 13.6, "completions/min_length": 116.0, "completions/min_terminated_length": 13.6, "epoch": 0.41759530791788857, "frac_reward_zero_std": 0.0125, "grad_norm": 2.09375, "kl": 0.17002868838608265, "learning_rate": 7.319624445428436e-06, "loss": 0.005, "num_tokens": 19520653.0, "reward": 12.678552627563477, "reward_std": 16.436040306091307, "rewards/wrapper/mean": 6.339276467263699, "rewards/wrapper/std": 21.000767435133458, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 72.2, "completions/mean_length": 252.54375, "completions/mean_terminated_length": 58.6, "completions/min_length": 147.4, "completions/min_terminated_length": 45.0, "epoch": 0.4187683284457478, "frac_reward_zero_std": 0.05, "grad_norm": 1.6328125, "kl": 0.00664262983482331, "learning_rate": 7.316315913900893e-06, "loss": -0.0056, "num_tokens": 19573212.0, "reward": 7.194902086257935, "reward_std": 9.6545166015625, "rewards/wrapper/mean": 3.5974511459469793, "rewards/wrapper/std": 12.101305271685124, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 42.8, "completions/mean_length": 252.5625, "completions/mean_terminated_length": 21.9, "completions/min_length": 154.6, "completions/min_terminated_length": 1.0, "epoch": 0.419941348973607, "frac_reward_zero_std": 0.0375, "grad_norm": 6.5, "kl": 0.0394431886379607, "learning_rate": 7.312999273393968e-06, "loss": -0.0076, "num_tokens": 19627970.0, "reward": 7.889170932769775, "reward_std": 10.530238914489747, "rewards/wrapper/mean": 3.9445855379104615, "rewards/wrapper/std": 13.939225174486637, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 24.6, "completions/mean_length": 255.16875, "completions/mean_terminated_length": 24.6, "completions/min_length": 229.4, "completions/min_terminated_length": 24.6, "epoch": 0.4211143695014663, "frac_reward_zero_std": 0.0125, "grad_norm": 5.5, "kl": 0.009630634234054015, "learning_rate": 7.309674535263649e-06, "loss": -0.0015, "num_tokens": 19685435.0, "reward": 8.898407530784606, "reward_std": 11.805877017974854, "rewards/wrapper/mean": 4.4492038011550905, "rewards/wrapper/std": 14.308778963983059, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 1.4, "completions/mean_length": 254.44375, "completions/mean_terminated_length": 1.4, "completions/min_length": 206.2, "completions/min_terminated_length": 1.4, "epoch": 0.4222873900293255, "frac_reward_zero_std": 0.0375, "grad_norm": 7.90625, "kl": 0.02276097269495949, "learning_rate": 7.3063417108936525e-06, "loss": 0.0015, "num_tokens": 19742088.0, "reward": 6.068336296081543, "reward_std": 7.929187393188476, "rewards/wrapper/mean": 3.034168167412281, "rewards/wrapper/std": 11.404951086640358, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 250.9, "completions/mean_terminated_length": 14.4, "completions/min_length": 156.0, "completions/min_terminated_length": 2.4, "epoch": 0.42346041055718475, "frac_reward_zero_std": 0.0125, "grad_norm": 2.96875, "kl": 0.019500624504871666, "learning_rate": 7.3030008116953775e-06, "loss": -0.0126, "num_tokens": 19797162.0, "reward": 6.141694736480713, "reward_std": 8.104972839355469, "rewards/wrapper/mean": 3.07084731683135, "rewards/wrapper/std": 11.21379586905241, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 143.2, "completions/mean_length": 249.61875, "completions/mean_terminated_length": 112.73333435058593, "completions/min_length": 92.2, "completions/min_terminated_length": 92.2, "epoch": 0.424633431085044, "frac_reward_zero_std": 0.05, "grad_norm": 4.21875, "kl": 0.23462779039982706, "learning_rate": 7.299651849107875e-06, "loss": -0.0037, "num_tokens": 19854501.0, "reward": 8.947885489463806, "reward_std": 11.50197262763977, "rewards/wrapper/mean": 4.473942489922047, "rewards/wrapper/std": 14.429775257408618, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 251.075, "completions/mean_terminated_length": 29.3, "completions/min_length": 114.0, "completions/min_terminated_length": 11.6, "epoch": 0.4258064516129032, "frac_reward_zero_std": 0.025, "grad_norm": 7.21875, "kl": 0.012706667324528098, "learning_rate": 7.296294834597802e-06, "loss": 0.0008, "num_tokens": 19910181.0, "reward": 7.768388175964356, "reward_std": 9.072725200653077, "rewards/wrapper/mean": 3.884194038808346, "rewards/wrapper/std": 12.262552881240845, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 9.4, "completions/mean_length": 253.09375, "completions/mean_terminated_length": 9.4, "completions/min_length": 163.0, "completions/min_terminated_length": 9.4, "epoch": 0.4269794721407625, "frac_reward_zero_std": 0.0125, "grad_norm": 4.4375, "kl": 0.02032384374178946, "learning_rate": 7.292929779659388e-06, "loss": -0.0085, "num_tokens": 19966670.0, "reward": 7.567059135437011, "reward_std": 10.188446760177612, "rewards/wrapper/mean": 3.783529528230429, "rewards/wrapper/std": 12.873230685293674, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 17.0, "completions/mean_length": 251.73125, "completions/mean_terminated_length": 17.0, "completions/min_length": 119.4, "completions/min_terminated_length": 17.0, "epoch": 0.4281524926686217, "frac_reward_zero_std": 0.025, "grad_norm": 5.6875, "kl": 0.02145702773705125, "learning_rate": 7.289556695814387e-06, "loss": -0.012, "num_tokens": 20023081.0, "reward": 9.922704219818115, "reward_std": 13.515721893310547, "rewards/wrapper/mean": 4.961352105438709, "rewards/wrapper/std": 16.33309898674488, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.42932551319648093, "frac_reward_zero_std": 0.05, "grad_norm": 1.0625, "kl": 0.007976790505927056, "learning_rate": 7.28617559461205e-06, "loss": -0.0047, "num_tokens": 20079482.0, "reward": 6.184163045883179, "reward_std": 6.993026924133301, "rewards/wrapper/mean": 3.092081458866596, "rewards/wrapper/std": 8.93438842445612, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 77.4, "completions/mean_length": 249.44375, "completions/mean_terminated_length": 54.2, "completions/min_length": 82.2, "completions/min_terminated_length": 31.0, "epoch": 0.43049853372434016, "frac_reward_zero_std": 0.0125, "grad_norm": 4.09375, "kl": 0.010702864232007414, "learning_rate": 7.2827864876290725e-06, "loss": -0.0112, "num_tokens": 20133555.0, "reward": 12.384488344192505, "reward_std": 16.911638498306274, "rewards/wrapper/mean": 6.192244322597981, "rewards/wrapper/std": 19.98247754573822, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 250.1625, "completions/mean_terminated_length": 9.2, "completions/min_length": 104.8, "completions/min_terminated_length": 2.4, "epoch": 0.43167155425219944, "frac_reward_zero_std": 0.025, "grad_norm": 2.203125, "kl": 8.578012371703517, "learning_rate": 7.2793893864695675e-06, "loss": 0.3315, "num_tokens": 20187869.0, "reward": 12.463616633415223, "reward_std": 16.961518478393554, "rewards/wrapper/mean": 6.231808027625084, "rewards/wrapper/std": 20.683779430389404, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 27.6, "completions/mean_length": 250.46875, "completions/mean_terminated_length": 20.0, "completions/min_length": 114.8, "completions/min_terminated_length": 12.4, "epoch": 0.43284457478005867, "frac_reward_zero_std": 0.025, "grad_norm": 3.171875, "kl": 0.03384362782817334, "learning_rate": 7.275984302765016e-06, "loss": -0.0105, "num_tokens": 20244016.0, "reward": 11.264978408813477, "reward_std": 12.805488967895508, "rewards/wrapper/mean": 5.63248887732625, "rewards/wrapper/std": 17.072499746084212, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 33.4, "completions/mean_length": 255.44375, "completions/mean_terminated_length": 33.4, "completions/min_length": 238.2, "completions/min_terminated_length": 33.4, "epoch": 0.4340175953079179, "frac_reward_zero_std": 0.0125, "grad_norm": 2.375, "kl": 0.008770254044793546, "learning_rate": 7.272571248174231e-06, "loss": 0.0012, "num_tokens": 20297411.0, "reward": 11.228783702850341, "reward_std": 14.728189849853516, "rewards/wrapper/mean": 5.614392015337944, "rewards/wrapper/std": 16.68226896971464, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.4351906158357771, "frac_reward_zero_std": 0.025, "grad_norm": 7.5, "kl": 0.010837229958269745, "learning_rate": 7.269150234383318e-06, "loss": -0.0102, "num_tokens": 20355551.0, "reward": 11.516048622131347, "reward_std": 13.84256067276001, "rewards/wrapper/mean": 5.758024173974991, "rewards/wrapper/std": 18.53322400599718, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 40.2, "completions/mean_length": 252.4625, "completions/mean_terminated_length": 20.3, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.43636363636363634, "frac_reward_zero_std": 0.0125, "grad_norm": 2.390625, "kl": 0.007430908730020747, "learning_rate": 7.2657212731056345e-06, "loss": -0.0031, "num_tokens": 20409427.0, "reward": 9.576006889343262, "reward_std": 12.781783771514892, "rewards/wrapper/mean": 4.788003156334161, "rewards/wrapper/std": 14.093942853808404, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 247.83125, "completions/mean_terminated_length": 33.5, "completions/min_length": 119.6, "completions/min_terminated_length": 17.2, "epoch": 0.4375366568914956, "frac_reward_zero_std": 0.0375, "grad_norm": 1.578125, "kl": 0.04859507377259433, "learning_rate": 7.262284376081749e-06, "loss": -0.0194, "num_tokens": 20466058.0, "reward": 8.321262574195861, "reward_std": 5.054714918136597, "rewards/wrapper/mean": 4.160631164908409, "rewards/wrapper/std": 9.609023374319076, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 14.2, "completions/mean_length": 250.04375, "completions/mean_terminated_length": 14.2, "completions/min_length": 65.4, "completions/min_terminated_length": 14.2, "epoch": 0.43870967741935485, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.017362323612906037, "learning_rate": 7.258839555079402e-06, "loss": -0.0054, "num_tokens": 20521151.0, "reward": 5.06641993522644, "reward_std": 3.91677873134613, "rewards/wrapper/mean": 2.5332097202539443, "rewards/wrapper/std": 8.172790160775184, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 251.2, "completions/mean_terminated_length": 51.6, "completions/min_length": 132.6, "completions/min_terminated_length": 30.2, "epoch": 0.4398826979472141, "frac_reward_zero_std": 0.025, "grad_norm": 1.546875, "kl": 0.008750611578579991, "learning_rate": 7.255386821893465e-06, "loss": -0.0106, "num_tokens": 20574001.0, "reward": 8.642753672599792, "reward_std": 9.593472319841386, "rewards/wrapper/mean": 4.321376763284206, "rewards/wrapper/std": 13.84940035790205, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4410557184750733, "frac_reward_zero_std": 0.0375, "grad_norm": 3.515625, "kl": 0.008011186274234205, "learning_rate": 7.251926188345901e-06, "loss": 0.0003, "num_tokens": 20631611.0, "reward": 11.901252555847169, "reward_std": 16.12425422668457, "rewards/wrapper/mean": 5.950626049935818, "rewards/wrapper/std": 19.04661168754101, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 18.8, "completions/mean_length": 250.19375, "completions/mean_terminated_length": 10.3, "completions/min_length": 104.2, "completions/min_terminated_length": 1.8, "epoch": 0.4422287390029325, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.006020620831986889, "learning_rate": 7.248457666285724e-06, "loss": -0.0141, "num_tokens": 20683374.0, "reward": 11.824284934997559, "reward_std": 14.59802188873291, "rewards/wrapper/mean": 5.912142033874988, "rewards/wrapper/std": 18.940556921064854, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 49.4, "completions/mean_length": 248.66875, "completions/mean_terminated_length": 36.333334350585936, "completions/min_length": 69.6, "completions/min_terminated_length": 18.4, "epoch": 0.4434017595307918, "frac_reward_zero_std": 0.0125, "grad_norm": 7.21875, "kl": 0.008943629602435976, "learning_rate": 7.244981267588955e-06, "loss": -0.0154, "num_tokens": 20738729.0, "reward": 10.507330799102784, "reward_std": 10.409684324264527, "rewards/wrapper/mean": 5.253665325790644, "rewards/wrapper/std": 14.284284387528896, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.44457478005865103, "frac_reward_zero_std": 0.0, "grad_norm": 0.9921875, "kl": 0.01356151889776811, "learning_rate": 7.241497004158588e-06, "loss": 0.0005, "num_tokens": 20790895.0, "reward": 12.190250253677368, "reward_std": 14.93087453842163, "rewards/wrapper/mean": 6.095125179737806, "rewards/wrapper/std": 16.378230841457842, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 1.6, "completions/mean_length": 251.25, "completions/mean_terminated_length": 1.6, "completions/min_length": 104.0, "completions/min_terminated_length": 1.6, "epoch": 0.44574780058651026, "frac_reward_zero_std": 0.0, "grad_norm": 7.25, "kl": 0.010270661639515311, "learning_rate": 7.238004887924543e-06, "loss": -0.012, "num_tokens": 20847969.0, "reward": 12.254180431365967, "reward_std": 14.935155391693115, "rewards/wrapper/mean": 6.12709027454257, "rewards/wrapper/std": 16.741011860966683, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 246.50625, "completions/mean_terminated_length": 1.9, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.4469208211143695, "frac_reward_zero_std": 0.0, "grad_norm": 4.75, "kl": 0.031814318336546424, "learning_rate": 7.234504930843625e-06, "loss": -0.0165, "num_tokens": 20903034.0, "reward": 8.4836496591568, "reward_std": 11.438926529884338, "rewards/wrapper/mean": 4.241824831068516, "rewards/wrapper/std": 11.069314436614514, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.44809384164222876, "frac_reward_zero_std": 0.0, "grad_norm": 4.5, "kl": 0.028099820134229958, "learning_rate": 7.230997144899492e-06, "loss": -0.0089, "num_tokens": 20962770.0, "reward": 15.655290794372558, "reward_std": 21.4853853225708, "rewards/wrapper/mean": 7.827645578980446, "rewards/wrapper/std": 22.036782597005367, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 13.6, "completions/min_length": 116.0, "completions/min_terminated_length": 13.6, "epoch": 0.449266862170088, "frac_reward_zero_std": 0.0375, "grad_norm": 2.046875, "kl": 0.008698656456544995, "learning_rate": 7.227481542102603e-06, "loss": -0.0205, "num_tokens": 21015545.0, "reward": 9.024892663955688, "reward_std": 11.932024049758912, "rewards/wrapper/mean": 4.512446265667677, "rewards/wrapper/std": 15.796873818337918, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 249.29375, "completions/mean_terminated_length": 41.0, "completions/min_length": 143.4, "completions/min_terminated_length": 41.0, "epoch": 0.4504398826979472, "frac_reward_zero_std": 0.0375, "grad_norm": 2.421875, "kl": 0.008335027925204486, "learning_rate": 7.223958134490182e-06, "loss": -0.0107, "num_tokens": 21068274.0, "reward": 14.141839408874512, "reward_std": 16.80069694519043, "rewards/wrapper/mean": 7.0709196768701075, "rewards/wrapper/std": 18.76930390149355, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 252.45625, "completions/mean_terminated_length": 25.3, "completions/min_length": 164.2, "completions/min_terminated_length": 10.6, "epoch": 0.45161290322580644, "frac_reward_zero_std": 0.025, "grad_norm": 1.765625, "kl": 0.004524467344162985, "learning_rate": 7.2204269341261774e-06, "loss": -0.0035, "num_tokens": 21120481.0, "reward": 10.868282413482666, "reward_std": 11.861543273925781, "rewards/wrapper/mean": 5.434141282737255, "rewards/wrapper/std": 16.583185213804246, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 37.2, "completions/mean_length": 253.9625, "completions/mean_terminated_length": 37.2, "completions/min_length": 190.8, "completions/min_terminated_length": 37.2, "epoch": 0.45278592375366566, "frac_reward_zero_std": 0.0125, "grad_norm": 1.6328125, "kl": 0.008211625734111294, "learning_rate": 7.21688795310122e-06, "loss": -0.0075, "num_tokens": 21174425.0, "reward": 8.30243649482727, "reward_std": 11.125893306732177, "rewards/wrapper/mean": 4.1512182362377645, "rewards/wrapper/std": 15.107886047661305, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 23.6, "completions/mean_length": 255.1375, "completions/mean_terminated_length": 23.6, "completions/min_length": 228.4, "completions/min_terminated_length": 23.6, "epoch": 0.45395894428152495, "frac_reward_zero_std": 0.0125, "grad_norm": 2.5625, "kl": 0.010393868479877711, "learning_rate": 7.213341203532579e-06, "loss": -0.0012, "num_tokens": 21227715.0, "reward": 8.592939472198486, "reward_std": 11.629208850860596, "rewards/wrapper/mean": 4.296469537913799, "rewards/wrapper/std": 15.236213786900043, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 249.8, "completions/mean_terminated_length": 68.53333435058593, "completions/min_length": 99.8, "completions/min_terminated_length": 48.6, "epoch": 0.45513196480938417, "frac_reward_zero_std": 0.0375, "grad_norm": 6.9375, "kl": 0.023260063456837086, "learning_rate": 7.209786697564124e-06, "loss": -0.0113, "num_tokens": 21286759.0, "reward": 14.108087921142578, "reward_std": 14.388849067687989, "rewards/wrapper/mean": 7.054043973982334, "rewards/wrapper/std": 18.971005833148958, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 33.6, "completions/mean_length": 250.65625, "completions/mean_terminated_length": 17.1, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.4563049853372434, "frac_reward_zero_std": 0.0125, "grad_norm": 1.640625, "kl": 0.013732324563898146, "learning_rate": 7.206224447366281e-06, "loss": -0.0036, "num_tokens": 21342288.0, "reward": 15.73800368309021, "reward_std": 18.553266048431396, "rewards/wrapper/mean": 7.869001491367817, "rewards/wrapper/std": 21.538618184626102, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 253.45625, "completions/mean_terminated_length": 21.0, "completions/min_length": 174.6, "completions/min_terminated_length": 21.0, "epoch": 0.4574780058651026, "frac_reward_zero_std": 0.025, "grad_norm": 3.203125, "kl": 0.007772558298893273, "learning_rate": 7.202654465135994e-06, "loss": -0.0075, "num_tokens": 21395321.0, "reward": 10.073754501342773, "reward_std": 9.745565795898438, "rewards/wrapper/mean": 5.036876889318227, "rewards/wrapper/std": 14.711842876672744, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.4586510263929619, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.016830851417034866, "learning_rate": 7.1990767630966786e-06, "loss": 0.0007, "num_tokens": 21449241.0, "reward": 11.806681632995605, "reward_std": 16.12475757598877, "rewards/wrapper/mean": 5.903340773284436, "rewards/wrapper/std": 18.54739146232605, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 4.4, "completions/mean_length": 252.94375, "completions/mean_terminated_length": 2.3, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.45982404692082113, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.02298996753524989, "learning_rate": 7.195491353498185e-06, "loss": -0.0097, "num_tokens": 21503506.0, "reward": 9.964939308166503, "reward_std": 11.606767654418945, "rewards/wrapper/mean": 4.982469742745161, "rewards/wrapper/std": 15.793828999996185, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.46099706744868035, "frac_reward_zero_std": 0.0125, "grad_norm": 5.84375, "kl": 0.012611826619831845, "learning_rate": 7.191898248616752e-06, "loss": 0.0005, "num_tokens": 21556986.0, "reward": 9.814716625213624, "reward_std": 10.976065969467163, "rewards/wrapper/mean": 4.907358513772488, "rewards/wrapper/std": 14.037784579396249, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 38.6, "completions/mean_length": 255.60625, "completions/mean_terminated_length": 38.6, "completions/min_length": 243.4, "completions/min_terminated_length": 38.6, "epoch": 0.4621700879765396, "frac_reward_zero_std": 0.025, "grad_norm": 3.328125, "kl": 0.0076864651869982484, "learning_rate": 7.188297460754966e-06, "loss": 0.0011, "num_tokens": 21613243.0, "reward": 12.246005964279174, "reward_std": 14.45817790031433, "rewards/wrapper/mean": 6.123002929985523, "rewards/wrapper/std": 14.876671414077283, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 250.35, "completions/mean_terminated_length": 24.0, "completions/min_length": 75.2, "completions/min_terminated_length": 24.0, "epoch": 0.4633431085043988, "frac_reward_zero_std": 0.0125, "grad_norm": 1.515625, "kl": 0.007964454928878695, "learning_rate": 7.18468900224172e-06, "loss": -0.016, "num_tokens": 21667595.0, "reward": 6.403034138679504, "reward_std": 8.485719972848893, "rewards/wrapper/mean": 3.2015167769044637, "rewards/wrapper/std": 11.11855943724513, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.94375, "completions/max_length": 256.0, "completions/max_terminated_length": 65.2, "completions/mean_length": 243.6625, "completions/mean_terminated_length": 61.5, "completions/min_length": 59.6, "completions/min_terminated_length": 59.6, "epoch": 0.4645161290322581, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.007507404690841213, "learning_rate": 7.1810728854321735e-06, "loss": -0.0286, "num_tokens": 21721657.0, "reward": 16.982488250732423, "reward_std": 20.268542289733887, "rewards/wrapper/mean": 8.491244368255138, "rewards/wrapper/std": 18.553547403216363, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 24.4, "completions/mean_length": 251.9625, "completions/mean_terminated_length": 24.4, "completions/min_length": 126.8, "completions/min_terminated_length": 24.4, "epoch": 0.4656891495601173, "frac_reward_zero_std": 0.025, "grad_norm": 8.875, "kl": 0.06541569416876883, "learning_rate": 7.177449122707703e-06, "loss": -0.0055, "num_tokens": 21775327.0, "reward": 9.9188814163208, "reward_std": 12.333803415298462, "rewards/wrapper/mean": 4.95944052785635, "rewards/wrapper/std": 12.621320475637912, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 68.8, "completions/mean_length": 251.75625, "completions/mean_terminated_length": 55.2, "completions/min_length": 144.0, "completions/min_terminated_length": 41.6, "epoch": 0.46686217008797654, "frac_reward_zero_std": 0.0375, "grad_norm": 1.3046875, "kl": 0.010084632772486658, "learning_rate": 7.17381772647587e-06, "loss": -0.012, "num_tokens": 21830376.0, "reward": 8.562131118774413, "reward_std": 10.988706159591676, "rewards/wrapper/mean": 4.281065583229065, "rewards/wrapper/std": 12.385915765166283, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.46803519061583576, "frac_reward_zero_std": 0.025, "grad_norm": 5.9375, "kl": 0.010933949542231858, "learning_rate": 7.170178709170365e-06, "loss": -0.0121, "num_tokens": 21885670.0, "reward": 12.820589447021485, "reward_std": 17.374348068237303, "rewards/wrapper/mean": 6.410294429957867, "rewards/wrapper/std": 16.29390445202589, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 92.2, "completions/mean_length": 252.48125, "completions/mean_terminated_length": 92.2, "completions/min_length": 143.4, "completions/min_terminated_length": 92.2, "epoch": 0.46920821114369504, "frac_reward_zero_std": 0.0125, "grad_norm": 2.125, "kl": 0.0378818953060545, "learning_rate": 7.1665320832509805e-06, "loss": -0.0031, "num_tokens": 21938619.0, "reward": 10.933886623382568, "reward_std": 14.852296829223633, "rewards/wrapper/mean": 5.466943139582872, "rewards/wrapper/std": 16.55307368338108, "step": 2000 }, { "epoch": 0.46920821114369504, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.81, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 77.34, "eval_completions/mean_length": 231.14, "eval_completions/mean_terminated_length": 69.25166679382325, "eval_completions/min_length": 176.15, "eval_completions/min_terminated_length": 60.95, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.012026752880774438, "eval_loss": -0.026698114350438118, "eval_num_tokens": 21938619.0, "eval_reward": 0.391553550735116, "eval_reward_std": 0.2365354063967243, "eval_rewards/wrapper/mean": 0.19577677831053733, "eval_rewards/wrapper/std": 0.1914515098161064, "eval_runtime": 208.8731, "eval_samples_per_second": 0.958, "eval_steps_per_second": 0.239, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 29.4, "completions/mean_length": 255.31875, "completions/mean_terminated_length": 29.4, "completions/min_length": 234.2, "completions/min_terminated_length": 29.4, "epoch": 0.47038123167155427, "frac_reward_zero_std": 0.0, "grad_norm": 5.15625, "kl": 0.009880531148519367, "learning_rate": 7.162877861203553e-06, "loss": 0.0024, "num_tokens": 21994548.0, "reward": 9.461054754257201, "reward_std": 10.576078653335571, "rewards/wrapper/mean": 4.730527497828007, "rewards/wrapper/std": 14.311911128461361, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.4715542521994135, "frac_reward_zero_std": 0.025, "grad_norm": 3.140625, "kl": 0.010484227410051972, "learning_rate": 7.159216055539936e-06, "loss": -0.0096, "num_tokens": 22053638.0, "reward": 9.75552453994751, "reward_std": 13.13539743423462, "rewards/wrapper/mean": 4.877762029320001, "rewards/wrapper/std": 16.15574167072773, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 18.4, "completions/mean_length": 253.375, "completions/mean_terminated_length": 18.4, "completions/min_length": 172.0, "completions/min_terminated_length": 18.4, "epoch": 0.4727272727272727, "frac_reward_zero_std": 0.025, "grad_norm": 1.03125, "kl": 0.0069014692155178635, "learning_rate": 7.155546678797941e-06, "loss": -0.0071, "num_tokens": 22109996.0, "reward": 5.2139427185058596, "reward_std": 6.810544824600219, "rewards/wrapper/mean": 2.6069714561104775, "rewards/wrapper/std": 8.123496209084987, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 45.8, "completions/mean_length": 246.24375, "completions/mean_terminated_length": 31.2, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.47390029325513194, "frac_reward_zero_std": 0.0125, "grad_norm": 3.296875, "kl": 0.009175126685295253, "learning_rate": 7.1518697435413075e-06, "loss": -0.0085, "num_tokens": 22161447.0, "reward": 16.252300643920897, "reward_std": 18.46790657043457, "rewards/wrapper/mean": 8.126150195300578, "rewards/wrapper/std": 22.689970228075982, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 253.05, "completions/mean_terminated_length": 8.0, "completions/min_length": 161.6, "completions/min_terminated_length": 8.0, "epoch": 0.4750733137829912, "frac_reward_zero_std": 0.0375, "grad_norm": 4.21875, "kl": 0.013811478717252612, "learning_rate": 7.148185262359653e-06, "loss": -0.0064, "num_tokens": 22215827.0, "reward": 8.836444938182831, "reward_std": 11.80629455447197, "rewards/wrapper/mean": 4.418222548812627, "rewards/wrapper/std": 12.121624572575092, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.47624633431085045, "frac_reward_zero_std": 0.0125, "grad_norm": 4.40625, "kl": 0.011954761703964322, "learning_rate": 7.144493247868432e-06, "loss": 0.0005, "num_tokens": 22270149.0, "reward": 8.726956057548524, "reward_std": 10.781459975242615, "rewards/wrapper/mean": 4.363477950543166, "rewards/wrapper/std": 11.56369944959879, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 247.7125, "completions/mean_terminated_length": 19.56666679382324, "completions/min_length": 116.4, "completions/min_terminated_length": 14.0, "epoch": 0.4774193548387097, "frac_reward_zero_std": 0.025, "grad_norm": 2.390625, "kl": 0.017661917663645, "learning_rate": 7.140793712708894e-06, "loss": -0.02, "num_tokens": 22324755.0, "reward": 11.873594665527344, "reward_std": 16.035013008117676, "rewards/wrapper/mean": 5.936797216534615, "rewards/wrapper/std": 18.508428135514258, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.4785923753665689, "frac_reward_zero_std": 0.0125, "grad_norm": 2.53125, "kl": 0.006388617679476738, "learning_rate": 7.137086669548035e-06, "loss": -0.0167, "num_tokens": 22379342.0, "reward": 9.060737133026123, "reward_std": 10.055862641334533, "rewards/wrapper/mean": 4.530368596315384, "rewards/wrapper/std": 15.233650147914886, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.6, "completions/mean_length": 252.1, "completions/mean_terminated_length": 14.5, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.4797653958944281, "frac_reward_zero_std": 0.0125, "grad_norm": 5.375, "kl": 8129.877822359197, "learning_rate": 7.1333721310785614e-06, "loss": 325.1902, "num_tokens": 22435590.0, "reward": 9.408462858200073, "reward_std": 12.48482882976532, "rewards/wrapper/mean": 4.704231335222721, "rewards/wrapper/std": 13.56791399270296, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 1.8, "completions/mean_length": 251.25625, "completions/mean_terminated_length": 1.8, "completions/min_length": 104.2, "completions/min_terminated_length": 1.8, "epoch": 0.4809384164222874, "frac_reward_zero_std": 0.0375, "grad_norm": 2.765625, "kl": 0.010197655879892409, "learning_rate": 7.129650110018844e-06, "loss": -0.0065, "num_tokens": 22488429.0, "reward": 10.777331662178039, "reward_std": 13.09556884765625, "rewards/wrapper/mean": 5.388665563613176, "rewards/wrapper/std": 15.516864584386349, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 254.4125, "completions/mean_terminated_length": 0.4, "completions/min_length": 205.2, "completions/min_terminated_length": 0.4, "epoch": 0.48211143695014663, "frac_reward_zero_std": 0.025, "grad_norm": 16.125, "kl": 0.029901218856684862, "learning_rate": 7.12592061911287e-06, "loss": 0.0062, "num_tokens": 22540891.0, "reward": 5.5157770156860355, "reward_std": 6.8765318393707275, "rewards/wrapper/mean": 2.757888501882553, "rewards/wrapper/std": 9.093967694044114, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.48328445747800586, "frac_reward_zero_std": 0.05, "grad_norm": 1.5703125, "kl": 0.00937183212954551, "learning_rate": 7.122183671130207e-06, "loss": -0.016, "num_tokens": 22593954.0, "reward": 10.615709495544433, "reward_std": 13.608506298065185, "rewards/wrapper/mean": 5.307854762673378, "rewards/wrapper/std": 14.481095506250858, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 2.6, "completions/mean_length": 252.88125, "completions/mean_terminated_length": 2.6, "completions/min_length": 156.2, "completions/min_terminated_length": 2.6, "epoch": 0.4844574780058651, "frac_reward_zero_std": 0.0125, "grad_norm": 1.3203125, "kl": 0.009229195350781084, "learning_rate": 7.118439278865949e-06, "loss": -0.0063, "num_tokens": 22646413.0, "reward": 10.647134971618652, "reward_std": 13.339782333374023, "rewards/wrapper/mean": 5.323567350953818, "rewards/wrapper/std": 16.633514940738678, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.48563049853372436, "frac_reward_zero_std": 0.0125, "grad_norm": 2.3125, "kl": 0.017320739361457525, "learning_rate": 7.114687455140686e-06, "loss": -0.0119, "num_tokens": 22703215.0, "reward": 10.626597595214843, "reward_std": 11.47060546875, "rewards/wrapper/mean": 5.313298827409744, "rewards/wrapper/std": 14.610971334576607, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.4868035190615836, "frac_reward_zero_std": 0.025, "grad_norm": 4.625, "kl": 0.026328472554450855, "learning_rate": 7.110928212800449e-06, "loss": -0.0015, "num_tokens": 22760596.0, "reward": 8.179675102233887, "reward_std": 9.763381147384644, "rewards/wrapper/mean": 4.089837612211705, "rewards/wrapper/std": 12.722039490938187, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.4879765395894428, "frac_reward_zero_std": 0.0125, "grad_norm": 3.53125, "kl": 0.018763077515177428, "learning_rate": 7.107161564716671e-06, "loss": -0.0055, "num_tokens": 22814613.0, "reward": 7.410968685150147, "reward_std": 9.938138008117676, "rewards/wrapper/mean": 3.705484404414892, "rewards/wrapper/std": 12.76738702505827, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 253.9875, "completions/mean_terminated_length": 38.0, "completions/min_length": 191.6, "completions/min_terminated_length": 38.0, "epoch": 0.48914956011730204, "frac_reward_zero_std": 0.0375, "grad_norm": 7.21875, "kl": 0.015139080863445998, "learning_rate": 7.10338752378614e-06, "loss": -0.0032, "num_tokens": 22871085.0, "reward": 12.965310859680176, "reward_std": 14.322172927856446, "rewards/wrapper/mean": 6.482655397057533, "rewards/wrapper/std": 19.77724291831255, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.49032258064516127, "frac_reward_zero_std": 0.0125, "grad_norm": 6.03125, "kl": 0.008022710657678544, "learning_rate": 7.099606102930959e-06, "loss": -0.0072, "num_tokens": 22924535.0, "reward": 9.030936241149902, "reward_std": 11.908495712280274, "rewards/wrapper/mean": 4.515467864274979, "rewards/wrapper/std": 14.924723632633686, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 31.6, "completions/mean_length": 253.7875, "completions/mean_terminated_length": 31.6, "completions/min_length": 185.2, "completions/min_terminated_length": 31.6, "epoch": 0.49149560117302055, "frac_reward_zero_std": 0.025, "grad_norm": 5.71875, "kl": 0.006183647876605391, "learning_rate": 7.095817315098498e-06, "loss": -0.0064, "num_tokens": 22977949.0, "reward": 12.94020755290985, "reward_std": 17.86543025970459, "rewards/wrapper/mean": 6.470103675872087, "rewards/wrapper/std": 16.391293506324292, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.49266862170087977, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.008084136416437104, "learning_rate": 7.092021173261353e-06, "loss": -0.0109, "num_tokens": 23031935.0, "reward": 11.518471813201904, "reward_std": 11.456220531463623, "rewards/wrapper/mean": 5.759235548973083, "rewards/wrapper/std": 16.237758734822272, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 13.8, "completions/mean_length": 251.6375, "completions/mean_terminated_length": 7.1, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.493841642228739, "frac_reward_zero_std": 0.0375, "grad_norm": 0.9375, "kl": 0.008848378155380487, "learning_rate": 7.088217690417298e-06, "loss": -0.0107, "num_tokens": 23086489.0, "reward": 7.520121216773987, "reward_std": 10.031231105327606, "rewards/wrapper/mean": 3.7600605204701423, "rewards/wrapper/std": 12.67145141363144, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 255.4, "completions/mean_terminated_length": 32.0, "completions/min_length": 236.8, "completions/min_terminated_length": 32.0, "epoch": 0.4950146627565982, "frac_reward_zero_std": 0.025, "grad_norm": 3.65625, "kl": 0.008108193590305745, "learning_rate": 7.084406879589242e-06, "loss": -0.0012, "num_tokens": 23144123.0, "reward": 11.643420743942261, "reward_std": 13.64364709854126, "rewards/wrapper/mean": 5.82171031832695, "rewards/wrapper/std": 14.45436689555645, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 6.6, "completions/mean_length": 253.00625, "completions/mean_terminated_length": 6.6, "completions/min_length": 160.2, "completions/min_terminated_length": 6.6, "epoch": 0.4961876832844575, "frac_reward_zero_std": 0.05, "grad_norm": 2.046875, "kl": 0.007070534571539611, "learning_rate": 7.080588753825184e-06, "loss": -0.0102, "num_tokens": 23198110.0, "reward": 10.695003128051757, "reward_std": 14.557994079589843, "rewards/wrapper/mean": 5.347501567006111, "rewards/wrapper/std": 17.22459286004305, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 75.2, "completions/mean_length": 250.3625, "completions/mean_terminated_length": 53.9, "completions/min_length": 135.0, "completions/min_terminated_length": 32.6, "epoch": 0.49736070381231673, "frac_reward_zero_std": 0.0125, "grad_norm": 3.765625, "kl": 0.029336131096351893, "learning_rate": 7.076763326198173e-06, "loss": -0.0016, "num_tokens": 23253760.0, "reward": 8.677525091171265, "reward_std": 9.848081493377686, "rewards/wrapper/mean": 4.338762363791465, "rewards/wrapper/std": 13.28867315351963, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 16.6, "completions/mean_length": 247.06875, "completions/mean_terminated_length": 5.65, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.49853372434017595, "frac_reward_zero_std": 0.0125, "grad_norm": 2.578125, "kl": 0.017404579918365925, "learning_rate": 7.072930609806254e-06, "loss": -0.0171, "num_tokens": 23306149.0, "reward": 15.353384113311767, "reward_std": 19.154958724975586, "rewards/wrapper/mean": 7.676692083477974, "rewards/wrapper/std": 22.459415701031684, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 85.6, "completions/mean_length": 252.28125, "completions/mean_terminated_length": 68.3, "completions/min_length": 153.4, "completions/min_terminated_length": 51.0, "epoch": 0.4997067448680352, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.006470509018981829, "learning_rate": 7.0690906177724305e-06, "loss": 0.0063, "num_tokens": 23363424.0, "reward": 9.307399272918701, "reward_std": 12.280207061767578, "rewards/wrapper/mean": 4.653699503093958, "rewards/wrapper/std": 15.558639793097973, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 65.2, "completions/mean_length": 251.64375, "completions/mean_terminated_length": 62.9, "completions/min_length": 163.0, "completions/min_terminated_length": 60.6, "epoch": 0.5008797653958944, "frac_reward_zero_std": 0.0, "grad_norm": 2.875, "kl": 0.017257986776530742, "learning_rate": 7.065243363244619e-06, "loss": -0.0119, "num_tokens": 23418599.0, "reward": 12.853836822509766, "reward_std": 17.099227905273438, "rewards/wrapper/mean": 6.4269180707633495, "rewards/wrapper/std": 18.025887221097946, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 251.41875, "completions/mean_terminated_length": 7.0, "completions/min_length": 109.4, "completions/min_terminated_length": 7.0, "epoch": 0.5020527859237537, "frac_reward_zero_std": 0.0375, "grad_norm": 2.921875, "kl": 0.008581590990070253, "learning_rate": 7.0613888593956e-06, "loss": -0.0054, "num_tokens": 23472810.0, "reward": 8.985012340545655, "reward_std": 11.706706619262695, "rewards/wrapper/mean": 4.492506121098995, "rewards/wrapper/std": 12.430897434055804, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5032258064516129, "frac_reward_zero_std": 0.05, "grad_norm": 1.890625, "kl": 0.008975341753102838, "learning_rate": 7.057527119422977e-06, "loss": -0.0053, "num_tokens": 23526291.0, "reward": 10.154117774963378, "reward_std": 13.665915584564209, "rewards/wrapper/mean": 5.077058912813664, "rewards/wrapper/std": 15.374008457362653, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 38.8, "completions/mean_length": 252.4125, "completions/mean_terminated_length": 38.8, "completions/min_length": 141.2, "completions/min_terminated_length": 38.8, "epoch": 0.5043988269794721, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.006695700716227293, "learning_rate": 7.0536581565491265e-06, "loss": -0.0117, "num_tokens": 23577593.0, "reward": 7.732948541641235, "reward_std": 8.718576312065125, "rewards/wrapper/mean": 3.866474460810423, "rewards/wrapper/std": 11.597916722297668, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 248.03125, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.5055718475073314, "frac_reward_zero_std": 0.0375, "grad_norm": 6.21875, "kl": 0.009025942167500033, "learning_rate": 7.049781984021159e-06, "loss": -0.0187, "num_tokens": 23630638.0, "reward": 10.552233409881591, "reward_std": 13.876630878448486, "rewards/wrapper/mean": 5.276116743683815, "rewards/wrapper/std": 15.018184214830399, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 32.2, "completions/mean_length": 253.8125, "completions/mean_terminated_length": 16.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5067448680351906, "frac_reward_zero_std": 0.0125, "grad_norm": 2.140625, "kl": 0.008229665062390267, "learning_rate": 7.04589861511087e-06, "loss": -0.0064, "num_tokens": 23683908.0, "reward": 7.022137629985809, "reward_std": 9.132097482681274, "rewards/wrapper/mean": 3.511068840324879, "rewards/wrapper/std": 10.933140191435815, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 36.4, "completions/mean_length": 253.9375, "completions/mean_terminated_length": 36.4, "completions/min_length": 190.0, "completions/min_terminated_length": 36.4, "epoch": 0.5079178885630499, "frac_reward_zero_std": 0.0, "grad_norm": 9.4375, "kl": 0.014441122498828918, "learning_rate": 7.042008063114695e-06, "loss": -0.0074, "num_tokens": 23739054.0, "reward": 7.9544504404067995, "reward_std": 10.569594264030457, "rewards/wrapper/mean": 3.977225196361542, "rewards/wrapper/std": 12.171872541308403, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 22.2, "completions/mean_length": 251.9, "completions/mean_terminated_length": 22.2, "completions/min_length": 175.8, "completions/min_terminated_length": 22.2, "epoch": 0.509090909090909, "frac_reward_zero_std": 0.025, "grad_norm": 5.1875, "kl": 0.013165635778568685, "learning_rate": 7.038110341353661e-06, "loss": 0.0052, "num_tokens": 23796234.0, "reward": 11.378597354888916, "reward_std": 15.319499397277832, "rewards/wrapper/mean": 5.689298801869154, "rewards/wrapper/std": 20.10531617105007, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 248.03125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5102639296187683, "frac_reward_zero_std": 0.025, "grad_norm": 1.8984375, "kl": 0.007043931004591286, "learning_rate": 7.034205463173349e-06, "loss": -0.0194, "num_tokens": 23849139.0, "reward": 11.2071537733078, "reward_std": 12.747740030288696, "rewards/wrapper/mean": 5.6035766273736956, "rewards/wrapper/std": 14.315529163181782, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 40.2, "completions/mean_length": 252.49375, "completions/mean_terminated_length": 40.1, "completions/min_length": 193.6, "completions/min_terminated_length": 40.0, "epoch": 0.5114369501466276, "frac_reward_zero_std": 0.0125, "grad_norm": 3.984375, "kl": 0.013299105316400528, "learning_rate": 7.030293441943839e-06, "loss": -0.0089, "num_tokens": 23904442.0, "reward": 14.169266891479491, "reward_std": 15.299544715881348, "rewards/wrapper/mean": 7.084633606672287, "rewards/wrapper/std": 20.22032133191824, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 74.4, "completions/mean_length": 247.14375, "completions/mean_terminated_length": 45.2, "completions/min_length": 81.8, "completions/min_terminated_length": 30.6, "epoch": 0.5126099706744868, "frac_reward_zero_std": 0.0375, "grad_norm": 6.78125, "kl": 0.02666954748565331, "learning_rate": 7.02637429105967e-06, "loss": -0.0197, "num_tokens": 23961361.0, "reward": 13.424666404724121, "reward_std": 15.76526699066162, "rewards/wrapper/mean": 6.712333005666733, "rewards/wrapper/std": 18.18206671178341, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 3.4, "completions/mean_length": 254.50625, "completions/mean_terminated_length": 3.4, "completions/min_length": 208.2, "completions/min_terminated_length": 3.4, "epoch": 0.513782991202346, "frac_reward_zero_std": 0.0125, "grad_norm": 4.375, "kl": 0.00966759123839438, "learning_rate": 7.022448023939792e-06, "loss": -0.0031, "num_tokens": 24016330.0, "reward": 10.31696891784668, "reward_std": 13.620968246459961, "rewards/wrapper/mean": 5.158484085649252, "rewards/wrapper/std": 15.674263837933541, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 28.6, "completions/mean_length": 252.09375, "completions/mean_terminated_length": 28.6, "completions/min_length": 131.0, "completions/min_terminated_length": 28.6, "epoch": 0.5149560117302053, "frac_reward_zero_std": 0.025, "grad_norm": 1.234375, "kl": 0.008295352436834946, "learning_rate": 7.018514654027522e-06, "loss": -0.0095, "num_tokens": 24071905.0, "reward": 11.781395196914673, "reward_std": 15.11465334892273, "rewards/wrapper/mean": 5.890697306394577, "rewards/wrapper/std": 16.348299649357795, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5161290322580645, "frac_reward_zero_std": 0.0, "grad_norm": 9.6875, "kl": 0.01148320131469518, "learning_rate": 7.014574194790494e-06, "loss": -0.0071, "num_tokens": 24131471.0, "reward": 10.274917244911194, "reward_std": 13.521960470080376, "rewards/wrapper/mean": 5.137458457052707, "rewards/wrapper/std": 14.553812845051288, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 21.0, "completions/mean_length": 248.66875, "completions/mean_terminated_length": 19.7, "completions/min_length": 120.8, "completions/min_terminated_length": 18.4, "epoch": 0.5173020527859238, "frac_reward_zero_std": 0.0, "grad_norm": 4.46875, "kl": 0.008623561984859408, "learning_rate": 7.010626659720619e-06, "loss": -0.0215, "num_tokens": 24182886.0, "reward": 11.50489158630371, "reward_std": 12.356374740600586, "rewards/wrapper/mean": 5.75244573764503, "rewards/wrapper/std": 15.028051799535751, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.518475073313783, "frac_reward_zero_std": 0.0375, "grad_norm": 4.125, "kl": 0.014326384535524993, "learning_rate": 7.006672062334031e-06, "loss": -0.0095, "num_tokens": 24235856.0, "reward": 10.057607388496399, "reward_std": 10.844542121887207, "rewards/wrapper/mean": 5.028803963959217, "rewards/wrapper/std": 14.027101680636406, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 16.2, "completions/mean_length": 251.70625, "completions/mean_terminated_length": 16.2, "completions/min_length": 118.6, "completions/min_terminated_length": 16.2, "epoch": 0.5196480938416422, "frac_reward_zero_std": 0.05, "grad_norm": 1.3671875, "kl": 0.0070623870589770375, "learning_rate": 7.0027104161710485e-06, "loss": 0.0003, "num_tokens": 24289943.0, "reward": 7.508664560317993, "reward_std": 9.97425332069397, "rewards/wrapper/mean": 3.7543324276804926, "rewards/wrapper/std": 12.59233037829399, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.5208211143695015, "frac_reward_zero_std": 0.0375, "grad_norm": 3.09375, "kl": 0.011236746236681939, "learning_rate": 6.9987417347961224e-06, "loss": 0.0004, "num_tokens": 24350043.0, "reward": 10.7836181640625, "reward_std": 14.692389869689942, "rewards/wrapper/mean": 5.391809102892876, "rewards/wrapper/std": 17.6582034394145, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 40.6, "completions/mean_length": 250.875, "completions/mean_terminated_length": 36.1, "completions/min_length": 134.0, "completions/min_terminated_length": 31.6, "epoch": 0.5219941348973607, "frac_reward_zero_std": 0.05, "grad_norm": 4.375, "kl": 0.014934242086019367, "learning_rate": 6.994766031797795e-06, "loss": -0.011, "num_tokens": 24404613.0, "reward": 12.124627828598022, "reward_std": 12.525247192382812, "rewards/wrapper/mean": 6.062313592433929, "rewards/wrapper/std": 15.312794582545758, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 6.4, "completions/mean_length": 249.8375, "completions/mean_terminated_length": 6.0, "completions/min_length": 108.0, "completions/min_terminated_length": 5.6, "epoch": 0.52316715542522, "frac_reward_zero_std": 0.025, "grad_norm": 15.3125, "kl": 0.05197887100512162, "learning_rate": 6.990783320788646e-06, "loss": -0.0027, "num_tokens": 24458525.0, "reward": 10.043572521209716, "reward_std": 13.56480016708374, "rewards/wrapper/mean": 5.021785932034254, "rewards/wrapper/std": 14.084061123430729, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 45.4, "completions/mean_length": 251.025, "completions/mean_terminated_length": 45.4, "completions/min_length": 147.8, "completions/min_terminated_length": 45.4, "epoch": 0.5243401759530791, "frac_reward_zero_std": 0.025, "grad_norm": 2.953125, "kl": 0.008444994036108256, "learning_rate": 6.98679361540525e-06, "loss": -0.008, "num_tokens": 24515231.0, "reward": 9.886309909820557, "reward_std": 9.906683957576751, "rewards/wrapper/mean": 4.9431547477841375, "rewards/wrapper/std": 13.339074079692363, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 29.2, "completions/mean_length": 252.2875, "completions/mean_terminated_length": 17.5, "completions/min_length": 159.4, "completions/min_terminated_length": 5.8, "epoch": 0.5255131964809384, "frac_reward_zero_std": 0.0125, "grad_norm": 6.4375, "kl": 0.4571570298052393, "learning_rate": 6.9827969293081375e-06, "loss": 0.0155, "num_tokens": 24570455.0, "reward": 10.244660663604737, "reward_std": 11.858344125747681, "rewards/wrapper/mean": 5.122330310195684, "rewards/wrapper/std": 16.16204769462347, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 50.6, "completions/mean_length": 254.38125, "completions/mean_terminated_length": 50.6, "completions/min_length": 204.2, "completions/min_terminated_length": 50.6, "epoch": 0.5266862170087977, "frac_reward_zero_std": 0.0125, "grad_norm": 1.453125, "kl": 0.040043732104822996, "learning_rate": 6.97879327618173e-06, "loss": -0.0034, "num_tokens": 24626558.0, "reward": 10.697834300994874, "reward_std": 14.070580577850341, "rewards/wrapper/mean": 5.348917351663113, "rewards/wrapper/std": 14.375663158297538, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 41.4, "completions/mean_length": 252.5, "completions/mean_terminated_length": 27.1, "completions/min_length": 166.4, "completions/min_terminated_length": 12.8, "epoch": 0.5278592375366569, "frac_reward_zero_std": 0.025, "grad_norm": 1.046875, "kl": 0.007097694149706513, "learning_rate": 6.97478266973431e-06, "loss": -0.0077, "num_tokens": 24680370.0, "reward": 7.563847708702087, "reward_std": 9.385326147079468, "rewards/wrapper/mean": 3.7819239191710947, "rewards/wrapper/std": 11.115535339713096, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 50.6, "completions/mean_length": 251.18125, "completions/mean_terminated_length": 50.6, "completions/min_length": 101.8, "completions/min_terminated_length": 50.6, "epoch": 0.5290322580645161, "frac_reward_zero_std": 0.0375, "grad_norm": 1.46875, "kl": 0.010719807958230377, "learning_rate": 6.970765123697969e-06, "loss": -0.0095, "num_tokens": 24738315.0, "reward": 18.66574192047119, "reward_std": 23.168578147888184, "rewards/wrapper/mean": 9.332871111482381, "rewards/wrapper/std": 24.021143828332423, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 15.8, "completions/mean_length": 251.7, "completions/mean_terminated_length": 8.1, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5302052785923753, "frac_reward_zero_std": 0.025, "grad_norm": 2.546875, "kl": 0.08032823919784278, "learning_rate": 6.966740651828553e-06, "loss": -0.0097, "num_tokens": 24793259.0, "reward": 12.07797212600708, "reward_std": 13.894320297241212, "rewards/wrapper/mean": 6.038985838741064, "rewards/wrapper/std": 17.31892890483141, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 25.6, "completions/mean_length": 250.4, "completions/mean_terminated_length": 25.6, "completions/min_length": 76.8, "completions/min_terminated_length": 25.6, "epoch": 0.5313782991202346, "frac_reward_zero_std": 0.05, "grad_norm": 3.5, "kl": 0.009525609156116843, "learning_rate": 6.962709267905628e-06, "loss": -0.0004, "num_tokens": 24848745.0, "reward": 17.720832443237306, "reward_std": 21.870363998413087, "rewards/wrapper/mean": 8.860416962206363, "rewards/wrapper/std": 28.17476643770933, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 75.8, "completions/mean_length": 245.925, "completions/mean_terminated_length": 38.266667175292966, "completions/min_length": 65.4, "completions/min_terminated_length": 14.2, "epoch": 0.5325513196480939, "frac_reward_zero_std": 0.0125, "grad_norm": 4.09375, "kl": 0.008869636815506964, "learning_rate": 6.9586709857324235e-06, "loss": -0.014, "num_tokens": 24903285.0, "reward": 7.448293590545655, "reward_std": 9.88570761680603, "rewards/wrapper/mean": 3.724146793037653, "rewards/wrapper/std": 12.0461391761899, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 3.4, "completions/mean_length": 251.3125, "completions/mean_terminated_length": 1.9, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.533724340175953, "frac_reward_zero_std": 0.05, "grad_norm": 3.75, "kl": 0.006731168960686773, "learning_rate": 6.954625819135789e-06, "loss": -0.0132, "num_tokens": 24959077.0, "reward": 14.303616046905518, "reward_std": 19.654699897766115, "rewards/wrapper/mean": 7.151808172464371, "rewards/wrapper/std": 20.88992646113038, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 47.4, "completions/mean_length": 252.68125, "completions/mean_terminated_length": 47.4, "completions/min_length": 149.8, "completions/min_terminated_length": 47.4, "epoch": 0.5348973607038123, "frac_reward_zero_std": 0.025, "grad_norm": 2.625, "kl": 0.008235664875246584, "learning_rate": 6.950573781966145e-06, "loss": -0.01, "num_tokens": 25016086.0, "reward": 10.721317195892334, "reward_std": 13.564059352874756, "rewards/wrapper/mean": 5.360658337175846, "rewards/wrapper/std": 15.573051902651788, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.2, "completions/mean_length": 251.65, "completions/mean_terminated_length": 7.3, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5360703812316715, "frac_reward_zero_std": 0.0125, "grad_norm": 1.0703125, "kl": 0.016059622971806676, "learning_rate": 6.946514888097435e-06, "loss": -0.0099, "num_tokens": 25071402.0, "reward": 11.941667938232422, "reward_std": 13.991264152526856, "rewards/wrapper/mean": 5.970833889394998, "rewards/wrapper/std": 17.43187249600887, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 10.4, "completions/mean_length": 253.125, "completions/mean_terminated_length": 10.4, "completions/min_length": 164.0, "completions/min_terminated_length": 10.4, "epoch": 0.5372434017595308, "frac_reward_zero_std": 0.025, "grad_norm": 4.09375, "kl": 0.009205941716209054, "learning_rate": 6.942449151427085e-06, "loss": -0.0041, "num_tokens": 25124486.0, "reward": 9.145212745666504, "reward_std": 12.24248571395874, "rewards/wrapper/mean": 4.572606243938208, "rewards/wrapper/std": 13.682215167582035, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 11.2, "completions/mean_length": 253.2, "completions/mean_terminated_length": 6.4, "completions/min_length": 206.4, "completions/min_terminated_length": 1.6, "epoch": 0.5384164222873901, "frac_reward_zero_std": 0.0125, "grad_norm": 1.984375, "kl": 0.18185594582464545, "learning_rate": 6.9383765858759435e-06, "loss": -0.0005, "num_tokens": 25176610.0, "reward": 13.980056762695312, "reward_std": 17.20791530609131, "rewards/wrapper/mean": 6.990028128027916, "rewards/wrapper/std": 20.499118688702584, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 36.8, "completions/mean_length": 252.5375, "completions/mean_terminated_length": 21.5, "completions/min_length": 159.8, "completions/min_terminated_length": 6.2, "epoch": 0.5395894428152492, "frac_reward_zero_std": 0.0125, "grad_norm": 2.71875, "kl": 0.013887450261972845, "learning_rate": 6.9342972053882475e-06, "loss": -0.0067, "num_tokens": 25235306.0, "reward": 7.798388671875, "reward_std": 9.488762283325196, "rewards/wrapper/mean": 3.8991942696273325, "rewards/wrapper/std": 14.203517746925353, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 90.4, "completions/mean_length": 254.025, "completions/mean_terminated_length": 90.4, "completions/min_length": 192.8, "completions/min_terminated_length": 90.4, "epoch": 0.5407624633431085, "frac_reward_zero_std": 0.05, "grad_norm": 0.94921875, "kl": 0.1488804242340848, "learning_rate": 6.930211023931562e-06, "loss": 0.0008, "num_tokens": 25288734.0, "reward": 11.9146014213562, "reward_std": 12.40765314102173, "rewards/wrapper/mean": 5.957300490140915, "rewards/wrapper/std": 16.380452224612235, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 252.825, "completions/mean_terminated_length": 0.8, "completions/min_length": 154.4, "completions/min_terminated_length": 0.8, "epoch": 0.5419354838709678, "frac_reward_zero_std": 0.025, "grad_norm": 6.375, "kl": 0.009590476250741631, "learning_rate": 6.926118055496741e-06, "loss": -0.0109, "num_tokens": 25343804.0, "reward": 10.353494834899902, "reward_std": 11.441220569610596, "rewards/wrapper/mean": 5.176747385412455, "rewards/wrapper/std": 12.015816460549832, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 49.2, "completions/mean_length": 253.01875, "completions/mean_terminated_length": 37.0, "completions/min_length": 178.4, "completions/min_terminated_length": 24.8, "epoch": 0.543108504398827, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.019391293311491607, "learning_rate": 6.922018314097876e-06, "loss": -0.0031, "num_tokens": 25399647.0, "reward": 10.329414510726929, "reward_std": 13.49524603486061, "rewards/wrapper/mean": 5.164707355946303, "rewards/wrapper/std": 15.286242140829563, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 252.9875, "completions/mean_terminated_length": 6.0, "completions/min_length": 159.6, "completions/min_terminated_length": 6.0, "epoch": 0.5442815249266862, "frac_reward_zero_std": 0.025, "grad_norm": 4.09375, "kl": 0.008706731640268117, "learning_rate": 6.917911813772251e-06, "loss": 0.0019, "num_tokens": 25454231.0, "reward": 7.387438726425171, "reward_std": 9.80572988986969, "rewards/wrapper/mean": 3.6937191992998124, "rewards/wrapper/std": 12.426317961513996, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 10.4, "completions/mean_length": 248.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5454545454545454, "frac_reward_zero_std": 0.025, "grad_norm": 6.21875, "kl": 0.008474712888710201, "learning_rate": 6.913798568580287e-06, "loss": -0.0237, "num_tokens": 25510329.0, "reward": 11.03807897567749, "reward_std": 12.529253387451172, "rewards/wrapper/mean": 5.519039383530616, "rewards/wrapper/std": 15.573815928399563, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 246.59375, "completions/mean_terminated_length": 32.6, "completions/min_length": 19.2, "completions/min_terminated_length": 19.2, "epoch": 0.5466275659824047, "frac_reward_zero_std": 0.0125, "grad_norm": 12.125, "kl": 0.021075761376414447, "learning_rate": 6.909678592605505e-06, "loss": -0.016, "num_tokens": 25563666.0, "reward": 12.097493743896484, "reward_std": 16.431555366516115, "rewards/wrapper/mean": 6.048747086524964, "rewards/wrapper/std": 19.726453380286692, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 27.0, "completions/mean_length": 252.04375, "completions/mean_terminated_length": 27.0, "completions/min_length": 129.4, "completions/min_terminated_length": 27.0, "epoch": 0.547800586510264, "frac_reward_zero_std": 0.0125, "grad_norm": 1.53125, "kl": 0.010052182164508849, "learning_rate": 6.905551899954469e-06, "loss": -0.0028, "num_tokens": 25618809.0, "reward": 7.563618755340576, "reward_std": 8.234925365447998, "rewards/wrapper/mean": 3.781809412688017, "rewards/wrapper/std": 11.576200237870216, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 250.35625, "completions/mean_terminated_length": 24.0, "completions/min_length": 126.4, "completions/min_terminated_length": 24.0, "epoch": 0.5489736070381231, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.007738465024158358, "learning_rate": 6.9014185047567374e-06, "loss": -0.0186, "num_tokens": 25669328.0, "reward": 7.976767730712891, "reward_std": 9.955007457733155, "rewards/wrapper/mean": 3.9883838906884193, "rewards/wrapper/std": 11.497270411252975, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5501466275659824, "frac_reward_zero_std": 0.025, "grad_norm": 1.21875, "kl": 0.00954286667983979, "learning_rate": 6.897278421164825e-06, "loss": -0.0053, "num_tokens": 25723649.0, "reward": 10.446186733245849, "reward_std": 11.945374870300293, "rewards/wrapper/mean": 5.223093181848526, "rewards/wrapper/std": 15.632076103985309, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 53.6, "completions/mean_length": 252.88125, "completions/mean_terminated_length": 49.8, "completions/min_length": 199.6, "completions/min_terminated_length": 46.0, "epoch": 0.5513196480938416, "frac_reward_zero_std": 0.0, "grad_norm": 3.796875, "kl": 0.01686981668462977, "learning_rate": 6.893131663354141e-06, "loss": -0.0092, "num_tokens": 25778384.0, "reward": 12.496757245063781, "reward_std": 14.88165216445923, "rewards/wrapper/mean": 6.24837853461504, "rewards/wrapper/std": 19.739536590874195, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5524926686217009, "frac_reward_zero_std": 0.0375, "grad_norm": 7.84375, "kl": 0.49535912594292314, "learning_rate": 6.8889782455229516e-06, "loss": 0.0142, "num_tokens": 25834571.0, "reward": 17.93525218963623, "reward_std": 22.702994346618652, "rewards/wrapper/mean": 8.967626057565212, "rewards/wrapper/std": 25.6280776694417, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5536656891495602, "frac_reward_zero_std": 0.025, "grad_norm": 4.125, "kl": 0.052939243521541356, "learning_rate": 6.884818181892319e-06, "loss": -0.0096, "num_tokens": 25891789.0, "reward": 8.508272314071656, "reward_std": 10.7423424243927, "rewards/wrapper/mean": 4.254136118665338, "rewards/wrapper/std": 14.045457898080349, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 49.4, "completions/mean_length": 254.34375, "completions/mean_terminated_length": 49.4, "completions/min_length": 203.0, "completions/min_terminated_length": 49.4, "epoch": 0.5548387096774193, "frac_reward_zero_std": 0.0375, "grad_norm": 1.203125, "kl": 0.014628489618189633, "learning_rate": 6.8806514867060685e-06, "loss": 0.0015, "num_tokens": 25947872.0, "reward": 14.109557342529296, "reward_std": 17.481741905212402, "rewards/wrapper/mean": 7.054778654873371, "rewards/wrapper/std": 21.674049939215184, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.2, "completions/mean_length": 252.8375, "completions/mean_terminated_length": 1.2, "completions/min_length": 154.8, "completions/min_terminated_length": 1.2, "epoch": 0.5560117302052786, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.028208025731146336, "learning_rate": 6.876478174230728e-06, "loss": 0.0025, "num_tokens": 26001552.0, "reward": 12.057064437866211, "reward_std": 16.252457237243654, "rewards/wrapper/mean": 6.028532239794731, "rewards/wrapper/std": 19.421438497304916, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 42.6, "completions/mean_length": 249.34375, "completions/mean_terminated_length": 26.3, "completions/min_length": 112.4, "completions/min_terminated_length": 10.0, "epoch": 0.5571847507331378, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.008724843431264163, "learning_rate": 6.872298258755484e-06, "loss": -0.0051, "num_tokens": 26055861.0, "reward": 8.755217671394348, "reward_std": 11.814749300479889, "rewards/wrapper/mean": 4.377608657628298, "rewards/wrapper/std": 13.853392350673676, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 251.26875, "completions/mean_terminated_length": 1.2, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5583577712609971, "frac_reward_zero_std": 0.025, "grad_norm": 5.03125, "kl": 0.008417540992377326, "learning_rate": 6.868111754592126e-06, "loss": -0.0133, "num_tokens": 26108722.0, "reward": 6.4707801818847654, "reward_std": 6.961957550048828, "rewards/wrapper/mean": 3.235390084981918, "rewards/wrapper/std": 9.7925319314003, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 34.8, "completions/mean_length": 250.69375, "completions/mean_terminated_length": 26.5, "completions/min_length": 120.6, "completions/min_terminated_length": 18.2, "epoch": 0.5595307917888563, "frac_reward_zero_std": 0.0375, "grad_norm": 4.71875, "kl": 0.008799165394157172, "learning_rate": 6.863918676075011e-06, "loss": -0.0172, "num_tokens": 26163919.0, "reward": 7.298964881896973, "reward_std": 9.784769356250763, "rewards/wrapper/mean": 3.6494823902845384, "rewards/wrapper/std": 9.869220197200775, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 251.225, "completions/mean_terminated_length": 0.8, "completions/min_length": 103.2, "completions/min_terminated_length": 0.8, "epoch": 0.5607038123167155, "frac_reward_zero_std": 0.0375, "grad_norm": 3.734375, "kl": 0.010733005020301788, "learning_rate": 6.859719037561e-06, "loss": 0.0067, "num_tokens": 26217157.0, "reward": 9.287919998168945, "reward_std": 10.79558277130127, "rewards/wrapper/mean": 4.643959843367338, "rewards/wrapper/std": 12.492424213886261, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5618768328445748, "frac_reward_zero_std": 0.0125, "grad_norm": 1.609375, "kl": 0.00881088743917644, "learning_rate": 6.855512853429417e-06, "loss": -0.0047, "num_tokens": 26271704.0, "reward": 12.773279190063477, "reward_std": 16.686813735961913, "rewards/wrapper/mean": 6.386639550328255, "rewards/wrapper/std": 16.06548334211111, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 8.4, "completions/mean_length": 251.46875, "completions/mean_terminated_length": 8.4, "completions/min_length": 162.0, "completions/min_terminated_length": 8.4, "epoch": 0.5630498533724341, "frac_reward_zero_std": 0.0125, "grad_norm": 1.75, "kl": 0.008467439271043986, "learning_rate": 6.851300138081998e-06, "loss": 0.002, "num_tokens": 26324537.0, "reward": 9.383643126487732, "reward_std": 11.786495923995972, "rewards/wrapper/mean": 4.691821337491274, "rewards/wrapper/std": 10.728658132255077, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 253.14375, "completions/mean_terminated_length": 44.5, "completions/min_length": 180.6, "completions/min_terminated_length": 27.0, "epoch": 0.5642228739002932, "frac_reward_zero_std": 0.025, "grad_norm": 2.203125, "kl": 0.00841777388122864, "learning_rate": 6.847080905942841e-06, "loss": -0.0103, "num_tokens": 26379500.0, "reward": 15.23821144104004, "reward_std": 19.51053657531738, "rewards/wrapper/mean": 7.619105443358421, "rewards/wrapper/std": 20.655081064999102, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.5653958944281525, "frac_reward_zero_std": 0.0375, "grad_norm": 3.046875, "kl": 0.009163353778421878, "learning_rate": 6.84285517145836e-06, "loss": -0.0166, "num_tokens": 26431411.0, "reward": 13.655599784851074, "reward_std": 18.63932514190674, "rewards/wrapper/mean": 6.827799582481385, "rewards/wrapper/std": 20.419064237177373, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 16.4, "completions/mean_length": 248.51875, "completions/mean_terminated_length": 9.3, "completions/min_length": 53.4, "completions/min_terminated_length": 2.2, "epoch": 0.5665689149560117, "frac_reward_zero_std": 0.025, "grad_norm": 1.765625, "kl": 0.008064938080497085, "learning_rate": 6.838622949097228e-06, "loss": -0.0118, "num_tokens": 26486158.0, "reward": 7.38580174446106, "reward_std": 9.37550323009491, "rewards/wrapper/mean": 3.6929009795188903, "rewards/wrapper/std": 11.389765891432763, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 24.8, "completions/mean_length": 255.175, "completions/mean_terminated_length": 24.8, "completions/min_length": 229.6, "completions/min_terminated_length": 24.8, "epoch": 0.567741935483871, "frac_reward_zero_std": 0.025, "grad_norm": 1.484375, "kl": 0.014842483540996909, "learning_rate": 6.834384253350335e-06, "loss": 0.0, "num_tokens": 26539338.0, "reward": 9.526783275604249, "reward_std": 12.541070747375489, "rewards/wrapper/mean": 4.763391713798046, "rewards/wrapper/std": 14.62897629737854, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5689149560117303, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.008617074706126004, "learning_rate": 6.8301390987307355e-06, "loss": -0.0059, "num_tokens": 26594891.0, "reward": 9.922335815429687, "reward_std": 12.18459119796753, "rewards/wrapper/mean": 4.961168044060469, "rewards/wrapper/std": 13.656743614375591, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 77.4, "completions/mean_length": 251.225, "completions/mean_terminated_length": 72.4, "completions/min_length": 169.8, "completions/min_terminated_length": 67.4, "epoch": 0.5700879765395894, "frac_reward_zero_std": 0.0375, "grad_norm": 4.9375, "kl": 0.007735758402850479, "learning_rate": 6.8258874997735975e-06, "loss": 0.0074, "num_tokens": 26651603.0, "reward": 7.229635429382324, "reward_std": 9.8571928024292, "rewards/wrapper/mean": 3.6148177579045297, "rewards/wrapper/std": 12.254036985337734, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95625, "completions/max_length": 256.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 247.0375, "completions/mean_terminated_length": 38.3, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "epoch": 0.5712609970674487, "frac_reward_zero_std": 0.0, "grad_norm": 4.84375, "kl": 0.010122451512143017, "learning_rate": 6.821629471036154e-06, "loss": -0.0266, "num_tokens": 26705361.0, "reward": 9.642913770675658, "reward_std": 7.355159568786621, "rewards/wrapper/mean": 4.821456654369831, "rewards/wrapper/std": 13.488919001817703, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.5724340175953079, "frac_reward_zero_std": 0.0125, "grad_norm": 3152.0, "kl": 1.9757349454564974, "learning_rate": 6.817365027097655e-06, "loss": 0.0677, "num_tokens": 26757754.0, "reward": 5.890293455123901, "reward_std": 6.86049211025238, "rewards/wrapper/mean": 2.9451463639736177, "rewards/wrapper/std": 10.447056712210179, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 18.6, "completions/mean_length": 251.78125, "completions/mean_terminated_length": 18.6, "completions/min_length": 121.0, "completions/min_terminated_length": 18.6, "epoch": 0.5736070381231672, "frac_reward_zero_std": 0.025, "grad_norm": 7.875, "kl": 0.0200483936117962, "learning_rate": 6.813094182559314e-06, "loss": -0.0131, "num_tokens": 26812285.0, "reward": 17.4649169921875, "reward_std": 20.268056106567382, "rewards/wrapper/mean": 8.732458454370498, "rewards/wrapper/std": 23.30886591821909, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 252.70625, "completions/mean_terminated_length": 24.3, "completions/min_length": 154.2, "completions/min_terminated_length": 0.6, "epoch": 0.5747800586510264, "frac_reward_zero_std": 0.025, "grad_norm": 4.8125, "kl": 0.011486495216377079, "learning_rate": 6.8088169520442604e-06, "loss": -0.0067, "num_tokens": 26865718.0, "reward": 14.725182628631591, "reward_std": 19.531952953338624, "rewards/wrapper/mean": 7.362591397762299, "rewards/wrapper/std": 16.93528108596802, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.5759530791788856, "frac_reward_zero_std": 0.025, "grad_norm": 1.4296875, "kl": 0.015283348388038576, "learning_rate": 6.804533350197491e-06, "loss": -0.005, "num_tokens": 26920061.0, "reward": 12.328321695327759, "reward_std": 13.61722400188446, "rewards/wrapper/mean": 6.1641609571874145, "rewards/wrapper/std": 15.86318702250719, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 49.2, "completions/mean_length": 250.65625, "completions/mean_terminated_length": 44.2, "completions/min_length": 90.4, "completions/min_terminated_length": 39.2, "epoch": 0.5771260997067449, "frac_reward_zero_std": 0.0625, "grad_norm": 1.640625, "kl": 0.011669400706887245, "learning_rate": 6.800243391685812e-06, "loss": -0.0067, "num_tokens": 26975658.0, "reward": 7.230981612205506, "reward_std": 8.025412940979004, "rewards/wrapper/mean": 3.615490733087063, "rewards/wrapper/std": 9.987757830321788, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 48.6, "completions/mean_length": 251.5, "completions/mean_terminated_length": 30.6, "completions/min_length": 115.0, "completions/min_terminated_length": 12.6, "epoch": 0.5782991202346041, "frac_reward_zero_std": 0.075, "grad_norm": 5.0, "kl": 0.00790996109135449, "learning_rate": 6.795947091197802e-06, "loss": -0.0036, "num_tokens": 27033648.0, "reward": 12.457160663604736, "reward_std": 13.068470573425293, "rewards/wrapper/mean": 6.228580264747142, "rewards/wrapper/std": 17.349486616253852, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 250.41875, "completions/mean_terminated_length": 38.6, "completions/min_length": 185.8, "completions/min_terminated_length": 32.2, "epoch": 0.5794721407624633, "frac_reward_zero_std": 0.025, "grad_norm": 1.3828125, "kl": 0.009441101230913773, "learning_rate": 6.791644463443747e-06, "loss": -0.0107, "num_tokens": 27088861.0, "reward": 9.027909755706787, "reward_std": 11.057364964485169, "rewards/wrapper/mean": 4.513954804837704, "rewards/wrapper/std": 15.522229766845703, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.5806451612903226, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.0075355375884100795, "learning_rate": 6.787335523155603e-06, "loss": -0.0198, "num_tokens": 27142882.0, "reward": 10.263352251052856, "reward_std": 11.702549076080322, "rewards/wrapper/mean": 5.131676015257836, "rewards/wrapper/std": 14.765554384887219, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 40.8, "completions/mean_length": 252.475, "completions/mean_terminated_length": 40.8, "completions/min_length": 143.2, "completions/min_terminated_length": 40.8, "epoch": 0.5818181818181818, "frac_reward_zero_std": 0.0125, "grad_norm": 3.484375, "kl": 0.009854745410848409, "learning_rate": 6.783020285086934e-06, "loss": -0.0079, "num_tokens": 27199384.0, "reward": 12.724866485595703, "reward_std": 15.704174327850343, "rewards/wrapper/mean": 6.362433303892613, "rewards/wrapper/std": 19.918368512392043, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 55.2, "completions/mean_length": 252.24375, "completions/mean_terminated_length": 52.3, "completions/min_length": 151.8, "completions/min_terminated_length": 49.4, "epoch": 0.5829912023460411, "frac_reward_zero_std": 0.0125, "grad_norm": 3.96875, "kl": 2.5806705773225986, "learning_rate": 6.778698764012874e-06, "loss": 0.1059, "num_tokens": 27254611.0, "reward": 11.302087688446045, "reward_std": 12.61121587753296, "rewards/wrapper/mean": 5.651043940335512, "rewards/wrapper/std": 16.543170015513898, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 24.2, "completions/mean_length": 253.55625, "completions/mean_terminated_length": 24.2, "completions/min_length": 177.8, "completions/min_terminated_length": 24.2, "epoch": 0.5841642228739002, "frac_reward_zero_std": 0.0125, "grad_norm": 1.1015625, "kl": 0.005001664778683334, "learning_rate": 6.7743709747300635e-06, "loss": -0.0087, "num_tokens": 27306574.0, "reward": 17.763720893859862, "reward_std": 21.10177364349365, "rewards/wrapper/mean": 8.881860357522964, "rewards/wrapper/std": 25.570963774621486, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 52.4, "completions/mean_length": 252.8375, "completions/mean_terminated_length": 52.4, "completions/min_length": 154.8, "completions/min_terminated_length": 52.4, "epoch": 0.5853372434017595, "frac_reward_zero_std": 0.0375, "grad_norm": 5.78125, "kl": 0.08458915337687359, "learning_rate": 6.770036932056609e-06, "loss": -0.0047, "num_tokens": 27359112.0, "reward": 9.331131935119629, "reward_std": 12.55569248199463, "rewards/wrapper/mean": 4.665565884113311, "rewards/wrapper/std": 14.336715736985207, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 7.2, "completions/mean_length": 253.025, "completions/mean_terminated_length": 7.2, "completions/min_length": 160.8, "completions/min_terminated_length": 7.2, "epoch": 0.5865102639296188, "frac_reward_zero_std": 0.025, "grad_norm": 2.96875, "kl": 0.026178717741277068, "learning_rate": 6.765696650832026e-06, "loss": -0.0082, "num_tokens": 27410764.0, "reward": 10.922028636932373, "reward_std": 14.060818576812744, "rewards/wrapper/mean": 5.461014226078987, "rewards/wrapper/std": 21.613298135995866, "step": 2500 }, { "epoch": 0.5865102639296188, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.8, "eval_completions/max_length": 255.95, "eval_completions/max_terminated_length": 72.19, "eval_completions/mean_length": 227.0475, "eval_completions/mean_terminated_length": 64.31, "eval_completions/min_length": 158.85, "eval_completions/min_terminated_length": 56.45, "eval_frac_reward_zero_std": 0.015, "eval_kl": 0.015076845483854413, "eval_loss": -0.026887383311986923, "eval_num_tokens": 27410764.0, "eval_reward": 0.34842764906585216, "eval_reward_std": 0.17556372756604105, "eval_rewards/wrapper/mean": 0.17421382576227187, "eval_rewards/wrapper/std": 0.15659947301028296, "eval_runtime": 208.2404, "eval_samples_per_second": 0.96, "eval_steps_per_second": 0.24, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 249.7875, "completions/mean_terminated_length": 6.0, "completions/min_length": 57.2, "completions/min_terminated_length": 6.0, "epoch": 0.587683284457478, "frac_reward_zero_std": 0.0125, "grad_norm": 1.4375, "kl": 0.008145063684787601, "learning_rate": 6.761350145917192e-06, "loss": -0.0178, "num_tokens": 27463364.0, "reward": 5.690246820449829, "reward_std": 6.9504313468933105, "rewards/wrapper/mean": 2.845123402774334, "rewards/wrapper/std": 9.402719949185848, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 250.225, "completions/mean_terminated_length": 20.0, "completions/min_length": 71.2, "completions/min_terminated_length": 20.0, "epoch": 0.5888563049853373, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.011013640067540109, "learning_rate": 6.756997432194293e-06, "loss": -0.0043, "num_tokens": 27518224.0, "reward": 15.580005931854249, "reward_std": 20.939382457733153, "rewards/wrapper/mean": 7.790003002434969, "rewards/wrapper/std": 21.323382918536662, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 252.8875, "completions/mean_terminated_length": 54.0, "completions/min_length": 156.4, "completions/min_terminated_length": 54.0, "epoch": 0.5900293255131965, "frac_reward_zero_std": 0.0375, "grad_norm": 7.96875, "kl": 0.02926081788027659, "learning_rate": 6.752638524566773e-06, "loss": -0.0009, "num_tokens": 27572688.0, "reward": 8.543451595306397, "reward_std": 11.566871976852417, "rewards/wrapper/mean": 4.271725799143314, "rewards/wrapper/std": 16.63720283508301, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 11.6, "completions/mean_length": 249.9625, "completions/mean_terminated_length": 11.6, "completions/min_length": 62.8, "completions/min_terminated_length": 11.6, "epoch": 0.5912023460410557, "frac_reward_zero_std": 0.025, "grad_norm": 3.296875, "kl": 0.009021649765782059, "learning_rate": 6.748273437959286e-06, "loss": -0.0176, "num_tokens": 27629564.0, "reward": 14.583421611785889, "reward_std": 17.03965835571289, "rewards/wrapper/mean": 7.29171050414443, "rewards/wrapper/std": 17.830930642783642, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 249.91875, "completions/mean_terminated_length": 10.0, "completions/min_length": 112.4, "completions/min_terminated_length": 10.0, "epoch": 0.592375366568915, "frac_reward_zero_std": 0.0125, "grad_norm": 1.9140625, "kl": 0.007613116281572729, "learning_rate": 6.74390218731764e-06, "loss": -0.0146, "num_tokens": 27683597.0, "reward": 11.419633483886718, "reward_std": 15.065901374816894, "rewards/wrapper/mean": 5.709816740453244, "rewards/wrapper/std": 17.053964272141457, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.5935483870967742, "frac_reward_zero_std": 0.0625, "grad_norm": 9.875, "kl": 0.010646293580066413, "learning_rate": 6.7395247876087505e-06, "loss": -0.0128, "num_tokens": 27738890.0, "reward": 11.766228675842285, "reward_std": 13.879878425598145, "rewards/wrapper/mean": 5.883114151656628, "rewards/wrapper/std": 15.828456656634808, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 41.6, "completions/mean_length": 250.9, "completions/mean_terminated_length": 41.6, "completions/min_length": 92.8, "completions/min_terminated_length": 41.6, "epoch": 0.5947214076246334, "frac_reward_zero_std": 0.0125, "grad_norm": 5.84375, "kl": 0.009409766842145473, "learning_rate": 6.735141253820584e-06, "loss": -0.003, "num_tokens": 27793140.0, "reward": 10.182917737960816, "reward_std": 11.574919128417969, "rewards/wrapper/mean": 5.091458834707737, "rewards/wrapper/std": 16.947440457344054, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 252.2, "completions/mean_terminated_length": 32.0, "completions/min_length": 134.4, "completions/min_terminated_length": 32.0, "epoch": 0.5958944281524927, "frac_reward_zero_std": 0.0125, "grad_norm": 2.453125, "kl": 0.014364969654707238, "learning_rate": 6.730751600962113e-06, "loss": -0.0112, "num_tokens": 27845464.0, "reward": 10.166706693172454, "reward_std": 11.566449183225632, "rewards/wrapper/mean": 5.0833531498908995, "rewards/wrapper/std": 15.070533008873463, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 25.8, "completions/mean_length": 253.60625, "completions/mean_terminated_length": 25.8, "completions/min_length": 179.4, "completions/min_terminated_length": 25.8, "epoch": 0.5970674486803519, "frac_reward_zero_std": 0.025, "grad_norm": 5.125, "kl": 0.01348751963232644, "learning_rate": 6.7263558440632615e-06, "loss": 0.006, "num_tokens": 27897069.0, "reward": 6.304814243316651, "reward_std": 5.5823561668396, "rewards/wrapper/mean": 3.1524071991443634, "rewards/wrapper/std": 9.431627669930458, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 61.4, "completions/mean_length": 254.71875, "completions/mean_terminated_length": 61.4, "completions/min_length": 215.0, "completions/min_terminated_length": 61.4, "epoch": 0.5982404692082112, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.007102605036925525, "learning_rate": 6.721953998174848e-06, "loss": -0.0031, "num_tokens": 27953170.0, "reward": 10.140501308441163, "reward_std": 13.48496742248535, "rewards/wrapper/mean": 5.0702507749199865, "rewards/wrapper/std": 14.650724425911903, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 55.2, "completions/mean_length": 252.93125, "completions/mean_terminated_length": 52.5, "completions/min_length": 203.4, "completions/min_terminated_length": 49.8, "epoch": 0.5994134897360703, "frac_reward_zero_std": 0.025, "grad_norm": 4.65625, "kl": 0.027316716557834297, "learning_rate": 6.717546078368546e-06, "loss": -0.0006, "num_tokens": 28008075.0, "reward": 5.827312445640564, "reward_std": 7.616882419586181, "rewards/wrapper/mean": 2.913656205683947, "rewards/wrapper/std": 9.296160396933555, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 4.6, "completions/mean_length": 252.94375, "completions/mean_terminated_length": 4.6, "completions/min_length": 158.2, "completions/min_terminated_length": 4.6, "epoch": 0.6005865102639296, "frac_reward_zero_std": 0.025, "grad_norm": 0.90625, "kl": 0.00896136105293408, "learning_rate": 6.713132099736822e-06, "loss": -0.0105, "num_tokens": 28062328.0, "reward": 13.819943046569824, "reward_std": 18.820354843139647, "rewards/wrapper/mean": 6.9099711433053015, "rewards/wrapper/std": 20.776973666250704, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 250.0625, "completions/mean_terminated_length": 43.7, "completions/min_length": 88.6, "completions/min_terminated_length": 37.4, "epoch": 0.6017595307917889, "frac_reward_zero_std": 0.0125, "grad_norm": 3.59375, "kl": 0.012951006775256246, "learning_rate": 6.708712077392889e-06, "loss": -0.0146, "num_tokens": 28118812.0, "reward": 11.193282270431519, "reward_std": 13.171154010295869, "rewards/wrapper/mean": 5.596641125530004, "rewards/wrapper/std": 16.368498238921166, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 38.2, "completions/mean_length": 255.59375, "completions/mean_terminated_length": 38.2, "completions/min_length": 243.0, "completions/min_terminated_length": 38.2, "epoch": 0.6029325513196481, "frac_reward_zero_std": 0.0125, "grad_norm": 7.96875, "kl": 0.011676612915471197, "learning_rate": 6.704286026470651e-06, "loss": -0.0004, "num_tokens": 28174469.0, "reward": 13.969676399230957, "reward_std": 14.664949417114258, "rewards/wrapper/mean": 6.984838116168976, "rewards/wrapper/std": 19.052175915241243, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 16.8, "completions/mean_length": 253.325, "completions/mean_terminated_length": 16.8, "completions/min_length": 170.4, "completions/min_terminated_length": 16.8, "epoch": 0.6041055718475073, "frac_reward_zero_std": 0.025, "grad_norm": 5.03125, "kl": 0.033306200080551206, "learning_rate": 6.699853962124658e-06, "loss": -0.0036, "num_tokens": 28230099.0, "reward": 10.326721906661987, "reward_std": 13.107465887069703, "rewards/wrapper/mean": 5.163360907882452, "rewards/wrapper/std": 15.956426008045673, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 17.6, "completions/mean_length": 251.75625, "completions/mean_terminated_length": 10.8, "completions/min_length": 157.6, "completions/min_terminated_length": 4.0, "epoch": 0.6052785923753665, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.006026765506248921, "learning_rate": 6.695415899530045e-06, "loss": -0.0031, "num_tokens": 28282456.0, "reward": 8.02139835357666, "reward_std": 9.612114334106446, "rewards/wrapper/mean": 4.010698922723532, "rewards/wrapper/std": 13.752987106144428, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 1.6, "completions/mean_length": 251.25, "completions/mean_terminated_length": 1.6, "completions/min_length": 104.0, "completions/min_terminated_length": 1.6, "epoch": 0.6064516129032258, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.006650674215052277, "learning_rate": 6.690971853882488e-06, "loss": -0.0122, "num_tokens": 28338506.0, "reward": 11.529253673553466, "reward_std": 12.749757957458495, "rewards/wrapper/mean": 5.764626702666282, "rewards/wrapper/std": 16.920075453817844, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 15.2, "completions/mean_length": 253.275, "completions/mean_terminated_length": 15.2, "completions/min_length": 168.8, "completions/min_terminated_length": 15.2, "epoch": 0.6076246334310851, "frac_reward_zero_std": 0.025, "grad_norm": 5.6875, "kl": 0.007390762877184898, "learning_rate": 6.686521840398147e-06, "loss": -0.006, "num_tokens": 28393026.0, "reward": 8.514682960510253, "reward_std": 9.638944625854492, "rewards/wrapper/mean": 4.257341559231281, "rewards/wrapper/std": 13.977068457007409, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 4.8, "completions/mean_length": 249.81875, "completions/mean_terminated_length": 3.7, "completions/min_length": 105.0, "completions/min_terminated_length": 2.6, "epoch": 0.6087976539589443, "frac_reward_zero_std": 0.05, "grad_norm": 1.65625, "kl": 0.009915889461990447, "learning_rate": 6.682065874313614e-06, "loss": -0.0171, "num_tokens": 28448633.0, "reward": 9.647655391693116, "reward_std": 12.866382217407226, "rewards/wrapper/mean": 4.823827692866326, "rewards/wrapper/std": 14.450115689635277, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 42.2, "completions/mean_length": 244.54375, "completions/mean_terminated_length": 37.7, "completions/min_length": 84.4, "completions/min_terminated_length": 33.2, "epoch": 0.6099706744868035, "frac_reward_zero_std": 0.05, "grad_norm": 4.75, "kl": 0.03516354828607291, "learning_rate": 6.677603970885869e-06, "loss": -0.0222, "num_tokens": 28503032.0, "reward": 7.7402863025665285, "reward_std": 10.311076450347901, "rewards/wrapper/mean": 3.870143134891987, "rewards/wrapper/std": 14.02377125620842, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 250.075, "completions/mean_terminated_length": 14.3, "completions/min_length": 116.0, "completions/min_terminated_length": 13.6, "epoch": 0.6111436950146627, "frac_reward_zero_std": 0.0375, "grad_norm": 5.03125, "kl": 0.008182111865608022, "learning_rate": 6.67313614539221e-06, "loss": -0.0034, "num_tokens": 28556298.0, "reward": 8.295944690704346, "reward_std": 10.63902931213379, "rewards/wrapper/mean": 4.147972152382136, "rewards/wrapper/std": 12.825438100099564, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 4.4, "completions/mean_length": 249.84375, "completions/mean_terminated_length": 4.1, "completions/min_length": 106.2, "completions/min_terminated_length": 3.8, "epoch": 0.612316715542522, "frac_reward_zero_std": 0.025, "grad_norm": 1.90625, "kl": 0.006537263444624841, "learning_rate": 6.668662413130221e-06, "loss": -0.012, "num_tokens": 28609081.0, "reward": 11.645491218566894, "reward_std": 15.850448608398438, "rewards/wrapper/mean": 5.8227458745241165, "rewards/wrapper/std": 17.563245555758478, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 13.8, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 13.8, "completions/min_length": 65.0, "completions/min_terminated_length": 13.8, "epoch": 0.6134897360703813, "frac_reward_zero_std": 0.025, "grad_norm": 14.9375, "kl": 0.009183612850029022, "learning_rate": 6.66418278941771e-06, "loss": -0.0135, "num_tokens": 28660034.0, "reward": 8.45309820175171, "reward_std": 11.176138877868652, "rewards/wrapper/mean": 4.226549039781093, "rewards/wrapper/std": 13.238977485895157, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 22.8, "completions/mean_length": 248.725, "completions/mean_terminated_length": 22.8, "completions/min_length": 125.2, "completions/min_terminated_length": 22.8, "epoch": 0.6146627565982404, "frac_reward_zero_std": 0.025, "grad_norm": 1.6953125, "kl": 0.007062935788417235, "learning_rate": 6.659697289592652e-06, "loss": -0.0188, "num_tokens": 28711502.0, "reward": 7.796641778945923, "reward_std": 10.286471939086914, "rewards/wrapper/mean": 3.898320996761322, "rewards/wrapper/std": 12.970464818179607, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 251.325, "completions/mean_terminated_length": 4.0, "completions/min_length": 106.4, "completions/min_terminated_length": 4.0, "epoch": 0.6158357771260997, "frac_reward_zero_std": 0.0375, "grad_norm": 2.71875, "kl": 0.008498370146844536, "learning_rate": 6.655205929013143e-06, "loss": -0.0101, "num_tokens": 28765306.0, "reward": 10.583085918426514, "reward_std": 13.03041124343872, "rewards/wrapper/mean": 5.2915429577231405, "rewards/wrapper/std": 13.423381480574609, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 247.00625, "completions/mean_terminated_length": 13.13333339691162, "completions/min_length": 61.4, "completions/min_terminated_length": 10.2, "epoch": 0.617008797653959, "frac_reward_zero_std": 0.025, "grad_norm": 1.53125, "kl": 0.01540077495155856, "learning_rate": 6.650708723057348e-06, "loss": -0.0035, "num_tokens": 28822733.0, "reward": 8.799090671539307, "reward_std": 11.789759016036987, "rewards/wrapper/mean": 4.399545115232468, "rewards/wrapper/std": 12.522874061763286, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 42.8, "completions/mean_length": 249.34375, "completions/mean_terminated_length": 22.6, "completions/min_length": 53.6, "completions/min_terminated_length": 2.4, "epoch": 0.6181818181818182, "frac_reward_zero_std": 0.0125, "grad_norm": 1.8203125, "kl": 0.008354636456351728, "learning_rate": 6.6462056871234466e-06, "loss": -0.0171, "num_tokens": 28878822.0, "reward": 8.222666358947754, "reward_std": 10.734997510910034, "rewards/wrapper/mean": 4.111333182454109, "rewards/wrapper/std": 13.712842452526093, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 154.6, "completions/mean_length": 251.74375, "completions/mean_terminated_length": 154.1, "completions/min_length": 153.6, "completions/min_terminated_length": 153.6, "epoch": 0.6193548387096774, "frac_reward_zero_std": 0.0, "grad_norm": 3.765625, "kl": 0.009392125299200416, "learning_rate": 6.641696836629576e-06, "loss": -0.0076, "num_tokens": 28932785.0, "reward": 13.914123153686523, "reward_std": 18.75023546218872, "rewards/wrapper/mean": 6.957061505317688, "rewards/wrapper/std": 19.568567314743994, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 19.8, "completions/mean_length": 250.23125, "completions/mean_terminated_length": 6.866667175292969, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6205278592375366, "frac_reward_zero_std": 0.0125, "grad_norm": 2.625, "kl": 0.00819388214731589, "learning_rate": 6.637182187013788e-06, "loss": -0.0055, "num_tokens": 28984320.0, "reward": 5.599798417091369, "reward_std": 7.3513828158378605, "rewards/wrapper/mean": 2.7998990304768085, "rewards/wrapper/std": 8.210212644934654, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 25.0, "completions/mean_length": 250.3875, "completions/mean_terminated_length": 25.0, "completions/min_length": 127.4, "completions/min_terminated_length": 25.0, "epoch": 0.6217008797653959, "frac_reward_zero_std": 0.0125, "grad_norm": 1.375, "kl": 0.0097636858234182, "learning_rate": 6.632661753733982e-06, "loss": -0.004, "num_tokens": 29040122.0, "reward": 10.327516424655915, "reward_std": 14.078668093681335, "rewards/wrapper/mean": 5.16375821903348, "rewards/wrapper/std": 15.739317643642426, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 249.6375, "completions/mean_terminated_length": 0.5333333492279053, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6228739002932552, "frac_reward_zero_std": 0.0375, "grad_norm": 1.765625, "kl": 0.017314812494441868, "learning_rate": 6.628135552267869e-06, "loss": -0.0056, "num_tokens": 29099046.0, "reward": 8.878870403766632, "reward_std": 9.9872851729393, "rewards/wrapper/mean": 4.439434761554002, "rewards/wrapper/std": 10.966332286596298, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 41.2, "completions/mean_length": 255.6875, "completions/mean_terminated_length": 41.2, "completions/min_length": 246.0, "completions/min_terminated_length": 41.2, "epoch": 0.6240469208211143, "frac_reward_zero_std": 0.0125, "grad_norm": 3.25, "kl": 0.016126143047586083, "learning_rate": 6.6236035981129045e-06, "loss": 0.0013, "num_tokens": 29156898.0, "reward": 9.977191162109374, "reward_std": 12.882474327087403, "rewards/wrapper/mean": 4.988595449924469, "rewards/wrapper/std": 15.649570155143739, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 250.41875, "completions/mean_terminated_length": 26.0, "completions/min_length": 128.4, "completions/min_terminated_length": 26.0, "epoch": 0.6252199413489736, "frac_reward_zero_std": 0.0125, "grad_norm": 5.71875, "kl": 0.013472359464503824, "learning_rate": 6.6190659067862444e-06, "loss": -0.0151, "num_tokens": 29213789.0, "reward": 12.702929973602295, "reward_std": 15.42300910949707, "rewards/wrapper/mean": 6.351464556157589, "rewards/wrapper/std": 16.331206111609937, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 18.6, "completions/mean_length": 253.38125, "completions/mean_terminated_length": 18.6, "completions/min_length": 172.2, "completions/min_terminated_length": 18.6, "epoch": 0.6263929618768328, "frac_reward_zero_std": 0.0125, "grad_norm": 2.125, "kl": 0.00791556949261576, "learning_rate": 6.614522493824686e-06, "loss": -0.0066, "num_tokens": 29268354.0, "reward": 7.424413585662842, "reward_std": 9.934688711166382, "rewards/wrapper/mean": 3.7122067116200923, "rewards/wrapper/std": 11.696235999464989, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 19.4, "completions/mean_length": 250.21875, "completions/mean_terminated_length": 9.9, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6275659824046921, "frac_reward_zero_std": 0.025, "grad_norm": 1.40625, "kl": 0.005988685146439821, "learning_rate": 6.609973374784615e-06, "loss": -0.0126, "num_tokens": 29321065.0, "reward": 11.359949398040772, "reward_std": 15.09793529510498, "rewards/wrapper/mean": 5.679974632710218, "rewards/wrapper/std": 18.337996226549148, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 26.2, "completions/mean_length": 250.425, "completions/mean_terminated_length": 13.4, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.6287390029325514, "frac_reward_zero_std": 0.025, "grad_norm": 0.95703125, "kl": 0.008262872393243016, "learning_rate": 6.605418565241957e-06, "loss": -0.012, "num_tokens": 29376085.0, "reward": 20.049555778503418, "reward_std": 23.516221237182616, "rewards/wrapper/mean": 10.024778033792973, "rewards/wrapper/std": 26.077826090157032, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 42.8, "completions/mean_length": 252.65625, "completions/mean_terminated_length": 36.3, "completions/min_length": 183.4, "completions/min_terminated_length": 29.8, "epoch": 0.6299120234604105, "frac_reward_zero_std": 0.0125, "grad_norm": 2.53125, "kl": 0.00597726086853072, "learning_rate": 6.600858080792127e-06, "loss": -0.0033, "num_tokens": 29430674.0, "reward": 11.03137435913086, "reward_std": 14.728109073638915, "rewards/wrapper/mean": 5.515687373280525, "rewards/wrapper/std": 18.126582558453084, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 39.6, "completions/mean_length": 254.0375, "completions/mean_terminated_length": 39.6, "completions/min_length": 193.2, "completions/min_terminated_length": 39.6, "epoch": 0.6310850439882698, "frac_reward_zero_std": 0.05, "grad_norm": 1.34375, "kl": 0.008559392578899861, "learning_rate": 6.596291937049959e-06, "loss": 0.0071, "num_tokens": 29483068.0, "reward": 17.887986612319946, "reward_std": 21.92697615623474, "rewards/wrapper/mean": 8.943993638455868, "rewards/wrapper/std": 18.718515367805956, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 251.36875, "completions/mean_terminated_length": 33.5, "completions/min_length": 134.4, "completions/min_terminated_length": 32.0, "epoch": 0.632258064516129, "frac_reward_zero_std": 0.0125, "grad_norm": 3.453125, "kl": 0.014107034134212881, "learning_rate": 6.5917201496496735e-06, "loss": -0.0028, "num_tokens": 29536873.0, "reward": 8.071206760406493, "reward_std": 9.532784819602966, "rewards/wrapper/mean": 4.035603339970112, "rewards/wrapper/std": 13.035564199090004, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 94.2, "completions/mean_length": 251.6125, "completions/mean_terminated_length": 63.4, "completions/min_length": 135.0, "completions/min_terminated_length": 32.6, "epoch": 0.6334310850439883, "frac_reward_zero_std": 0.0125, "grad_norm": 3.34375, "kl": 0.007457331806654111, "learning_rate": 6.5871427342448105e-06, "loss": -0.0076, "num_tokens": 29588683.0, "reward": 8.361358261108398, "reward_std": 10.859839820861817, "rewards/wrapper/mean": 4.180679216235876, "rewards/wrapper/std": 13.797844186425209, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 48.6, "completions/mean_length": 249.525, "completions/mean_terminated_length": 47.2, "completions/min_length": 97.0, "completions/min_terminated_length": 45.8, "epoch": 0.6346041055718475, "frac_reward_zero_std": 0.0375, "grad_norm": 7.3125, "kl": 0.011775065213441849, "learning_rate": 6.58255970650818e-06, "loss": -0.0114, "num_tokens": 29642651.0, "reward": 12.164882373809814, "reward_std": 10.960504055023193, "rewards/wrapper/mean": 6.082441242039204, "rewards/wrapper/std": 16.81056024134159, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 22.2, "completions/mean_length": 253.49375, "completions/mean_terminated_length": 22.2, "completions/min_length": 175.8, "completions/min_terminated_length": 22.2, "epoch": 0.6357771260997067, "frac_reward_zero_std": 0.0125, "grad_norm": 6.0625, "kl": 0.012871231907047332, "learning_rate": 6.5779710821318105e-06, "loss": -0.0079, "num_tokens": 29698884.0, "reward": 8.56117124557495, "reward_std": 11.631469535827637, "rewards/wrapper/mean": 4.28058585524559, "rewards/wrapper/std": 12.213414934277534, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 17.8, "completions/mean_length": 253.35625, "completions/mean_terminated_length": 17.8, "completions/min_length": 171.4, "completions/min_terminated_length": 17.8, "epoch": 0.636950146627566, "frac_reward_zero_std": 0.05, "grad_norm": 1.640625, "kl": 0.00925441893050447, "learning_rate": 6.573376876826891e-06, "loss": -0.0078, "num_tokens": 29753909.0, "reward": 12.767994022369384, "reward_std": 17.376163291931153, "rewards/wrapper/mean": 6.38399690464139, "rewards/wrapper/std": 19.173024424910544, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 2.4, "completions/mean_length": 254.475, "completions/mean_terminated_length": 2.4, "completions/min_length": 207.2, "completions/min_terminated_length": 2.4, "epoch": 0.6381231671554253, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.007609774707816541, "learning_rate": 6.568777106323721e-06, "loss": -0.0044, "num_tokens": 29806425.0, "reward": 15.84351224899292, "reward_std": 19.228125762939452, "rewards/wrapper/mean": 7.9217560440301895, "rewards/wrapper/std": 18.802449291944505, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 26.6, "completions/mean_length": 252.0625, "completions/mean_terminated_length": 24.5, "completions/min_length": 176.0, "completions/min_terminated_length": 22.4, "epoch": 0.6392961876832844, "frac_reward_zero_std": 0.0375, "grad_norm": 0.9453125, "kl": 0.00825467858230695, "learning_rate": 6.5641717863716515e-06, "loss": -0.0112, "num_tokens": 29860335.0, "reward": 11.479648876190186, "reward_std": 13.858703422546387, "rewards/wrapper/mean": 5.739824234694242, "rewards/wrapper/std": 15.419359780848026, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 17.2, "completions/mean_length": 250.425, "completions/mean_terminated_length": 13.2, "completions/min_length": 162.8, "completions/min_terminated_length": 9.2, "epoch": 0.6404692082111437, "frac_reward_zero_std": 0.025, "grad_norm": 3.9375, "kl": 0.02650681862141937, "learning_rate": 6.559560932739037e-06, "loss": -0.0188, "num_tokens": 29916441.0, "reward": 14.781854248046875, "reward_std": 15.919013786315919, "rewards/wrapper/mean": 7.390926908701658, "rewards/wrapper/std": 21.141387024521826, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 33.6, "completions/mean_length": 250.76875, "completions/mean_terminated_length": 20.5, "completions/min_length": 109.8, "completions/min_terminated_length": 7.4, "epoch": 0.6416422287390029, "frac_reward_zero_std": 0.025, "grad_norm": 2.796875, "kl": 0.13726555263856427, "learning_rate": 6.554944561213182e-06, "loss": -0.0118, "num_tokens": 29974952.0, "reward": 10.810014820098877, "reward_std": 14.398545169830323, "rewards/wrapper/mean": 5.405007231235504, "rewards/wrapper/std": 16.08275884240866, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6428152492668622, "frac_reward_zero_std": 0.0625, "grad_norm": 3.625, "kl": 0.009235573734622448, "learning_rate": 6.550322687600278e-06, "loss": -0.0009, "num_tokens": 30030699.0, "reward": 13.479443836212159, "reward_std": 14.499934005737305, "rewards/wrapper/mean": 6.739722138643264, "rewards/wrapper/std": 16.795974485576153, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 30.0, "completions/mean_length": 248.95, "completions/mean_terminated_length": 15.3, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.6439882697947215, "frac_reward_zero_std": 0.025, "grad_norm": 2.703125, "kl": 0.00881982856662944, "learning_rate": 6.54569532772536e-06, "loss": -0.0089, "num_tokens": 30086385.0, "reward": 14.04744815826416, "reward_std": 17.4063777923584, "rewards/wrapper/mean": 7.023723734170199, "rewards/wrapper/std": 19.03030771613121, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 46.4, "completions/mean_length": 252.65625, "completions/mean_terminated_length": 30.5, "completions/min_length": 168.2, "completions/min_terminated_length": 14.6, "epoch": 0.6451612903225806, "frac_reward_zero_std": 0.0125, "grad_norm": 2.359375, "kl": 0.07983345453976654, "learning_rate": 6.541062497432242e-06, "loss": 0.0084, "num_tokens": 30140448.0, "reward": 5.343929433822632, "reward_std": 6.9545831203460695, "rewards/wrapper/mean": 2.671964705735445, "rewards/wrapper/std": 10.572839736938477, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 16.6, "completions/mean_length": 254.91875, "completions/mean_terminated_length": 16.6, "completions/min_length": 221.4, "completions/min_terminated_length": 16.6, "epoch": 0.6463343108504399, "frac_reward_zero_std": 0.0375, "grad_norm": 2.984375, "kl": 0.01167663314845413, "learning_rate": 6.536424212583478e-06, "loss": 0.0045, "num_tokens": 30199283.0, "reward": 12.16322021484375, "reward_std": 12.57860621213913, "rewards/wrapper/mean": 6.081610155105591, "rewards/wrapper/std": 19.400627340376378, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 0.8, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 0.8, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.6475073313782991, "frac_reward_zero_std": 0.025, "grad_norm": 4.28125, "kl": 0.011256003042217345, "learning_rate": 6.531780489060287e-06, "loss": -0.0291, "num_tokens": 30253005.0, "reward": 10.376868152618409, "reward_std": 13.655181932449342, "rewards/wrapper/mean": 5.18843387439847, "rewards/wrapper/std": 16.041875714063643, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 41.2, "completions/mean_length": 252.49375, "completions/mean_terminated_length": 20.8, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6486803519061584, "frac_reward_zero_std": 0.0, "grad_norm": 3.296875, "kl": 0.006306805438362062, "learning_rate": 6.527131342762519e-06, "loss": -0.0122, "num_tokens": 30305632.0, "reward": 14.130721759796142, "reward_std": 17.248423194885255, "rewards/wrapper/mean": 7.065360965207219, "rewards/wrapper/std": 18.418144088238478, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 46.4, "completions/mean_length": 251.05625, "completions/mean_terminated_length": 24.3, "completions/min_length": 104.6, "completions/min_terminated_length": 2.2, "epoch": 0.6498533724340176, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.007100276905111969, "learning_rate": 6.522476789608584e-06, "loss": -0.004, "num_tokens": 30359495.0, "reward": 7.962774801254272, "reward_std": 10.023509168624878, "rewards/wrapper/mean": 3.98138741850853, "rewards/wrapper/std": 12.435541369020939, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 10.6, "completions/mean_length": 251.53125, "completions/mean_terminated_length": 10.6, "completions/min_length": 113.0, "completions/min_terminated_length": 10.6, "epoch": 0.6510263929618768, "frac_reward_zero_std": 0.025, "grad_norm": 1.6484375, "kl": 0.018682882434222847, "learning_rate": 6.517816845535409e-06, "loss": -0.0133, "num_tokens": 30415146.0, "reward": 11.046603441238403, "reward_std": 14.976951122283936, "rewards/wrapper/mean": 5.523301954567432, "rewards/wrapper/std": 16.57356094866991, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 251.6375, "completions/mean_terminated_length": 14.0, "completions/min_length": 116.4, "completions/min_terminated_length": 14.0, "epoch": 0.6521994134897361, "frac_reward_zero_std": 0.025, "grad_norm": 0.9765625, "kl": 0.15719263151986523, "learning_rate": 6.513151526498379e-06, "loss": 0.0002, "num_tokens": 30469236.0, "reward": 11.077020359039306, "reward_std": 14.02384262084961, "rewards/wrapper/mean": 5.538510248064995, "rewards/wrapper/std": 14.277467794716358, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 252.35, "completions/mean_terminated_length": 88.0, "completions/min_length": 139.2, "completions/min_terminated_length": 88.0, "epoch": 0.6533724340175953, "frac_reward_zero_std": 0.025, "grad_norm": 1.21875, "kl": 2.1136296992423014, "learning_rate": 6.508480848471282e-06, "loss": 0.0867, "num_tokens": 30524428.0, "reward": 8.025190353393555, "reward_std": 10.717722511291504, "rewards/wrapper/mean": 4.012595250457525, "rewards/wrapper/std": 15.221745024621487, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.2, "completions/mean_length": 250.36875, "completions/mean_terminated_length": 12.3, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6545454545454545, "frac_reward_zero_std": 0.075, "grad_norm": 3.140625, "kl": 0.030153904797043652, "learning_rate": 6.503804827446254e-06, "loss": -0.008, "num_tokens": 30579919.0, "reward": 10.410851192474365, "reward_std": 13.341778182983399, "rewards/wrapper/mean": 5.2054255366325375, "rewards/wrapper/std": 16.092943432927132, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 248.08125, "completions/mean_terminated_length": 1.2, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6557184750733138, "frac_reward_zero_std": 0.0375, "grad_norm": 2.59375, "kl": 0.007615790021372959, "learning_rate": 6.499123479433728e-06, "loss": -0.0049, "num_tokens": 30633020.0, "reward": 6.076622056961059, "reward_std": 7.86311776638031, "rewards/wrapper/mean": 3.038311021029949, "rewards/wrapper/std": 8.92110146433115, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 24.6, "completions/mean_length": 253.56875, "completions/mean_terminated_length": 24.6, "completions/min_length": 178.2, "completions/min_terminated_length": 24.6, "epoch": 0.656891495601173, "frac_reward_zero_std": 0.0375, "grad_norm": 3.59375, "kl": 0.005625285796122625, "learning_rate": 6.494436820462371e-06, "loss": -0.0074, "num_tokens": 30685167.0, "reward": 11.33204174041748, "reward_std": 14.154102516174316, "rewards/wrapper/mean": 5.666020432859659, "rewards/wrapper/std": 16.62561158388853, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 56.6, "completions/mean_length": 248.575, "completions/mean_terminated_length": 37.9, "completions/min_length": 70.4, "completions/min_terminated_length": 19.2, "epoch": 0.6580645161290323, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.0125009736395441, "learning_rate": 6.489744866579038e-06, "loss": -0.0202, "num_tokens": 30740001.0, "reward": 7.082417011260986, "reward_std": 8.872758960723877, "rewards/wrapper/mean": 3.5412084154784678, "rewards/wrapper/std": 11.814745858311653, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.6592375366568914, "frac_reward_zero_std": 0.0125, "grad_norm": 34.75, "kl": 0.07425311693223194, "learning_rate": 6.4850476338487135e-06, "loss": 0.003, "num_tokens": 30795647.0, "reward": 12.623469734191895, "reward_std": 16.824228286743164, "rewards/wrapper/mean": 6.311734789609909, "rewards/wrapper/std": 18.106909097731112, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.6604105571847507, "frac_reward_zero_std": 0.0125, "grad_norm": 2.046875, "kl": 0.013532165088690817, "learning_rate": 6.480345138354457e-06, "loss": -0.0032, "num_tokens": 30850124.0, "reward": 12.670428943634032, "reward_std": 16.832879066467285, "rewards/wrapper/mean": 6.335214430838823, "rewards/wrapper/std": 17.93000815808773, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 41.4, "completions/mean_length": 249.30625, "completions/mean_terminated_length": 14.2, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.66158357771261, "frac_reward_zero_std": 0.0125, "grad_norm": 1.4765625, "kl": 0.011149391869548709, "learning_rate": 6.475637396197346e-06, "loss": -0.01, "num_tokens": 30902123.0, "reward": 10.397428131103515, "reward_std": 12.043423748016357, "rewards/wrapper/mean": 5.198714216798544, "rewards/wrapper/std": 14.887701985239982, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 96.2, "completions/mean_length": 247.48125, "completions/mean_terminated_length": 68.5, "completions/min_length": 92.0, "completions/min_terminated_length": 40.8, "epoch": 0.6627565982404692, "frac_reward_zero_std": 0.0, "grad_norm": 6.46875, "kl": 0.012524871563073248, "learning_rate": 6.470924423496421e-06, "loss": -0.0267, "num_tokens": 30955618.0, "reward": 14.662075805664063, "reward_std": 17.22254581451416, "rewards/wrapper/mean": 7.331037894636393, "rewards/wrapper/std": 17.588551034033298, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 21.4, "completions/mean_length": 251.875, "completions/mean_terminated_length": 21.4, "completions/min_length": 175.0, "completions/min_terminated_length": 21.4, "epoch": 0.6639296187683285, "frac_reward_zero_std": 0.025, "grad_norm": 2.890625, "kl": 0.007945716701215133, "learning_rate": 6.466206236388636e-06, "loss": -0.0116, "num_tokens": 31009472.0, "reward": 13.417535018920898, "reward_std": 17.303636741638183, "rewards/wrapper/mean": 6.708767288178206, "rewards/wrapper/std": 20.54477540552616, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 31.6, "completions/mean_length": 250.59375, "completions/mean_terminated_length": 24.7, "completions/min_length": 120.2, "completions/min_terminated_length": 17.8, "epoch": 0.6651026392961877, "frac_reward_zero_std": 0.0125, "grad_norm": 2.40625, "kl": 0.007562464033253491, "learning_rate": 6.461482851028794e-06, "loss": -0.0107, "num_tokens": 31066489.0, "reward": 10.607298517227173, "reward_std": 11.593081283569337, "rewards/wrapper/mean": 5.30364919602871, "rewards/wrapper/std": 17.242631320655345, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 23.2, "completions/mean_length": 250.3375, "completions/mean_terminated_length": 23.2, "completions/min_length": 176.8, "completions/min_terminated_length": 23.2, "epoch": 0.6662756598240469, "frac_reward_zero_std": 0.025, "grad_norm": 3.84375, "kl": 0.008082286594435573, "learning_rate": 6.4567542835894985e-06, "loss": -0.0186, "num_tokens": 31119453.0, "reward": 11.698405933380126, "reward_std": 15.217379951477051, "rewards/wrapper/mean": 5.849203032255173, "rewards/wrapper/std": 18.357003271579742, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.6674486803519062, "frac_reward_zero_std": 0.0, "grad_norm": 28.75, "kl": 0.03912512751994655, "learning_rate": 6.452020550261098e-06, "loss": 0.0016, "num_tokens": 31171969.0, "reward": 12.70831356048584, "reward_std": 14.184439086914063, "rewards/wrapper/mean": 6.3541566789150234, "rewards/wrapper/std": 18.155715675652026, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 49.8, "completions/mean_length": 251.1625, "completions/mean_terminated_length": 48.8, "completions/min_length": 150.2, "completions/min_terminated_length": 47.8, "epoch": 0.6686217008797654, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.021190780331380665, "learning_rate": 6.447281667251626e-06, "loss": -0.0041, "num_tokens": 31227067.0, "reward": 13.802499532699585, "reward_std": 16.760237789154054, "rewards/wrapper/mean": 6.901249774545431, "rewards/wrapper/std": 17.009926618635653, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.6697947214076246, "frac_reward_zero_std": 0.0375, "grad_norm": 4.53125, "kl": 0.020001567632425575, "learning_rate": 6.4425376507867485e-06, "loss": 0.0008, "num_tokens": 31280919.0, "reward": 5.958142876625061, "reward_std": 7.7278544187545775, "rewards/wrapper/mean": 2.9790713407099245, "rewards/wrapper/std": 10.234820060431957, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6709677419354839, "frac_reward_zero_std": 0.025, "grad_norm": 5.71875, "kl": 0.05707511976361275, "learning_rate": 6.4377885171097104e-06, "loss": -0.0053, "num_tokens": 31337926.0, "reward": 11.579971981048583, "reward_std": 15.459444427490235, "rewards/wrapper/mean": 5.789985730499029, "rewards/wrapper/std": 16.77395656108856, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 62.2, "completions/mean_length": 249.95, "completions/mean_terminated_length": 57.2, "completions/min_length": 103.4, "completions/min_terminated_length": 52.2, "epoch": 0.6721407624633431, "frac_reward_zero_std": 0.05, "grad_norm": 1.859375, "kl": 0.007303468359168619, "learning_rate": 6.4330342824812745e-06, "loss": -0.0168, "num_tokens": 31394784.0, "reward": 6.7388733386993405, "reward_std": 8.405215740203857, "rewards/wrapper/mean": 3.3694365844130516, "rewards/wrapper/std": 12.588712561130524, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 7.8, "completions/mean_length": 249.95, "completions/mean_terminated_length": 7.5, "completions/min_length": 109.6, "completions/min_terminated_length": 7.2, "epoch": 0.6733137829912024, "frac_reward_zero_std": 0.025, "grad_norm": 9.5625, "kl": 0.20785174979828297, "learning_rate": 6.4282749631796725e-06, "loss": -0.0062, "num_tokens": 31450594.0, "reward": 12.838958740234375, "reward_std": 16.73229236602783, "rewards/wrapper/mean": 6.41947939991951, "rewards/wrapper/std": 16.986108617484568, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 253.1, "completions/mean_terminated_length": 81.6, "completions/min_length": 165.6, "completions/min_terminated_length": 63.2, "epoch": 0.6744868035190615, "frac_reward_zero_std": 0.0375, "grad_norm": 3.046875, "kl": 0.014510875556152315, "learning_rate": 6.42351057550054e-06, "loss": 0.0028, "num_tokens": 31504968.0, "reward": 5.3186607837677, "reward_std": 7.012968826293945, "rewards/wrapper/mean": 2.6593304432928564, "rewards/wrapper/std": 10.752986335754395, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6756598240469208, "frac_reward_zero_std": 0.0375, "grad_norm": 6.1875, "kl": 0.017193890095222743, "learning_rate": 6.418741135756875e-06, "loss": -0.0069, "num_tokens": 31557632.0, "reward": 13.5635555267334, "reward_std": 16.37178087234497, "rewards/wrapper/mean": 6.781777499616146, "rewards/wrapper/std": 18.871901808679105, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 10.8, "completions/mean_length": 254.7375, "completions/mean_terminated_length": 10.8, "completions/min_length": 215.6, "completions/min_terminated_length": 10.8, "epoch": 0.6768328445747801, "frac_reward_zero_std": 0.0125, "grad_norm": 4.34375, "kl": 0.00942248246865347, "learning_rate": 6.413966660278967e-06, "loss": -0.0015, "num_tokens": 31614106.0, "reward": 10.58498935699463, "reward_std": 11.666668128967284, "rewards/wrapper/mean": 5.292494739592075, "rewards/wrapper/std": 15.16362506300211, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 252.79375, "completions/mean_terminated_length": 51.0, "completions/min_length": 153.4, "completions/min_terminated_length": 51.0, "epoch": 0.6780058651026393, "frac_reward_zero_std": 0.05, "grad_norm": 2.84375, "kl": 0.012383062462322413, "learning_rate": 6.409187165414346e-06, "loss": -0.0014, "num_tokens": 31669557.0, "reward": 10.968196487426757, "reward_std": 14.180962562561035, "rewards/wrapper/mean": 5.484098115563393, "rewards/wrapper/std": 18.82552878111601, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 11.4, "completions/mean_length": 248.49375, "completions/mean_terminated_length": 8.0, "completions/min_length": 107.0, "completions/min_terminated_length": 4.6, "epoch": 0.6791788856304986, "frac_reward_zero_std": 0.05, "grad_norm": 3.3125, "kl": 0.009327814332209527, "learning_rate": 6.404402667527736e-06, "loss": -0.0163, "num_tokens": 31726770.0, "reward": 10.455389595031738, "reward_std": 14.166728591918945, "rewards/wrapper/mean": 5.2276949137449265, "rewards/wrapper/std": 15.657369413971901, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 253.95625, "completions/mean_terminated_length": 37.0, "completions/min_length": 190.6, "completions/min_terminated_length": 37.0, "epoch": 0.6803519061583577, "frac_reward_zero_std": 0.0, "grad_norm": 3.453125, "kl": 0.00922186360694468, "learning_rate": 6.399613183000983e-06, "loss": -0.0063, "num_tokens": 31780625.0, "reward": 5.889966726303101, "reward_std": 5.738338303565979, "rewards/wrapper/mean": 2.9449833787977697, "rewards/wrapper/std": 10.102134810388089, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 8.6, "completions/mean_length": 251.46875, "completions/mean_terminated_length": 8.6, "completions/min_length": 111.0, "completions/min_terminated_length": 8.6, "epoch": 0.681524926686217, "frac_reward_zero_std": 0.05, "grad_norm": 2.1875, "kl": 0.011204652744345367, "learning_rate": 6.394818728233014e-06, "loss": -0.0103, "num_tokens": 31838018.0, "reward": 11.55151720046997, "reward_std": 15.863847255706787, "rewards/wrapper/mean": 5.775758402049542, "rewards/wrapper/std": 17.470771422982217, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 3.8, "completions/mean_length": 252.91875, "completions/mean_terminated_length": 3.8, "completions/min_length": 157.4, "completions/min_terminated_length": 3.8, "epoch": 0.6826979472140763, "frac_reward_zero_std": 0.025, "grad_norm": 4.84375, "kl": 0.028127940092235803, "learning_rate": 6.3900193196397675e-06, "loss": -0.0086, "num_tokens": 31894569.0, "reward": 11.12969675064087, "reward_std": 15.122749328613281, "rewards/wrapper/mean": 5.564848321676254, "rewards/wrapper/std": 15.775805968046189, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 248.1, "completions/mean_terminated_length": 42.5, "completions/min_length": 86.2, "completions/min_terminated_length": 35.0, "epoch": 0.6838709677419355, "frac_reward_zero_std": 0.025, "grad_norm": 1.96875, "kl": 4.824623215675819, "learning_rate": 6.385214973654147e-06, "loss": 0.1788, "num_tokens": 31949579.0, "reward": 9.268371200561523, "reward_std": 12.328631019592285, "rewards/wrapper/mean": 4.6341855227947235, "rewards/wrapper/std": 14.82547686547041, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.6850439882697947, "frac_reward_zero_std": 0.0125, "grad_norm": 6.1875, "kl": 0.04323632463347167, "learning_rate": 6.380405706725961e-06, "loss": -0.0057, "num_tokens": 32002908.0, "reward": 10.425172328948975, "reward_std": 14.20661792755127, "rewards/wrapper/mean": 5.212585891783237, "rewards/wrapper/std": 17.869454950094223, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 40.8, "completions/mean_length": 255.675, "completions/mean_terminated_length": 40.8, "completions/min_length": 245.6, "completions/min_terminated_length": 40.8, "epoch": 0.6862170087976539, "frac_reward_zero_std": 0.0, "grad_norm": 4.59375, "kl": 0.010727729741483926, "learning_rate": 6.375591535321866e-06, "loss": 0.0011, "num_tokens": 32058924.0, "reward": 7.594053506851196, "reward_std": 9.99413022994995, "rewards/wrapper/mean": 3.7970266461372377, "rewards/wrapper/std": 12.754188150167465, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.6873900293255132, "frac_reward_zero_std": 0.0125, "grad_norm": 2.03125, "kl": 0.0054875939211342485, "learning_rate": 6.370772475925312e-06, "loss": -0.0048, "num_tokens": 32113721.0, "reward": 12.846275234222412, "reward_std": 15.05757179260254, "rewards/wrapper/mean": 6.423137576878071, "rewards/wrapper/std": 18.93209269195795, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6885630498533725, "frac_reward_zero_std": 0.0375, "grad_norm": 4.28125, "kl": 0.012189660430885851, "learning_rate": 6.365948545036486e-06, "loss": -0.0078, "num_tokens": 32171397.0, "reward": 13.03080415725708, "reward_std": 15.789403009414674, "rewards/wrapper/mean": 6.515402068197727, "rewards/wrapper/std": 20.668898472189902, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 42.6, "completions/mean_length": 254.13125, "completions/mean_terminated_length": 42.6, "completions/min_length": 196.2, "completions/min_terminated_length": 42.6, "epoch": 0.6897360703812316, "frac_reward_zero_std": 0.025, "grad_norm": 1.640625, "kl": 0.007035130372969434, "learning_rate": 6.361119759172254e-06, "loss": -0.0039, "num_tokens": 32225176.0, "reward": 11.792992210388183, "reward_std": 15.923147010803223, "rewards/wrapper/mean": 5.896495893597603, "rewards/wrapper/std": 16.489003255963326, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 252.01875, "completions/mean_terminated_length": 13.2, "completions/min_length": 166.4, "completions/min_terminated_length": 12.8, "epoch": 0.6909090909090909, "frac_reward_zero_std": 0.0125, "grad_norm": 9.5, "kl": 0.011763886036351322, "learning_rate": 6.3562861348661025e-06, "loss": -0.005, "num_tokens": 32280057.0, "reward": 12.035215187072755, "reward_std": 14.319508934020996, "rewards/wrapper/mean": 6.017607763409615, "rewards/wrapper/std": 17.326738145947456, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 252.725, "completions/mean_terminated_length": 100.0, "completions/min_length": 151.2, "completions/min_terminated_length": 100.0, "epoch": 0.6920821114369502, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.008140194357838481, "learning_rate": 6.351447688668089e-06, "loss": -0.0017, "num_tokens": 32330873.0, "reward": 7.528527927398682, "reward_std": 8.575776290893554, "rewards/wrapper/mean": 3.764264015108347, "rewards/wrapper/std": 12.360572703182697, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 251.10625, "completions/mean_terminated_length": 16.2, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.6932551319648094, "frac_reward_zero_std": 0.0125, "grad_norm": 6.53125, "kl": 0.011117254314012826, "learning_rate": 6.34660443714478e-06, "loss": -0.0064, "num_tokens": 32385796.0, "reward": 13.417228507995606, "reward_std": 14.974496126174927, "rewards/wrapper/mean": 6.708614060282708, "rewards/wrapper/std": 16.140495674312113, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 10.8, "completions/mean_length": 248.34375, "completions/mean_terminated_length": 10.8, "completions/min_length": 62.0, "completions/min_terminated_length": 10.8, "epoch": 0.6944281524926686, "frac_reward_zero_std": 0.0125, "grad_norm": 0.875, "kl": 0.011824999994132668, "learning_rate": 6.341756396879192e-06, "loss": -0.0234, "num_tokens": 32439957.0, "reward": 12.452883625030518, "reward_std": 15.025022745132446, "rewards/wrapper/mean": 6.226441939175129, "rewards/wrapper/std": 16.66017941981554, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 54.2, "completions/mean_length": 252.89375, "completions/mean_terminated_length": 54.2, "completions/min_length": 156.6, "completions/min_terminated_length": 54.2, "epoch": 0.6956011730205278, "frac_reward_zero_std": 0.0125, "grad_norm": 3.828125, "kl": 0.009212367539294063, "learning_rate": 6.33690358447074e-06, "loss": -0.0097, "num_tokens": 32494362.0, "reward": 13.592586326599122, "reward_std": 18.491394233703613, "rewards/wrapper/mean": 6.796293315291405, "rewards/wrapper/std": 20.610531900823116, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 93.6, "completions/mean_length": 252.525, "completions/mean_terminated_length": 93.6, "completions/min_length": 144.8, "completions/min_terminated_length": 93.6, "epoch": 0.6967741935483871, "frac_reward_zero_std": 0.0375, "grad_norm": 1.21875, "kl": 0.009233022644184529, "learning_rate": 6.33204601653518e-06, "loss": -0.0024, "num_tokens": 32549066.0, "reward": 8.715370655059814, "reward_std": 9.768978691101074, "rewards/wrapper/mean": 4.357685124874115, "rewards/wrapper/std": 16.73828110843897, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 250.35, "completions/mean_terminated_length": 100.8, "completions/min_length": 148.8, "completions/min_terminated_length": 97.6, "epoch": 0.6979472140762464, "frac_reward_zero_std": 0.0375, "grad_norm": 0.9296875, "kl": 0.007445700804237276, "learning_rate": 6.327183709704547e-06, "loss": -0.0089, "num_tokens": 32601098.0, "reward": 8.694724130630494, "reward_std": 11.72841203212738, "rewards/wrapper/mean": 4.347361895442009, "rewards/wrapper/std": 12.822503638267516, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 3.4, "completions/mean_length": 251.34375, "completions/mean_terminated_length": 2.4, "completions/min_length": 155.0, "completions/min_terminated_length": 1.4, "epoch": 0.6991202346041056, "frac_reward_zero_std": 0.05, "grad_norm": 4.1875, "kl": 0.029423248756211252, "learning_rate": 6.322316680627107e-06, "loss": -0.0098, "num_tokens": 32657423.0, "reward": 15.263641691207885, "reward_std": 20.547141981124877, "rewards/wrapper/mean": 7.6318209052085875, "rewards/wrapper/std": 18.74337030798197, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 248.48125, "completions/mean_terminated_length": 9.4, "completions/min_length": 58.0, "completions/min_terminated_length": 6.8, "epoch": 0.7002932551319648, "frac_reward_zero_std": 0.0125, "grad_norm": 3.421875, "kl": 0.009029289579484612, "learning_rate": 6.317444945967288e-06, "loss": -0.0006, "num_tokens": 32714476.0, "reward": 13.580448102951049, "reward_std": 15.056785011291504, "rewards/wrapper/mean": 6.790223602950573, "rewards/wrapper/std": 16.98922117650509, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 249.85625, "completions/mean_terminated_length": 6.2, "completions/min_length": 106.8, "completions/min_terminated_length": 4.4, "epoch": 0.701466275659824, "frac_reward_zero_std": 0.025, "grad_norm": 1.4609375, "kl": 0.010367827408481389, "learning_rate": 6.312568522405635e-06, "loss": -0.012, "num_tokens": 32768535.0, "reward": 8.727212715148926, "reward_std": 9.299742698669434, "rewards/wrapper/mean": 4.363606084138155, "rewards/wrapper/std": 11.761549571156502, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 8.2, "completions/mean_length": 253.0625, "completions/mean_terminated_length": 4.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7026392961876833, "frac_reward_zero_std": 0.0, "grad_norm": 5.34375, "kl": 0.010755813180003316, "learning_rate": 6.307687426638746e-06, "loss": -0.0087, "num_tokens": 32822397.0, "reward": 10.127997446060181, "reward_std": 12.69483847618103, "rewards/wrapper/mean": 5.0639987081289295, "rewards/wrapper/std": 14.818000476062299, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 11.4, "completions/mean_length": 251.55625, "completions/mean_terminated_length": 11.4, "completions/min_length": 113.8, "completions/min_terminated_length": 11.4, "epoch": 0.7038123167155426, "frac_reward_zero_std": 0.025, "grad_norm": 1.125, "kl": 0.007638441288145259, "learning_rate": 6.302801675379216e-06, "loss": -0.0098, "num_tokens": 32875248.0, "reward": 14.06427435874939, "reward_std": 18.11653337478638, "rewards/wrapper/mean": 7.032136972993612, "rewards/wrapper/std": 19.8682512819767, "step": 3000 }, { "epoch": 0.7038123167155426, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.79, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 71.57, "eval_completions/mean_length": 222.775, "eval_completions/mean_terminated_length": 60.508333358764645, "eval_completions/min_length": 146.37, "eval_completions/min_terminated_length": 49.09, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.013644674248062073, "eval_loss": -0.05559740215539932, "eval_num_tokens": 32875248.0, "eval_reward": 0.34767198249697684, "eval_reward_std": 0.19411193696781992, "eval_rewards/wrapper/mean": 0.1738359948247671, "eval_rewards/wrapper/std": 0.1692263395804912, "eval_runtime": 208.5757, "eval_samples_per_second": 0.959, "eval_steps_per_second": 0.24, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 20.0, "completions/mean_length": 250.23125, "completions/mean_terminated_length": 20.0, "completions/min_length": 122.4, "completions/min_terminated_length": 20.0, "epoch": 0.7049853372434017, "frac_reward_zero_std": 0.0375, "grad_norm": 4.75, "kl": 0.012254834035411477, "learning_rate": 6.297911285355579e-06, "loss": 0.0123, "num_tokens": 32930043.0, "reward": 10.28910961151123, "reward_std": 12.967611694335938, "rewards/wrapper/mean": 5.144554616510868, "rewards/wrapper/std": 18.04752763658762, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.4, "completions/mean_length": 252.84375, "completions/mean_terminated_length": 1.4, "completions/min_length": 155.0, "completions/min_terminated_length": 1.4, "epoch": 0.706158357771261, "frac_reward_zero_std": 0.0375, "grad_norm": 2.015625, "kl": 0.012843809789046645, "learning_rate": 6.293016273312254e-06, "loss": -0.0044, "num_tokens": 32985718.0, "reward": 15.941922378540038, "reward_std": 19.729792022705077, "rewards/wrapper/mean": 7.970961252599954, "rewards/wrapper/std": 22.57847531288862, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 254.41875, "completions/mean_terminated_length": 0.6, "completions/min_length": 205.4, "completions/min_terminated_length": 0.6, "epoch": 0.7073313782991202, "frac_reward_zero_std": 0.0375, "grad_norm": 1.1171875, "kl": 0.011954522877931594, "learning_rate": 6.288116656009485e-06, "loss": 0.0036, "num_tokens": 33040551.0, "reward": 14.066415405273437, "reward_std": 16.348633670806883, "rewards/wrapper/mean": 7.033207412064075, "rewards/wrapper/std": 19.8112138196826, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 251.35625, "completions/mean_terminated_length": 5.0, "completions/min_length": 107.4, "completions/min_terminated_length": 5.0, "epoch": 0.7085043988269795, "frac_reward_zero_std": 0.025, "grad_norm": 2.75, "kl": 0.013446322188246995, "learning_rate": 6.283212450223284e-06, "loss": -0.0117, "num_tokens": 33094550.0, "reward": 8.66706485748291, "reward_std": 11.540789794921874, "rewards/wrapper/mean": 4.333532364666462, "rewards/wrapper/std": 15.913649466633796, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 50.2, "completions/mean_length": 253.8375, "completions/mean_terminated_length": 42.3, "completions/min_length": 188.0, "completions/min_terminated_length": 34.4, "epoch": 0.7096774193548387, "frac_reward_zero_std": 0.0625, "grad_norm": 1.7109375, "kl": 0.007783821260090917, "learning_rate": 6.278303672745373e-06, "loss": -0.002, "num_tokens": 33149128.0, "reward": 8.587579703330993, "reward_std": 10.724381399154662, "rewards/wrapper/mean": 4.293789640069008, "rewards/wrapper/std": 12.531948786973953, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 45.8, "completions/mean_length": 254.23125, "completions/mean_terminated_length": 45.8, "completions/min_length": 199.4, "completions/min_terminated_length": 45.8, "epoch": 0.7108504398826979, "frac_reward_zero_std": 0.025, "grad_norm": 5.0, "kl": 0.010787489754147828, "learning_rate": 6.2733903403831275e-06, "loss": -0.0075, "num_tokens": 33206507.0, "reward": 8.99125509262085, "reward_std": 11.77661190032959, "rewards/wrapper/mean": 4.495627209544182, "rewards/wrapper/std": 14.157334826886654, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 256.0, "completions/max_terminated_length": 87.6, "completions/mean_length": 242.9375, "completions/mean_terminated_length": 48.46666717529297, "completions/min_length": 26.6, "completions/min_terminated_length": 26.6, "epoch": 0.7120234604105572, "frac_reward_zero_std": 0.0375, "grad_norm": 4.4375, "kl": 0.00714592015137896, "learning_rate": 6.268472469959519e-06, "loss": -0.0185, "num_tokens": 33261703.0, "reward": 13.6625545501709, "reward_std": 13.896155834197998, "rewards/wrapper/mean": 6.831276829540729, "rewards/wrapper/std": 19.921477034687996, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 19.2, "completions/mean_length": 250.20625, "completions/mean_terminated_length": 10.7, "completions/min_length": 104.6, "completions/min_terminated_length": 2.2, "epoch": 0.7131964809384165, "frac_reward_zero_std": 0.0375, "grad_norm": 3.203125, "kl": 0.009536017797654495, "learning_rate": 6.263550078313057e-06, "loss": -0.0123, "num_tokens": 33317868.0, "reward": 11.820788192749024, "reward_std": 13.19154167175293, "rewards/wrapper/mean": 5.910393899679184, "rewards/wrapper/std": 16.527128563821314, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.7143695014662756, "frac_reward_zero_std": 0.0125, "grad_norm": 1.6171875, "kl": 0.006540574179962277, "learning_rate": 6.2586231822977305e-06, "loss": -0.0106, "num_tokens": 33371771.0, "reward": 7.129058766365051, "reward_std": 9.376532649993896, "rewards/wrapper/mean": 3.5645292200148107, "rewards/wrapper/std": 11.052740156650543, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 16.4, "completions/mean_length": 253.3125, "completions/mean_terminated_length": 16.4, "completions/min_length": 170.0, "completions/min_terminated_length": 16.4, "epoch": 0.7155425219941349, "frac_reward_zero_std": 0.025, "grad_norm": 1.1484375, "kl": 0.016403516015270726, "learning_rate": 6.253691798782954e-06, "loss": 0.001, "num_tokens": 33423625.0, "reward": 16.984063339233398, "reward_std": 21.145125770568846, "rewards/wrapper/mean": 8.492031678557396, "rewards/wrapper/std": 25.235761691629886, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 54.4, "completions/mean_length": 251.675, "completions/mean_terminated_length": 54.3, "completions/min_length": 156.6, "completions/min_terminated_length": 54.2, "epoch": 0.7167155425219941, "frac_reward_zero_std": 0.0375, "grad_norm": 1.671875, "kl": 0.010424506437266246, "learning_rate": 6.248755944653503e-06, "loss": -0.0033, "num_tokens": 33481279.0, "reward": 9.209318351745605, "reward_std": 11.9292555809021, "rewards/wrapper/mean": 4.604659250378608, "rewards/wrapper/std": 14.66360622793436, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 40.2, "completions/mean_length": 252.4625, "completions/mean_terminated_length": 40.2, "completions/min_length": 193.8, "completions/min_terminated_length": 40.2, "epoch": 0.7178885630498534, "frac_reward_zero_std": 0.025, "grad_norm": 6.90625, "kl": 0.01565061039291322, "learning_rate": 6.243815636809464e-06, "loss": -0.0126, "num_tokens": 33539275.0, "reward": 12.543126440048217, "reward_std": 15.04892885684967, "rewards/wrapper/mean": 6.271563523262739, "rewards/wrapper/std": 18.085418404638766, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 249.95625, "completions/mean_terminated_length": 27.8, "completions/min_length": 170.0, "completions/min_terminated_length": 16.4, "epoch": 0.7190615835777127, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.00790031259530224, "learning_rate": 6.238870892166168e-06, "loss": -0.0126, "num_tokens": 33590184.0, "reward": 9.49119300842285, "reward_std": 11.98473072052002, "rewards/wrapper/mean": 4.745596365630627, "rewards/wrapper/std": 14.232426093518734, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.7202346041055718, "frac_reward_zero_std": 0.0125, "grad_norm": 1.9296875, "kl": 0.011212410021107644, "learning_rate": 6.233921727654144e-06, "loss": -0.0058, "num_tokens": 33646111.0, "reward": 7.471917772293091, "reward_std": 9.847779417037964, "rewards/wrapper/mean": 3.735959121584892, "rewards/wrapper/std": 13.612964145839214, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 45.6, "completions/mean_length": 254.225, "completions/mean_terminated_length": 45.6, "completions/min_length": 199.2, "completions/min_terminated_length": 45.6, "epoch": 0.7214076246334311, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.019665966625325382, "learning_rate": 6.2289681602190475e-06, "loss": -0.0001, "num_tokens": 33701441.0, "reward": 9.508408069610596, "reward_std": 12.26361608505249, "rewards/wrapper/mean": 4.754203618317843, "rewards/wrapper/std": 16.59772346019745, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 34.2, "completions/mean_length": 252.45625, "completions/mean_terminated_length": 13.4, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7225806451612903, "frac_reward_zero_std": 0.0875, "grad_norm": 2.546875, "kl": 0.014641248155385256, "learning_rate": 6.224010206821615e-06, "loss": -0.0111, "num_tokens": 33757582.0, "reward": 7.073202276229859, "reward_std": 9.425168991088867, "rewards/wrapper/mean": 3.536601182818413, "rewards/wrapper/std": 10.393733787536622, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 48.4, "completions/mean_length": 251.11875, "completions/mean_terminated_length": 24.5, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.7237536656891496, "frac_reward_zero_std": 0.0125, "grad_norm": 3.375, "kl": 0.008687148883473128, "learning_rate": 6.219047884437596e-06, "loss": -0.0055, "num_tokens": 33812913.0, "reward": 10.549963283538819, "reward_std": 14.273748397827148, "rewards/wrapper/mean": 5.27498158365488, "rewards/wrapper/std": 18.366497644782065, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 7.2, "completions/mean_length": 251.43125, "completions/mean_terminated_length": 7.2, "completions/min_length": 160.8, "completions/min_terminated_length": 7.2, "epoch": 0.7249266862170088, "frac_reward_zero_std": 0.025, "grad_norm": 2.359375, "kl": 0.009205997944809497, "learning_rate": 6.214081210057702e-06, "loss": -0.0055, "num_tokens": 33870604.0, "reward": 15.315609312057495, "reward_std": 20.723115539550783, "rewards/wrapper/mean": 7.657804708182812, "rewards/wrapper/std": 18.553164108097555, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.726099706744868, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.012098168022930622, "learning_rate": 6.209110200687543e-06, "loss": -0.0096, "num_tokens": 33923694.0, "reward": 8.973456811904907, "reward_std": 10.326069641113282, "rewards/wrapper/mean": 4.486728381365538, "rewards/wrapper/std": 13.67483127862215, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 250.9875, "completions/mean_terminated_length": 34.0, "completions/min_length": 128.4, "completions/min_terminated_length": 26.0, "epoch": 0.7272727272727273, "frac_reward_zero_std": 0.05, "grad_norm": 1.7578125, "kl": 0.020142507256241515, "learning_rate": 6.2041348733475726e-06, "loss": -0.013, "num_tokens": 33976212.0, "reward": 12.24562635421753, "reward_std": 11.961190509796143, "rewards/wrapper/mean": 6.122812962532043, "rewards/wrapper/std": 17.585172924399377, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.7284457478005865, "frac_reward_zero_std": 0.05, "grad_norm": 3.703125, "kl": 6.63561298425775, "learning_rate": 6.199155245073034e-06, "loss": 0.2629, "num_tokens": 34031094.0, "reward": 7.917291593551636, "reward_std": 9.857232570648193, "rewards/wrapper/mean": 3.9586454682052135, "rewards/wrapper/std": 12.87554216235876, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 32.0, "completions/mean_length": 255.4, "completions/mean_terminated_length": 32.0, "completions/min_length": 236.8, "completions/min_terminated_length": 32.0, "epoch": 0.7296187683284457, "frac_reward_zero_std": 0.0125, "grad_norm": 1.296875, "kl": 0.007416438066866249, "learning_rate": 6.194171332913887e-06, "loss": -0.0023, "num_tokens": 34084466.0, "reward": 9.664254093170166, "reward_std": 12.404686951637268, "rewards/wrapper/mean": 4.832127270102501, "rewards/wrapper/std": 12.02934721559286, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 54.2, "completions/mean_length": 251.29375, "completions/mean_terminated_length": 54.2, "completions/min_length": 105.4, "completions/min_terminated_length": 54.2, "epoch": 0.730791788856305, "frac_reward_zero_std": 0.025, "grad_norm": 5.21875, "kl": 0.04753340142779052, "learning_rate": 6.189183153934767e-06, "loss": -0.0032, "num_tokens": 34139469.0, "reward": 10.3969295501709, "reward_std": 10.79515585899353, "rewards/wrapper/mean": 5.19846440255642, "rewards/wrapper/std": 16.67407398223877, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 9.4, "completions/mean_length": 251.49375, "completions/mean_terminated_length": 9.4, "completions/min_length": 111.8, "completions/min_terminated_length": 9.4, "epoch": 0.7319648093841642, "frac_reward_zero_std": 0.025, "grad_norm": 8.375, "kl": 0.03143579112365842, "learning_rate": 6.1841907252149144e-06, "loss": -0.0128, "num_tokens": 34194524.0, "reward": 11.688684749603272, "reward_std": 15.52239112854004, "rewards/wrapper/mean": 5.844342230260372, "rewards/wrapper/std": 17.19078350365162, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 19.6, "completions/mean_length": 251.81875, "completions/mean_terminated_length": 19.6, "completions/min_length": 173.2, "completions/min_terminated_length": 19.6, "epoch": 0.7331378299120235, "frac_reward_zero_std": 0.0125, "grad_norm": 2.296875, "kl": 0.006614711100701243, "learning_rate": 6.1791940638481225e-06, "loss": -0.0056, "num_tokens": 34247523.0, "reward": 10.99114260673523, "reward_std": 12.279700326919556, "rewards/wrapper/mean": 5.495571257919073, "rewards/wrapper/std": 14.978180499374867, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 10.8, "completions/mean_length": 251.5375, "completions/mean_terminated_length": 10.8, "completions/min_length": 113.2, "completions/min_terminated_length": 10.8, "epoch": 0.7343108504398826, "frac_reward_zero_std": 0.0125, "grad_norm": 2.5, "kl": 0.011815444775857031, "learning_rate": 6.174193186942678e-06, "loss": -0.0111, "num_tokens": 34305905.0, "reward": 10.515061807632446, "reward_std": 14.016519451141358, "rewards/wrapper/mean": 5.257530699670315, "rewards/wrapper/std": 16.910930271446706, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 56.8, "completions/mean_length": 249.91875, "completions/mean_terminated_length": 54.5, "completions/min_length": 103.4, "completions/min_terminated_length": 52.2, "epoch": 0.7354838709677419, "frac_reward_zero_std": 0.0375, "grad_norm": 1.53125, "kl": 0.020315185002982617, "learning_rate": 6.169188111621298e-06, "loss": -0.0097, "num_tokens": 34360016.0, "reward": 16.148542070388793, "reward_std": 16.03440327644348, "rewards/wrapper/mean": 8.074271266907454, "rewards/wrapper/std": 18.308212214708327, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 49.2, "completions/mean_length": 251.55625, "completions/mean_terminated_length": 48.2, "completions/min_length": 149.6, "completions/min_terminated_length": 47.2, "epoch": 0.7366568914956012, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.08753183353692293, "learning_rate": 6.164178855021075e-06, "loss": -0.0093, "num_tokens": 34412503.0, "reward": 8.106660556793212, "reward_std": 10.737511825561523, "rewards/wrapper/mean": 4.053329988569021, "rewards/wrapper/std": 13.285885770618915, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 252.925, "completions/mean_terminated_length": 4.0, "completions/min_length": 157.6, "completions/min_terminated_length": 4.0, "epoch": 0.7378299120234604, "frac_reward_zero_std": 0.05, "grad_norm": 1.875, "kl": 0.008428220392670483, "learning_rate": 6.159165434293425e-06, "loss": -0.0124, "num_tokens": 34467477.0, "reward": 12.283936882019043, "reward_std": 15.909385299682617, "rewards/wrapper/mean": 6.141968539357185, "rewards/wrapper/std": 19.165711463987826, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7390029325513197, "frac_reward_zero_std": 0.025, "grad_norm": 3.25, "kl": 0.008187936479225754, "learning_rate": 6.154147866604011e-06, "loss": -0.0047, "num_tokens": 34521474.0, "reward": 10.439709758758545, "reward_std": 13.946194648742676, "rewards/wrapper/mean": 5.219854548573494, "rewards/wrapper/std": 16.24859150648117, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 73.2, "completions/mean_length": 251.9, "completions/mean_terminated_length": 36.8, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.7401759530791789, "frac_reward_zero_std": 0.025, "grad_norm": 1.484375, "kl": 0.0075267312116920945, "learning_rate": 6.149126169132701e-06, "loss": 0.0008, "num_tokens": 34574796.0, "reward": 9.501663303375244, "reward_std": 11.618663597106934, "rewards/wrapper/mean": 4.750831536203623, "rewards/wrapper/std": 14.094446489214898, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.7413489736070381, "frac_reward_zero_std": 0.025, "grad_norm": 2.453125, "kl": 0.009569109929725528, "learning_rate": 6.144100359073504e-06, "loss": 0.0004, "num_tokens": 34630316.0, "reward": 10.490760707855225, "reward_std": 12.09967861175537, "rewards/wrapper/mean": 5.245380634069443, "rewards/wrapper/std": 14.713596984744072, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 41.2, "completions/mean_length": 249.3, "completions/mean_terminated_length": 26.1, "completions/min_length": 113.4, "completions/min_terminated_length": 11.0, "epoch": 0.7425219941348974, "frac_reward_zero_std": 0.0, "grad_norm": 3.546875, "kl": 0.009689736948348581, "learning_rate": 6.139070453634509e-06, "loss": -0.0111, "num_tokens": 34685220.0, "reward": 13.131814670562743, "reward_std": 15.438633823394776, "rewards/wrapper/mean": 6.5659067839384075, "rewards/wrapper/std": 19.385744975507258, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 250.38125, "completions/mean_terminated_length": 70.73333358764648, "completions/min_length": 149.2, "completions/min_terminated_length": 46.8, "epoch": 0.7436950146627566, "frac_reward_zero_std": 0.0, "grad_norm": 3.375, "kl": 0.33361702859401704, "learning_rate": 6.1340364700378255e-06, "loss": 0.0094, "num_tokens": 34739987.0, "reward": 14.167833518981933, "reward_std": 19.107415008544923, "rewards/wrapper/mean": 7.083916249871254, "rewards/wrapper/std": 18.601954208314417, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 88.2, "completions/mean_length": 253.9625, "completions/mean_terminated_length": 63.3, "completions/min_length": 192.0, "completions/min_terminated_length": 38.4, "epoch": 0.7448680351906158, "frac_reward_zero_std": 0.025, "grad_norm": 6.78125, "kl": 0.008973815105855465, "learning_rate": 6.128998425519528e-06, "loss": -0.0039, "num_tokens": 34797083.0, "reward": 9.621114826202392, "reward_std": 10.426493692398072, "rewards/wrapper/mean": 4.8105573311448095, "rewards/wrapper/std": 14.826824332773686, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 14.4, "completions/mean_length": 251.65, "completions/mean_terminated_length": 14.4, "completions/min_length": 116.8, "completions/min_terminated_length": 14.4, "epoch": 0.7460410557184751, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.009251125366427004, "learning_rate": 6.123956337329597e-06, "loss": -0.0111, "num_tokens": 34848269.0, "reward": 12.609267926216125, "reward_std": 16.96615676879883, "rewards/wrapper/mean": 6.304634357988834, "rewards/wrapper/std": 16.025322619080544, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 60.6, "completions/mean_length": 251.54375, "completions/mean_terminated_length": 53.3, "completions/min_length": 148.4, "completions/min_terminated_length": 46.0, "epoch": 0.7472140762463343, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.010146375722251832, "learning_rate": 6.118910222731853e-06, "loss": -0.0057, "num_tokens": 34903840.0, "reward": 18.351296615600585, "reward_std": 22.918200492858887, "rewards/wrapper/mean": 9.175648310780526, "rewards/wrapper/std": 21.03637299388647, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 9.2, "completions/mean_length": 251.49375, "completions/mean_terminated_length": 4.8, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.7483870967741936, "frac_reward_zero_std": 0.025, "grad_norm": 1.8515625, "kl": 0.1860305307782255, "learning_rate": 6.113860099003909e-06, "loss": -0.0054, "num_tokens": 34960899.0, "reward": 8.806917667388916, "reward_std": 10.08819055557251, "rewards/wrapper/mean": 4.403458857536316, "rewards/wrapper/std": 14.91646645218134, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 4.2, "completions/mean_length": 251.3375, "completions/mean_terminated_length": 4.2, "completions/min_length": 157.8, "completions/min_terminated_length": 4.2, "epoch": 0.7495601173020527, "frac_reward_zero_std": 0.025, "grad_norm": 1.1171875, "kl": 0.00893011859152466, "learning_rate": 6.108805983437102e-06, "loss": -0.0037, "num_tokens": 35014629.0, "reward": 13.187407493591309, "reward_std": 18.10890293121338, "rewards/wrapper/mean": 6.593703691661358, "rewards/wrapper/std": 21.82013604640961, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 34.8, "completions/mean_length": 253.8875, "completions/mean_terminated_length": 34.8, "completions/min_length": 188.4, "completions/min_terminated_length": 34.8, "epoch": 0.750733137829912, "frac_reward_zero_std": 0.025, "grad_norm": 2.859375, "kl": 0.010938762326259167, "learning_rate": 6.103747893336437e-06, "loss": -0.0027, "num_tokens": 35069609.0, "reward": 9.006208515167236, "reward_std": 10.240087795257569, "rewards/wrapper/mean": 4.503104318678379, "rewards/wrapper/std": 12.582425367832183, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 52.2, "completions/mean_length": 254.43125, "completions/mean_terminated_length": 52.2, "completions/min_length": 205.8, "completions/min_terminated_length": 52.2, "epoch": 0.7519061583577713, "frac_reward_zero_std": 0.0, "grad_norm": 3.734375, "kl": 0.010828335920814424, "learning_rate": 6.098685846020526e-06, "loss": -0.0045, "num_tokens": 35125844.0, "reward": 12.905813598632813, "reward_std": 17.641766929626463, "rewards/wrapper/mean": 6.4529064983129505, "rewards/wrapper/std": 19.059850125014783, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 43.0, "completions/mean_length": 252.55, "completions/mean_terminated_length": 36.3, "completions/min_length": 183.2, "completions/min_terminated_length": 29.6, "epoch": 0.7530791788856305, "frac_reward_zero_std": 0.0125, "grad_norm": 4.96875, "kl": 0.009136547101661563, "learning_rate": 6.093619858821535e-06, "loss": -0.0074, "num_tokens": 35180934.0, "reward": 9.771417665481568, "reward_std": 13.003374195098877, "rewards/wrapper/mean": 4.885708878934383, "rewards/wrapper/std": 16.47006680816412, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 13.6, "completions/mean_length": 253.225, "completions/mean_terminated_length": 13.6, "completions/min_length": 167.2, "completions/min_terminated_length": 13.6, "epoch": 0.7542521994134898, "frac_reward_zero_std": 0.0125, "grad_norm": 6.6875, "kl": 0.014397739351261407, "learning_rate": 6.088549949085114e-06, "loss": 0.0026, "num_tokens": 35239758.0, "reward": 7.511144304275513, "reward_std": 9.265942859649659, "rewards/wrapper/mean": 3.7555719554424285, "rewards/wrapper/std": 12.816774183511734, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.7554252199413489, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.010587751294951885, "learning_rate": 6.083476134170349e-06, "loss": 0.0004, "num_tokens": 35293992.0, "reward": 11.967586421966553, "reward_std": 16.125496673583985, "rewards/wrapper/mean": 5.983793088048697, "rewards/wrapper/std": 17.478325541317464, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 36.4, "completions/mean_length": 248.24375, "completions/mean_terminated_length": 31.5, "completions/min_length": 26.6, "completions/min_terminated_length": 26.6, "epoch": 0.7565982404692082, "frac_reward_zero_std": 0.0375, "grad_norm": 1.875, "kl": 0.01313765674130991, "learning_rate": 6.078398431449692e-06, "loss": -0.0266, "num_tokens": 35348929.0, "reward": 10.895433044433593, "reward_std": 13.076961612701416, "rewards/wrapper/mean": 5.44771647900343, "rewards/wrapper/std": 17.260293766856194, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 54.6, "completions/mean_length": 252.90625, "completions/mean_terminated_length": 54.6, "completions/min_length": 157.0, "completions/min_terminated_length": 54.6, "epoch": 0.7577712609970675, "frac_reward_zero_std": 0.0375, "grad_norm": 3.015625, "kl": 0.008978944190312177, "learning_rate": 6.073316858308911e-06, "loss": -0.0062, "num_tokens": 35403446.0, "reward": 5.560769200325012, "reward_std": 7.3834045171737674, "rewards/wrapper/mean": 2.7803845427930356, "rewards/wrapper/std": 9.4280636459589, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 251.41875, "completions/mean_terminated_length": 7.0, "completions/min_length": 109.4, "completions/min_terminated_length": 7.0, "epoch": 0.7589442815249267, "frac_reward_zero_std": 0.0125, "grad_norm": 2.078125, "kl": 0.01288509035948664, "learning_rate": 6.068231432147023e-06, "loss": -0.0164, "num_tokens": 35458751.0, "reward": 10.370946216583253, "reward_std": 13.006755352020264, "rewards/wrapper/mean": 5.185473144054413, "rewards/wrapper/std": 16.655578370392323, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 30.2, "completions/mean_length": 252.14375, "completions/mean_terminated_length": 30.2, "completions/min_length": 132.6, "completions/min_terminated_length": 30.2, "epoch": 0.7601173020527859, "frac_reward_zero_std": 0.025, "grad_norm": 2.21875, "kl": 1.4038491782092024, "learning_rate": 6.063142170376238e-06, "loss": 0.0486, "num_tokens": 35516916.0, "reward": 8.923083782196045, "reward_std": 10.14984426498413, "rewards/wrapper/mean": 4.461541792750358, "rewards/wrapper/std": 11.562783433496952, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 12.6, "completions/mean_length": 253.19375, "completions/mean_terminated_length": 12.6, "completions/min_length": 166.2, "completions/min_terminated_length": 12.6, "epoch": 0.7612903225806451, "frac_reward_zero_std": 0.025, "grad_norm": 2.59375, "kl": 0.007983066863380373, "learning_rate": 6.058049090421904e-06, "loss": -0.0074, "num_tokens": 35568979.0, "reward": 15.244392204284669, "reward_std": 18.322880840301515, "rewards/wrapper/mean": 7.622196093201637, "rewards/wrapper/std": 19.97906378209591, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 26.2, "completions/mean_length": 252.025, "completions/mean_terminated_length": 19.7, "completions/min_length": 166.8, "completions/min_terminated_length": 13.2, "epoch": 0.7624633431085044, "frac_reward_zero_std": 0.0, "grad_norm": 9.625, "kl": 0.008099046605639159, "learning_rate": 6.052952209722434e-06, "loss": -0.012, "num_tokens": 35622339.0, "reward": 14.022188472747803, "reward_std": 12.63869276046753, "rewards/wrapper/mean": 7.011093850433826, "rewards/wrapper/std": 20.01946667730808, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 97.8, "completions/mean_length": 251.06875, "completions/mean_terminated_length": 62.13333435058594, "completions/min_length": 122.0, "completions/min_terminated_length": 19.6, "epoch": 0.7636363636363637, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.3200004163081758, "learning_rate": 6.047851545729257e-06, "loss": -0.0037, "num_tokens": 35675784.0, "reward": 11.79908652305603, "reward_std": 15.175419282913207, "rewards/wrapper/mean": 5.899543111026287, "rewards/wrapper/std": 17.951800467073916, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.7648093841642228, "frac_reward_zero_std": 0.025, "grad_norm": 3.859375, "kl": 0.015531188854947687, "learning_rate": 6.042747115906762e-06, "loss": -0.0132, "num_tokens": 35730971.0, "reward": 16.25852451324463, "reward_std": 20.682630634307863, "rewards/wrapper/mean": 8.12926201224327, "rewards/wrapper/std": 20.433372582495213, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 249.73125, "completions/mean_terminated_length": 39.4, "completions/min_length": 134.0, "completions/min_terminated_length": 31.6, "epoch": 0.7659824046920821, "frac_reward_zero_std": 0.0375, "grad_norm": 1.0390625, "kl": 0.009887203492689877, "learning_rate": 6.037638937732224e-06, "loss": -0.0043, "num_tokens": 35783812.0, "reward": 10.08267617225647, "reward_std": 13.750806903839111, "rewards/wrapper/mean": 5.041338118910789, "rewards/wrapper/std": 13.372113381326198, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.7671554252199414, "frac_reward_zero_std": 0.0125, "grad_norm": 1.796875, "kl": 0.01692812864203006, "learning_rate": 6.0325270286957576e-06, "loss": 0.0007, "num_tokens": 35836060.0, "reward": 6.83661425113678, "reward_std": 9.127959537506104, "rewards/wrapper/mean": 3.4183069966733455, "rewards/wrapper/std": 13.14266570582986, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 43.6, "completions/mean_length": 252.5625, "completions/mean_terminated_length": 43.6, "completions/min_length": 146.0, "completions/min_terminated_length": 43.6, "epoch": 0.7683284457478006, "frac_reward_zero_std": 0.025, "grad_norm": 4.03125, "kl": 0.21960210915422068, "learning_rate": 6.027411406300248e-06, "loss": 0.0001, "num_tokens": 35891990.0, "reward": 12.126427268981933, "reward_std": 16.43520584106445, "rewards/wrapper/mean": 6.063213557004929, "rewards/wrapper/std": 18.251873682439328, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 10.8, "completions/mean_length": 251.54375, "completions/mean_terminated_length": 9.1, "completions/min_length": 161.0, "completions/min_terminated_length": 7.4, "epoch": 0.7695014662756599, "frac_reward_zero_std": 0.025, "grad_norm": 3.640625, "kl": 0.013229875633260234, "learning_rate": 6.022292088061295e-06, "loss": -0.0091, "num_tokens": 35944685.0, "reward": 10.55277452468872, "reward_std": 12.219483661651612, "rewards/wrapper/mean": 5.276387079060077, "rewards/wrapper/std": 15.1108578145504, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 251.075, "completions/mean_terminated_length": 46.9, "completions/min_length": 149.2, "completions/min_terminated_length": 46.8, "epoch": 0.770674486803519, "frac_reward_zero_std": 0.025, "grad_norm": 1.6328125, "kl": 0.012836833647452296, "learning_rate": 6.0171690915071554e-06, "loss": -0.0138, "num_tokens": 36000061.0, "reward": 13.690535640716552, "reward_std": 13.481866836547852, "rewards/wrapper/mean": 6.84526747316122, "rewards/wrapper/std": 17.245069521665574, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 43.6, "completions/mean_length": 254.16875, "completions/mean_terminated_length": 21.9, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7718475073313783, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.009838975500315428, "learning_rate": 6.01204243417868e-06, "loss": -0.0024, "num_tokens": 36055320.0, "reward": 11.147965097427369, "reward_std": 14.937340641021729, "rewards/wrapper/mean": 5.573982398211956, "rewards/wrapper/std": 14.91929092258215, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 26.8, "completions/mean_length": 252.04375, "completions/mean_terminated_length": 13.6, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.7730205278592376, "frac_reward_zero_std": 0.025, "grad_norm": 2.640625, "kl": 0.008335669967345893, "learning_rate": 6.0069121336292505e-06, "loss": 0.0033, "num_tokens": 36109309.0, "reward": 14.027770709991454, "reward_std": 14.90116481781006, "rewards/wrapper/mean": 7.0138855308294294, "rewards/wrapper/std": 21.48350759744644, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 8.4, "completions/min_length": 154.4, "completions/min_terminated_length": 0.8, "epoch": 0.7741935483870968, "frac_reward_zero_std": 0.0, "grad_norm": 4.78125, "kl": 0.007277981494553387, "learning_rate": 6.001778207424726e-06, "loss": -0.0179, "num_tokens": 36161223.0, "reward": 7.435875916481018, "reward_std": 9.074062395095826, "rewards/wrapper/mean": 3.717937920242548, "rewards/wrapper/std": 13.181463140249253, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 71.2, "completions/mean_length": 248.64375, "completions/mean_terminated_length": 51.4, "completions/min_length": 82.8, "completions/min_terminated_length": 31.6, "epoch": 0.775366568914956, "frac_reward_zero_std": 0.025, "grad_norm": 2.96875, "kl": 0.01733542055590078, "learning_rate": 5.996640673143379e-06, "loss": -0.0158, "num_tokens": 36214038.0, "reward": 11.63428020477295, "reward_std": 13.726638221740723, "rewards/wrapper/mean": 5.81714008525014, "rewards/wrapper/std": 17.69838539212942, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7765395894428152, "frac_reward_zero_std": 0.0125, "grad_norm": 12.9375, "kl": 0.12717605533543974, "learning_rate": 5.991499548375836e-06, "loss": -0.0012, "num_tokens": 36271697.0, "reward": 10.735355949401855, "reward_std": 14.319485092163086, "rewards/wrapper/mean": 5.367678099870682, "rewards/wrapper/std": 15.712982186675072, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 69.6, "completions/mean_length": 250.18125, "completions/mean_terminated_length": 53.0, "completions/min_length": 87.6, "completions/min_terminated_length": 36.4, "epoch": 0.7777126099706745, "frac_reward_zero_std": 0.0375, "grad_norm": 2.921875, "kl": 0.1200019514246378, "learning_rate": 5.986354850725015e-06, "loss": 0.0005, "num_tokens": 36326110.0, "reward": 15.179692125320434, "reward_std": 20.558689689636232, "rewards/wrapper/mean": 7.589845579862595, "rewards/wrapper/std": 19.174075277149676, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 75.6, "completions/mean_length": 251.975, "completions/mean_terminated_length": 45.6, "completions/min_length": 184.2, "completions/min_terminated_length": 30.6, "epoch": 0.7788856304985338, "frac_reward_zero_std": 0.0125, "grad_norm": 3.625, "kl": 0.012654828454833478, "learning_rate": 5.98120659780607e-06, "loss": -0.0074, "num_tokens": 36381048.0, "reward": 20.327030181884766, "reward_std": 27.906171798706055, "rewards/wrapper/mean": 10.16351497322321, "rewards/wrapper/std": 27.119664253294467, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 9.4, "completions/mean_length": 249.9, "completions/mean_terminated_length": 6.0, "completions/min_length": 105.0, "completions/min_terminated_length": 2.6, "epoch": 0.7800586510263929, "frac_reward_zero_std": 0.025, "grad_norm": 2.984375, "kl": 0.007662113837432117, "learning_rate": 5.976054807246328e-06, "loss": -0.0185, "num_tokens": 36433154.0, "reward": 9.926510620117188, "reward_std": 10.952439212799073, "rewards/wrapper/mean": 4.963255329430103, "rewards/wrapper/std": 13.317852970957755, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 29.8, "completions/mean_length": 253.73125, "completions/mean_terminated_length": 29.8, "completions/min_length": 183.4, "completions/min_terminated_length": 29.8, "epoch": 0.7812316715542522, "frac_reward_zero_std": 0.025, "grad_norm": 2.8125, "kl": 0.007791096775326878, "learning_rate": 5.970899496685225e-06, "loss": -0.0042, "num_tokens": 36487157.0, "reward": 18.77137498855591, "reward_std": 24.687060356140137, "rewards/wrapper/mean": 9.385687156021595, "rewards/wrapper/std": 25.24630133062601, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7824046920821114, "frac_reward_zero_std": 0.0125, "grad_norm": 4.3125, "kl": 0.10288106517400593, "learning_rate": 5.965740683774254e-06, "loss": -0.0034, "num_tokens": 36544046.0, "reward": 7.03903284072876, "reward_std": 9.311002731323242, "rewards/wrapper/mean": 3.519516411423683, "rewards/wrapper/std": 11.18458695858717, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.7835777126099707, "frac_reward_zero_std": 0.025, "grad_norm": 3.015625, "kl": 0.036584659735672174, "learning_rate": 5.960578386176898e-06, "loss": 0.0015, "num_tokens": 36599552.0, "reward": 14.876870918273926, "reward_std": 19.170709991455077, "rewards/wrapper/mean": 7.438435123860836, "rewards/wrapper/std": 18.200231629610062, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 22.2, "completions/mean_length": 253.49375, "completions/mean_terminated_length": 22.2, "completions/min_length": 175.8, "completions/min_terminated_length": 22.2, "epoch": 0.78475073313783, "frac_reward_zero_std": 0.025, "grad_norm": 2.859375, "kl": 0.025927690404932945, "learning_rate": 5.955412621568571e-06, "loss": -0.0057, "num_tokens": 36653269.0, "reward": 8.59150498509407, "reward_std": 11.231444156169891, "rewards/wrapper/mean": 4.2957524582743645, "rewards/wrapper/std": 13.615456952154636, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.7859237536656891, "frac_reward_zero_std": 0.0, "grad_norm": 4.125, "kl": 0.007560189696960151, "learning_rate": 5.950243407636558e-06, "loss": 0.0003, "num_tokens": 36706603.0, "reward": 8.770905303955079, "reward_std": 11.600796127319336, "rewards/wrapper/mean": 4.385452452301979, "rewards/wrapper/std": 13.816292996704579, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 251.275, "completions/mean_terminated_length": 20.960000610351564, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.7870967741935484, "frac_reward_zero_std": 0.025, "grad_norm": 4.34375, "kl": 0.00841328235110268, "learning_rate": 5.945070762079953e-06, "loss": -0.0035, "num_tokens": 36760247.0, "reward": 10.318192052841187, "reward_std": 13.998966562747956, "rewards/wrapper/mean": 5.159096036851406, "rewards/wrapper/std": 14.412682954967021, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 9.6, "completions/mean_length": 254.7, "completions/mean_terminated_length": 9.6, "completions/min_length": 214.4, "completions/min_terminated_length": 9.6, "epoch": 0.7882697947214077, "frac_reward_zero_std": 0.0375, "grad_norm": 3.046875, "kl": 0.01900848246878013, "learning_rate": 5.939894702609604e-06, "loss": -0.0022, "num_tokens": 36817131.0, "reward": 8.248554515838624, "reward_std": 11.029405975341797, "rewards/wrapper/mean": 4.124277160316706, "rewards/wrapper/std": 11.294475804269315, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 70.8, "completions/mean_length": 251.8125, "completions/mean_terminated_length": 70.8, "completions/min_length": 122.0, "completions/min_terminated_length": 70.8, "epoch": 0.7894428152492668, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.00594492913223803, "learning_rate": 5.934715246948042e-06, "loss": -0.0115, "num_tokens": 36870919.0, "reward": 14.39435272216797, "reward_std": 17.458001327514648, "rewards/wrapper/mean": 7.197176413238049, "rewards/wrapper/std": 20.577864629030227, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 20.8, "completions/mean_length": 250.25, "completions/mean_terminated_length": 20.8, "completions/min_length": 72.0, "completions/min_terminated_length": 20.8, "epoch": 0.7906158357771261, "frac_reward_zero_std": 0.025, "grad_norm": 5.90625, "kl": 0.009505171538330614, "learning_rate": 5.929532412829432e-06, "loss": -0.0091, "num_tokens": 36928641.0, "reward": 8.956029534339905, "reward_std": 9.786428260803223, "rewards/wrapper/mean": 4.4780147187411785, "rewards/wrapper/std": 14.131121622025967, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 23.6, "completions/mean_length": 251.9375, "completions/mean_terminated_length": 23.6, "completions/min_length": 126.0, "completions/min_terminated_length": 23.6, "epoch": 0.7917888563049853, "frac_reward_zero_std": 0.0125, "grad_norm": 3.609375, "kl": 0.007368017232511193, "learning_rate": 5.924346217999501e-06, "loss": -0.0148, "num_tokens": 36983527.0, "reward": 13.003273391723633, "reward_std": 17.144739818572997, "rewards/wrapper/mean": 6.50163644105196, "rewards/wrapper/std": 18.4751975864172, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9625, "completions/max_length": 256.0, "completions/max_terminated_length": 15.6, "completions/mean_length": 246.90625, "completions/mean_terminated_length": 10.95, "completions/min_length": 111.8, "completions/min_terminated_length": 9.4, "epoch": 0.7929618768328446, "frac_reward_zero_std": 0.025, "grad_norm": 1.46875, "kl": 0.0075474835990462456, "learning_rate": 5.919156680215489e-06, "loss": -0.0109, "num_tokens": 37036544.0, "reward": 8.331317377090453, "reward_std": 9.312832927703857, "rewards/wrapper/mean": 4.165658417344093, "rewards/wrapper/std": 11.342847776412963, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.4, "completions/mean_length": 252.84375, "completions/mean_terminated_length": 1.4, "completions/min_length": 155.0, "completions/min_terminated_length": 1.4, "epoch": 0.7941348973607039, "frac_reward_zero_std": 0.0125, "grad_norm": 2.296875, "kl": 0.01361710149794817, "learning_rate": 5.913963817246078e-06, "loss": -0.0069, "num_tokens": 37094437.0, "reward": 7.812090110778809, "reward_std": 10.394651508331298, "rewards/wrapper/mean": 3.906045150756836, "rewards/wrapper/std": 12.286448706686496, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 28.2, "completions/mean_length": 253.68125, "completions/mean_terminated_length": 28.2, "completions/min_length": 181.8, "completions/min_terminated_length": 28.2, "epoch": 0.795307917888563, "frac_reward_zero_std": 0.0125, "grad_norm": 2.109375, "kl": 0.08241370252799243, "learning_rate": 5.908767646871337e-06, "loss": -0.0068, "num_tokens": 37148160.0, "reward": 8.166846323013306, "reward_std": 10.215306043624878, "rewards/wrapper/mean": 4.083422873914242, "rewards/wrapper/std": 12.841757401823997, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 24.0, "completions/mean_length": 250.35, "completions/mean_terminated_length": 24.0, "completions/min_length": 75.2, "completions/min_terminated_length": 24.0, "epoch": 0.7964809384164223, "frac_reward_zero_std": 0.0375, "grad_norm": 1.4140625, "kl": 0.00970684234634973, "learning_rate": 5.903568186882657e-06, "loss": 0.0066, "num_tokens": 37203162.0, "reward": 5.611361646652222, "reward_std": 6.9950720310211185, "rewards/wrapper/mean": 2.805680803209543, "rewards/wrapper/std": 8.580831627547742, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.7976539589442815, "frac_reward_zero_std": 0.0375, "grad_norm": 1.703125, "kl": 0.009019218035973608, "learning_rate": 5.898365455082694e-06, "loss": -0.0022, "num_tokens": 37259433.0, "reward": 15.188376426696777, "reward_std": 19.650272560119628, "rewards/wrapper/mean": 7.594188006222248, "rewards/wrapper/std": 20.71901829689741, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 9.6, "completions/mean_length": 254.7, "completions/mean_terminated_length": 9.6, "completions/min_length": 214.4, "completions/min_terminated_length": 9.6, "epoch": 0.7988269794721408, "frac_reward_zero_std": 0.0, "grad_norm": 5.0625, "kl": 0.4169622597051784, "learning_rate": 5.8931594692853095e-06, "loss": 0.0127, "num_tokens": 37313919.0, "reward": 8.489449882507325, "reward_std": 9.544210720062257, "rewards/wrapper/mean": 4.244724971055985, "rewards/wrapper/std": 11.45752448141575, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 21.2, "completions/mean_length": 251.8625, "completions/mean_terminated_length": 21.2, "completions/min_length": 123.6, "completions/min_terminated_length": 21.2, "epoch": 0.8, "frac_reward_zero_std": 0.0375, "grad_norm": 3.390625, "kl": 0.009147522901184858, "learning_rate": 5.887950247315501e-06, "loss": -0.0074, "num_tokens": 37368831.0, "reward": 9.47968680858612, "reward_std": 11.7788712143898, "rewards/wrapper/mean": 4.739843459427357, "rewards/wrapper/std": 12.508141206204892, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 82.6, "completions/mean_length": 253.78125, "completions/mean_terminated_length": 82.6, "completions/min_length": 185.0, "completions/min_terminated_length": 82.6, "epoch": 0.8011730205278592, "frac_reward_zero_std": 0.025, "grad_norm": 2.734375, "kl": 0.008823846664745361, "learning_rate": 5.88273780700935e-06, "loss": -0.0047, "num_tokens": 37423584.0, "reward": 12.176618194580078, "reward_std": 15.400644683837891, "rewards/wrapper/mean": 6.088309331983328, "rewards/wrapper/std": 16.16872684061527, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 256.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 256.0, "completions/min_terminated_length": 0.0, "epoch": 0.8023460410557185, "frac_reward_zero_std": 0.0375, "grad_norm": 8.1875, "kl": 0.012897691410034894, "learning_rate": 5.8775221662139565e-06, "loss": 0.0005, "num_tokens": 37481854.0, "reward": 13.41852263212204, "reward_std": 14.628087675571441, "rewards/wrapper/mean": 6.709261292219162, "rewards/wrapper/std": 21.448935608565808, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.99375, "completions/max_length": 256.0, "completions/max_terminated_length": 0.2, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 0.2, "completions/min_length": 205.0, "completions/min_terminated_length": 0.2, "epoch": 0.8035190615835777, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.008465129404794425, "learning_rate": 5.87230334278738e-06, "loss": -0.0059, "num_tokens": 37537499.0, "reward": 12.369258069992066, "reward_std": 10.207852268218994, "rewards/wrapper/mean": 6.184629016369581, "rewards/wrapper/std": 15.72431526631117, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 45.6, "completions/mean_length": 252.625, "completions/mean_terminated_length": 45.6, "completions/min_length": 148.0, "completions/min_terminated_length": 45.6, "epoch": 0.804692082111437, "frac_reward_zero_std": 0.025, "grad_norm": 3.796875, "kl": 0.010761903249658644, "learning_rate": 5.867081354598574e-06, "loss": -0.0016, "num_tokens": 37593419.0, "reward": 8.310170364379882, "reward_std": 11.01772403717041, "rewards/wrapper/mean": 4.155085320025682, "rewards/wrapper/std": 13.078730754554272, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 47.2, "completions/mean_length": 251.4875, "completions/mean_terminated_length": 47.0, "completions/min_length": 149.2, "completions/min_terminated_length": 46.8, "epoch": 0.8058651026392962, "frac_reward_zero_std": 0.025, "grad_norm": 3.390625, "kl": 0.00788930271519348, "learning_rate": 5.861856219527331e-06, "loss": -0.0151, "num_tokens": 37646569.0, "reward": 7.208945274353027, "reward_std": 9.432900524139404, "rewards/wrapper/mean": 3.6044726580381394, "rewards/wrapper/std": 11.457335326075555, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 33.4, "completions/mean_length": 250.9875, "completions/mean_terminated_length": 22.4, "completions/min_length": 113.8, "completions/min_terminated_length": 11.4, "epoch": 0.8070381231671554, "frac_reward_zero_std": 0.0, "grad_norm": 6651904.0, "kl": 507.7494637601543, "learning_rate": 5.856627955464216e-06, "loss": 20.3053, "num_tokens": 37702253.0, "reward": 11.832038593292236, "reward_std": 15.661580467224121, "rewards/wrapper/mean": 5.916019389778375, "rewards/wrapper/std": 17.451789928972723, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 42.2, "completions/mean_length": 253.0875, "completions/mean_terminated_length": 39.4, "completions/min_length": 190.2, "completions/min_terminated_length": 36.6, "epoch": 0.8082111436950147, "frac_reward_zero_std": 0.0125, "grad_norm": 4.1875, "kl": 0.014341074018739164, "learning_rate": 5.851396580310511e-06, "loss": -0.0004, "num_tokens": 37756311.0, "reward": 9.03423089981079, "reward_std": 12.168891334533692, "rewards/wrapper/mean": 4.517115272581577, "rewards/wrapper/std": 13.540437084436416, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 62.4, "completions/mean_length": 253.15, "completions/mean_terminated_length": 62.4, "completions/min_length": 164.8, "completions/min_terminated_length": 62.4, "epoch": 0.8093841642228738, "frac_reward_zero_std": 0.025, "grad_norm": 3.5, "kl": 0.010428384994156659, "learning_rate": 5.846162111978145e-06, "loss": -0.0056, "num_tokens": 37814377.0, "reward": 5.7462818145751955, "reward_std": 6.842691504955292, "rewards/wrapper/mean": 2.8731407657265664, "rewards/wrapper/std": 9.179096294939518, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 30.6, "completions/mean_length": 252.1625, "completions/mean_terminated_length": 15.5, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.8105571847507331, "frac_reward_zero_std": 0.025, "grad_norm": 1.9296875, "kl": 0.010554290283471346, "learning_rate": 5.8409245683896415e-06, "loss": -0.0107, "num_tokens": 37871469.0, "reward": 8.885381197929382, "reward_std": 11.384269547462463, "rewards/wrapper/mean": 4.4426903769373896, "rewards/wrapper/std": 12.253298838436603, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 48.8, "completions/mean_length": 252.725, "completions/mean_terminated_length": 48.8, "completions/min_length": 151.2, "completions/min_terminated_length": 48.8, "epoch": 0.8117302052785924, "frac_reward_zero_std": 0.0375, "grad_norm": 1.234375, "kl": 0.009817326813936234, "learning_rate": 5.835683967478055e-06, "loss": -0.0082, "num_tokens": 37926255.0, "reward": 10.214480829238891, "reward_std": 11.895071125030517, "rewards/wrapper/mean": 5.107240244746208, "rewards/wrapper/std": 17.879739120602608, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 253.29375, "completions/mean_terminated_length": 67.0, "completions/min_length": 169.4, "completions/min_terminated_length": 67.0, "epoch": 0.8129032258064516, "frac_reward_zero_std": 0.025, "grad_norm": 2.734375, "kl": 0.411629791429732, "learning_rate": 5.830440327186903e-06, "loss": 0.0107, "num_tokens": 37983936.0, "reward": 8.380476808547973, "reward_std": 11.220604085922242, "rewards/wrapper/mean": 4.190238507837057, "rewards/wrapper/std": 13.17393459379673, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 53.4, "completions/mean_length": 249.68125, "completions/mean_terminated_length": 51.93333339691162, "completions/min_length": 153.6, "completions/min_terminated_length": 51.2, "epoch": 0.8140762463343109, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.01218703945633024, "learning_rate": 5.825193665470114e-06, "loss": -0.0192, "num_tokens": 38035703.0, "reward": 9.604938888549805, "reward_std": 12.615773582458496, "rewards/wrapper/mean": 4.802469128370285, "rewards/wrapper/std": 14.279499669373035, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.98125, "completions/max_length": 256.0, "completions/max_terminated_length": 0.6, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 0.6, "completions/min_length": 103.0, "completions/min_terminated_length": 0.6, "epoch": 0.8152492668621701, "frac_reward_zero_std": 0.025, "grad_norm": 1.3203125, "kl": 0.00477030526380986, "learning_rate": 5.819944000291961e-06, "loss": -0.0143, "num_tokens": 38089328.0, "reward": 10.100382041931152, "reward_std": 11.479272651672364, "rewards/wrapper/mean": 5.050190765410662, "rewards/wrapper/std": 14.452794567495584, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.96875, "completions/max_length": 256.0, "completions/max_terminated_length": 1.2, "completions/mean_length": 248.04375, "completions/mean_terminated_length": 1.1, "completions/min_length": 52.2, "completions/min_terminated_length": 1.0, "epoch": 0.8164222873900293, "frac_reward_zero_std": 0.0375, "grad_norm": 12.6875, "kl": 0.010600641707424075, "learning_rate": 5.814691349626997e-06, "loss": -0.024, "num_tokens": 38142733.0, "reward": 9.5110595703125, "reward_std": 11.635570430755616, "rewards/wrapper/mean": 4.755529856681823, "rewards/wrapper/std": 13.565742841362953, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 38.6, "completions/mean_length": 250.8125, "completions/mean_terminated_length": 38.6, "completions/min_length": 141.0, "completions/min_terminated_length": 38.6, "epoch": 0.8175953079178886, "frac_reward_zero_std": 0.025, "grad_norm": 3.515625, "kl": 0.008213794010225683, "learning_rate": 5.809435731460002e-06, "loss": -0.0053, "num_tokens": 38197519.0, "reward": 8.368210363388062, "reward_std": 11.415078401565552, "rewards/wrapper/mean": 4.1841050907969475, "rewards/wrapper/std": 12.334974782168866, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9875, "completions/max_length": 256.0, "completions/max_terminated_length": 0.4, "completions/mean_length": 252.8125, "completions/mean_terminated_length": 0.4, "completions/min_length": 154.0, "completions/min_terminated_length": 0.4, "epoch": 0.8187683284457478, "frac_reward_zero_std": 0.0125, "grad_norm": 1.2734375, "kl": 0.007487597275758162, "learning_rate": 5.804177163785915e-06, "loss": 0.0116, "num_tokens": 38251175.0, "reward": 6.888301229476928, "reward_std": 7.135924625396728, "rewards/wrapper/mean": 3.4441506803035735, "rewards/wrapper/std": 10.814507488906383, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.975, "completions/max_length": 256.0, "completions/max_terminated_length": 3.2, "completions/mean_length": 249.7, "completions/mean_terminated_length": 3.2, "completions/min_length": 54.4, "completions/min_terminated_length": 3.2, "epoch": 0.819941348973607, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609375, "kl": 0.01530810545082204, "learning_rate": 5.798915664609773e-06, "loss": -0.018, "num_tokens": 38303053.0, "reward": 10.540728664398193, "reward_std": 12.398472499847411, "rewards/wrapper/mean": 5.2703643180429935, "rewards/wrapper/std": 13.671444369852543, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.95, "completions/max_length": 256.0, "completions/max_terminated_length": 27.8, "completions/mean_length": 244.19375, "completions/mean_terminated_length": 13.6, "completions/min_length": 52.0, "completions/min_terminated_length": 0.8, "epoch": 0.8211143695014663, "frac_reward_zero_std": 0.0125, "grad_norm": 2.21875, "kl": 0.007007872418034822, "learning_rate": 5.7936512519466495e-06, "loss": -0.0243, "num_tokens": 38354696.0, "reward": 12.547660064697265, "reward_std": 15.266628074645997, "rewards/wrapper/mean": 6.273830074816942, "rewards/wrapper/std": 18.49634841531515, "step": 3500 }, { "epoch": 0.8211143695014663, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.7825, "eval_completions/max_length": 256.0, "eval_completions/max_terminated_length": 75.64, "eval_completions/mean_length": 224.9175, "eval_completions/mean_terminated_length": 61.223333587646486, "eval_completions/min_length": 155.16, "eval_completions/min_terminated_length": 45.08, "eval_frac_reward_zero_std": 0.005, "eval_kl": 0.013251524628140032, "eval_loss": -0.06226690113544464, "eval_num_tokens": 38354696.0, "eval_reward": 0.31256932340562343, "eval_reward_std": 0.15810628833714874, "eval_rewards/wrapper/mean": 0.15628466337919236, "eval_rewards/wrapper/std": 0.14292447288986296, "eval_runtime": 208.8173, "eval_samples_per_second": 0.958, "eval_steps_per_second": 0.239, "step": 3500 } ], "logging_steps": 5, "max_steps": 8524, "num_input_tokens_seen": 38354696, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }