{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8403361344537815, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2947.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3156.0, "completions/max_terminated_length": 3156.0, "completions/mean_length": 2947.5, "completions/mean_terminated_length": 2947.5, "completions/min_length": 2764.0, "completions/min_terminated_length": 2764.0, "epoch": 0.0016806722689075631, "frac_reward_zero_std": 0.0, "grad_norm": 0.22864551842212677, "kl": 0.41864094138145447, "learning_rate": 0.0, "loss": 0.0004, "num_tokens": 22462.0, "reward": 8.499990463256836, "reward_std": 0.3535599112510681, "rewards/reward_model/mean": 8.499990463256836, "rewards/reward_model/std": 0.3535599112510681, "step": 1 }, { "completion_length": 3534.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4533.0, "completions/max_terminated_length": 4533.0, "completions/mean_length": 3534.0, "completions/mean_terminated_length": 3534.0, "completions/min_length": 2558.0, "completions/min_terminated_length": 2558.0, "epoch": 0.0033613445378151263, "frac_reward_zero_std": 0.0, "grad_norm": 0.24098172783851624, "kl": 0.3827270567417145, "learning_rate": 5.0000000000000004e-08, "loss": 0.0004, "num_tokens": 47146.0, "reward": 7.874920845031738, "reward_std": 0.5204211473464966, "rewards/reward_model/mean": 7.874920845031738, "rewards/reward_model/std": 0.5204212665557861, "step": 2 }, { "completion_length": 2755.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3132.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 2755.75, "completions/mean_terminated_length": 2755.75, "completions/min_length": 2379.0, "completions/min_terminated_length": 2379.0, "epoch": 0.005042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.2558765113353729, "kl": 0.8935418128967285, "learning_rate": 1.0000000000000001e-07, "loss": 0.0009, "num_tokens": 66937.0, "reward": 9.562366485595703, "reward_std": 0.5153685212135315, "rewards/reward_model/mean": 9.562366485595703, "rewards/reward_model/std": 0.5153685212135315, "step": 3 }, { "completion_length": 3009.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3186.0, "completions/max_terminated_length": 3186.0, "completions/mean_length": 3009.0, "completions/mean_terminated_length": 3009.0, "completions/min_length": 2896.0, "completions/min_terminated_length": 2896.0, "epoch": 0.0067226890756302525, "frac_reward_zero_std": 0.0, "grad_norm": 0.23344005644321442, "kl": 0.4426734447479248, "learning_rate": 1.5000000000000002e-07, "loss": 0.0004, "num_tokens": 88921.0, "reward": 9.624762535095215, "reward_std": 0.43301600217819214, "rewards/reward_model/mean": 9.624762535095215, "rewards/reward_model/std": 0.43301627039909363, "step": 4 }, { "completion_length": 883.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 883.0, "completions/mean_terminated_length": 883.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.008403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 0.6023623943328857, "kl": 0.8841147422790527, "learning_rate": 2.0000000000000002e-07, "loss": 0.0009, "num_tokens": 102045.0, "reward": 6.124550819396973, "reward_std": 1.6142183542251587, "rewards/reward_model/mean": 6.124550819396973, "rewards/reward_model/std": 1.6142184734344482, "step": 5 }, { "completion_length": 3231.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4057.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 3231.0, "completions/mean_terminated_length": 3231.0, "completions/min_length": 2769.0, "completions/min_terminated_length": 2769.0, "epoch": 0.010084033613445379, "frac_reward_zero_std": 0.0, "grad_norm": 0.25101199746131897, "kl": 0.42379599809646606, "learning_rate": 2.5000000000000004e-07, "loss": 0.0004, "num_tokens": 124149.0, "reward": 8.374448776245117, "reward_std": 0.6291409134864807, "rewards/reward_model/mean": 8.374448776245117, "rewards/reward_model/std": 0.6291410326957703, "step": 6 }, { "completion_length": 2049.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 2049.25, "completions/mean_terminated_length": 2049.25, "completions/min_length": 1558.0, "completions/min_terminated_length": 1558.0, "epoch": 0.011764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.34592828154563904, "kl": 0.3436116874217987, "learning_rate": 3.0000000000000004e-07, "loss": 0.0003, "num_tokens": 143242.0, "reward": 7.25, "reward_std": 3.4034295082092285, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 3.4034297466278076, "step": 7 }, { "completion_length": 3440.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3849.0, "completions/max_terminated_length": 3849.0, "completions/mean_length": 3440.5, "completions/mean_terminated_length": 3440.5, "completions/min_length": 2937.0, "completions/min_terminated_length": 2937.0, "epoch": 0.013445378151260505, "frac_reward_zero_std": 0.0, "grad_norm": 0.231065571308136, "kl": 0.4249846041202545, "learning_rate": 3.5000000000000004e-07, "loss": 0.0004, "num_tokens": 167764.0, "reward": 6.87348747253418, "reward_std": 3.0664312839508057, "rewards/reward_model/mean": 6.87348747253418, "rewards/reward_model/std": 3.0664312839508057, "step": 8 }, { "completion_length": 1771.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 1771.0, "completions/mean_terminated_length": 1771.0, "completions/min_length": 1367.0, "completions/min_terminated_length": 1367.0, "epoch": 0.015126050420168067, "frac_reward_zero_std": 0.0, "grad_norm": 0.41485831141471863, "kl": 0.40487462282180786, "learning_rate": 4.0000000000000003e-07, "loss": 0.0004, "num_tokens": 183464.0, "reward": 6.125, "reward_std": 1.973786473274231, "rewards/reward_model/mean": 6.125, "rewards/reward_model/std": 1.9737865924835205, "step": 9 }, { "completion_length": 3359.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3600.0, "completions/max_terminated_length": 3600.0, "completions/mean_length": 3359.5, "completions/mean_terminated_length": 3359.5, "completions/min_length": 3058.0, "completions/min_terminated_length": 3058.0, "epoch": 0.01680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.2300678789615631, "kl": 0.40493836998939514, "learning_rate": 4.5000000000000003e-07, "loss": 0.0004, "num_tokens": 206786.0, "reward": 8.87358283996582, "reward_std": 0.14423945546150208, "rewards/reward_model/mean": 8.87358283996582, "rewards/reward_model/std": 0.14423947036266327, "step": 10 }, { "completion_length": 1176.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 1176.0, "completions/mean_terminated_length": 1176.0, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.018487394957983194, "frac_reward_zero_std": 0.0, "grad_norm": 0.5758852958679199, "kl": 0.7856903076171875, "learning_rate": 5.000000000000001e-07, "loss": 0.0008, "num_tokens": 220790.0, "reward": 7.059416770935059, "reward_std": 1.1248167753219604, "rewards/reward_model/mean": 7.059416770935059, "rewards/reward_model/std": 1.12481689453125, "step": 11 }, { "completion_length": 197.25, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.020168067226890758, "frac_reward_zero_std": 0.0, "grad_norm": 10.455142974853516, "kl": 1.7632454633712769, "learning_rate": 5.5e-07, "loss": 0.0018, "num_tokens": 229683.0, "reward": 6.749574661254883, "reward_std": 3.7753303050994873, "rewards/reward_model/mean": 6.749574661254883, "rewards/reward_model/std": 3.7753303050994873, "step": 12 }, { "completion_length": 3293.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3623.0, "completions/max_terminated_length": 3623.0, "completions/mean_length": 3293.5, "completions/mean_terminated_length": 3293.5, "completions/min_length": 2867.0, "completions/min_terminated_length": 2867.0, "epoch": 0.021848739495798318, "frac_reward_zero_std": 0.0, "grad_norm": 0.19915665686130524, "kl": 0.462253212928772, "learning_rate": 6.000000000000001e-07, "loss": 0.0005, "num_tokens": 251365.0, "reward": 7.060825347900391, "reward_std": 3.0564966201782227, "rewards/reward_model/mean": 7.060825347900391, "rewards/reward_model/std": 3.0564968585968018, "step": 13 }, { "completion_length": 3526.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3798.0, "completions/max_terminated_length": 3798.0, "completions/mean_length": 3526.5, "completions/mean_terminated_length": 3526.5, "completions/min_length": 3032.0, "completions/min_terminated_length": 3032.0, "epoch": 0.023529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.23769311606884003, "kl": 0.39918115735054016, "learning_rate": 6.5e-07, "loss": 0.0004, "num_tokens": 275555.0, "reward": 7.184149742126465, "reward_std": 1.26445472240448, "rewards/reward_model/mean": 7.184149742126465, "rewards/reward_model/std": 1.26445472240448, "step": 14 }, { "completion_length": 3020.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3207.0, "completions/max_terminated_length": 3207.0, "completions/mean_length": 3020.25, "completions/mean_terminated_length": 3020.25, "completions/min_length": 2788.0, "completions/min_terminated_length": 2788.0, "epoch": 0.025210084033613446, "frac_reward_zero_std": 0.0, "grad_norm": 0.21623943746089935, "kl": 0.46406587958335876, "learning_rate": 7.000000000000001e-07, "loss": 0.0005, "num_tokens": 297480.0, "reward": 8.49774169921875, "reward_std": 1.0792028903961182, "rewards/reward_model/mean": 8.49774169921875, "rewards/reward_model/std": 1.0792030096054077, "step": 15 }, { "completion_length": 3226.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3620.0, "completions/max_terminated_length": 3620.0, "completions/mean_length": 3226.75, "completions/mean_terminated_length": 3226.75, "completions/min_length": 2546.0, "completions/min_terminated_length": 2546.0, "epoch": 0.02689075630252101, "frac_reward_zero_std": 0.0, "grad_norm": 0.24463801085948944, "kl": 0.49360811710357666, "learning_rate": 7.5e-07, "loss": 0.0005, "num_tokens": 321115.0, "reward": 6.808682441711426, "reward_std": 1.6747063398361206, "rewards/reward_model/mean": 6.808682441711426, "rewards/reward_model/std": 1.6747063398361206, "step": 16 }, { "completion_length": 3048.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 3048.75, "completions/mean_terminated_length": 3048.75, "completions/min_length": 2805.0, "completions/min_terminated_length": 2805.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2717757225036621, "kl": 0.4575347304344177, "learning_rate": 8.000000000000001e-07, "loss": 0.0005, "num_tokens": 344562.0, "reward": 8.246984481811523, "reward_std": 0.8661016821861267, "rewards/reward_model/mean": 8.246984481811523, "rewards/reward_model/std": 0.8661016821861267, "step": 17 }, { "completion_length": 2945.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3917.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 2945.0, "completions/mean_terminated_length": 2945.0, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "epoch": 0.030252100840336135, "frac_reward_zero_std": 0.0, "grad_norm": 0.30351686477661133, "kl": 0.33760935068130493, "learning_rate": 8.500000000000001e-07, "loss": 0.0003, "num_tokens": 365458.0, "reward": 3.12009334564209, "reward_std": 3.325429677963257, "rewards/reward_model/mean": 3.12009334564209, "rewards/reward_model/std": 3.325429677963257, "step": 18 }, { "completion_length": 3071.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3501.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 3071.25, "completions/mean_terminated_length": 3071.25, "completions/min_length": 2716.0, "completions/min_terminated_length": 2716.0, "epoch": 0.031932773109243695, "frac_reward_zero_std": 0.0, "grad_norm": 0.21365933120250702, "kl": 0.44693294167518616, "learning_rate": 9.000000000000001e-07, "loss": 0.0004, "num_tokens": 388303.0, "reward": 7.433669567108154, "reward_std": 2.9736855030059814, "rewards/reward_model/mean": 7.433669567108154, "rewards/reward_model/std": 2.9736855030059814, "step": 19 }, { "completion_length": 3026.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3183.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 3026.0, "completions/mean_terminated_length": 3026.0, "completions/min_length": 2747.0, "completions/min_terminated_length": 2747.0, "epoch": 0.03361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.2290022075176239, "kl": 0.42688068747520447, "learning_rate": 9.500000000000001e-07, "loss": 0.0004, "num_tokens": 408863.0, "reward": 7.873417377471924, "reward_std": 1.7840297222137451, "rewards/reward_model/mean": 7.873417377471924, "rewards/reward_model/std": 1.7840297222137451, "step": 20 }, { "completion_length": 1102.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 1102.25, "completions/mean_terminated_length": 1102.25, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.03529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7857298851013184, "kl": 0.8577286005020142, "learning_rate": 1.0000000000000002e-06, "loss": 0.0009, "num_tokens": 423604.0, "reward": 6.423410415649414, "reward_std": 4.296779632568359, "rewards/reward_model/mean": 6.423410415649414, "rewards/reward_model/std": 4.296780109405518, "step": 21 }, { "completion_length": 3086.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3459.0, "completions/max_terminated_length": 3459.0, "completions/mean_length": 3086.75, "completions/mean_terminated_length": 3086.75, "completions/min_length": 2556.0, "completions/min_terminated_length": 2556.0, "epoch": 0.03697478991596639, "frac_reward_zero_std": 0.0, "grad_norm": 0.2390928417444229, "kl": 0.5277106761932373, "learning_rate": 1.0500000000000001e-06, "loss": 0.0005, "num_tokens": 446255.0, "reward": 7.993368148803711, "reward_std": 0.8412674069404602, "rewards/reward_model/mean": 7.993368148803711, "rewards/reward_model/std": 0.8412673473358154, "step": 22 }, { "completion_length": 1777.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1777.75, "completions/mean_terminated_length": 1777.75, "completions/min_length": 1416.0, "completions/min_terminated_length": 1416.0, "epoch": 0.03865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.3648889660835266, "kl": 0.41789889335632324, "learning_rate": 1.1e-06, "loss": 0.0004, "num_tokens": 462262.0, "reward": 4.125, "reward_std": 1.4930394887924194, "rewards/reward_model/mean": 4.125, "rewards/reward_model/std": 1.4930394887924194, "step": 23 }, { "completion_length": 3006.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3322.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 3006.75, "completions/mean_terminated_length": 3006.75, "completions/min_length": 2838.0, "completions/min_terminated_length": 2838.0, "epoch": 0.040336134453781515, "frac_reward_zero_std": 0.0, "grad_norm": 0.24086372554302216, "kl": 0.45793622732162476, "learning_rate": 1.1500000000000002e-06, "loss": 0.0005, "num_tokens": 483125.0, "reward": 9.804398536682129, "reward_std": 0.12509065866470337, "rewards/reward_model/mean": 9.804398536682129, "rewards/reward_model/std": 0.12509065866470337, "step": 24 }, { "completion_length": 1440.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 1440.25, "completions/mean_terminated_length": 1440.25, "completions/min_length": 1091.0, "completions/min_terminated_length": 1091.0, "epoch": 0.04201680672268908, "frac_reward_zero_std": 0.0, "grad_norm": 0.49449586868286133, "kl": 0.7333781123161316, "learning_rate": 1.2000000000000002e-06, "loss": 0.0007, "num_tokens": 499202.0, "reward": 6.603575229644775, "reward_std": 2.1134190559387207, "rewards/reward_model/mean": 6.603575229644775, "rewards/reward_model/std": 2.1134190559387207, "step": 25 }, { "completion_length": 2997.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3468.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2997.5, "completions/mean_terminated_length": 2997.5, "completions/min_length": 2723.0, "completions/min_terminated_length": 2723.0, "epoch": 0.043697478991596636, "frac_reward_zero_std": 0.0, "grad_norm": 0.22256870567798615, "kl": 0.4614196717739105, "learning_rate": 1.25e-06, "loss": 0.0005, "num_tokens": 520276.0, "reward": 7.620273590087891, "reward_std": 2.245389461517334, "rewards/reward_model/mean": 7.620273590087891, "rewards/reward_model/std": 2.245389461517334, "step": 26 }, { "completion_length": 1711.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 1711.5, "completions/mean_terminated_length": 1711.5, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.0453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.5169827938079834, "kl": 0.6935045123100281, "learning_rate": 1.3e-06, "loss": 0.0007, "num_tokens": 536426.0, "reward": 8.411685943603516, "reward_std": 0.920927107334137, "rewards/reward_model/mean": 8.411685943603516, "rewards/reward_model/std": 0.920927107334137, "step": 27 }, { "completion_length": 2390.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3224.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 2390.75, "completions/mean_terminated_length": 2390.75, "completions/min_length": 1817.0, "completions/min_terminated_length": 1817.0, "epoch": 0.047058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.4216252863407135, "kl": 0.3447871506214142, "learning_rate": 1.3500000000000002e-06, "loss": 0.0003, "num_tokens": 557113.0, "reward": 6.5, "reward_std": 2.798809289932251, "rewards/reward_model/mean": 6.5, "rewards/reward_model/std": 2.798809289932251, "step": 28 }, { "completion_length": 3297.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3448.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 3297.0, "completions/mean_terminated_length": 3297.0, "completions/min_length": 3118.0, "completions/min_terminated_length": 3118.0, "epoch": 0.04873949579831933, "frac_reward_zero_std": 0.0, "grad_norm": 0.20905952155590057, "kl": 0.44418153166770935, "learning_rate": 1.4000000000000001e-06, "loss": 0.0004, "num_tokens": 581145.0, "reward": 7.86611270904541, "reward_std": 0.774669885635376, "rewards/reward_model/mean": 7.86611270904541, "rewards/reward_model/std": 0.7746700644493103, "step": 29 }, { "completion_length": 2944.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3491.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 2944.5, "completions/mean_terminated_length": 2944.5, "completions/min_length": 2070.0, "completions/min_terminated_length": 2070.0, "epoch": 0.05042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.21313747763633728, "kl": 0.4106956720352173, "learning_rate": 1.45e-06, "loss": 0.0004, "num_tokens": 603227.0, "reward": 8.306060791015625, "reward_std": 0.893286943435669, "rewards/reward_model/mean": 8.306060791015625, "rewards/reward_model/std": 0.893286943435669, "step": 30 }, { "completion_length": 2035.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2720.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 2035.25, "completions/mean_terminated_length": 2035.25, "completions/min_length": 1470.0, "completions/min_terminated_length": 1470.0, "epoch": 0.052100840336134456, "frac_reward_zero_std": 0.0, "grad_norm": 0.327402800321579, "kl": 0.3633970618247986, "learning_rate": 1.5e-06, "loss": 0.0004, "num_tokens": 620532.0, "reward": 6.75, "reward_std": 2.3273732662200928, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.327373504638672, "step": 31 }, { "completion_length": 2783.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3118.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2783.25, "completions/mean_terminated_length": 2783.25, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.05378151260504202, "frac_reward_zero_std": 0.0, "grad_norm": 0.2371380627155304, "kl": 0.5408622622489929, "learning_rate": 1.5500000000000002e-06, "loss": 0.0005, "num_tokens": 640929.0, "reward": 8.676490783691406, "reward_std": 0.23656955361366272, "rewards/reward_model/mean": 8.676490783691406, "rewards/reward_model/std": 0.23656955361366272, "step": 32 }, { "completion_length": 3145.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3334.0, "completions/max_terminated_length": 3334.0, "completions/mean_length": 3145.5, "completions/mean_terminated_length": 3145.5, "completions/min_length": 2748.0, "completions/min_terminated_length": 2748.0, "epoch": 0.05546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.23712949454784393, "kl": 0.4347061812877655, "learning_rate": 1.6000000000000001e-06, "loss": 0.0004, "num_tokens": 664079.0, "reward": -0.014399351552128792, "reward_std": 0.01094669010490179, "rewards/reward_model/mean": -0.014399351552128792, "rewards/reward_model/std": 0.01094669010490179, "step": 33 }, { "completion_length": 1896.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1896.25, "completions/mean_terminated_length": 1896.25, "completions/min_length": 1386.0, "completions/min_terminated_length": 1386.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.48681309819221497, "kl": 0.40498724579811096, "learning_rate": 1.6500000000000003e-06, "loss": 0.0004, "num_tokens": 681572.0, "reward": 5.930726051330566, "reward_std": 4.415777206420898, "rewards/reward_model/mean": 5.930726051330566, "rewards/reward_model/std": 4.415777206420898, "step": 34 }, { "completion_length": 2924.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 2924.5, "completions/mean_terminated_length": 2924.5, "completions/min_length": 2808.0, "completions/min_terminated_length": 2808.0, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.22173462808132172, "kl": 0.3978763818740845, "learning_rate": 1.7000000000000002e-06, "loss": 0.0004, "num_tokens": 702998.0, "reward": 8.046117782592773, "reward_std": 1.2211781740188599, "rewards/reward_model/mean": 8.046117782592773, "rewards/reward_model/std": 1.2211780548095703, "step": 35 }, { "completion_length": 3348.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3742.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 3348.5, "completions/mean_terminated_length": 3348.5, "completions/min_length": 3066.0, "completions/min_terminated_length": 3066.0, "epoch": 0.06050420168067227, "frac_reward_zero_std": 0.0, "grad_norm": 0.2196238934993744, "kl": 0.41847947239875793, "learning_rate": 1.75e-06, "loss": 0.0004, "num_tokens": 727604.0, "reward": 8.480682373046875, "reward_std": 0.6139712929725647, "rewards/reward_model/mean": 8.480682373046875, "rewards/reward_model/std": 0.6139712929725647, "step": 36 }, { "completion_length": 7.25, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 7.25, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.06218487394957983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05685713142156601, "kl": 4.287390232086182, "learning_rate": 1.8000000000000001e-06, "loss": 0.0043, "num_tokens": 736097.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 37 }, { "completion_length": 3168.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4236.0, "completions/max_terminated_length": 4236.0, "completions/mean_length": 3168.25, "completions/mean_terminated_length": 3168.25, "completions/min_length": 2421.0, "completions/min_terminated_length": 2421.0, "epoch": 0.06386554621848739, "frac_reward_zero_std": 0.0, "grad_norm": 0.30552753806114197, "kl": 0.274600625038147, "learning_rate": 1.85e-06, "loss": 0.0003, "num_tokens": 758170.0, "reward": 7.125, "reward_std": 1.5478479862213135, "rewards/reward_model/mean": 7.125, "rewards/reward_model/std": 1.5478479862213135, "step": 38 }, { "completion_length": 3031.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3205.0, "completions/max_terminated_length": 3205.0, "completions/mean_length": 3031.0, "completions/mean_terminated_length": 3031.0, "completions/min_length": 2839.0, "completions/min_terminated_length": 2839.0, "epoch": 0.06554621848739496, "frac_reward_zero_std": 0.0, "grad_norm": 0.2312554270029068, "kl": 0.4626840054988861, "learning_rate": 1.9000000000000002e-06, "loss": 0.0005, "num_tokens": 780238.0, "reward": 8.927217483520508, "reward_std": 1.1756690740585327, "rewards/reward_model/mean": 8.927217483520508, "rewards/reward_model/std": 1.1756691932678223, "step": 39 }, { "completion_length": 3206.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 3206.0, "completions/mean_terminated_length": 3206.0, "completions/min_length": 2840.0, "completions/min_terminated_length": 2840.0, "epoch": 0.06722689075630252, "frac_reward_zero_std": 0.0, "grad_norm": 0.27287694811820984, "kl": 0.42922842502593994, "learning_rate": 1.9500000000000004e-06, "loss": 0.0004, "num_tokens": 803578.0, "reward": 8.357658386230469, "reward_std": 1.2403758764266968, "rewards/reward_model/mean": 8.357658386230469, "rewards/reward_model/std": 1.2403758764266968, "step": 40 }, { "completion_length": 3249.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3387.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 3249.0, "completions/mean_terminated_length": 3249.0, "completions/min_length": 3084.0, "completions/min_terminated_length": 3084.0, "epoch": 0.06890756302521009, "frac_reward_zero_std": 0.0, "grad_norm": 0.21696928143501282, "kl": 0.43103882670402527, "learning_rate": 2.0000000000000003e-06, "loss": 0.0004, "num_tokens": 826502.0, "reward": 7.731931209564209, "reward_std": 0.6359418034553528, "rewards/reward_model/mean": 7.731931209564209, "rewards/reward_model/std": 0.635941743850708, "step": 41 }, { "completion_length": 3166.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3637.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 3166.0, "completions/mean_terminated_length": 3166.0, "completions/min_length": 2995.0, "completions/min_terminated_length": 2995.0, "epoch": 0.07058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.2448631376028061, "kl": 0.4528976082801819, "learning_rate": 2.05e-06, "loss": 0.0005, "num_tokens": 849846.0, "reward": 8.172100067138672, "reward_std": 1.5137546062469482, "rewards/reward_model/mean": 8.172100067138672, "rewards/reward_model/std": 1.5137547254562378, "step": 42 }, { "completion_length": 1082.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 1082.5, "completions/mean_terminated_length": 1082.5, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.07226890756302522, "frac_reward_zero_std": 0.0, "grad_norm": 0.614413321018219, "kl": 0.9997281432151794, "learning_rate": 2.1000000000000002e-06, "loss": 0.001, "num_tokens": 865128.0, "reward": 5.758825302124023, "reward_std": 2.153346061706543, "rewards/reward_model/mean": 5.758825302124023, "rewards/reward_model/std": 2.153346061706543, "step": 43 }, { "completion_length": 3139.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3744.0, "completions/max_terminated_length": 3744.0, "completions/mean_length": 3139.25, "completions/mean_terminated_length": 3139.25, "completions/min_length": 2570.0, "completions/min_terminated_length": 2570.0, "epoch": 0.07394957983193277, "frac_reward_zero_std": 0.0, "grad_norm": 0.23637787997722626, "kl": 0.4812513291835785, "learning_rate": 2.15e-06, "loss": 0.0005, "num_tokens": 888913.0, "reward": 7.538500785827637, "reward_std": 1.2191288471221924, "rewards/reward_model/mean": 7.538500785827637, "rewards/reward_model/std": 1.2191288471221924, "step": 44 }, { "completion_length": 3443.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3698.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 3443.75, "completions/mean_terminated_length": 3443.75, "completions/min_length": 3105.0, "completions/min_terminated_length": 3105.0, "epoch": 0.07563025210084033, "frac_reward_zero_std": 0.0, "grad_norm": 0.19623233377933502, "kl": 0.3900257647037506, "learning_rate": 2.2e-06, "loss": 0.0004, "num_tokens": 912152.0, "reward": 8.34599781036377, "reward_std": 1.126673698425293, "rewards/reward_model/mean": 8.34599781036377, "rewards/reward_model/std": 1.1266734600067139, "step": 45 }, { "completion_length": 57.5, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0773109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 5.90267276763916, "kl": 3.3936893939971924, "learning_rate": 2.25e-06, "loss": 0.0034, "num_tokens": 921438.0, "reward": 8.0, "reward_std": 4.0, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 4.0, "step": 46 }, { "completion_length": 1558.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1558.75, "completions/mean_terminated_length": 1558.75, "completions/min_length": 1377.0, "completions/min_terminated_length": 1377.0, "epoch": 0.07899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.5147756338119507, "kl": 0.6282870173454285, "learning_rate": 2.3000000000000004e-06, "loss": 0.0006, "num_tokens": 937877.0, "reward": 7.551456451416016, "reward_std": 1.0817488431930542, "rewards/reward_model/mean": 7.551456451416016, "rewards/reward_model/std": 1.0817489624023438, "step": 47 }, { "completion_length": 3149.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3280.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 3149.0, "completions/mean_terminated_length": 3149.0, "completions/min_length": 3075.0, "completions/min_terminated_length": 3075.0, "epoch": 0.08067226890756303, "frac_reward_zero_std": 0.0, "grad_norm": 0.21285971999168396, "kl": 0.4472464621067047, "learning_rate": 2.35e-06, "loss": 0.0004, "num_tokens": 960593.0, "reward": 8.15469741821289, "reward_std": 0.9880263209342957, "rewards/reward_model/mean": 8.15469741821289, "rewards/reward_model/std": 0.9880266189575195, "step": 48 }, { "completion_length": 2261.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3393.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 2261.5, "completions/mean_terminated_length": 2261.5, "completions/min_length": 1702.0, "completions/min_terminated_length": 1702.0, "epoch": 0.08235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.4225335419178009, "kl": 0.3637184798717499, "learning_rate": 2.4000000000000003e-06, "loss": 0.0004, "num_tokens": 979171.0, "reward": 7.375, "reward_std": 2.780137777328491, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 2.7801380157470703, "step": 49 }, { "completion_length": 3035.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3395.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 3035.0, "completions/mean_terminated_length": 3035.0, "completions/min_length": 2758.0, "completions/min_terminated_length": 2758.0, "epoch": 0.08403361344537816, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415667027235031, "kl": 0.4912819564342499, "learning_rate": 2.4500000000000003e-06, "loss": 0.0005, "num_tokens": 999851.0, "reward": 8.715576171875, "reward_std": 0.45521172881126404, "rewards/reward_model/mean": 8.715576171875, "rewards/reward_model/std": 0.45521196722984314, "step": 50 }, { "completion_length": 4185.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 4185.75, "completions/mean_terminated_length": 3170.33349609375, "completions/min_length": 2558.0, "completions/min_terminated_length": 2558.0, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.16737323999404907, "kl": 0.3826863169670105, "learning_rate": 2.5e-06, "loss": 0.0004, "num_tokens": 1026650.0, "reward": 7.64158821105957, "reward_std": 1.6326850652694702, "rewards/reward_model/mean": 7.64158821105957, "rewards/reward_model/std": 1.6326849460601807, "step": 51 }, { "completion_length": 1089.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 1089.25, "completions/mean_terminated_length": 1089.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.08739495798319327, "frac_reward_zero_std": 0.0, "grad_norm": 0.49092772603034973, "kl": 0.8803520798683167, "learning_rate": 2.55e-06, "loss": 0.0009, "num_tokens": 1041415.0, "reward": 7.310797691345215, "reward_std": 1.0026116371154785, "rewards/reward_model/mean": 7.310797691345215, "rewards/reward_model/std": 1.0026116371154785, "step": 52 }, { "completion_length": 3123.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3517.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 3123.25, "completions/mean_terminated_length": 3123.25, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.08907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.21318843960762024, "kl": 0.42238688468933105, "learning_rate": 2.6e-06, "loss": 0.0004, "num_tokens": 1063012.0, "reward": 9.053631782531738, "reward_std": 0.7274901270866394, "rewards/reward_model/mean": 9.053631782531738, "rewards/reward_model/std": 0.7274901270866394, "step": 53 }, { "completion_length": 2926.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3444.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 2926.0, "completions/mean_terminated_length": 2926.0, "completions/min_length": 2523.0, "completions/min_terminated_length": 2523.0, "epoch": 0.0907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.2534274160861969, "kl": 0.5139771103858948, "learning_rate": 2.6500000000000005e-06, "loss": 0.0005, "num_tokens": 1086172.0, "reward": 8.154108047485352, "reward_std": 0.9382317066192627, "rewards/reward_model/mean": 8.154108047485352, "rewards/reward_model/std": 0.9382315874099731, "step": 54 }, { "completion_length": 3260.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 3260.0, "completions/mean_terminated_length": 3260.0, "completions/min_length": 3041.0, "completions/min_terminated_length": 3041.0, "epoch": 0.09243697478991597, "frac_reward_zero_std": 0.0, "grad_norm": 0.219038188457489, "kl": 0.7034112215042114, "learning_rate": 2.7000000000000004e-06, "loss": 0.0007, "num_tokens": 1109548.0, "reward": 8.521720886230469, "reward_std": 1.0076866149902344, "rewards/reward_model/mean": 8.521720886230469, "rewards/reward_model/std": 1.0076866149902344, "step": 55 }, { "completion_length": 3062.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3473.0, "completions/max_terminated_length": 3473.0, "completions/mean_length": 3062.25, "completions/mean_terminated_length": 3062.25, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.09411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.23915456235408783, "kl": 0.4395737946033478, "learning_rate": 2.7500000000000004e-06, "loss": 0.0004, "num_tokens": 1130981.0, "reward": 8.267683029174805, "reward_std": 0.7767911553382874, "rewards/reward_model/mean": 8.267683029174805, "rewards/reward_model/std": 0.7767910957336426, "step": 56 }, { "completion_length": 3322.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3483.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 3322.0, "completions/mean_terminated_length": 3322.0, "completions/min_length": 2998.0, "completions/min_terminated_length": 2998.0, "epoch": 0.0957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.20130489766597748, "kl": 0.4055849015712738, "learning_rate": 2.8000000000000003e-06, "loss": 0.0004, "num_tokens": 1154809.0, "reward": 8.213647842407227, "reward_std": 0.8999168276786804, "rewards/reward_model/mean": 8.213647842407227, "rewards/reward_model/std": 0.8999167680740356, "step": 57 }, { "completion_length": 1292.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1292.25, "completions/mean_terminated_length": 1292.25, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.09747899159663866, "frac_reward_zero_std": 0.0, "grad_norm": 0.5870078802108765, "kl": 0.7187660932540894, "learning_rate": 2.85e-06, "loss": 0.0007, "num_tokens": 1169350.0, "reward": 7.023124694824219, "reward_std": 1.6435866355895996, "rewards/reward_model/mean": 7.023124694824219, "rewards/reward_model/std": 1.6435868740081787, "step": 58 }, { "completion_length": 2549.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3263.0, "completions/max_terminated_length": 3263.0, "completions/mean_length": 2549.25, "completions/mean_terminated_length": 2549.25, "completions/min_length": 2124.0, "completions/min_terminated_length": 2124.0, "epoch": 0.09915966386554621, "frac_reward_zero_std": 0.0, "grad_norm": 0.2910903990268707, "kl": 0.29700058698654175, "learning_rate": 2.9e-06, "loss": 0.0003, "num_tokens": 1189963.0, "reward": 5.5, "reward_std": 1.5811388492584229, "rewards/reward_model/mean": 5.5, "rewards/reward_model/std": 1.5811388492584229, "step": 59 }, { "completion_length": 2904.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3014.0, "completions/max_terminated_length": 3014.0, "completions/mean_length": 2904.0, "completions/mean_terminated_length": 2904.0, "completions/min_length": 2822.0, "completions/min_terminated_length": 2822.0, "epoch": 0.10084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.2662099599838257, "kl": 0.45383381843566895, "learning_rate": 2.95e-06, "loss": 0.0005, "num_tokens": 1212511.0, "reward": 8.575969696044922, "reward_std": 1.0328577756881714, "rewards/reward_model/mean": 8.575969696044922, "rewards/reward_model/std": 1.0328580141067505, "step": 60 }, { "completion_length": 2814.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3206.0, "completions/max_terminated_length": 3206.0, "completions/mean_length": 2814.75, "completions/mean_terminated_length": 2814.75, "completions/min_length": 2580.0, "completions/min_terminated_length": 2580.0, "epoch": 0.10252100840336134, "frac_reward_zero_std": 0.0, "grad_norm": 0.24718832969665527, "kl": 0.4411505460739136, "learning_rate": 3e-06, "loss": 0.0004, "num_tokens": 1234434.0, "reward": 8.411513328552246, "reward_std": 0.8315414786338806, "rewards/reward_model/mean": 8.411513328552246, "rewards/reward_model/std": 0.8315416574478149, "step": 61 }, { "completion_length": 3244.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3895.0, "completions/max_terminated_length": 3895.0, "completions/mean_length": 3244.5, "completions/mean_terminated_length": 3244.5, "completions/min_length": 2591.0, "completions/min_terminated_length": 2591.0, "epoch": 0.10420168067226891, "frac_reward_zero_std": 0.0, "grad_norm": 0.2538319230079651, "kl": 0.27741584181785583, "learning_rate": 3.05e-06, "loss": 0.0003, "num_tokens": 1256916.0, "reward": 6.0, "reward_std": 3.9370038509368896, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.9370038509368896, "step": 62 }, { "completion_length": 2728.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2981.0, "completions/max_terminated_length": 2981.0, "completions/mean_length": 2728.75, "completions/mean_terminated_length": 2728.75, "completions/min_length": 2376.0, "completions/min_terminated_length": 2376.0, "epoch": 0.10588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.25504007935523987, "kl": 0.46872344613075256, "learning_rate": 3.1000000000000004e-06, "loss": 0.0005, "num_tokens": 1276531.0, "reward": 8.205828666687012, "reward_std": 0.682498037815094, "rewards/reward_model/mean": 8.205828666687012, "rewards/reward_model/std": 0.6824979782104492, "step": 63 }, { "completion_length": 2017.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 2017.0, "completions/mean_terminated_length": 2017.0, "completions/min_length": 1666.0, "completions/min_terminated_length": 1666.0, "epoch": 0.10756302521008404, "frac_reward_zero_std": 0.0, "grad_norm": 0.3398452401161194, "kl": 0.3937748372554779, "learning_rate": 3.1500000000000003e-06, "loss": 0.0004, "num_tokens": 1294551.0, "reward": 5.75, "reward_std": 1.7078251838684082, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 1.7078251838684082, "step": 64 }, { "completion_length": 3087.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3985.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 3087.0, "completions/mean_terminated_length": 3087.0, "completions/min_length": 2602.0, "completions/min_terminated_length": 2602.0, "epoch": 0.1092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.22506201267242432, "kl": 0.4327456057071686, "learning_rate": 3.2000000000000003e-06, "loss": 0.0004, "num_tokens": 1317007.0, "reward": 8.593668937683105, "reward_std": 1.6437653303146362, "rewards/reward_model/mean": 8.593668937683105, "rewards/reward_model/std": 1.6437653303146362, "step": 65 }, { "completion_length": 828.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1600.0, "completions/max_terminated_length": 1600.0, "completions/mean_length": 828.5, "completions/mean_terminated_length": 828.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.11092436974789915, "frac_reward_zero_std": 0.0, "grad_norm": 0.5670727491378784, "kl": 0.9830642342567444, "learning_rate": 3.2500000000000002e-06, "loss": 0.001, "num_tokens": 1330901.0, "reward": 6.8028154373168945, "reward_std": 1.10745370388031, "rewards/reward_model/mean": 6.8028154373168945, "rewards/reward_model/std": 1.10745370388031, "step": 66 }, { "completion_length": 2075.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2810.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 2075.25, "completions/mean_terminated_length": 2075.25, "completions/min_length": 1383.0, "completions/min_terminated_length": 1383.0, "epoch": 0.11260504201680673, "frac_reward_zero_std": 0.0, "grad_norm": 0.29757460951805115, "kl": 0.35559019446372986, "learning_rate": 3.3000000000000006e-06, "loss": 0.0004, "num_tokens": 1348938.0, "reward": 6.351768493652344, "reward_std": 4.791616439819336, "rewards/reward_model/mean": 6.351768493652344, "rewards/reward_model/std": 4.791616916656494, "step": 67 }, { "completion_length": 1801.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1801.25, "completions/mean_terminated_length": 1801.25, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.4349398910999298, "kl": 0.4391127824783325, "learning_rate": 3.3500000000000005e-06, "loss": 0.0004, "num_tokens": 1366391.0, "reward": 6.5406036376953125, "reward_std": 4.524318695068359, "rewards/reward_model/mean": 6.5406036376953125, "rewards/reward_model/std": 4.524318695068359, "step": 68 }, { "completion_length": 3342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3956.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 3342.0, "completions/mean_terminated_length": 3342.0, "completions/min_length": 2952.0, "completions/min_terminated_length": 2952.0, "epoch": 0.11596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.20729507505893707, "kl": 0.4406834542751312, "learning_rate": 3.4000000000000005e-06, "loss": 0.0004, "num_tokens": 1388627.0, "reward": 9.244726181030273, "reward_std": 0.9476140141487122, "rewards/reward_model/mean": 9.244726181030273, "rewards/reward_model/std": 0.9476140141487122, "step": 69 }, { "completion_length": 2394.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2394.0, "completions/mean_terminated_length": 2394.0, "completions/min_length": 1955.0, "completions/min_terminated_length": 1955.0, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.3415325880050659, "kl": 0.7586065530776978, "learning_rate": 3.45e-06, "loss": 0.0008, "num_tokens": 1407775.0, "reward": 7.75, "reward_std": 2.629955530166626, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 2.629955768585205, "step": 70 }, { "completion_length": 3043.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3499.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 3043.75, "completions/mean_terminated_length": 3043.75, "completions/min_length": 2632.0, "completions/min_terminated_length": 2632.0, "epoch": 0.11932773109243698, "frac_reward_zero_std": 0.0, "grad_norm": 0.2264113426208496, "kl": 0.43621689081192017, "learning_rate": 3.5e-06, "loss": 0.0004, "num_tokens": 1429114.0, "reward": 7.379914283752441, "reward_std": 1.2877236604690552, "rewards/reward_model/mean": 7.379914283752441, "rewards/reward_model/std": 1.2877237796783447, "step": 71 }, { "completion_length": 2935.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3168.0, "completions/max_terminated_length": 3168.0, "completions/mean_length": 2935.5, "completions/mean_terminated_length": 2935.5, "completions/min_length": 2622.0, "completions/min_terminated_length": 2622.0, "epoch": 0.12100840336134454, "frac_reward_zero_std": 0.0, "grad_norm": 0.21893373131752014, "kl": 0.5037636756896973, "learning_rate": 3.5500000000000003e-06, "loss": 0.0006, "num_tokens": 1450920.0, "reward": 8.926493644714355, "reward_std": 0.007464637514203787, "rewards/reward_model/mean": 8.926493644714355, "rewards/reward_model/std": 0.007464637979865074, "step": 72 }, { "completion_length": 909.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 909.25, "completions/mean_terminated_length": 909.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.6668774485588074, "kl": 1.02503502368927, "learning_rate": 3.6000000000000003e-06, "loss": 0.001, "num_tokens": 1464057.0, "reward": 6.830574989318848, "reward_std": 1.5993455648422241, "rewards/reward_model/mean": 6.830574989318848, "rewards/reward_model/std": 1.5993454456329346, "step": 73 }, { "completion_length": 1187.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1714.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1187.25, "completions/mean_terminated_length": 1187.25, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.12436974789915967, "frac_reward_zero_std": 0.0, "grad_norm": 2.6230356693267822, "kl": 7.192581653594971, "learning_rate": 3.65e-06, "loss": 0.0072, "num_tokens": 1478994.0, "reward": 7.468416213989258, "reward_std": 0.5226073265075684, "rewards/reward_model/mean": 7.468416213989258, "rewards/reward_model/std": 0.5226073861122131, "step": 74 }, { "completion_length": 2722.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3341.0, "completions/max_terminated_length": 3341.0, "completions/mean_length": 2722.0, "completions/mean_terminated_length": 2722.0, "completions/min_length": 2369.0, "completions/min_terminated_length": 2369.0, "epoch": 0.12605042016806722, "frac_reward_zero_std": 0.0, "grad_norm": 0.2930119037628174, "kl": 0.525174081325531, "learning_rate": 3.7e-06, "loss": 0.0005, "num_tokens": 1498790.0, "reward": 9.866546630859375, "reward_std": 0.13535748422145844, "rewards/reward_model/mean": 9.866546630859375, "rewards/reward_model/std": 0.13535748422145844, "step": 75 }, { "completion_length": 2678.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 2678.25, "completions/mean_terminated_length": 2678.25, "completions/min_length": 2529.0, "completions/min_terminated_length": 2529.0, "epoch": 0.12773109243697478, "frac_reward_zero_std": 0.0, "grad_norm": 0.24343159794807434, "kl": 0.47763705253601074, "learning_rate": 3.7500000000000005e-06, "loss": 0.0005, "num_tokens": 1520331.0, "reward": 9.23454761505127, "reward_std": 0.8037624359130859, "rewards/reward_model/mean": 9.23454761505127, "rewards/reward_model/std": 0.8037623763084412, "step": 76 }, { "completion_length": 2961.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3192.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 2961.0, "completions/mean_terminated_length": 2961.0, "completions/min_length": 2866.0, "completions/min_terminated_length": 2866.0, "epoch": 0.12941176470588237, "frac_reward_zero_std": 0.0, "grad_norm": 0.27070072293281555, "kl": 0.4094032049179077, "learning_rate": 3.8000000000000005e-06, "loss": 0.0004, "num_tokens": 1541335.0, "reward": 8.187192916870117, "reward_std": 1.4074355363845825, "rewards/reward_model/mean": 8.187192916870117, "rewards/reward_model/std": 1.407435655593872, "step": 77 }, { "completion_length": 2904.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 2904.5, "completions/mean_terminated_length": 2904.5, "completions/min_length": 2665.0, "completions/min_terminated_length": 2665.0, "epoch": 0.13109243697478992, "frac_reward_zero_std": 0.0, "grad_norm": 0.22602877020835876, "kl": 0.4318145513534546, "learning_rate": 3.85e-06, "loss": 0.0004, "num_tokens": 1562753.0, "reward": 7.521336555480957, "reward_std": 2.3426802158355713, "rewards/reward_model/mean": 7.521336555480957, "rewards/reward_model/std": 2.3426802158355713, "step": 78 }, { "completion_length": 3219.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3297.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 3219.5, "completions/mean_terminated_length": 3219.5, "completions/min_length": 3121.0, "completions/min_terminated_length": 3121.0, "epoch": 0.13277310924369748, "frac_reward_zero_std": 0.0, "grad_norm": 0.20466728508472443, "kl": 0.40577566623687744, "learning_rate": 3.900000000000001e-06, "loss": 0.0004, "num_tokens": 1586335.0, "reward": 8.53989315032959, "reward_std": 1.2545560598373413, "rewards/reward_model/mean": 8.53989315032959, "rewards/reward_model/std": 1.2545561790466309, "step": 79 }, { "completion_length": 806.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 806.75, "completions/mean_terminated_length": 806.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.13445378151260504, "frac_reward_zero_std": 0.0, "grad_norm": 0.6143341660499573, "kl": 1.0517674684524536, "learning_rate": 3.95e-06, "loss": 0.0011, "num_tokens": 1600666.0, "reward": 7.633878707885742, "reward_std": 0.37605300545692444, "rewards/reward_model/mean": 7.633878707885742, "rewards/reward_model/std": 0.3760528862476349, "step": 80 }, { "completion_length": 3203.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4340.0, "completions/max_terminated_length": 4340.0, "completions/mean_length": 3203.5, "completions/mean_terminated_length": 3203.5, "completions/min_length": 2409.0, "completions/min_terminated_length": 2409.0, "epoch": 0.1361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.23666132986545563, "kl": 0.42324742674827576, "learning_rate": 4.000000000000001e-06, "loss": 0.0004, "num_tokens": 1622448.0, "reward": 7.318637847900391, "reward_std": 2.778326988220215, "rewards/reward_model/mean": 7.318637847900391, "rewards/reward_model/std": 2.778327226638794, "step": 81 }, { "completion_length": 3323.75, "completions/clipped_ratio": 0.0, "completions/max_length": 4202.0, "completions/max_terminated_length": 4202.0, "completions/mean_length": 3323.75, "completions/mean_terminated_length": 3323.75, "completions/min_length": 2512.0, "completions/min_terminated_length": 2512.0, "epoch": 0.13781512605042018, "frac_reward_zero_std": 0.0, "grad_norm": 0.24035634100437164, "kl": 0.4106868803501129, "learning_rate": 4.05e-06, "loss": 0.0004, "num_tokens": 1646043.0, "reward": 6.966048240661621, "reward_std": 1.1899865865707397, "rewards/reward_model/mean": 6.966048240661621, "rewards/reward_model/std": 1.1899867057800293, "step": 82 }, { "completion_length": 3115.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3221.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 3115.75, "completions/mean_terminated_length": 3115.75, "completions/min_length": 3007.0, "completions/min_terminated_length": 3007.0, "epoch": 0.13949579831932774, "frac_reward_zero_std": 0.0, "grad_norm": 0.21992234885692596, "kl": 0.44762954115867615, "learning_rate": 4.1e-06, "loss": 0.0004, "num_tokens": 1667814.0, "reward": 9.537565231323242, "reward_std": 0.4324139952659607, "rewards/reward_model/mean": 9.537565231323242, "rewards/reward_model/std": 0.4324139654636383, "step": 83 }, { "completion_length": 2971.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3360.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 2971.0, "completions/mean_terminated_length": 2971.0, "completions/min_length": 2768.0, "completions/min_terminated_length": 2768.0, "epoch": 0.1411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.26085975766181946, "kl": 0.43445736169815063, "learning_rate": 4.15e-06, "loss": 0.0004, "num_tokens": 1690270.0, "reward": 8.511381149291992, "reward_std": 0.6435398459434509, "rewards/reward_model/mean": 8.511381149291992, "rewards/reward_model/std": 0.6435400247573853, "step": 84 }, { "completion_length": 2111.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 2111.0, "completions/mean_terminated_length": 2111.0, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.735354483127594, "kl": 1.131089687347412, "learning_rate": 4.2000000000000004e-06, "loss": 0.0011, "num_tokens": 1708290.0, "reward": 6.888094902038574, "reward_std": 4.362208366394043, "rewards/reward_model/mean": 6.888094902038574, "rewards/reward_model/std": 4.362208366394043, "step": 85 }, { "completion_length": 402.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 402.75, "completions/mean_terminated_length": 402.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.14453781512605043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6006290316581726, "kl": 1.258028268814087, "learning_rate": 4.25e-06, "loss": 0.0013, "num_tokens": 1721241.0, "reward": 6.638233184814453, "reward_std": 2.5938193798065186, "rewards/reward_model/mean": 6.638233184814453, "rewards/reward_model/std": 2.5938196182250977, "step": 86 }, { "completion_length": 2114.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 2114.0, "completions/mean_terminated_length": 2114.0, "completions/min_length": 1862.0, "completions/min_terminated_length": 1862.0, "epoch": 0.146218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.34364503622055054, "kl": 1.398358941078186, "learning_rate": 4.3e-06, "loss": 0.0014, "num_tokens": 1740013.0, "reward": 8.375, "reward_std": 1.3768926858901978, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.3768926858901978, "step": 87 }, { "completion_length": 2752.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2924.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 2752.25, "completions/mean_terminated_length": 2752.25, "completions/min_length": 2575.0, "completions/min_terminated_length": 2575.0, "epoch": 0.14789915966386555, "frac_reward_zero_std": 0.0, "grad_norm": 0.27950361371040344, "kl": 0.49790701270103455, "learning_rate": 4.350000000000001e-06, "loss": 0.0005, "num_tokens": 1761918.0, "reward": 7.676429748535156, "reward_std": 1.4302712678909302, "rewards/reward_model/mean": 7.676429748535156, "rewards/reward_model/std": 1.4302712678909302, "step": 88 }, { "completion_length": 2853.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3089.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 2853.0, "completions/mean_terminated_length": 2853.0, "completions/min_length": 2605.0, "completions/min_terminated_length": 2605.0, "epoch": 0.1495798319327731, "frac_reward_zero_std": 0.0, "grad_norm": 0.2263125628232956, "kl": 0.4167507588863373, "learning_rate": 4.4e-06, "loss": 0.0004, "num_tokens": 1784366.0, "reward": 7.4489288330078125, "reward_std": 2.177258014678955, "rewards/reward_model/mean": 7.4489288330078125, "rewards/reward_model/std": 2.177258253097534, "step": 89 }, { "completion_length": 2805.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 2805.25, "completions/mean_terminated_length": 2805.25, "completions/min_length": 2340.0, "completions/min_terminated_length": 2340.0, "epoch": 0.15126050420168066, "frac_reward_zero_std": 0.0, "grad_norm": 0.25741341710090637, "kl": 0.43744996190071106, "learning_rate": 4.450000000000001e-06, "loss": 0.0004, "num_tokens": 1806855.0, "reward": 7.539546966552734, "reward_std": 3.0199856758117676, "rewards/reward_model/mean": 7.539546966552734, "rewards/reward_model/std": 3.0199856758117676, "step": 90 }, { "completion_length": 2687.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 2687.5, "completions/mean_terminated_length": 2687.5, "completions/min_length": 2398.0, "completions/min_terminated_length": 2398.0, "epoch": 0.15294117647058825, "frac_reward_zero_std": 0.0, "grad_norm": 0.24347051978111267, "kl": 0.4858957827091217, "learning_rate": 4.5e-06, "loss": 0.0005, "num_tokens": 1827277.0, "reward": 8.389686584472656, "reward_std": 0.6794203519821167, "rewards/reward_model/mean": 8.389686584472656, "rewards/reward_model/std": 0.679420530796051, "step": 91 }, { "completion_length": 2798.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3086.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 2798.25, "completions/mean_terminated_length": 2798.25, "completions/min_length": 2591.0, "completions/min_terminated_length": 2591.0, "epoch": 0.1546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.24916556477546692, "kl": 0.48482340574264526, "learning_rate": 4.5500000000000005e-06, "loss": 0.0005, "num_tokens": 1847102.0, "reward": 7.97796630859375, "reward_std": 0.8365663290023804, "rewards/reward_model/mean": 7.97796630859375, "rewards/reward_model/std": 0.8365663886070251, "step": 92 }, { "completion_length": 3056.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3159.0, "completions/max_terminated_length": 3159.0, "completions/mean_length": 3056.25, "completions/mean_terminated_length": 3056.25, "completions/min_length": 2902.0, "completions/min_terminated_length": 2902.0, "epoch": 0.15630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.22196516394615173, "kl": 0.4709712564945221, "learning_rate": 4.600000000000001e-06, "loss": 0.0005, "num_tokens": 1869175.0, "reward": 9.879180908203125, "reward_std": 0.16843675076961517, "rewards/reward_model/mean": 9.879180908203125, "rewards/reward_model/std": 0.16843685507774353, "step": 93 }, { "completion_length": 2674.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 2674.25, "completions/mean_terminated_length": 2674.25, "completions/min_length": 2532.0, "completions/min_terminated_length": 2532.0, "epoch": 0.15798319327731092, "frac_reward_zero_std": 0.0, "grad_norm": 0.3054737448692322, "kl": 0.7604443430900574, "learning_rate": 4.65e-06, "loss": 0.0008, "num_tokens": 1889516.0, "reward": 7.914120674133301, "reward_std": 1.0475715398788452, "rewards/reward_model/mean": 7.914120674133301, "rewards/reward_model/std": 1.0475715398788452, "step": 94 }, { "completion_length": 2791.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3596.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 2791.0, "completions/mean_terminated_length": 2791.0, "completions/min_length": 2346.0, "completions/min_terminated_length": 2346.0, "epoch": 0.15966386554621848, "frac_reward_zero_std": 0.0, "grad_norm": 0.240616574883461, "kl": 0.4514558017253876, "learning_rate": 4.7e-06, "loss": 0.0005, "num_tokens": 1911380.0, "reward": 8.27052116394043, "reward_std": 0.7082969546318054, "rewards/reward_model/mean": 8.27052116394043, "rewards/reward_model/std": 0.7082969546318054, "step": 95 }, { "completion_length": 3014.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3382.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 3014.25, "completions/mean_terminated_length": 3014.25, "completions/min_length": 2798.0, "completions/min_terminated_length": 2798.0, "epoch": 0.16134453781512606, "frac_reward_zero_std": 0.0, "grad_norm": 0.22918841242790222, "kl": 0.4786817729473114, "learning_rate": 4.75e-06, "loss": 0.0005, "num_tokens": 1933981.0, "reward": 7.595390319824219, "reward_std": 1.2992151975631714, "rewards/reward_model/mean": 7.595390319824219, "rewards/reward_model/std": 1.2992151975631714, "step": 96 }, { "completion_length": 3266.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3377.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 3266.5, "completions/mean_terminated_length": 3266.5, "completions/min_length": 3109.0, "completions/min_terminated_length": 3109.0, "epoch": 0.16302521008403362, "frac_reward_zero_std": 0.0, "grad_norm": 0.2017071694135666, "kl": 0.42354616522789, "learning_rate": 4.800000000000001e-06, "loss": 0.0004, "num_tokens": 1957023.0, "reward": 9.349321365356445, "reward_std": 0.6180247664451599, "rewards/reward_model/mean": 9.349321365356445, "rewards/reward_model/std": 0.6180248260498047, "step": 97 }, { "completion_length": 199.75, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16470588235294117, "frac_reward_zero_std": 0.0, "grad_norm": 2.11334228515625, "kl": 1.4292675256729126, "learning_rate": 4.85e-06, "loss": 0.0014, "num_tokens": 1965326.0, "reward": 8.5, "reward_std": 1.0, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.0, "step": 98 }, { "completion_length": 2075.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 2075.0, "completions/mean_terminated_length": 2075.0, "completions/min_length": 1511.0, "completions/min_terminated_length": 1511.0, "epoch": 0.16638655462184873, "frac_reward_zero_std": 0.0, "grad_norm": 0.44360974431037903, "kl": 0.38844701647758484, "learning_rate": 4.9000000000000005e-06, "loss": 0.0004, "num_tokens": 1984098.0, "reward": 6.0625, "reward_std": 3.6307425498962402, "rewards/reward_model/mean": 6.0625, "rewards/reward_model/std": 3.6307425498962402, "step": 99 }, { "completion_length": 3006.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3300.0, "completions/max_terminated_length": 3300.0, "completions/mean_length": 3006.5, "completions/mean_terminated_length": 3006.5, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.16806722689075632, "frac_reward_zero_std": 0.0, "grad_norm": 0.21265709400177002, "kl": 0.4319321811199188, "learning_rate": 4.95e-06, "loss": 0.0004, "num_tokens": 2006040.0, "reward": 9.181880950927734, "reward_std": 0.6058098673820496, "rewards/reward_model/mean": 9.181880950927734, "rewards/reward_model/std": 0.6058098673820496, "step": 100 }, { "completion_length": 2721.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 2721.5, "completions/mean_terminated_length": 2721.5, "completions/min_length": 2631.0, "completions/min_terminated_length": 2631.0, "epoch": 0.16974789915966387, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661150097846985, "kl": 0.49838021397590637, "learning_rate": 5e-06, "loss": 0.0005, "num_tokens": 2026218.0, "reward": 7.413733959197998, "reward_std": 1.5620050430297852, "rewards/reward_model/mean": 7.413733959197998, "rewards/reward_model/std": 1.5620051622390747, "step": 101 }, { "completion_length": 926.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 926.5, "completions/mean_terminated_length": 926.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.6446244120597839, "kl": 0.8280232548713684, "learning_rate": 4.994444444444445e-06, "loss": 0.0008, "num_tokens": 2039944.0, "reward": 6.553905487060547, "reward_std": 1.4130481481552124, "rewards/reward_model/mean": 6.553905487060547, "rewards/reward_model/std": 1.413048267364502, "step": 102 }, { "completion_length": 2172.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 2172.0, "completions/mean_terminated_length": 2172.0, "completions/min_length": 1860.0, "completions/min_terminated_length": 1860.0, "epoch": 0.173109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.27262404561042786, "kl": 0.3252059817314148, "learning_rate": 4.988888888888889e-06, "loss": 0.0003, "num_tokens": 2058512.0, "reward": 8.375, "reward_std": 1.7969882488250732, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.7969882488250732, "step": 103 }, { "completion_length": 2038.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 2038.25, "completions/mean_terminated_length": 2038.25, "completions/min_length": 1499.0, "completions/min_terminated_length": 1499.0, "epoch": 0.17478991596638654, "frac_reward_zero_std": 0.0, "grad_norm": 0.37813693284988403, "kl": 0.3322860896587372, "learning_rate": 4.983333333333334e-06, "loss": 0.0003, "num_tokens": 2077113.0, "reward": 9.625, "reward_std": 0.75, "rewards/reward_model/mean": 9.625, "rewards/reward_model/std": 0.75, "step": 104 }, { "completion_length": 1085.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 1085.0, "completions/mean_terminated_length": 1085.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.0, "grad_norm": 0.5497007966041565, "kl": 0.861971914768219, "learning_rate": 4.977777777777778e-06, "loss": 0.0009, "num_tokens": 2090677.0, "reward": 7.113149642944336, "reward_std": 1.5167663097381592, "rewards/reward_model/mean": 7.113149642944336, "rewards/reward_model/std": 1.5167663097381592, "step": 105 }, { "completion_length": 141.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.1781512605042017, "frac_reward_zero_std": 0.0, "grad_norm": 2.3327019214630127, "kl": 1.9802045822143555, "learning_rate": 4.9722222222222224e-06, "loss": 0.002, "num_tokens": 2098745.0, "reward": 9.25, "reward_std": 1.5, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.5, "step": 106 }, { "completion_length": 2973.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3368.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 2973.5, "completions/mean_terminated_length": 2973.5, "completions/min_length": 2667.0, "completions/min_terminated_length": 2667.0, "epoch": 0.17983193277310924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2415865957736969, "kl": 0.5100703835487366, "learning_rate": 4.966666666666667e-06, "loss": 0.0005, "num_tokens": 2120403.0, "reward": 7.290048599243164, "reward_std": 1.7958564758300781, "rewards/reward_model/mean": 7.290048599243164, "rewards/reward_model/std": 1.7958564758300781, "step": 107 }, { "completion_length": 3186.25, "completions/clipped_ratio": 0.0, "completions/max_length": 4318.0, "completions/max_terminated_length": 4318.0, "completions/mean_length": 3186.25, "completions/mean_terminated_length": 3186.25, "completions/min_length": 2722.0, "completions/min_terminated_length": 2722.0, "epoch": 0.1815126050420168, "frac_reward_zero_std": 0.0, "grad_norm": 0.2218141257762909, "kl": 0.4428662061691284, "learning_rate": 4.961111111111111e-06, "loss": 0.0004, "num_tokens": 2143188.0, "reward": 7.9481964111328125, "reward_std": 2.0744946002960205, "rewards/reward_model/mean": 7.9481964111328125, "rewards/reward_model/std": 2.0744946002960205, "step": 108 }, { "completion_length": 2968.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 2968.0, "completions/mean_terminated_length": 2968.0, "completions/min_length": 2650.0, "completions/min_terminated_length": 2650.0, "epoch": 0.18319327731092436, "frac_reward_zero_std": 0.0, "grad_norm": 0.23803484439849854, "kl": 0.48616155982017517, "learning_rate": 4.9555555555555565e-06, "loss": 0.0005, "num_tokens": 2164028.0, "reward": 8.134922981262207, "reward_std": 0.7836208343505859, "rewards/reward_model/mean": 8.134922981262207, "rewards/reward_model/std": 0.7836208939552307, "step": 109 }, { "completion_length": 2895.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3122.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 2895.0, "completions/mean_terminated_length": 2895.0, "completions/min_length": 2736.0, "completions/min_terminated_length": 2736.0, "epoch": 0.18487394957983194, "frac_reward_zero_std": 0.0, "grad_norm": 0.23772816359996796, "kl": 0.7086710929870605, "learning_rate": 4.95e-06, "loss": 0.0007, "num_tokens": 2184752.0, "reward": 9.1290922164917, "reward_std": 1.3770978450775146, "rewards/reward_model/mean": 9.1290922164917, "rewards/reward_model/std": 1.3770978450775146, "step": 110 }, { "completion_length": 3044.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3504.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 3044.0, "completions/mean_terminated_length": 3044.0, "completions/min_length": 2646.0, "completions/min_terminated_length": 2646.0, "epoch": 0.1865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.23825815320014954, "kl": 0.4379551112651825, "learning_rate": 4.944444444444445e-06, "loss": 0.0004, "num_tokens": 2208396.0, "reward": 8.048235893249512, "reward_std": 1.601423740386963, "rewards/reward_model/mean": 8.048235893249512, "rewards/reward_model/std": 1.601423740386963, "step": 111 }, { "completion_length": 2520.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2852.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 2520.0, "completions/mean_terminated_length": 2520.0, "completions/min_length": 2203.0, "completions/min_terminated_length": 2203.0, "epoch": 0.18823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2840467691421509, "kl": 0.4571273624897003, "learning_rate": 4.938888888888889e-06, "loss": 0.0005, "num_tokens": 2227528.0, "reward": 7.234651565551758, "reward_std": 4.369300842285156, "rewards/reward_model/mean": 7.234651565551758, "rewards/reward_model/std": 4.3693013191223145, "step": 112 }, { "completion_length": 2831.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3357.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 2831.75, "completions/mean_terminated_length": 2831.75, "completions/min_length": 2390.0, "completions/min_terminated_length": 2390.0, "epoch": 0.1899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.25559303164482117, "kl": 0.5501706004142761, "learning_rate": 4.933333333333334e-06, "loss": 0.0006, "num_tokens": 2249951.0, "reward": 8.183422088623047, "reward_std": 0.45960310101509094, "rewards/reward_model/mean": 8.183422088623047, "rewards/reward_model/std": 0.4596029818058014, "step": 113 }, { "completion_length": 2031.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 2031.0, "completions/mean_terminated_length": 2031.0, "completions/min_length": 1647.0, "completions/min_terminated_length": 1647.0, "epoch": 0.1915966386554622, "frac_reward_zero_std": 0.0, "grad_norm": 0.33318275213241577, "kl": 0.3922136127948761, "learning_rate": 4.927777777777778e-06, "loss": 0.0004, "num_tokens": 2267395.0, "reward": 6.75, "reward_std": 2.7537853717803955, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.7537853717803955, "step": 114 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.19327731092436976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008913876954466105, "kl": 3.7003390789031982, "learning_rate": 4.922222222222223e-06, "loss": 0.0037, "num_tokens": 2275843.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 115 }, { "completion_length": 2963.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3328.0, "completions/max_terminated_length": 3328.0, "completions/mean_length": 2963.75, "completions/mean_terminated_length": 2963.75, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.1949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.28050631284713745, "kl": 0.5140309929847717, "learning_rate": 4.9166666666666665e-06, "loss": 0.0005, "num_tokens": 2298438.0, "reward": 8.040239334106445, "reward_std": 1.173266887664795, "rewards/reward_model/mean": 8.040239334106445, "rewards/reward_model/std": 1.1732672452926636, "step": 116 }, { "completion_length": 2095.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2667.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 2095.75, "completions/mean_terminated_length": 2095.75, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "epoch": 0.19663865546218487, "frac_reward_zero_std": 0.0, "grad_norm": 0.352792888879776, "kl": 0.41261500120162964, "learning_rate": 4.911111111111112e-06, "loss": 0.0004, "num_tokens": 2316933.0, "reward": 8.625, "reward_std": 1.8874585628509521, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 1.8874585628509521, "step": 117 }, { "completion_length": 1953.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 1953.25, "completions/mean_terminated_length": 1953.25, "completions/min_length": 1792.0, "completions/min_terminated_length": 1792.0, "epoch": 0.19831932773109243, "frac_reward_zero_std": 0.0, "grad_norm": 0.3218609094619751, "kl": 0.3694440424442291, "learning_rate": 4.905555555555556e-06, "loss": 0.0004, "num_tokens": 2334030.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 118 }, { "completion_length": 1927.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 1927.0, "completions/mean_terminated_length": 1927.0, "completions/min_length": 1555.0, "completions/min_terminated_length": 1555.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.36837753653526306, "kl": 0.3546859323978424, "learning_rate": 4.9000000000000005e-06, "loss": 0.0004, "num_tokens": 2351874.0, "reward": 7.625, "reward_std": 1.7017147541046143, "rewards/reward_model/mean": 7.625, "rewards/reward_model/std": 1.7017148733139038, "step": 119 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.20168067226890757, "frac_reward_zero_std": 1.0, "grad_norm": 0.005828255787491798, "kl": 3.5667221546173096, "learning_rate": 4.894444444444445e-06, "loss": 0.0036, "num_tokens": 2359778.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 120 }, { "completion_length": 1054.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1255.0, "completions/max_terminated_length": 1255.0, "completions/mean_length": 1054.25, "completions/mean_terminated_length": 1054.25, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.20336134453781513, "frac_reward_zero_std": 0.0, "grad_norm": 0.6091054677963257, "kl": 0.7709091305732727, "learning_rate": 4.888888888888889e-06, "loss": 0.0008, "num_tokens": 2375299.0, "reward": 7.903931617736816, "reward_std": 0.7135262489318848, "rewards/reward_model/mean": 7.903931617736816, "rewards/reward_model/std": 0.7135262489318848, "step": 121 }, { "completion_length": 2617.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2759.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 2617.5, "completions/mean_terminated_length": 2617.5, "completions/min_length": 2503.0, "completions/min_terminated_length": 2503.0, "epoch": 0.20504201680672268, "frac_reward_zero_std": 0.0, "grad_norm": 0.2652413547039032, "kl": 0.5015753507614136, "learning_rate": 4.883333333333334e-06, "loss": 0.0005, "num_tokens": 2395317.0, "reward": 8.609277725219727, "reward_std": 1.2459663152694702, "rewards/reward_model/mean": 8.609277725219727, "rewards/reward_model/std": 1.2459663152694702, "step": 122 }, { "completion_length": 1957.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1957.25, "completions/mean_terminated_length": 1957.25, "completions/min_length": 1258.0, "completions/min_terminated_length": 1258.0, "epoch": 0.20672268907563024, "frac_reward_zero_std": 0.0, "grad_norm": 0.4272514283657074, "kl": 0.40108439326286316, "learning_rate": 4.877777777777778e-06, "loss": 0.0004, "num_tokens": 2414138.0, "reward": 6.875, "reward_std": 2.3228933811187744, "rewards/reward_model/mean": 6.875, "rewards/reward_model/std": 2.3228933811187744, "step": 123 }, { "completion_length": 1083.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 1083.0, "completions/mean_terminated_length": 1083.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.20840336134453782, "frac_reward_zero_std": 0.0, "grad_norm": 0.726239800453186, "kl": 1.0261199474334717, "learning_rate": 4.8722222222222225e-06, "loss": 0.001, "num_tokens": 2428382.0, "reward": 6.302136421203613, "reward_std": 2.2844040393829346, "rewards/reward_model/mean": 6.302136421203613, "rewards/reward_model/std": 2.2844040393829346, "step": 124 }, { "completion_length": 2840.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2960.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 2840.25, "completions/mean_terminated_length": 2840.25, "completions/min_length": 2633.0, "completions/min_terminated_length": 2633.0, "epoch": 0.21008403361344538, "frac_reward_zero_std": 0.0, "grad_norm": 0.25508490204811096, "kl": 0.4664878845214844, "learning_rate": 4.866666666666667e-06, "loss": 0.0005, "num_tokens": 2449003.0, "reward": 8.282108306884766, "reward_std": 0.6589659452438354, "rewards/reward_model/mean": 8.282108306884766, "rewards/reward_model/std": 0.6589656472206116, "step": 125 }, { "completion_length": 1021.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1632.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1021.75, "completions/mean_terminated_length": 1021.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.21176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.5321233868598938, "kl": 0.9480860829353333, "learning_rate": 4.861111111111111e-06, "loss": 0.0009, "num_tokens": 2462086.0, "reward": 7.718278884887695, "reward_std": 0.9998491406440735, "rewards/reward_model/mean": 7.718278884887695, "rewards/reward_model/std": 0.999849259853363, "step": 126 }, { "completion_length": 1045.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 1045.25, "completions/mean_terminated_length": 1045.25, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.2134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.5783733129501343, "kl": 0.9520083069801331, "learning_rate": 4.855555555555556e-06, "loss": 0.001, "num_tokens": 2475995.0, "reward": 7.649256706237793, "reward_std": 0.715004563331604, "rewards/reward_model/mean": 7.649256706237793, "rewards/reward_model/std": 0.7150046229362488, "step": 127 }, { "completion_length": 1338.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1338.75, "completions/mean_terminated_length": 1338.75, "completions/min_length": 1130.0, "completions/min_terminated_length": 1130.0, "epoch": 0.21512605042016808, "frac_reward_zero_std": 0.0, "grad_norm": 0.6013649702072144, "kl": 0.7449133992195129, "learning_rate": 4.85e-06, "loss": 0.0007, "num_tokens": 2492062.0, "reward": 7.158843994140625, "reward_std": 0.819972574710846, "rewards/reward_model/mean": 7.158843994140625, "rewards/reward_model/std": 0.819972574710846, "step": 128 }, { "completion_length": 910.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 910.25, "completions/mean_terminated_length": 910.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.21680672268907564, "frac_reward_zero_std": 0.0, "grad_norm": 0.5794121026992798, "kl": 0.9318625926971436, "learning_rate": 4.8444444444444446e-06, "loss": 0.0009, "num_tokens": 2506207.0, "reward": 6.760138988494873, "reward_std": 0.8559876680374146, "rewards/reward_model/mean": 6.760138988494873, "rewards/reward_model/std": 0.8559876084327698, "step": 129 }, { "completion_length": 2844.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2844.5, "completions/mean_terminated_length": 2844.5, "completions/min_length": 2625.0, "completions/min_terminated_length": 2625.0, "epoch": 0.2184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.23493915796279907, "kl": 0.46635761857032776, "learning_rate": 4.838888888888889e-06, "loss": 0.0005, "num_tokens": 2526885.0, "reward": 8.46530532836914, "reward_std": 1.7754273414611816, "rewards/reward_model/mean": 8.46530532836914, "rewards/reward_model/std": 1.7754271030426025, "step": 130 }, { "completion_length": 2762.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2971.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 2762.25, "completions/mean_terminated_length": 2762.25, "completions/min_length": 2651.0, "completions/min_terminated_length": 2651.0, "epoch": 0.22016806722689075, "frac_reward_zero_std": 0.0, "grad_norm": 0.24335390329360962, "kl": 0.43510404229164124, "learning_rate": 4.833333333333333e-06, "loss": 0.0004, "num_tokens": 2547110.0, "reward": 7.015050888061523, "reward_std": 0.8529961705207825, "rewards/reward_model/mean": 7.015050888061523, "rewards/reward_model/std": 0.8529962301254272, "step": 131 }, { "completion_length": 2942.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3428.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 2942.5, "completions/mean_terminated_length": 2942.5, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "epoch": 0.2218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.22997471690177917, "kl": 0.461277037858963, "learning_rate": 4.827777777777778e-06, "loss": 0.0005, "num_tokens": 2568532.0, "reward": 7.938364028930664, "reward_std": 1.288348913192749, "rewards/reward_model/mean": 7.938364028930664, "rewards/reward_model/std": 1.2883487939834595, "step": 132 }, { "completion_length": 1835.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2406.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 1835.5, "completions/mean_terminated_length": 1835.5, "completions/min_length": 1467.0, "completions/min_terminated_length": 1467.0, "epoch": 0.2235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.3291085660457611, "kl": 0.31088417768478394, "learning_rate": 4.822222222222222e-06, "loss": 0.0003, "num_tokens": 2584746.0, "reward": 9.25, "reward_std": 1.5, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.5, "step": 133 }, { "completion_length": 2998.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3211.0, "completions/max_terminated_length": 3211.0, "completions/mean_length": 2998.5, "completions/mean_terminated_length": 2998.5, "completions/min_length": 2824.0, "completions/min_terminated_length": 2824.0, "epoch": 0.22521008403361345, "frac_reward_zero_std": 0.0, "grad_norm": 0.23152707517147064, "kl": 0.46097418665885925, "learning_rate": 4.816666666666667e-06, "loss": 0.0005, "num_tokens": 2606156.0, "reward": 8.750370025634766, "reward_std": 0.4992596507072449, "rewards/reward_model/mean": 8.750370025634766, "rewards/reward_model/std": 0.49925950169563293, "step": 134 }, { "completion_length": 1982.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 1982.75, "completions/mean_terminated_length": 1982.75, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "epoch": 0.226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.3510342240333557, "kl": 0.34647998213768005, "learning_rate": 4.811111111111111e-06, "loss": 0.0003, "num_tokens": 2624427.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 135 }, { "completion_length": 1114.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 1114.25, "completions/mean_terminated_length": 1114.25, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.5375041961669922, "kl": 0.9781880974769592, "learning_rate": 4.805555555555556e-06, "loss": 0.001, "num_tokens": 2638572.0, "reward": 6.138072490692139, "reward_std": 1.0945528745651245, "rewards/reward_model/mean": 6.138072490692139, "rewards/reward_model/std": 1.094552993774414, "step": 136 }, { "completion_length": 2201.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 2201.5, "completions/mean_terminated_length": 2201.5, "completions/min_length": 1675.0, "completions/min_terminated_length": 1675.0, "epoch": 0.23025210084033612, "frac_reward_zero_std": 0.0, "grad_norm": 0.3466634750366211, "kl": 0.33992475271224976, "learning_rate": 4.800000000000001e-06, "loss": 0.0003, "num_tokens": 2656674.0, "reward": 5.375, "reward_std": 2.2867372035980225, "rewards/reward_model/mean": 5.375, "rewards/reward_model/std": 2.2867372035980225, "step": 137 }, { "completion_length": 11.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2319327731092437, "frac_reward_zero_std": 1.0, "grad_norm": 0.02911699190735817, "kl": 1.452804446220398, "learning_rate": 4.794444444444445e-06, "loss": 0.0015, "num_tokens": 2664726.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 138 }, { "completion_length": 769.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 769.75, "completions/mean_terminated_length": 769.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.23361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.9029837846755981, "kl": 1.0116665363311768, "learning_rate": 4.7888888888888894e-06, "loss": 0.001, "num_tokens": 2677333.0, "reward": 7.218898773193359, "reward_std": 1.1574432849884033, "rewards/reward_model/mean": 7.218898773193359, "rewards/reward_model/std": 1.1574432849884033, "step": 139 }, { "completion_length": 3052.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4165.0, "completions/max_terminated_length": 4165.0, "completions/mean_length": 3052.5, "completions/mean_terminated_length": 3052.5, "completions/min_length": 2442.0, "completions/min_terminated_length": 2442.0, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.25393086671829224, "kl": 0.4295288622379303, "learning_rate": 4.783333333333334e-06, "loss": 0.0004, "num_tokens": 2700003.0, "reward": 8.008160591125488, "reward_std": 0.7072951197624207, "rewards/reward_model/mean": 8.008160591125488, "rewards/reward_model/std": 0.7072951793670654, "step": 140 }, { "completion_length": 2434.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3146.0, "completions/max_terminated_length": 3146.0, "completions/mean_length": 2434.5, "completions/mean_terminated_length": 2434.5, "completions/min_length": 1822.0, "completions/min_terminated_length": 1822.0, "epoch": 0.23697478991596638, "frac_reward_zero_std": 0.0, "grad_norm": 0.3451744318008423, "kl": 0.36107468605041504, "learning_rate": 4.777777777777778e-06, "loss": 0.0004, "num_tokens": 2719029.0, "reward": 6.0, "reward_std": 1.7320507764816284, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 1.7320507764816284, "step": 141 }, { "completion_length": 2533.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3518.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2533.5, "completions/mean_terminated_length": 2533.5, "completions/min_length": 1936.0, "completions/min_terminated_length": 1936.0, "epoch": 0.23865546218487396, "frac_reward_zero_std": 0.0, "grad_norm": 0.22997406125068665, "kl": 0.27732178568840027, "learning_rate": 4.772222222222223e-06, "loss": 0.0003, "num_tokens": 2738123.0, "reward": 6.75, "reward_std": 2.362907886505127, "rewards/reward_model/mean": 6.75, "rewards/reward_model/std": 2.362907886505127, "step": 142 }, { "completion_length": 486.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 486.0, "completions/mean_terminated_length": 486.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.24033613445378152, "frac_reward_zero_std": 0.0, "grad_norm": 0.7206143736839294, "kl": 1.2204090356826782, "learning_rate": 4.766666666666667e-06, "loss": 0.0012, "num_tokens": 2749363.0, "reward": 7.29646110534668, "reward_std": 0.6988224983215332, "rewards/reward_model/mean": 7.29646110534668, "rewards/reward_model/std": 0.698822557926178, "step": 143 }, { "completion_length": 3001.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3150.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 3001.5, "completions/mean_terminated_length": 3001.5, "completions/min_length": 2846.0, "completions/min_terminated_length": 2846.0, "epoch": 0.24201680672268908, "frac_reward_zero_std": 0.0, "grad_norm": 0.23584291338920593, "kl": 0.4656028151512146, "learning_rate": 4.7611111111111115e-06, "loss": 0.0005, "num_tokens": 2771829.0, "reward": 7.873873710632324, "reward_std": 1.058119297027588, "rewards/reward_model/mean": 7.873873710632324, "rewards/reward_model/std": 1.0581194162368774, "step": 144 }, { "completion_length": 2899.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3358.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 2899.5, "completions/mean_terminated_length": 2899.5, "completions/min_length": 2521.0, "completions/min_terminated_length": 2521.0, "epoch": 0.24369747899159663, "frac_reward_zero_std": 0.0, "grad_norm": 0.222762793302536, "kl": 0.45958980917930603, "learning_rate": 4.755555555555556e-06, "loss": 0.0005, "num_tokens": 2792747.0, "reward": 8.856507301330566, "reward_std": 0.8436880707740784, "rewards/reward_model/mean": 8.856507301330566, "rewards/reward_model/std": 0.8436882495880127, "step": 145 }, { "completion_length": 2922.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3402.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 2922.5, "completions/mean_terminated_length": 2922.5, "completions/min_length": 2679.0, "completions/min_terminated_length": 2679.0, "epoch": 0.2453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.21581555902957916, "kl": 0.44578394293785095, "learning_rate": 4.75e-06, "loss": 0.0004, "num_tokens": 2813509.0, "reward": 8.549777030944824, "reward_std": 0.6061685681343079, "rewards/reward_model/mean": 8.549777030944824, "rewards/reward_model/std": 0.6061685085296631, "step": 146 }, { "completion_length": 1648.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1648.25, "completions/mean_terminated_length": 1648.25, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "epoch": 0.24705882352941178, "frac_reward_zero_std": 0.0, "grad_norm": 0.3748073875904083, "kl": 0.39842212200164795, "learning_rate": 4.744444444444445e-06, "loss": 0.0004, "num_tokens": 2829714.0, "reward": 8.25, "reward_std": 2.0615527629852295, "rewards/reward_model/mean": 8.25, "rewards/reward_model/std": 2.0615527629852295, "step": 147 }, { "completion_length": 2712.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 2712.25, "completions/mean_terminated_length": 2712.25, "completions/min_length": 2541.0, "completions/min_terminated_length": 2541.0, "epoch": 0.24873949579831933, "frac_reward_zero_std": 0.0, "grad_norm": 0.23967885971069336, "kl": 0.4646434783935547, "learning_rate": 4.73888888888889e-06, "loss": 0.0005, "num_tokens": 2849275.0, "reward": 8.54823112487793, "reward_std": 0.7377526760101318, "rewards/reward_model/mean": 8.54823112487793, "rewards/reward_model/std": 0.7377527952194214, "step": 148 }, { "completion_length": 2780.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3075.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 2780.0, "completions/mean_terminated_length": 2780.0, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.2504201680672269, "frac_reward_zero_std": 0.0, "grad_norm": 0.2209819257259369, "kl": 0.4562610387802124, "learning_rate": 4.7333333333333335e-06, "loss": 0.0005, "num_tokens": 2869639.0, "reward": 7.664841175079346, "reward_std": 1.110151767730713, "rewards/reward_model/mean": 7.664841175079346, "rewards/reward_model/std": 1.1101515293121338, "step": 149 }, { "completion_length": 1810.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1810.0, "completions/mean_terminated_length": 1810.0, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "epoch": 0.25210084033613445, "frac_reward_zero_std": 0.0, "grad_norm": 0.3472149968147278, "kl": 0.4088854193687439, "learning_rate": 4.727777777777779e-06, "loss": 0.0004, "num_tokens": 2886107.0, "reward": 6.125, "reward_std": 4.269562721252441, "rewards/reward_model/mean": 6.125, "rewards/reward_model/std": 4.2695631980896, "step": 150 }, { "completion_length": 2084.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 2084.75, "completions/mean_terminated_length": 2084.75, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "epoch": 0.253781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.462090402841568, "kl": 0.37683919072151184, "learning_rate": 4.722222222222222e-06, "loss": 0.0004, "num_tokens": 2904326.0, "reward": 5.75, "reward_std": 4.193248748779297, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 4.193248748779297, "step": 151 }, { "completion_length": 3218.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3679.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 3218.25, "completions/mean_terminated_length": 3218.25, "completions/min_length": 2463.0, "completions/min_terminated_length": 2463.0, "epoch": 0.25546218487394956, "frac_reward_zero_std": 0.0, "grad_norm": 0.22532090544700623, "kl": 0.4902237057685852, "learning_rate": 4.7166666666666675e-06, "loss": 0.0005, "num_tokens": 2926371.0, "reward": 7.50351619720459, "reward_std": 1.5778533220291138, "rewards/reward_model/mean": 7.50351619720459, "rewards/reward_model/std": 1.5778533220291138, "step": 152 }, { "completion_length": 2763.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2868.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 2763.5, "completions/mean_terminated_length": 2763.5, "completions/min_length": 2580.0, "completions/min_terminated_length": 2580.0, "epoch": 0.2571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22987692058086395, "kl": 0.5656647682189941, "learning_rate": 4.711111111111111e-06, "loss": 0.0006, "num_tokens": 2946173.0, "reward": 8.71992301940918, "reward_std": 0.3809349834918976, "rewards/reward_model/mean": 8.71992301940918, "rewards/reward_model/std": 0.38093510270118713, "step": 153 }, { "completion_length": 527.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 527.25, "completions/mean_terminated_length": 527.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.25882352941176473, "frac_reward_zero_std": 0.0, "grad_norm": 0.5611518621444702, "kl": 1.0655821561813354, "learning_rate": 4.705555555555556e-06, "loss": 0.0011, "num_tokens": 2958790.0, "reward": 7.388204574584961, "reward_std": 0.7416150569915771, "rewards/reward_model/mean": 7.388204574584961, "rewards/reward_model/std": 0.7416151762008667, "step": 154 }, { "completion_length": 2629.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 2629.25, "completions/mean_terminated_length": 2629.25, "completions/min_length": 2321.0, "completions/min_terminated_length": 2321.0, "epoch": 0.2605042016806723, "frac_reward_zero_std": 0.0, "grad_norm": 0.21701310575008392, "kl": 0.4524945318698883, "learning_rate": 4.7e-06, "loss": 0.0005, "num_tokens": 2979667.0, "reward": 9.411394119262695, "reward_std": 0.45068293809890747, "rewards/reward_model/mean": 9.411394119262695, "rewards/reward_model/std": 0.450682669878006, "step": 155 }, { "completion_length": 806.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 806.5, "completions/mean_terminated_length": 806.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.26218487394957984, "frac_reward_zero_std": 0.0, "grad_norm": 1.1670176982879639, "kl": 1.115036129951477, "learning_rate": 4.694444444444445e-06, "loss": 0.0011, "num_tokens": 2992293.0, "reward": 7.309442043304443, "reward_std": 1.4336081743240356, "rewards/reward_model/mean": 7.309442043304443, "rewards/reward_model/std": 1.4336081743240356, "step": 156 }, { "completion_length": 8.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2638655462184874, "frac_reward_zero_std": 1.0, "grad_norm": 3.568586544133723e-05, "kl": 3.766537666320801, "learning_rate": 4.6888888888888895e-06, "loss": 0.0038, "num_tokens": 3001013.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 157 }, { "completion_length": 2871.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3163.0, "completions/max_terminated_length": 3163.0, "completions/mean_length": 2871.5, "completions/mean_terminated_length": 2871.5, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.26554621848739496, "frac_reward_zero_std": 0.0, "grad_norm": 0.20554177463054657, "kl": 0.474880188703537, "learning_rate": 4.683333333333334e-06, "loss": 0.0005, "num_tokens": 3022867.0, "reward": 7.569443702697754, "reward_std": 2.278564214706421, "rewards/reward_model/mean": 7.569443702697754, "rewards/reward_model/std": 2.278564214706421, "step": 158 }, { "completion_length": 2923.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3323.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 2923.0, "completions/mean_terminated_length": 2923.0, "completions/min_length": 2266.0, "completions/min_terminated_length": 2266.0, "epoch": 0.2672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.220077246427536, "kl": 0.43240219354629517, "learning_rate": 4.677777777777778e-06, "loss": 0.0004, "num_tokens": 3045187.0, "reward": 8.473838806152344, "reward_std": 1.3453700542449951, "rewards/reward_model/mean": 8.473838806152344, "rewards/reward_model/std": 1.3453701734542847, "step": 159 }, { "completion_length": 2675.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 2675.25, "completions/mean_terminated_length": 2675.25, "completions/min_length": 2503.0, "completions/min_terminated_length": 2503.0, "epoch": 0.2689075630252101, "frac_reward_zero_std": 0.0, "grad_norm": 0.24588997662067413, "kl": 0.5831613540649414, "learning_rate": 4.672222222222223e-06, "loss": 0.0006, "num_tokens": 3064848.0, "reward": 9.232027053833008, "reward_std": 0.19261541962623596, "rewards/reward_model/mean": 9.232027053833008, "rewards/reward_model/std": 0.1926155984401703, "step": 160 }, { "completion_length": 2873.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 2873.5, "completions/mean_terminated_length": 2873.5, "completions/min_length": 2782.0, "completions/min_terminated_length": 2782.0, "epoch": 0.27058823529411763, "frac_reward_zero_std": 0.0, "grad_norm": 0.22928255796432495, "kl": 0.7358444333076477, "learning_rate": 4.666666666666667e-06, "loss": 0.0007, "num_tokens": 3085830.0, "reward": 8.405526161193848, "reward_std": 1.475093960762024, "rewards/reward_model/mean": 8.405526161193848, "rewards/reward_model/std": 1.4750943183898926, "step": 161 }, { "completion_length": 3030.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3525.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 3030.0, "completions/mean_terminated_length": 3030.0, "completions/min_length": 2763.0, "completions/min_terminated_length": 2763.0, "epoch": 0.2722689075630252, "frac_reward_zero_std": 0.0, "grad_norm": 0.22029252350330353, "kl": 0.47707507014274597, "learning_rate": 4.6611111111111116e-06, "loss": 0.0005, "num_tokens": 3108074.0, "reward": 8.712586402893066, "reward_std": 0.9264523983001709, "rewards/reward_model/mean": 8.712586402893066, "rewards/reward_model/std": 0.9264521598815918, "step": 162 }, { "completion_length": 2786.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3248.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 2786.0, "completions/mean_terminated_length": 2786.0, "completions/min_length": 2233.0, "completions/min_terminated_length": 2233.0, "epoch": 0.2739495798319328, "frac_reward_zero_std": 0.0, "grad_norm": 0.29195111989974976, "kl": 0.4199967384338379, "learning_rate": 4.655555555555556e-06, "loss": 0.0004, "num_tokens": 3129658.0, "reward": 7.955666542053223, "reward_std": 2.5929222106933594, "rewards/reward_model/mean": 7.955666542053223, "rewards/reward_model/std": 2.5929222106933594, "step": 163 }, { "completion_length": 2653.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 2653.0, "completions/mean_terminated_length": 2653.0, "completions/min_length": 2464.0, "completions/min_terminated_length": 2464.0, "epoch": 0.27563025210084036, "frac_reward_zero_std": 0.0, "grad_norm": 0.2398880124092102, "kl": 0.6989083886146545, "learning_rate": 4.65e-06, "loss": 0.0007, "num_tokens": 3149762.0, "reward": 8.421257019042969, "reward_std": 2.498462677001953, "rewards/reward_model/mean": 8.421257019042969, "rewards/reward_model/std": 2.498462677001953, "step": 164 }, { "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.2773109243697479, "frac_reward_zero_std": 1.0, "grad_norm": 0.11003076285123825, "kl": 3.142202377319336, "learning_rate": 4.644444444444445e-06, "loss": 0.0031, "num_tokens": 3158262.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 165 }, { "completion_length": 3022.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3399.0, "completions/max_terminated_length": 3399.0, "completions/mean_length": 3022.75, "completions/mean_terminated_length": 3022.75, "completions/min_length": 2646.0, "completions/min_terminated_length": 2646.0, "epoch": 0.27899159663865547, "frac_reward_zero_std": 0.0, "grad_norm": 0.22422586381435394, "kl": 0.44220367074012756, "learning_rate": 4.638888888888889e-06, "loss": 0.0004, "num_tokens": 3181801.0, "reward": 7.756802558898926, "reward_std": 1.6295182704925537, "rewards/reward_model/mean": 7.756802558898926, "rewards/reward_model/std": 1.6295182704925537, "step": 166 }, { "completion_length": 9.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.0, "completions/mean_terminated_length": 9.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.280672268907563, "frac_reward_zero_std": 1.0, "grad_norm": 0.094392791390419, "kl": 3.158621072769165, "learning_rate": 4.633333333333334e-06, "loss": 0.0032, "num_tokens": 3190213.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 167 }, { "completion_length": 2663.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2663.5, "completions/mean_terminated_length": 2663.5, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.2823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.2424425184726715, "kl": 0.5139824151992798, "learning_rate": 4.627777777777778e-06, "loss": 0.0005, "num_tokens": 3210791.0, "reward": 8.023183822631836, "reward_std": 0.60601407289505, "rewards/reward_model/mean": 8.023183822631836, "rewards/reward_model/std": 0.6060142517089844, "step": 168 }, { "completion_length": 2033.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2274.0, "completions/max_terminated_length": 2274.0, "completions/mean_length": 2033.25, "completions/mean_terminated_length": 2033.25, "completions/min_length": 1597.0, "completions/min_terminated_length": 1597.0, "epoch": 0.28403361344537814, "frac_reward_zero_std": 0.0, "grad_norm": 0.33141204714775085, "kl": 0.3702349364757538, "learning_rate": 4.622222222222222e-06, "loss": 0.0004, "num_tokens": 3228656.0, "reward": 6.625, "reward_std": 2.462214469909668, "rewards/reward_model/mean": 6.625, "rewards/reward_model/std": 2.462214469909668, "step": 169 }, { "completion_length": 2960.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3061.0, "completions/max_terminated_length": 3061.0, "completions/mean_length": 2960.75, "completions/mean_terminated_length": 2960.75, "completions/min_length": 2787.0, "completions/min_terminated_length": 2787.0, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.24120405316352844, "kl": 0.4731678366661072, "learning_rate": 4.616666666666667e-06, "loss": 0.0005, "num_tokens": 3250495.0, "reward": 8.096492767333984, "reward_std": 2.1864511966705322, "rewards/reward_model/mean": 8.096492767333984, "rewards/reward_model/std": 2.186450958251953, "step": 170 }, { "completion_length": 1669.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1669.5, "completions/mean_terminated_length": 1669.5, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "epoch": 0.28739495798319326, "frac_reward_zero_std": 0.0, "grad_norm": 0.3854559063911438, "kl": 0.4447607100009918, "learning_rate": 4.611111111111112e-06, "loss": 0.0004, "num_tokens": 3267873.0, "reward": 9.125, "reward_std": 1.4361406564712524, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.4361406564712524, "step": 171 }, { "completion_length": 1177.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1740.0, "completions/max_terminated_length": 1740.0, "completions/mean_length": 1177.0, "completions/mean_terminated_length": 1177.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.28907563025210087, "frac_reward_zero_std": 0.0, "grad_norm": 0.5531060695648193, "kl": 0.856512725353241, "learning_rate": 4.605555555555556e-06, "loss": 0.0009, "num_tokens": 3282981.0, "reward": 5.437447547912598, "reward_std": 2.498600721359253, "rewards/reward_model/mean": 5.437447547912598, "rewards/reward_model/std": 2.498600721359253, "step": 172 }, { "completion_length": 35.5, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 35.5, "completions/mean_terminated_length": 35.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2907563025210084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045571294613182545, "kl": 2.7890782356262207, "learning_rate": 4.600000000000001e-06, "loss": 0.0028, "num_tokens": 3290631.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 173 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.292436974789916, "frac_reward_zero_std": 1.0, "grad_norm": 0.07329612970352173, "kl": 0.8356922268867493, "learning_rate": 4.594444444444444e-06, "loss": 0.0008, "num_tokens": 3298979.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 174 }, { "completion_length": 1129.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 1129.25, "completions/mean_terminated_length": 1129.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.29411764705882354, "frac_reward_zero_std": 0.0, "grad_norm": 0.6764624118804932, "kl": 0.7676644921302795, "learning_rate": 4.58888888888889e-06, "loss": 0.0008, "num_tokens": 3313872.0, "reward": 6.710842132568359, "reward_std": 1.0217556953430176, "rewards/reward_model/mean": 6.710842132568359, "rewards/reward_model/std": 1.0217556953430176, "step": 175 }, { "completion_length": 1741.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2025.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1741.0, "completions/mean_terminated_length": 1741.0, "completions/min_length": 1512.0, "completions/min_terminated_length": 1512.0, "epoch": 0.2957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.3833245038986206, "kl": 0.408497154712677, "learning_rate": 4.583333333333333e-06, "loss": 0.0004, "num_tokens": 3329552.0, "reward": 4.75, "reward_std": 2.901149272918701, "rewards/reward_model/mean": 4.75, "rewards/reward_model/std": 2.901149272918701, "step": 176 }, { "completion_length": 822.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 822.5, "completions/mean_terminated_length": 822.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.29747899159663865, "frac_reward_zero_std": 0.0, "grad_norm": 0.7895044088363647, "kl": 0.9263789057731628, "learning_rate": 4.5777777777777785e-06, "loss": 0.0009, "num_tokens": 3342470.0, "reward": 6.625405311584473, "reward_std": 0.6329342126846313, "rewards/reward_model/mean": 6.625405311584473, "rewards/reward_model/std": 0.6329342126846313, "step": 177 }, { "completion_length": 2721.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2721.25, "completions/mean_terminated_length": 2721.25, "completions/min_length": 2488.0, "completions/min_terminated_length": 2488.0, "epoch": 0.2991596638655462, "frac_reward_zero_std": 0.0, "grad_norm": 0.23519279062747955, "kl": 0.47075846791267395, "learning_rate": 4.572222222222222e-06, "loss": 0.0005, "num_tokens": 3362955.0, "reward": 8.75, "reward_std": 1.3994046449661255, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.3994046449661255, "step": 178 }, { "completion_length": 2845.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 2845.5, "completions/mean_terminated_length": 2845.5, "completions/min_length": 2671.0, "completions/min_terminated_length": 2671.0, "epoch": 0.30084033613445377, "frac_reward_zero_std": 0.0, "grad_norm": 0.22595342993736267, "kl": 0.4554749131202698, "learning_rate": 4.566666666666667e-06, "loss": 0.0005, "num_tokens": 3383937.0, "reward": 8.375, "reward_std": 1.5612494945526123, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.5612494945526123, "step": 179 }, { "completion_length": 2462.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 2462.25, "completions/mean_terminated_length": 2462.25, "completions/min_length": 2137.0, "completions/min_terminated_length": 2137.0, "epoch": 0.3025210084033613, "frac_reward_zero_std": 0.0, "grad_norm": 0.3311174511909485, "kl": 0.3128574788570404, "learning_rate": 4.561111111111112e-06, "loss": 0.0003, "num_tokens": 3403302.0, "reward": 8.5, "reward_std": 2.3804759979248047, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 2.380476236343384, "step": 180 }, { "completion_length": 3018.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3157.0, "completions/max_terminated_length": 3157.0, "completions/mean_length": 3018.25, "completions/mean_terminated_length": 3018.25, "completions/min_length": 2856.0, "completions/min_terminated_length": 2856.0, "epoch": 0.3042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.23110154271125793, "kl": 0.42669227719306946, "learning_rate": 4.555555555555556e-06, "loss": 0.0004, "num_tokens": 3424899.0, "reward": 7.908246040344238, "reward_std": 1.5839347839355469, "rewards/reward_model/mean": 7.908246040344238, "rewards/reward_model/std": 1.5839349031448364, "step": 181 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 5.533947387448279e-06, "kl": 0.6830087304115295, "learning_rate": 4.5500000000000005e-06, "loss": 0.0007, "num_tokens": 3433059.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 182 }, { "completion_length": 2384.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 2384.75, "completions/mean_terminated_length": 769.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.30756302521008405, "frac_reward_zero_std": 0.0, "grad_norm": 0.3204618990421295, "kl": 0.7881150841712952, "learning_rate": 4.544444444444445e-06, "loss": 0.0008, "num_tokens": 3453478.0, "reward": 4.037667274475098, "reward_std": 3.387129306793213, "rewards/reward_model/mean": 4.037667274475098, "rewards/reward_model/std": 3.387129068374634, "step": 183 }, { "completion_length": 2932.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 2932.75, "completions/mean_terminated_length": 1499.666748046875, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "epoch": 0.3092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.22691690921783447, "kl": 0.32286691665649414, "learning_rate": 4.538888888888889e-06, "loss": 0.0003, "num_tokens": 3474185.0, "reward": 6.0, "reward_std": 3.535533905029297, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.535533905029297, "step": 184 }, { "completion_length": 1219.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1704.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 1219.5, "completions/mean_terminated_length": 1219.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.31092436974789917, "frac_reward_zero_std": 0.0, "grad_norm": 12.985053062438965, "kl": 1.6388633251190186, "learning_rate": 4.533333333333334e-06, "loss": 0.0016, "num_tokens": 3489371.0, "reward": 5.125, "reward_std": 3.6142079830169678, "rewards/reward_model/mean": 5.125, "rewards/reward_model/std": 3.6142079830169678, "step": 185 }, { "completion_length": 2768.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 2768.75, "completions/mean_terminated_length": 2768.75, "completions/min_length": 2353.0, "completions/min_terminated_length": 2353.0, "epoch": 0.3126050420168067, "frac_reward_zero_std": 0.0, "grad_norm": 0.22867242991924286, "kl": 0.47679272294044495, "learning_rate": 4.527777777777778e-06, "loss": 0.0005, "num_tokens": 3509922.0, "reward": 8.5, "reward_std": 1.0801234245300293, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.0801235437393188, "step": 186 }, { "completion_length": 515.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1392.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 515.5, "completions/mean_terminated_length": 515.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.3142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.772415816783905, "kl": 1.1174834966659546, "learning_rate": 4.5222222222222225e-06, "loss": 0.0011, "num_tokens": 3521276.0, "reward": 6.782945156097412, "reward_std": 1.331932783126831, "rewards/reward_model/mean": 6.782945156097412, "rewards/reward_model/std": 1.3319326639175415, "step": 187 }, { "completion_length": 3300.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 3300.5, "completions/mean_terminated_length": 3300.5, "completions/min_length": 2707.0, "completions/min_terminated_length": 2707.0, "epoch": 0.31596638655462184, "frac_reward_zero_std": 0.0, "grad_norm": 0.2294078916311264, "kl": 0.47780218720436096, "learning_rate": 4.516666666666667e-06, "loss": 0.0005, "num_tokens": 3544978.0, "reward": 8.62745475769043, "reward_std": 0.3291955590248108, "rewards/reward_model/mean": 8.62745475769043, "rewards/reward_model/std": 0.3291955590248108, "step": 188 }, { "completion_length": 1702.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1702.5, "completions/mean_terminated_length": 1702.5, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "epoch": 0.3176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.3584645092487335, "kl": 0.38060808181762695, "learning_rate": 4.511111111111111e-06, "loss": 0.0004, "num_tokens": 3561500.0, "reward": 9.125, "reward_std": 1.75, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.75, "step": 189 }, { "completion_length": 1480.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 1480.5, "completions/mean_terminated_length": 1480.5, "completions/min_length": 1308.0, "completions/min_terminated_length": 1308.0, "epoch": 0.31932773109243695, "frac_reward_zero_std": 0.0, "grad_norm": 0.4364623427391052, "kl": 0.4817902445793152, "learning_rate": 4.505555555555556e-06, "loss": 0.0005, "num_tokens": 3576722.0, "reward": 7.875, "reward_std": 2.3935678005218506, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 2.3935678005218506, "step": 190 }, { "completion_length": 2473.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2725.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 2473.25, "completions/mean_terminated_length": 2473.25, "completions/min_length": 1974.0, "completions/min_terminated_length": 1974.0, "epoch": 0.32100840336134456, "frac_reward_zero_std": 0.0, "grad_norm": 0.25333359837532043, "kl": 0.5164632201194763, "learning_rate": 4.5e-06, "loss": 0.0005, "num_tokens": 3597439.0, "reward": 8.10063362121582, "reward_std": 0.573161780834198, "rewards/reward_model/mean": 8.10063362121582, "rewards/reward_model/std": 0.573161780834198, "step": 191 }, { "completion_length": 2716.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 2716.5, "completions/mean_terminated_length": 2716.5, "completions/min_length": 2586.0, "completions/min_terminated_length": 2586.0, "epoch": 0.3226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.22468847036361694, "kl": 0.7855070233345032, "learning_rate": 4.4944444444444445e-06, "loss": 0.0008, "num_tokens": 3618181.0, "reward": 8.293278694152832, "reward_std": 1.2478078603744507, "rewards/reward_model/mean": 8.293278694152832, "rewards/reward_model/std": 1.2478079795837402, "step": 192 }, { "completion_length": 1787.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1787.25, "completions/mean_terminated_length": 1787.25, "completions/min_length": 1577.0, "completions/min_terminated_length": 1577.0, "epoch": 0.3243697478991597, "frac_reward_zero_std": 0.0, "grad_norm": 0.44078683853149414, "kl": 0.3985818028450012, "learning_rate": 4.488888888888889e-06, "loss": 0.0004, "num_tokens": 3635362.0, "reward": 8.875, "reward_std": 1.6520190238952637, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.6520190238952637, "step": 193 }, { "completion_length": 2601.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 2601.0, "completions/mean_terminated_length": 2601.0, "completions/min_length": 2142.0, "completions/min_terminated_length": 2142.0, "epoch": 0.32605042016806723, "frac_reward_zero_std": 0.0, "grad_norm": 0.27554836869239807, "kl": 0.5283735394477844, "learning_rate": 4.483333333333333e-06, "loss": 0.0005, "num_tokens": 3655434.0, "reward": 8.392618179321289, "reward_std": 0.2633104622364044, "rewards/reward_model/mean": 8.392618179321289, "rewards/reward_model/std": 0.2633104622364044, "step": 194 }, { "completion_length": 1337.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 1337.25, "completions/mean_terminated_length": 1337.25, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "epoch": 0.3277310924369748, "frac_reward_zero_std": 0.0, "grad_norm": 0.5729833841323853, "kl": 0.7496541142463684, "learning_rate": 4.477777777777778e-06, "loss": 0.0007, "num_tokens": 3670847.0, "reward": 5.385059833526611, "reward_std": 2.0218420028686523, "rewards/reward_model/mean": 5.385059833526611, "rewards/reward_model/std": 2.0218420028686523, "step": 195 }, { "completion_length": 1217.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 1217.5, "completions/mean_terminated_length": 1217.5, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.32941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.54954594373703, "kl": 0.9338546395301819, "learning_rate": 4.472222222222223e-06, "loss": 0.0009, "num_tokens": 3685053.0, "reward": 6.616114139556885, "reward_std": 0.3764911890029907, "rewards/reward_model/mean": 6.616114139556885, "rewards/reward_model/std": 0.37649112939834595, "step": 196 }, { "completion_length": 1339.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 1339.75, "completions/mean_terminated_length": 1339.75, "completions/min_length": 1274.0, "completions/min_terminated_length": 1274.0, "epoch": 0.3310924369747899, "frac_reward_zero_std": 0.0, "grad_norm": 0.49502500891685486, "kl": 0.6899818181991577, "learning_rate": 4.4666666666666665e-06, "loss": 0.0007, "num_tokens": 3700456.0, "reward": 6.938961029052734, "reward_std": 0.47081485390663147, "rewards/reward_model/mean": 6.938961029052734, "rewards/reward_model/std": 0.470814973115921, "step": 197 }, { "completion_length": 2196.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 2196.25, "completions/mean_terminated_length": 2196.25, "completions/min_length": 1713.0, "completions/min_terminated_length": 1713.0, "epoch": 0.33277310924369746, "frac_reward_zero_std": 0.0, "grad_norm": 0.33635690808296204, "kl": 0.3528163731098175, "learning_rate": 4.461111111111112e-06, "loss": 0.0004, "num_tokens": 3717853.0, "reward": 8.75, "reward_std": 1.5545631647109985, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.5545631647109985, "step": 198 }, { "completion_length": 1193.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 1193.75, "completions/mean_terminated_length": 1193.75, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.334453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.6599255800247192, "kl": 0.8738446235656738, "learning_rate": 4.455555555555555e-06, "loss": 0.0009, "num_tokens": 3731124.0, "reward": 6.023212909698486, "reward_std": 1.539479374885559, "rewards/reward_model/mean": 6.023212909698486, "rewards/reward_model/std": 1.539479374885559, "step": 199 }, { "completion_length": 1154.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 1154.5, "completions/mean_terminated_length": 1154.5, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "epoch": 0.33613445378151263, "frac_reward_zero_std": 0.0, "grad_norm": 0.628311812877655, "kl": 0.8193744421005249, "learning_rate": 4.450000000000001e-06, "loss": 0.0008, "num_tokens": 3745958.0, "reward": 6.5886006355285645, "reward_std": 1.5735419988632202, "rewards/reward_model/mean": 6.5886006355285645, "rewards/reward_model/std": 1.5735421180725098, "step": 200 }, { "completion_length": 2125.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 2125.25, "completions/mean_terminated_length": 2125.25, "completions/min_length": 1687.0, "completions/min_terminated_length": 1687.0, "epoch": 0.3378151260504202, "frac_reward_zero_std": 0.0, "grad_norm": 0.3583378195762634, "kl": 0.3288140296936035, "learning_rate": 4.444444444444444e-06, "loss": 0.0003, "num_tokens": 3763167.0, "reward": 6.5, "reward_std": 0.9128709435462952, "rewards/reward_model/mean": 6.5, "rewards/reward_model/std": 0.9128709435462952, "step": 201 }, { "completion_length": 2559.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 2559.25, "completions/mean_terminated_length": 2559.25, "completions/min_length": 2192.0, "completions/min_terminated_length": 2192.0, "epoch": 0.33949579831932775, "frac_reward_zero_std": 0.0, "grad_norm": 0.23530146479606628, "kl": 0.4598729908466339, "learning_rate": 4.438888888888889e-06, "loss": 0.0005, "num_tokens": 3784792.0, "reward": 8.130855560302734, "reward_std": 0.7685312628746033, "rewards/reward_model/mean": 8.130855560302734, "rewards/reward_model/std": 0.7685312628746033, "step": 202 }, { "completion_length": 2495.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 2495.25, "completions/mean_terminated_length": 2495.25, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "epoch": 0.3411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.23678502440452576, "kl": 0.4983557164669037, "learning_rate": 4.433333333333334e-06, "loss": 0.0005, "num_tokens": 3803773.0, "reward": 7.749725341796875, "reward_std": 1.274611234664917, "rewards/reward_model/mean": 7.749725341796875, "rewards/reward_model/std": 1.274611234664917, "step": 203 }, { "completion_length": 2991.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3169.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 2991.5, "completions/mean_terminated_length": 2991.5, "completions/min_length": 2786.0, "completions/min_terminated_length": 2786.0, "epoch": 0.34285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.21996529400348663, "kl": 0.44032400846481323, "learning_rate": 4.427777777777778e-06, "loss": 0.0004, "num_tokens": 3826407.0, "reward": 9.06251049041748, "reward_std": 0.13361211121082306, "rewards/reward_model/mean": 9.06251049041748, "rewards/reward_model/std": 0.13361230492591858, "step": 204 }, { "completion_length": 3041.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3546.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 3041.25, "completions/mean_terminated_length": 3041.25, "completions/min_length": 2768.0, "completions/min_terminated_length": 2768.0, "epoch": 0.3445378151260504, "frac_reward_zero_std": 0.0, "grad_norm": 0.22607754170894623, "kl": 0.4181973338127136, "learning_rate": 4.422222222222223e-06, "loss": 0.0004, "num_tokens": 3848652.0, "reward": 8.249590873718262, "reward_std": 1.1901236772537231, "rewards/reward_model/mean": 8.249590873718262, "rewards/reward_model/std": 1.1901236772537231, "step": 205 }, { "completion_length": 1240.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1490.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 1240.0, "completions/mean_terminated_length": 1240.0, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.346218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.6906178593635559, "kl": 0.7297955751419067, "learning_rate": 4.416666666666667e-06, "loss": 0.0007, "num_tokens": 3864228.0, "reward": 6.978236198425293, "reward_std": 0.2300529032945633, "rewards/reward_model/mean": 6.978236198425293, "rewards/reward_model/std": 0.2300529032945633, "step": 206 }, { "completion_length": 2016.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 2016.25, "completions/mean_terminated_length": 2016.25, "completions/min_length": 1466.0, "completions/min_terminated_length": 1466.0, "epoch": 0.34789915966386553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021035869140177965, "kl": 0.3609980344772339, "learning_rate": 4.411111111111111e-06, "loss": 0.0004, "num_tokens": 3881761.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 207 }, { "completion_length": 2906.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3030.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 2906.5, "completions/mean_terminated_length": 2906.5, "completions/min_length": 2797.0, "completions/min_terminated_length": 2797.0, "epoch": 0.3495798319327731, "frac_reward_zero_std": 0.0, "grad_norm": 0.2264556735754013, "kl": 0.431278795003891, "learning_rate": 4.405555555555556e-06, "loss": 0.0004, "num_tokens": 3903959.0, "reward": 9.686347007751465, "reward_std": 0.37577515840530396, "rewards/reward_model/mean": 9.686347007751465, "rewards/reward_model/std": 0.3757750988006592, "step": 208 }, { "completion_length": 2715.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2963.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 2715.5, "completions/mean_terminated_length": 2715.5, "completions/min_length": 2502.0, "completions/min_terminated_length": 2502.0, "epoch": 0.35126050420168065, "frac_reward_zero_std": 0.0, "grad_norm": 0.23651069402694702, "kl": 0.4715757966041565, "learning_rate": 4.4e-06, "loss": 0.0005, "num_tokens": 3924701.0, "reward": 9.56467056274414, "reward_std": 0.5904367566108704, "rewards/reward_model/mean": 9.56467056274414, "rewards/reward_model/std": 0.5904366970062256, "step": 209 }, { "completion_length": 1060.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 1060.5, "completions/mean_terminated_length": 1060.5, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.35294117647058826, "frac_reward_zero_std": 0.0, "grad_norm": 0.6156778931617737, "kl": 0.8022011518478394, "learning_rate": 4.3944444444444455e-06, "loss": 0.0008, "num_tokens": 3939579.0, "reward": 5.080005645751953, "reward_std": 3.199277877807617, "rewards/reward_model/mean": 5.080005645751953, "rewards/reward_model/std": 3.199277877807617, "step": 210 }, { "completion_length": 2800.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2800.0, "completions/mean_terminated_length": 2800.0, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "epoch": 0.3546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.2301899492740631, "kl": 0.4365086853504181, "learning_rate": 4.388888888888889e-06, "loss": 0.0004, "num_tokens": 3959559.0, "reward": 8.273713111877441, "reward_std": 1.4226092100143433, "rewards/reward_model/mean": 8.273713111877441, "rewards/reward_model/std": 1.4226092100143433, "step": 211 }, { "completion_length": 1130.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1130.25, "completions/mean_terminated_length": 1130.25, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.3563025210084034, "frac_reward_zero_std": 0.0, "grad_norm": 0.6039474606513977, "kl": 1.3054817914962769, "learning_rate": 4.383333333333334e-06, "loss": 0.0013, "num_tokens": 3974288.0, "reward": 6.615111827850342, "reward_std": 1.1467223167419434, "rewards/reward_model/mean": 6.615111827850342, "rewards/reward_model/std": 1.1467223167419434, "step": 212 }, { "completion_length": 1182.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1182.75, "completions/mean_terminated_length": 1182.75, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.35798319327731093, "frac_reward_zero_std": 0.0, "grad_norm": 0.563639223575592, "kl": 0.8061200976371765, "learning_rate": 4.377777777777778e-06, "loss": 0.0008, "num_tokens": 3989879.0, "reward": 7.473617076873779, "reward_std": 0.6156018376350403, "rewards/reward_model/mean": 7.473617076873779, "rewards/reward_model/std": 0.6156017184257507, "step": 213 }, { "completion_length": 2827.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3099.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 2827.25, "completions/mean_terminated_length": 2827.25, "completions/min_length": 2574.0, "completions/min_terminated_length": 2574.0, "epoch": 0.3596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.2129405289888382, "kl": 0.4732570946216583, "learning_rate": 4.372222222222223e-06, "loss": 0.0005, "num_tokens": 4010588.0, "reward": 8.651857376098633, "reward_std": 0.4093300700187683, "rewards/reward_model/mean": 8.651857376098633, "rewards/reward_model/std": 0.4093301296234131, "step": 214 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.36134453781512604, "frac_reward_zero_std": 1.0, "grad_norm": 3.541159685482853e-06, "kl": 0.7164430022239685, "learning_rate": 4.366666666666667e-06, "loss": 0.0007, "num_tokens": 4018988.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 215 }, { "completion_length": 2901.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3227.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 2901.25, "completions/mean_terminated_length": 2901.25, "completions/min_length": 2564.0, "completions/min_terminated_length": 2564.0, "epoch": 0.3630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.22036711871623993, "kl": 0.44254857301712036, "learning_rate": 4.361111111111112e-06, "loss": 0.0004, "num_tokens": 4040389.0, "reward": 8.36063003540039, "reward_std": 0.6233159303665161, "rewards/reward_model/mean": 8.36063003540039, "rewards/reward_model/std": 0.6233160495758057, "step": 216 }, { "completion_length": 498.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 498.75, "completions/mean_terminated_length": 498.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.36470588235294116, "frac_reward_zero_std": 0.0, "grad_norm": 7.46598482131958, "kl": 1.1445415019989014, "learning_rate": 4.3555555555555555e-06, "loss": 0.0011, "num_tokens": 4050540.0, "reward": 2.5, "reward_std": 2.886751174926758, "rewards/reward_model/mean": 2.5, "rewards/reward_model/std": 2.886751413345337, "step": 217 }, { "completion_length": 2650.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2959.0, "completions/max_terminated_length": 2959.0, "completions/mean_length": 2650.75, "completions/mean_terminated_length": 2650.75, "completions/min_length": 2241.0, "completions/min_terminated_length": 2241.0, "epoch": 0.3663865546218487, "frac_reward_zero_std": 0.0, "grad_norm": 0.2419486939907074, "kl": 0.4856666922569275, "learning_rate": 4.350000000000001e-06, "loss": 0.0005, "num_tokens": 4070671.0, "reward": 6.481883525848389, "reward_std": 1.6037975549697876, "rewards/reward_model/mean": 6.481883525848389, "rewards/reward_model/std": 1.6037975549697876, "step": 218 }, { "completion_length": 1812.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1812.0, "completions/mean_terminated_length": 1812.0, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "epoch": 0.3680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.3586513102054596, "kl": 0.4191110134124756, "learning_rate": 4.344444444444445e-06, "loss": 0.0004, "num_tokens": 4087115.0, "reward": 9.125, "reward_std": 0.6291528940200806, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 0.6291528940200806, "step": 219 }, { "completion_length": 1374.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1374.5, "completions/mean_terminated_length": 1374.5, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 0.3697478991596639, "frac_reward_zero_std": 0.0, "grad_norm": 0.5102481842041016, "kl": 0.7383792400360107, "learning_rate": 4.3388888888888895e-06, "loss": 0.0007, "num_tokens": 4101217.0, "reward": 6.679230213165283, "reward_std": 0.7687322497367859, "rewards/reward_model/mean": 6.679230213165283, "rewards/reward_model/std": 0.768732488155365, "step": 220 }, { "completion_length": 1784.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1784.5, "completions/mean_terminated_length": 1784.5, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "epoch": 0.37142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.3449091911315918, "kl": 0.3592361807823181, "learning_rate": 4.333333333333334e-06, "loss": 0.0004, "num_tokens": 4118411.0, "reward": 6.0, "reward_std": 3.240370273590088, "rewards/reward_model/mean": 6.0, "rewards/reward_model/std": 3.240370273590088, "step": 221 }, { "completion_length": 1657.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1657.0, "completions/mean_terminated_length": 1657.0, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "epoch": 0.373109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.3586861491203308, "kl": 0.44242116808891296, "learning_rate": 4.327777777777778e-06, "loss": 0.0004, "num_tokens": 4133891.0, "reward": 8.75, "reward_std": 1.8929693698883057, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.8929694890975952, "step": 222 }, { "completion_length": 2416.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3223.0, "completions/max_terminated_length": 3223.0, "completions/mean_length": 2416.0, "completions/mean_terminated_length": 2416.0, "completions/min_length": 1638.0, "completions/min_terminated_length": 1638.0, "epoch": 0.37478991596638656, "frac_reward_zero_std": 0.0, "grad_norm": 0.31728434562683105, "kl": 0.3470199704170227, "learning_rate": 4.322222222222223e-06, "loss": 0.0003, "num_tokens": 4152903.0, "reward": 7.0, "reward_std": 3.488075017929077, "rewards/reward_model/mean": 7.0, "rewards/reward_model/std": 3.488075017929077, "step": 223 }, { "completion_length": 1478.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1478.25, "completions/mean_terminated_length": 1478.25, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "epoch": 0.3764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.4029053747653961, "kl": 0.3796059489250183, "learning_rate": 4.316666666666667e-06, "loss": 0.0004, "num_tokens": 4168224.0, "reward": 8.375, "reward_std": 1.108677864074707, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.1086779832839966, "step": 224 }, { "completion_length": 2548.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2548.25, "completions/mean_terminated_length": 2548.25, "completions/min_length": 2442.0, "completions/min_terminated_length": 2442.0, "epoch": 0.37815126050420167, "frac_reward_zero_std": 0.0, "grad_norm": 0.23842692375183105, "kl": 0.5093043446540833, "learning_rate": 4.3111111111111115e-06, "loss": 0.0005, "num_tokens": 4187045.0, "reward": 8.74769115447998, "reward_std": 0.6377931833267212, "rewards/reward_model/mean": 8.74769115447998, "rewards/reward_model/std": 0.6377933025360107, "step": 225 }, { "completion_length": 789.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 789.75, "completions/mean_terminated_length": 789.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3798319327731092, "frac_reward_zero_std": 0.0, "grad_norm": 0.6549173593521118, "kl": 1.0279666185379028, "learning_rate": 4.305555555555556e-06, "loss": 0.001, "num_tokens": 4199356.0, "reward": 7.8533034324646, "reward_std": 0.7746135592460632, "rewards/reward_model/mean": 7.8533034324646, "rewards/reward_model/std": 0.7746136784553528, "step": 226 }, { "completion_length": 1766.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1766.5, "completions/mean_terminated_length": 1766.5, "completions/min_length": 1432.0, "completions/min_terminated_length": 1432.0, "epoch": 0.3815126050420168, "frac_reward_zero_std": 0.0, "grad_norm": 0.3704773783683777, "kl": 0.37874269485473633, "learning_rate": 4.3e-06, "loss": 0.0004, "num_tokens": 4215918.0, "reward": 5.75, "reward_std": 2.217355728149414, "rewards/reward_model/mean": 5.75, "rewards/reward_model/std": 2.217355966567993, "step": 227 }, { "completion_length": 3109.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3472.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 3109.5, "completions/mean_terminated_length": 3109.5, "completions/min_length": 2585.0, "completions/min_terminated_length": 2585.0, "epoch": 0.3831932773109244, "frac_reward_zero_std": 0.0, "grad_norm": 0.23839071393013, "kl": 0.47008052468299866, "learning_rate": 4.294444444444445e-06, "loss": 0.0005, "num_tokens": 4239080.0, "reward": 7.694772243499756, "reward_std": 1.1449562311172485, "rewards/reward_model/mean": 7.694772243499756, "rewards/reward_model/std": 1.1449562311172485, "step": 228 }, { "completion_length": 1801.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 1801.0, "completions/mean_terminated_length": 1801.0, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "epoch": 0.38487394957983195, "frac_reward_zero_std": 0.0, "grad_norm": 0.3541783094406128, "kl": 0.4009455740451813, "learning_rate": 4.288888888888889e-06, "loss": 0.0004, "num_tokens": 4255632.0, "reward": 8.0, "reward_std": 0.8164966106414795, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 0.8164966106414795, "step": 229 }, { "completion_length": 1486.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1486.25, "completions/mean_terminated_length": 1486.25, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 0.3865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6752094626426697, "kl": 0.42911961674690247, "learning_rate": 4.2833333333333335e-06, "loss": 0.0004, "num_tokens": 4270469.0, "reward": 6.8125, "reward_std": 4.160203456878662, "rewards/reward_model/mean": 6.8125, "rewards/reward_model/std": 4.160203456878662, "step": 230 }, { "completion_length": 2945.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3142.0, "completions/max_terminated_length": 3142.0, "completions/mean_length": 2945.5, "completions/mean_terminated_length": 2945.5, "completions/min_length": 2688.0, "completions/min_terminated_length": 2688.0, "epoch": 0.38823529411764707, "frac_reward_zero_std": 0.0, "grad_norm": 0.2245272994041443, "kl": 0.5455446839332581, "learning_rate": 4.277777777777778e-06, "loss": 0.0005, "num_tokens": 4292147.0, "reward": 7.7876482009887695, "reward_std": 1.4192252159118652, "rewards/reward_model/mean": 7.7876482009887695, "rewards/reward_model/std": 1.4192253351211548, "step": 231 }, { "completion_length": 1662.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1662.0, "completions/mean_terminated_length": 1662.0, "completions/min_length": 1529.0, "completions/min_terminated_length": 1529.0, "epoch": 0.3899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.3757660388946533, "kl": 0.42322346568107605, "learning_rate": 4.272222222222222e-06, "loss": 0.0004, "num_tokens": 4307431.0, "reward": 7.25, "reward_std": 0.8660253882408142, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 0.8660253882408142, "step": 232 }, { "completion_length": 2795.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3066.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 2795.75, "completions/mean_terminated_length": 2795.75, "completions/min_length": 2561.0, "completions/min_terminated_length": 2561.0, "epoch": 0.3915966386554622, "frac_reward_zero_std": 0.0, "grad_norm": 0.22974319756031036, "kl": 0.4618850648403168, "learning_rate": 4.266666666666668e-06, "loss": 0.0005, "num_tokens": 4328122.0, "reward": 7.258394241333008, "reward_std": 0.5371370911598206, "rewards/reward_model/mean": 7.258394241333008, "rewards/reward_model/std": 0.5371370911598206, "step": 233 }, { "completion_length": 1628.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1628.25, "completions/mean_terminated_length": 1628.25, "completions/min_length": 1252.0, "completions/min_terminated_length": 1252.0, "epoch": 0.39327731092436974, "frac_reward_zero_std": 0.0, "grad_norm": 0.34619399905204773, "kl": 0.46718090772628784, "learning_rate": 4.261111111111111e-06, "loss": 0.0005, "num_tokens": 4344839.0, "reward": 9.125, "reward_std": 1.1814539432525635, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 1.1814539432525635, "step": 234 }, { "completion_length": 1105.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 1105.5, "completions/mean_terminated_length": 1105.5, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.3949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.6756569743156433, "kl": 0.8630207180976868, "learning_rate": 4.255555555555556e-06, "loss": 0.0009, "num_tokens": 4358041.0, "reward": 5.431639671325684, "reward_std": 1.4380779266357422, "rewards/reward_model/mean": 5.431639671325684, "rewards/reward_model/std": 1.4380780458450317, "step": 235 }, { "completion_length": 2920.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3273.0, "completions/max_terminated_length": 3273.0, "completions/mean_length": 2920.75, "completions/mean_terminated_length": 2920.75, "completions/min_length": 2707.0, "completions/min_terminated_length": 2707.0, "epoch": 0.39663865546218485, "frac_reward_zero_std": 0.0, "grad_norm": 0.21619361639022827, "kl": 0.4627906084060669, "learning_rate": 4.25e-06, "loss": 0.0005, "num_tokens": 4381184.0, "reward": 8.5, "reward_std": 0.6123724579811096, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 0.6123724579811096, "step": 236 }, { "completion_length": 3077.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3252.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 3077.0, "completions/mean_terminated_length": 3077.0, "completions/min_length": 2915.0, "completions/min_terminated_length": 2915.0, "epoch": 0.3983193277310924, "frac_reward_zero_std": 0.0, "grad_norm": 0.2036733329296112, "kl": 0.4635200798511505, "learning_rate": 4.244444444444445e-06, "loss": 0.0005, "num_tokens": 4402932.0, "reward": 9.433502197265625, "reward_std": 0.46453145146369934, "rewards/reward_model/mean": 9.433502197265625, "rewards/reward_model/std": 0.46453163027763367, "step": 237 }, { "completion_length": 2770.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 2770.25, "completions/mean_terminated_length": 2770.25, "completions/min_length": 2621.0, "completions/min_terminated_length": 2621.0, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 0.24756017327308655, "kl": 0.4903179109096527, "learning_rate": 4.238888888888889e-06, "loss": 0.0005, "num_tokens": 4424409.0, "reward": 8.543237686157227, "reward_std": 1.2686576843261719, "rewards/reward_model/mean": 8.543237686157227, "rewards/reward_model/std": 1.268657922744751, "step": 238 }, { "completion_length": 320.25, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.4016806722689076, "frac_reward_zero_std": 0.0, "grad_norm": 0.5235669016838074, "kl": 0.9405966997146606, "learning_rate": 4.233333333333334e-06, "loss": 0.0009, "num_tokens": 4436042.0, "reward": 8.375, "reward_std": 0.25, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.25, "step": 239 }, { "completion_length": 3967.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 3967.75, "completions/mean_terminated_length": 2879.666748046875, "completions/min_length": 2402.0, "completions/min_terminated_length": 2402.0, "epoch": 0.40336134453781514, "frac_reward_zero_std": 0.0, "grad_norm": 0.15598049759864807, "kl": 0.3935525715351105, "learning_rate": 4.227777777777778e-06, "loss": 0.0004, "num_tokens": 4462557.0, "reward": 7.573015213012695, "reward_std": 2.0017759799957275, "rewards/reward_model/mean": 7.573015213012695, "rewards/reward_model/std": 2.0017759799957275, "step": 240 }, { "completion_length": 796.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 796.75, "completions/mean_terminated_length": 796.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.4050420168067227, "frac_reward_zero_std": 0.0, "grad_norm": 0.5736768841743469, "kl": 0.8851484656333923, "learning_rate": 4.222222222222223e-06, "loss": 0.0009, "num_tokens": 4474704.0, "reward": 6.755120754241943, "reward_std": 2.419196128845215, "rewards/reward_model/mean": 6.755120754241943, "rewards/reward_model/std": 2.419196128845215, "step": 241 }, { "completion_length": 1553.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1553.75, "completions/mean_terminated_length": 1553.75, "completions/min_length": 1263.0, "completions/min_terminated_length": 1263.0, "epoch": 0.40672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.40061724185943604, "kl": 0.36847835779190063, "learning_rate": 4.216666666666667e-06, "loss": 0.0004, "num_tokens": 4490207.0, "reward": 9.25, "reward_std": 0.9574271440505981, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 0.9574271440505981, "step": 242 }, { "completion_length": 2808.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3275.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 2808.0, "completions/mean_terminated_length": 2808.0, "completions/min_length": 2385.0, "completions/min_terminated_length": 2385.0, "epoch": 0.4084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.2375236451625824, "kl": 0.5209259986877441, "learning_rate": 4.211111111111112e-06, "loss": 0.0005, "num_tokens": 4511323.0, "reward": 8.539529800415039, "reward_std": 1.2907954454421997, "rewards/reward_model/mean": 8.539529800415039, "rewards/reward_model/std": 1.2907954454421997, "step": 243 }, { "completion_length": 2845.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 2845.5, "completions/mean_terminated_length": 2845.5, "completions/min_length": 2438.0, "completions/min_terminated_length": 2438.0, "epoch": 0.41008403361344536, "frac_reward_zero_std": 0.0, "grad_norm": 0.26784712076187134, "kl": 0.5034636855125427, "learning_rate": 4.205555555555556e-06, "loss": 0.0005, "num_tokens": 4533309.0, "reward": 8.882152557373047, "reward_std": 0.8950045704841614, "rewards/reward_model/mean": 8.882152557373047, "rewards/reward_model/std": 0.8950047492980957, "step": 244 }, { "completion_length": 471.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 471.0, "completions/mean_terminated_length": 471.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 1.0807291269302368, "kl": 0.9026118516921997, "learning_rate": 4.2000000000000004e-06, "loss": 0.0009, "num_tokens": 4545037.0, "reward": 5.626047134399414, "reward_std": 4.62348747253418, "rewards/reward_model/mean": 5.626047134399414, "rewards/reward_model/std": 4.62348747253418, "step": 245 }, { "completion_length": 3169.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3654.0, "completions/max_terminated_length": 3654.0, "completions/mean_length": 3169.5, "completions/mean_terminated_length": 3169.5, "completions/min_length": 2712.0, "completions/min_terminated_length": 2712.0, "epoch": 0.4134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.22497378289699554, "kl": 0.4505349397659302, "learning_rate": 4.194444444444445e-06, "loss": 0.0005, "num_tokens": 4568679.0, "reward": 7.199137210845947, "reward_std": 1.1360905170440674, "rewards/reward_model/mean": 7.199137210845947, "rewards/reward_model/std": 1.1360902786254883, "step": 246 }, { "completion_length": 2979.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3353.0, "completions/max_terminated_length": 3353.0, "completions/mean_length": 2979.0, "completions/mean_terminated_length": 2979.0, "completions/min_length": 2461.0, "completions/min_terminated_length": 2461.0, "epoch": 0.4151260504201681, "frac_reward_zero_std": 0.0, "grad_norm": 0.1978806108236313, "kl": 0.4465804100036621, "learning_rate": 4.188888888888889e-06, "loss": 0.0004, "num_tokens": 4591319.0, "reward": 9.045299530029297, "reward_std": 0.32406604290008545, "rewards/reward_model/mean": 9.045299530029297, "rewards/reward_model/std": 0.32406583428382874, "step": 247 }, { "completion_length": 2383.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2678.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 2383.25, "completions/mean_terminated_length": 2383.25, "completions/min_length": 1829.0, "completions/min_terminated_length": 1829.0, "epoch": 0.41680672268907565, "frac_reward_zero_std": 0.0, "grad_norm": 0.2815382778644562, "kl": 0.8800508379936218, "learning_rate": 4.183333333333334e-06, "loss": 0.0009, "num_tokens": 4609576.0, "reward": 7.960483074188232, "reward_std": 1.1536815166473389, "rewards/reward_model/mean": 7.960483074188232, "rewards/reward_model/std": 1.1536816358566284, "step": 248 }, { "completion_length": 1926.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 1926.0, "completions/min_length": 1749.0, "completions/min_terminated_length": 1749.0, "epoch": 0.4184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.33348986506462097, "kl": 0.42607924342155457, "learning_rate": 4.177777777777778e-06, "loss": 0.0004, "num_tokens": 4627812.0, "reward": 6.625, "reward_std": 2.4958298206329346, "rewards/reward_model/mean": 6.625, "rewards/reward_model/std": 2.4958298206329346, "step": 249 }, { "completion_length": 635.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 635.0, "completions/mean_terminated_length": 635.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.42016806722689076, "frac_reward_zero_std": 0.0, "grad_norm": 0.5603328347206116, "kl": 1.105303406715393, "learning_rate": 4.1722222222222225e-06, "loss": 0.0011, "num_tokens": 4639708.0, "reward": 7.454676151275635, "reward_std": 1.3379886150360107, "rewards/reward_model/mean": 7.454676151275635, "rewards/reward_model/std": 1.3379884958267212, "step": 250 }, { "completion_length": 2519.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 2519.0, "completions/mean_terminated_length": 2519.0, "completions/min_length": 2101.0, "completions/min_terminated_length": 2101.0, "epoch": 0.4218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.23313941061496735, "kl": 0.5020186305046082, "learning_rate": 4.166666666666667e-06, "loss": 0.0005, "num_tokens": 4659764.0, "reward": 7.716615676879883, "reward_std": 2.483187198638916, "rewards/reward_model/mean": 7.716615676879883, "rewards/reward_model/std": 2.483187198638916, "step": 251 }, { "completion_length": 239.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.4235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006444022292271256, "kl": 1.3417521715164185, "learning_rate": 4.161111111111111e-06, "loss": 0.0013, "num_tokens": 4670236.0, "reward": 9.25, "reward_std": 0.0, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 0.0, "step": 252 }, { "completion_length": 2820.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3126.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 2820.75, "completions/mean_terminated_length": 2820.75, "completions/min_length": 2446.0, "completions/min_terminated_length": 2446.0, "epoch": 0.42521008403361343, "frac_reward_zero_std": 0.0, "grad_norm": 0.23993265628814697, "kl": 0.4522344172000885, "learning_rate": 4.155555555555556e-06, "loss": 0.0005, "num_tokens": 4691631.0, "reward": 8.778319358825684, "reward_std": 0.8046744465827942, "rewards/reward_model/mean": 8.778319358825684, "rewards/reward_model/std": 0.8046746253967285, "step": 253 }, { "completion_length": 1701.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1701.0, "completions/mean_terminated_length": 1701.0, "completions/min_length": 1419.0, "completions/min_terminated_length": 1419.0, "epoch": 0.426890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.3598572015762329, "kl": 0.4365740120410919, "learning_rate": 4.15e-06, "loss": 0.0004, "num_tokens": 4707115.0, "reward": 7.375, "reward_std": 1.6520190238952637, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 1.6520190238952637, "step": 254 }, { "completion_length": 255.5, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.5156990885734558, "kl": 1.2908028364181519, "learning_rate": 4.1444444444444445e-06, "loss": 0.0013, "num_tokens": 4716985.0, "reward": 8.8125, "reward_std": 0.3145764470100403, "rewards/reward_model/mean": 8.8125, "rewards/reward_model/std": 0.3145764470100403, "step": 255 }, { "completion_length": 2111.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 2111.5, "completions/mean_terminated_length": 2111.5, "completions/min_length": 1738.0, "completions/min_terminated_length": 1738.0, "epoch": 0.43025210084033616, "frac_reward_zero_std": 0.0, "grad_norm": 0.3176546096801758, "kl": 0.31987547874450684, "learning_rate": 4.138888888888889e-06, "loss": 0.0003, "num_tokens": 4735275.0, "reward": 9.375, "reward_std": 0.9464846849441528, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 0.9464847445487976, "step": 256 }, { "completion_length": 1851.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 1851.25, "completions/mean_terminated_length": 1851.25, "completions/min_length": 1591.0, "completions/min_terminated_length": 1591.0, "epoch": 0.4319327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.349301278591156, "kl": 0.343892365694046, "learning_rate": 4.133333333333333e-06, "loss": 0.0003, "num_tokens": 4752660.0, "reward": 7.625, "reward_std": 2.3228933811187744, "rewards/reward_model/mean": 7.625, "rewards/reward_model/std": 2.3228933811187744, "step": 257 }, { "completion_length": 2430.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2653.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 2430.5, "completions/mean_terminated_length": 2430.5, "completions/min_length": 2252.0, "completions/min_terminated_length": 2252.0, "epoch": 0.4336134453781513, "frac_reward_zero_std": 0.0, "grad_norm": 0.26822173595428467, "kl": 0.5344579219818115, "learning_rate": 4.1277777777777785e-06, "loss": 0.0005, "num_tokens": 4771654.0, "reward": -0.3647910952568054, "reward_std": 0.4221853017807007, "rewards/reward_model/mean": -0.3647910952568054, "rewards/reward_model/std": 0.42218533158302307, "step": 258 }, { "completion_length": 1812.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2814.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 1812.5, "completions/mean_terminated_length": 1812.5, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "epoch": 0.43529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.346161425113678, "kl": 0.40465983748435974, "learning_rate": 4.122222222222222e-06, "loss": 0.0004, "num_tokens": 4787404.0, "reward": 6.094305038452148, "reward_std": 5.060488700866699, "rewards/reward_model/mean": 6.094305038452148, "rewards/reward_model/std": 5.060488700866699, "step": 259 }, { "completion_length": 1989.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1989.25, "completions/mean_terminated_length": 1989.25, "completions/min_length": 1700.0, "completions/min_terminated_length": 1700.0, "epoch": 0.4369747899159664, "frac_reward_zero_std": 0.0, "grad_norm": 0.3061082065105438, "kl": 0.36512595415115356, "learning_rate": 4.116666666666667e-06, "loss": 0.0004, "num_tokens": 4804709.0, "reward": 9.875, "reward_std": 0.25, "rewards/reward_model/mean": 9.875, "rewards/reward_model/std": 0.25, "step": 260 }, { "completion_length": 321.5, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.43865546218487395, "frac_reward_zero_std": 0.0, "grad_norm": 0.7300800085067749, "kl": 1.1826317310333252, "learning_rate": 4.111111111111111e-06, "loss": 0.0012, "num_tokens": 4816375.0, "reward": 6.4375, "reward_std": 3.4723610877990723, "rewards/reward_model/mean": 6.4375, "rewards/reward_model/std": 3.4723610877990723, "step": 261 }, { "completion_length": 1722.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1722.25, "completions/mean_terminated_length": 1722.25, "completions/min_length": 1374.0, "completions/min_terminated_length": 1374.0, "epoch": 0.4403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 0.39571529626846313, "kl": 0.4361860156059265, "learning_rate": 4.105555555555556e-06, "loss": 0.0004, "num_tokens": 4832856.0, "reward": 9.75, "reward_std": 0.5, "rewards/reward_model/mean": 9.75, "rewards/reward_model/std": 0.5, "step": 262 }, { "completion_length": 2824.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3707.0, "completions/max_terminated_length": 3707.0, "completions/mean_length": 2824.75, "completions/mean_terminated_length": 2824.75, "completions/min_length": 2491.0, "completions/min_terminated_length": 2491.0, "epoch": 0.44201680672268906, "frac_reward_zero_std": 0.0, "grad_norm": 0.27337905764579773, "kl": 0.4943571388721466, "learning_rate": 4.1e-06, "loss": 0.0005, "num_tokens": 4852655.0, "reward": 7.6584672927856445, "reward_std": 1.708861231803894, "rewards/reward_model/mean": 7.6584672927856445, "rewards/reward_model/std": 1.708861231803894, "step": 263 }, { "completion_length": 2824.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2824.5, "completions/mean_terminated_length": 2824.5, "completions/min_length": 2615.0, "completions/min_terminated_length": 2615.0, "epoch": 0.4436974789915966, "frac_reward_zero_std": 0.0, "grad_norm": 0.22636935114860535, "kl": 0.45819365978240967, "learning_rate": 4.094444444444445e-06, "loss": 0.0005, "num_tokens": 4873149.0, "reward": 9.9375, "reward_std": 0.125, "rewards/reward_model/mean": 9.9375, "rewards/reward_model/std": 0.125, "step": 264 }, { "completion_length": 239.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.44537815126050423, "frac_reward_zero_std": 0.0, "grad_norm": 0.68837571144104, "kl": 1.2865989208221436, "learning_rate": 4.088888888888889e-06, "loss": 0.0013, "num_tokens": 4883801.0, "reward": 8.875, "reward_std": 0.14433756470680237, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.14433756470680237, "step": 265 }, { "completion_length": 2764.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 2764.5, "completions/mean_terminated_length": 2764.5, "completions/min_length": 2553.0, "completions/min_terminated_length": 2553.0, "epoch": 0.4470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.22983042895793915, "kl": 0.4933811128139496, "learning_rate": 4.083333333333334e-06, "loss": 0.0005, "num_tokens": 4905047.0, "reward": 8.157556533813477, "reward_std": 1.219398856163025, "rewards/reward_model/mean": 8.157556533813477, "rewards/reward_model/std": 1.2193987369537354, "step": 266 }, { "completion_length": 1804.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 1804.25, "completions/mean_terminated_length": 1804.25, "completions/min_length": 1470.0, "completions/min_terminated_length": 1470.0, "epoch": 0.44873949579831934, "frac_reward_zero_std": 0.0, "grad_norm": 0.3584739863872528, "kl": 0.4173256754875183, "learning_rate": 4.077777777777778e-06, "loss": 0.0004, "num_tokens": 4922016.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 267 }, { "completion_length": 4010.5, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 4010.5, "completions/mean_terminated_length": 2936.666748046875, "completions/min_length": 2793.0, "completions/min_terminated_length": 2793.0, "epoch": 0.4504201680672269, "frac_reward_zero_std": 0.0, "grad_norm": 0.14479389786720276, "kl": 0.3683866262435913, "learning_rate": 4.0722222222222226e-06, "loss": 0.0004, "num_tokens": 4948374.0, "reward": 7.302102088928223, "reward_std": 1.0420950651168823, "rewards/reward_model/mean": 7.302102088928223, "rewards/reward_model/std": 1.0420950651168823, "step": 268 }, { "completion_length": 2756.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 2756.25, "completions/mean_terminated_length": 2756.25, "completions/min_length": 2539.0, "completions/min_terminated_length": 2539.0, "epoch": 0.45210084033613446, "frac_reward_zero_std": 0.0, "grad_norm": 0.23952485620975494, "kl": 0.4850582480430603, "learning_rate": 4.066666666666667e-06, "loss": 0.0005, "num_tokens": 4969463.0, "reward": 8.543432235717773, "reward_std": 0.3966461420059204, "rewards/reward_model/mean": 8.543432235717773, "rewards/reward_model/std": 0.3966458737850189, "step": 269 }, { "completion_length": 3161.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3328.0, "completions/max_terminated_length": 3328.0, "completions/mean_length": 3161.0, "completions/mean_terminated_length": 3161.0, "completions/min_length": 3065.0, "completions/min_terminated_length": 3065.0, "epoch": 0.453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.24282827973365784, "kl": 0.41256698966026306, "learning_rate": 4.061111111111111e-06, "loss": 0.0004, "num_tokens": 4992971.0, "reward": 8.257098197937012, "reward_std": 0.5260062217712402, "rewards/reward_model/mean": 8.257098197937012, "rewards/reward_model/std": 0.5260062217712402, "step": 270 }, { "completion_length": 2835.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 2835.25, "completions/mean_terminated_length": 2835.25, "completions/min_length": 2086.0, "completions/min_terminated_length": 2086.0, "epoch": 0.45546218487394957, "frac_reward_zero_std": 0.0, "grad_norm": 0.20633547008037567, "kl": 0.43844982981681824, "learning_rate": 4.055555555555556e-06, "loss": 0.0004, "num_tokens": 5014960.0, "reward": 8.574216842651367, "reward_std": 0.6181183457374573, "rewards/reward_model/mean": 8.574216842651367, "rewards/reward_model/std": 0.6181180477142334, "step": 271 }, { "completion_length": 2729.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2855.0, "completions/max_terminated_length": 2855.0, "completions/mean_length": 2729.75, "completions/mean_terminated_length": 2729.75, "completions/min_length": 2629.0, "completions/min_terminated_length": 2629.0, "epoch": 0.45714285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.21822984516620636, "kl": 0.4951835870742798, "learning_rate": 4.05e-06, "loss": 0.0005, "num_tokens": 5034899.0, "reward": 8.276845932006836, "reward_std": 0.7162977457046509, "rewards/reward_model/mean": 8.276845932006836, "rewards/reward_model/std": 0.7162977457046509, "step": 272 }, { "completion_length": 2538.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2836.0, "completions/max_terminated_length": 2836.0, "completions/mean_length": 2538.0, "completions/mean_terminated_length": 2538.0, "completions/min_length": 2010.0, "completions/min_terminated_length": 2010.0, "epoch": 0.4588235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.2506416440010071, "kl": 0.5022124648094177, "learning_rate": 4.044444444444445e-06, "loss": 0.0005, "num_tokens": 5054727.0, "reward": 8.645195960998535, "reward_std": 1.5822590589523315, "rewards/reward_model/mean": 8.645195960998535, "rewards/reward_model/std": 1.582259178161621, "step": 273 }, { "completion_length": 3165.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3940.0, "completions/max_terminated_length": 3940.0, "completions/mean_length": 3165.0, "completions/mean_terminated_length": 3165.0, "completions/min_length": 2730.0, "completions/min_terminated_length": 2730.0, "epoch": 0.46050420168067224, "frac_reward_zero_std": 0.0, "grad_norm": 0.19147326052188873, "kl": 0.3794291317462921, "learning_rate": 4.038888888888889e-06, "loss": 0.0004, "num_tokens": 5077259.0, "reward": 7.875, "reward_std": 1.299038052558899, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 1.299038052558899, "step": 274 }, { "completion_length": 1519.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1519.25, "completions/mean_terminated_length": 1519.25, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.46218487394957986, "frac_reward_zero_std": 0.0, "grad_norm": 0.35595157742500305, "kl": 0.3893285095691681, "learning_rate": 4.033333333333333e-06, "loss": 0.0004, "num_tokens": 5093876.0, "reward": 8.375, "reward_std": 1.7969882488250732, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.7969882488250732, "step": 275 }, { "completion_length": 2896.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3184.0, "completions/max_terminated_length": 3184.0, "completions/mean_length": 2896.25, "completions/mean_terminated_length": 2896.25, "completions/min_length": 2684.0, "completions/min_terminated_length": 2684.0, "epoch": 0.4638655462184874, "frac_reward_zero_std": 0.0, "grad_norm": 0.26196521520614624, "kl": 0.4277169704437256, "learning_rate": 4.027777777777779e-06, "loss": 0.0004, "num_tokens": 5114701.0, "reward": 8.341163635253906, "reward_std": 0.8709584474563599, "rewards/reward_model/mean": 8.341163635253906, "rewards/reward_model/std": 0.870958685874939, "step": 276 }, { "completion_length": 1570.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1570.0, "completions/mean_terminated_length": 1570.0, "completions/min_length": 1266.0, "completions/min_terminated_length": 1266.0, "epoch": 0.46554621848739497, "frac_reward_zero_std": 0.0, "grad_norm": 0.36992964148521423, "kl": 0.4556547999382019, "learning_rate": 4.022222222222222e-06, "loss": 0.0005, "num_tokens": 5130557.0, "reward": 9.875, "reward_std": 0.25, "rewards/reward_model/mean": 9.875, "rewards/reward_model/std": 0.25, "step": 277 }, { "completion_length": 2570.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 2570.0, "completions/mean_terminated_length": 2570.0, "completions/min_length": 2186.0, "completions/min_terminated_length": 2186.0, "epoch": 0.4672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.23997019231319427, "kl": 0.49078354239463806, "learning_rate": 4.0166666666666675e-06, "loss": 0.0005, "num_tokens": 5149629.0, "reward": 9.368803024291992, "reward_std": 0.7542216181755066, "rewards/reward_model/mean": 9.368803024291992, "rewards/reward_model/std": 0.7542216777801514, "step": 278 }, { "completion_length": 2426.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 2426.0, "completions/mean_terminated_length": 2426.0, "completions/min_length": 1657.0, "completions/min_terminated_length": 1657.0, "epoch": 0.4689075630252101, "frac_reward_zero_std": 0.0, "grad_norm": 0.3054017424583435, "kl": 0.6035206913948059, "learning_rate": 4.011111111111111e-06, "loss": 0.0006, "num_tokens": 5168745.0, "reward": 8.16610050201416, "reward_std": 3.1355741024017334, "rewards/reward_model/mean": 8.16610050201416, "rewards/reward_model/std": 3.1355741024017334, "step": 279 }, { "completion_length": 3688.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 3688.75, "completions/mean_terminated_length": 2507.666748046875, "completions/min_length": 2447.0, "completions/min_terminated_length": 2447.0, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.14645691215991974, "kl": 0.44846615195274353, "learning_rate": 4.005555555555556e-06, "loss": 0.0004, "num_tokens": 5192512.0, "reward": 7.706606388092041, "reward_std": 1.8789076805114746, "rewards/reward_model/mean": 7.706606388092041, "rewards/reward_model/std": 1.8789079189300537, "step": 280 }, { "completion_length": 535.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 535.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4722689075630252, "frac_reward_zero_std": 0.0, "grad_norm": 0.7318718433380127, "kl": 1.2159335613250732, "learning_rate": 4.000000000000001e-06, "loss": 0.0012, "num_tokens": 5203460.0, "reward": 7.814440727233887, "reward_std": 1.3730045557022095, "rewards/reward_model/mean": 7.814440727233887, "rewards/reward_model/std": 1.373004674911499, "step": 281 }, { "completion_length": 427.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 427.0, "completions/mean_terminated_length": 427.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.47394957983193275, "frac_reward_zero_std": 0.0, "grad_norm": 0.7804542183876038, "kl": 1.2701330184936523, "learning_rate": 3.994444444444445e-06, "loss": 0.0013, "num_tokens": 5215088.0, "reward": 7.464520454406738, "reward_std": 0.819574236869812, "rewards/reward_model/mean": 7.464520454406738, "rewards/reward_model/std": 0.8195742964744568, "step": 282 }, { "completion_length": 1212.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1617.0, "completions/max_terminated_length": 1617.0, "completions/mean_length": 1212.25, "completions/mean_terminated_length": 1212.25, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 0.4756302521008403, "frac_reward_zero_std": 0.0, "grad_norm": 0.6198026537895203, "kl": 0.797105073928833, "learning_rate": 3.9888888888888895e-06, "loss": 0.0008, "num_tokens": 5230677.0, "reward": 4.329617500305176, "reward_std": 1.2353250980377197, "rewards/reward_model/mean": 4.329617500305176, "rewards/reward_model/std": 1.2353249788284302, "step": 283 }, { "completion_length": 1515.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1515.75, "completions/mean_terminated_length": 1515.75, "completions/min_length": 1103.0, "completions/min_terminated_length": 1103.0, "epoch": 0.4773109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.40149950981140137, "kl": 0.4432225823402405, "learning_rate": 3.983333333333334e-06, "loss": 0.0004, "num_tokens": 5245444.0, "reward": 9.5, "reward_std": 0.7071067690849304, "rewards/reward_model/mean": 9.5, "rewards/reward_model/std": 0.7071067690849304, "step": 284 }, { "completion_length": 1580.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 1580.75, "completions/mean_terminated_length": 1580.75, "completions/min_length": 1356.0, "completions/min_terminated_length": 1356.0, "epoch": 0.4789915966386555, "frac_reward_zero_std": 0.0, "grad_norm": 0.3850942552089691, "kl": 0.34461867809295654, "learning_rate": 3.977777777777778e-06, "loss": 0.0003, "num_tokens": 5262243.0, "reward": 5.375, "reward_std": 2.212653160095215, "rewards/reward_model/mean": 5.375, "rewards/reward_model/std": 2.212653160095215, "step": 285 }, { "completion_length": 2878.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3174.0, "completions/max_terminated_length": 3174.0, "completions/mean_length": 2878.0, "completions/mean_terminated_length": 2878.0, "completions/min_length": 2668.0, "completions/min_terminated_length": 2668.0, "epoch": 0.48067226890756304, "frac_reward_zero_std": 0.0, "grad_norm": 0.2235092669725418, "kl": 0.462818443775177, "learning_rate": 3.972222222222223e-06, "loss": 0.0005, "num_tokens": 5283991.0, "reward": 7.139934539794922, "reward_std": 0.8488904237747192, "rewards/reward_model/mean": 7.139934539794922, "rewards/reward_model/std": 0.8488903045654297, "step": 286 }, { "completion_length": 1628.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1628.0, "completions/mean_terminated_length": 1628.0, "completions/min_length": 1432.0, "completions/min_terminated_length": 1432.0, "epoch": 0.4823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.3521459996700287, "kl": 0.4344547986984253, "learning_rate": 3.966666666666667e-06, "loss": 0.0004, "num_tokens": 5299663.0, "reward": 7.875, "reward_std": 1.5478479862213135, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 1.5478479862213135, "step": 287 }, { "completion_length": 729.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 729.5, "completions/mean_terminated_length": 729.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.48403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 0.8145760297775269, "kl": 1.7744853496551514, "learning_rate": 3.9611111111111115e-06, "loss": 0.0018, "num_tokens": 5313841.0, "reward": 5.823897838592529, "reward_std": 1.4017596244812012, "rewards/reward_model/mean": 5.823897838592529, "rewards/reward_model/std": 1.4017595052719116, "step": 288 }, { "completion_length": 2374.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2677.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 2374.25, "completions/mean_terminated_length": 2374.25, "completions/min_length": 2022.0, "completions/min_terminated_length": 2022.0, "epoch": 0.4857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.25775057077407837, "kl": 0.5824540257453918, "learning_rate": 3.955555555555556e-06, "loss": 0.0006, "num_tokens": 5332054.0, "reward": 7.965119361877441, "reward_std": 0.4358280897140503, "rewards/reward_model/mean": 7.965119361877441, "rewards/reward_model/std": 0.4358280897140503, "step": 289 }, { "completion_length": 438.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.48739495798319327, "frac_reward_zero_std": 0.0, "grad_norm": 0.5862885117530823, "kl": 1.2424085140228271, "learning_rate": 3.95e-06, "loss": 0.0012, "num_tokens": 5343442.0, "reward": 8.441064834594727, "reward_std": 0.9799709320068359, "rewards/reward_model/mean": 8.441064834594727, "rewards/reward_model/std": 0.9799709320068359, "step": 290 }, { "completion_length": 2556.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2831.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 2556.25, "completions/mean_terminated_length": 2556.25, "completions/min_length": 2407.0, "completions/min_terminated_length": 2407.0, "epoch": 0.4890756302521008, "frac_reward_zero_std": 0.0, "grad_norm": 0.22921150922775269, "kl": 0.47927939891815186, "learning_rate": 3.944444444444445e-06, "loss": 0.0005, "num_tokens": 5362683.0, "reward": 8.625, "reward_std": 0.25, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 0.25, "step": 291 }, { "completion_length": 1449.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1449.75, "completions/mean_terminated_length": 1449.75, "completions/min_length": 1314.0, "completions/min_terminated_length": 1314.0, "epoch": 0.4907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.41298845410346985, "kl": 0.4478920102119446, "learning_rate": 3.938888888888889e-06, "loss": 0.0004, "num_tokens": 5377562.0, "reward": 9.5, "reward_std": 1.0, "rewards/reward_model/mean": 9.5, "rewards/reward_model/std": 1.0, "step": 292 }, { "completion_length": 2849.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3386.0, "completions/max_terminated_length": 3386.0, "completions/mean_length": 2849.0, "completions/mean_terminated_length": 2849.0, "completions/min_length": 2592.0, "completions/min_terminated_length": 2592.0, "epoch": 0.492436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.23692262172698975, "kl": 0.438249796628952, "learning_rate": 3.9333333333333335e-06, "loss": 0.0004, "num_tokens": 5400362.0, "reward": 8.102673530578613, "reward_std": 0.5822569727897644, "rewards/reward_model/mean": 8.102673530578613, "rewards/reward_model/std": 0.5822569727897644, "step": 293 }, { "completion_length": 305.75, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.49411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048225149512290955, "kl": 1.199939489364624, "learning_rate": 3.927777777777778e-06, "loss": 0.0012, "num_tokens": 5410885.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_model/mean": 0.0, "rewards/reward_model/std": 0.0, "step": 294 }, { "completion_length": 1803.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1803.5, "completions/mean_terminated_length": 1803.5, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "epoch": 0.4957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.30101513862609863, "kl": 0.33720529079437256, "learning_rate": 3.922222222222223e-06, "loss": 0.0003, "num_tokens": 5427571.0, "reward": 8.375, "reward_std": 2.3584952354431152, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 2.3584952354431152, "step": 295 }, { "completion_length": 34.5, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.49747899159663866, "frac_reward_zero_std": 1.0, "grad_norm": 0.013268837705254555, "kl": 1.9172415733337402, "learning_rate": 3.916666666666667e-06, "loss": 0.0019, "num_tokens": 5435201.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 296 }, { "completion_length": 2705.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3151.0, "completions/max_terminated_length": 3151.0, "completions/mean_length": 2705.25, "completions/mean_terminated_length": 2705.25, "completions/min_length": 2438.0, "completions/min_terminated_length": 2438.0, "epoch": 0.4991596638655462, "frac_reward_zero_std": 0.0, "grad_norm": 0.2240026444196701, "kl": 0.4789201617240906, "learning_rate": 3.911111111111112e-06, "loss": 0.0005, "num_tokens": 5456130.0, "reward": 8.752599716186523, "reward_std": 0.19990156590938568, "rewards/reward_model/mean": 8.752599716186523, "rewards/reward_model/std": 0.1999017596244812, "step": 297 }, { "completion_length": 2657.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 2657.5, "completions/mean_terminated_length": 2657.5, "completions/min_length": 2547.0, "completions/min_terminated_length": 2547.0, "epoch": 0.5008403361344538, "frac_reward_zero_std": 0.0, "grad_norm": 0.24701523780822754, "kl": 0.5096070766448975, "learning_rate": 3.9055555555555555e-06, "loss": 0.0005, "num_tokens": 5476632.0, "reward": 8.631172180175781, "reward_std": 0.9608593583106995, "rewards/reward_model/mean": 8.631172180175781, "rewards/reward_model/std": 0.9608596563339233, "step": 298 }, { "completion_length": 1691.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 1691.5, "completions/mean_terminated_length": 1691.5, "completions/min_length": 1491.0, "completions/min_terminated_length": 1491.0, "epoch": 0.5025210084033613, "frac_reward_zero_std": 0.0, "grad_norm": 0.3567356467247009, "kl": 0.3661032021045685, "learning_rate": 3.900000000000001e-06, "loss": 0.0004, "num_tokens": 5493650.0, "reward": 8.625, "reward_std": 2.75, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 2.75, "step": 299 }, { "completion_length": 2468.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2731.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 2468.0, "completions/mean_terminated_length": 2468.0, "completions/min_length": 2304.0, "completions/min_terminated_length": 2304.0, "epoch": 0.5042016806722689, "frac_reward_zero_std": 0.0, "grad_norm": 0.25644633173942566, "kl": 0.526447057723999, "learning_rate": 3.894444444444444e-06, "loss": 0.0005, "num_tokens": 5514310.0, "reward": 9.125, "reward_std": 0.75, "rewards/reward_model/mean": 9.125, "rewards/reward_model/std": 0.75, "step": 300 }, { "completion_length": 2511.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 2511.25, "completions/mean_terminated_length": 2511.25, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "epoch": 0.5058823529411764, "frac_reward_zero_std": 0.0, "grad_norm": 0.2634853720664978, "kl": 0.47203025221824646, "learning_rate": 3.88888888888889e-06, "loss": 0.0005, "num_tokens": 5533947.0, "reward": 9.0, "reward_std": 1.541103482246399, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 1.541103482246399, "step": 301 }, { "completion_length": 2958.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3329.0, "completions/max_terminated_length": 3329.0, "completions/mean_length": 2958.0, "completions/mean_terminated_length": 2958.0, "completions/min_length": 2740.0, "completions/min_terminated_length": 2740.0, "epoch": 0.507563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.2197744995355606, "kl": 0.4193590581417084, "learning_rate": 3.883333333333333e-06, "loss": 0.0004, "num_tokens": 5555755.0, "reward": 9.359548568725586, "reward_std": 0.6383318901062012, "rewards/reward_model/mean": 9.359548568725586, "rewards/reward_model/std": 0.6383317708969116, "step": 302 }, { "completion_length": 2577.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 2577.75, "completions/mean_terminated_length": 2577.75, "completions/min_length": 2023.0, "completions/min_terminated_length": 2023.0, "epoch": 0.5092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.2394646853208542, "kl": 0.2628512382507324, "learning_rate": 3.877777777777778e-06, "loss": 0.0003, "num_tokens": 5576774.0, "reward": 8.0, "reward_std": 2.041241407394409, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 2.0412416458129883, "step": 303 }, { "completion_length": 2622.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 2622.75, "completions/mean_terminated_length": 2622.75, "completions/min_length": 2491.0, "completions/min_terminated_length": 2491.0, "epoch": 0.5109243697478991, "frac_reward_zero_std": 0.0, "grad_norm": 0.24940021336078644, "kl": 0.730096161365509, "learning_rate": 3.872222222222223e-06, "loss": 0.0007, "num_tokens": 5596185.0, "reward": 7.863737106323242, "reward_std": 1.3137425184249878, "rewards/reward_model/mean": 7.863737106323242, "rewards/reward_model/std": 1.3137426376342773, "step": 304 }, { "completion_length": 1545.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1545.0, "completions/mean_terminated_length": 1545.0, "completions/min_length": 1211.0, "completions/min_terminated_length": 1211.0, "epoch": 0.5126050420168067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022401062306016684, "kl": 0.41902345418930054, "learning_rate": 3.866666666666667e-06, "loss": 0.0004, "num_tokens": 5612637.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 305 }, { "completion_length": 2731.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2862.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 2731.5, "completions/mean_terminated_length": 2731.5, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.5142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.20936322212219238, "kl": 0.49068236351013184, "learning_rate": 3.861111111111112e-06, "loss": 0.0005, "num_tokens": 5632823.0, "reward": 9.07536792755127, "reward_std": 0.4592762589454651, "rewards/reward_model/mean": 9.07536792755127, "rewards/reward_model/std": 0.4592762589454651, "step": 306 }, { "completion_length": 2622.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 2622.5, "completions/mean_terminated_length": 2622.5, "completions/min_length": 2400.0, "completions/min_terminated_length": 2400.0, "epoch": 0.5159663865546219, "frac_reward_zero_std": 0.0, "grad_norm": 0.21886466443538666, "kl": 0.4906158149242401, "learning_rate": 3.855555555555556e-06, "loss": 0.0005, "num_tokens": 5653029.0, "reward": 7.288498401641846, "reward_std": 0.5875564813613892, "rewards/reward_model/mean": 7.288498401641846, "rewards/reward_model/std": 0.5875565409660339, "step": 307 }, { "completion_length": 2366.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 2366.5, "completions/mean_terminated_length": 2366.5, "completions/min_length": 1856.0, "completions/min_terminated_length": 1856.0, "epoch": 0.5176470588235295, "frac_reward_zero_std": 0.0, "grad_norm": 0.2796529233455658, "kl": 0.5184450745582581, "learning_rate": 3.85e-06, "loss": 0.0005, "num_tokens": 5671635.0, "reward": 8.75, "reward_std": 0.8660253882408142, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 0.8660253882408142, "step": 308 }, { "completion_length": 235.5, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.519327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.7424680590629578, "kl": 1.3006591796875, "learning_rate": 3.844444444444445e-06, "loss": 0.0013, "num_tokens": 5681553.0, "reward": 8.0625, "reward_std": 0.23935678601264954, "rewards/reward_model/mean": 8.0625, "rewards/reward_model/std": 0.23935678601264954, "step": 309 }, { "completion_length": 1788.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1788.5, "completions/mean_terminated_length": 1788.5, "completions/min_length": 1468.0, "completions/min_terminated_length": 1468.0, "epoch": 0.5210084033613446, "frac_reward_zero_std": 0.0, "grad_norm": 0.3830660581588745, "kl": 0.3712735176086426, "learning_rate": 3.838888888888889e-06, "loss": 0.0004, "num_tokens": 5699143.0, "reward": 8.875, "reward_std": 2.25, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 2.25, "step": 310 }, { "completion_length": 1929.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 1929.25, "completions/mean_terminated_length": 1929.25, "completions/min_length": 1706.0, "completions/min_terminated_length": 1706.0, "epoch": 0.5226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.2874327301979065, "kl": 0.3179646134376526, "learning_rate": 3.833333333333334e-06, "loss": 0.0003, "num_tokens": 5717908.0, "reward": 4.375, "reward_std": 2.9261748790740967, "rewards/reward_model/mean": 4.375, "rewards/reward_model/std": 2.9261748790740967, "step": 311 }, { "completion_length": 2580.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 2580.0, "completions/mean_terminated_length": 2580.0, "completions/min_length": 2394.0, "completions/min_terminated_length": 2394.0, "epoch": 0.5243697478991597, "frac_reward_zero_std": 0.0, "grad_norm": 0.23839616775512695, "kl": 0.513839840888977, "learning_rate": 3.827777777777778e-06, "loss": 0.0005, "num_tokens": 5737376.0, "reward": 8.6875, "reward_std": 0.23935678601264954, "rewards/reward_model/mean": 8.6875, "rewards/reward_model/std": 0.23935678601264954, "step": 312 }, { "completion_length": 2636.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 2636.75, "completions/mean_terminated_length": 2636.75, "completions/min_length": 2323.0, "completions/min_terminated_length": 2323.0, "epoch": 0.5260504201680672, "frac_reward_zero_std": 0.0, "grad_norm": 0.23319679498672485, "kl": 0.5761256217956543, "learning_rate": 3.8222222222222224e-06, "loss": 0.0006, "num_tokens": 5757067.0, "reward": 7.948423385620117, "reward_std": 1.3213917016983032, "rewards/reward_model/mean": 7.948423385620117, "rewards/reward_model/std": 1.3213918209075928, "step": 313 }, { "completion_length": 463.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 463.5, "completions/mean_terminated_length": 463.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5277310924369748, "frac_reward_zero_std": 0.0, "grad_norm": 0.5147403478622437, "kl": 1.200799822807312, "learning_rate": 3.816666666666667e-06, "loss": 0.0012, "num_tokens": 5768681.0, "reward": 7.955634117126465, "reward_std": 0.6738820672035217, "rewards/reward_model/mean": 7.955634117126465, "rewards/reward_model/std": 0.673882007598877, "step": 314 }, { "completion_length": 1520.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1888.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1520.25, "completions/mean_terminated_length": 1520.25, "completions/min_length": 1262.0, "completions/min_terminated_length": 1262.0, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.0, "grad_norm": 0.3914318382740021, "kl": 0.37652626633644104, "learning_rate": 3.8111111111111117e-06, "loss": 0.0004, "num_tokens": 5783306.0, "reward": 9.25, "reward_std": 1.5, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.5, "step": 315 }, { "completion_length": 449.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5310924369747899, "frac_reward_zero_std": 0.0, "grad_norm": 0.5969287753105164, "kl": 1.1334108114242554, "learning_rate": 3.8055555555555556e-06, "loss": 0.0011, "num_tokens": 5793736.0, "reward": 8.24909782409668, "reward_std": 1.001805067062378, "rewards/reward_model/mean": 8.24909782409668, "rewards/reward_model/std": 1.001805067062378, "step": 316 }, { "completion_length": 1441.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1441.0, "completions/mean_terminated_length": 1441.0, "completions/min_length": 1213.0, "completions/min_terminated_length": 1213.0, "epoch": 0.5327731092436975, "frac_reward_zero_std": 0.0, "grad_norm": 0.3987216353416443, "kl": 0.4196941554546356, "learning_rate": 3.8000000000000005e-06, "loss": 0.0004, "num_tokens": 5808852.0, "reward": 7.75, "reward_std": 1.8484227657318115, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 1.8484227657318115, "step": 317 }, { "completion_length": 2525.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 2525.0, "completions/mean_terminated_length": 2525.0, "completions/min_length": 2074.0, "completions/min_terminated_length": 2074.0, "epoch": 0.534453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.22196711599826813, "kl": 0.4962569773197174, "learning_rate": 3.7944444444444444e-06, "loss": 0.0005, "num_tokens": 5828168.0, "reward": 8.102570533752441, "reward_std": 0.7247011065483093, "rewards/reward_model/mean": 8.102570533752441, "rewards/reward_model/std": 0.7247012853622437, "step": 318 }, { "completion_length": 2988.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3275.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 2988.5, "completions/mean_terminated_length": 2988.5, "completions/min_length": 2697.0, "completions/min_terminated_length": 2697.0, "epoch": 0.5361344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.20434461534023285, "kl": 0.4289247989654541, "learning_rate": 3.7888888888888893e-06, "loss": 0.0004, "num_tokens": 5850246.0, "reward": 8.078916549682617, "reward_std": 2.560884475708008, "rewards/reward_model/mean": 8.078916549682617, "rewards/reward_model/std": 2.5608842372894287, "step": 319 }, { "completion_length": 2367.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2514.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 2367.75, "completions/mean_terminated_length": 2367.75, "completions/min_length": 2067.0, "completions/min_terminated_length": 2067.0, "epoch": 0.5378151260504201, "frac_reward_zero_std": 0.0, "grad_norm": 0.24936741590499878, "kl": 0.5856636166572571, "learning_rate": 3.7833333333333337e-06, "loss": 0.0006, "num_tokens": 5868329.0, "reward": 8.437995910644531, "reward_std": 0.8260601758956909, "rewards/reward_model/mean": 8.437995910644531, "rewards/reward_model/std": 0.8260601758956909, "step": 320 }, { "completion_length": 1405.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 1405.5, "completions/mean_terminated_length": 1405.5, "completions/min_length": 1232.0, "completions/min_terminated_length": 1232.0, "epoch": 0.5394957983193277, "frac_reward_zero_std": 0.0, "grad_norm": 0.40040045976638794, "kl": 0.7034329771995544, "learning_rate": 3.777777777777778e-06, "loss": 0.0007, "num_tokens": 5883599.0, "reward": 9.75, "reward_std": 0.28867512941360474, "rewards/reward_model/mean": 9.75, "rewards/reward_model/std": 0.28867512941360474, "step": 321 }, { "completion_length": 2568.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 2568.25, "completions/mean_terminated_length": 2568.25, "completions/min_length": 2339.0, "completions/min_terminated_length": 2339.0, "epoch": 0.5411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.22290842235088348, "kl": 0.4961845278739929, "learning_rate": 3.7722222222222225e-06, "loss": 0.0005, "num_tokens": 5903460.0, "reward": 8.125, "reward_std": 1.299038052558899, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 1.299038052558899, "step": 322 }, { "completion_length": 2574.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 2574.5, "completions/mean_terminated_length": 2574.5, "completions/min_length": 2271.0, "completions/min_terminated_length": 2271.0, "epoch": 0.5428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.24352754652500153, "kl": 0.48642972111701965, "learning_rate": 3.766666666666667e-06, "loss": 0.0005, "num_tokens": 5922538.0, "reward": 7.9626312255859375, "reward_std": 0.7469905614852905, "rewards/reward_model/mean": 7.9626312255859375, "rewards/reward_model/std": 0.7469905018806458, "step": 323 }, { "completion_length": 230.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5445378151260504, "frac_reward_zero_std": 1.0, "grad_norm": 0.016352355480194092, "kl": 1.4648537635803223, "learning_rate": 3.7611111111111113e-06, "loss": 0.0015, "num_tokens": 5932222.0, "reward": 9.0, "reward_std": 0.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.0, "step": 324 }, { "completion_length": 2593.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 2593.75, "completions/mean_terminated_length": 2593.75, "completions/min_length": 2479.0, "completions/min_terminated_length": 2479.0, "epoch": 0.5462184873949579, "frac_reward_zero_std": 0.0, "grad_norm": 0.252117782831192, "kl": 0.4662327170372009, "learning_rate": 3.7555555555555557e-06, "loss": 0.0005, "num_tokens": 5951993.0, "reward": 8.160984992980957, "reward_std": 1.8355817794799805, "rewards/reward_model/mean": 8.160984992980957, "rewards/reward_model/std": 1.8355820178985596, "step": 325 }, { "completion_length": 574.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 574.0, "completions/mean_terminated_length": 574.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5478991596638656, "frac_reward_zero_std": 0.0, "grad_norm": 0.5098720788955688, "kl": 0.9151164889335632, "learning_rate": 3.7500000000000005e-06, "loss": 0.0009, "num_tokens": 5964013.0, "reward": 5.917305946350098, "reward_std": 0.39032450318336487, "rewards/reward_model/mean": 5.917305946350098, "rewards/reward_model/std": 0.3903244137763977, "step": 326 }, { "completion_length": 2315.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 2315.75, "completions/mean_terminated_length": 2315.75, "completions/min_length": 2146.0, "completions/min_terminated_length": 2146.0, "epoch": 0.5495798319327732, "frac_reward_zero_std": 0.0, "grad_norm": 0.2519376873970032, "kl": 0.7739394903182983, "learning_rate": 3.744444444444445e-06, "loss": 0.0008, "num_tokens": 5983272.0, "reward": 9.0, "reward_std": 0.8164966106414795, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.8164966106414795, "step": 327 }, { "completion_length": 2902.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3270.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 2902.0, "completions/mean_terminated_length": 2902.0, "completions/min_length": 2562.0, "completions/min_terminated_length": 2562.0, "epoch": 0.5512605042016807, "frac_reward_zero_std": 0.0, "grad_norm": 0.2465457022190094, "kl": 0.4950849711894989, "learning_rate": 3.7388888888888893e-06, "loss": 0.0005, "num_tokens": 6004520.0, "reward": 7.6588969230651855, "reward_std": 1.058329701423645, "rewards/reward_model/mean": 7.6588969230651855, "rewards/reward_model/std": 1.058329701423645, "step": 328 }, { "completion_length": 2645.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 2645.5, "completions/mean_terminated_length": 2645.5, "completions/min_length": 2522.0, "completions/min_terminated_length": 2522.0, "epoch": 0.5529411764705883, "frac_reward_zero_std": 0.0, "grad_norm": 0.23811551928520203, "kl": 0.5148739218711853, "learning_rate": 3.7333333333333337e-06, "loss": 0.0005, "num_tokens": 6023666.0, "reward": 8.227792739868164, "reward_std": 0.37653422355651855, "rewards/reward_model/mean": 8.227792739868164, "rewards/reward_model/std": 0.37653419375419617, "step": 329 }, { "completion_length": 265.5, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.5510560870170593, "kl": 1.1760433912277222, "learning_rate": 3.727777777777778e-06, "loss": 0.0012, "num_tokens": 6035392.0, "reward": 8.75, "reward_std": 0.3535533845424652, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 0.3535533845424652, "step": 330 }, { "completion_length": 257.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.5563025210084034, "frac_reward_zero_std": 0.0, "grad_norm": 0.5730410814285278, "kl": 1.214462399482727, "learning_rate": 3.7222222222222225e-06, "loss": 0.0012, "num_tokens": 6046420.0, "reward": 8.4375, "reward_std": 0.9655525088310242, "rewards/reward_model/mean": 8.4375, "rewards/reward_model/std": 0.9655525088310242, "step": 331 }, { "completion_length": 350.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5579831932773109, "frac_reward_zero_std": 0.0, "grad_norm": 0.7308914661407471, "kl": 0.9910828471183777, "learning_rate": 3.716666666666667e-06, "loss": 0.001, "num_tokens": 6057316.0, "reward": 7.875, "reward_std": 0.322748601436615, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 0.3227486312389374, "step": 332 }, { "completion_length": 1654.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1654.75, "completions/mean_terminated_length": 1654.75, "completions/min_length": 1436.0, "completions/min_terminated_length": 1436.0, "epoch": 0.5596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.413554847240448, "kl": 0.4082599878311157, "learning_rate": 3.7111111111111113e-06, "loss": 0.0004, "num_tokens": 6074559.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 333 }, { "completion_length": 1638.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1638.25, "completions/mean_terminated_length": 1638.25, "completions/min_length": 1450.0, "completions/min_terminated_length": 1450.0, "epoch": 0.561344537815126, "frac_reward_zero_std": 0.0, "grad_norm": 0.38662171363830566, "kl": 0.39538225531578064, "learning_rate": 3.705555555555556e-06, "loss": 0.0004, "num_tokens": 6091020.0, "reward": 8.5, "reward_std": 1.9148542881011963, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.9148542881011963, "step": 334 }, { "completion_length": 334.25, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.6256248950958252, "kl": 0.9595475792884827, "learning_rate": 3.7e-06, "loss": 0.001, "num_tokens": 6102817.0, "reward": 8.1875, "reward_std": 0.4732423424720764, "rewards/reward_model/mean": 8.1875, "rewards/reward_model/std": 0.4732423722743988, "step": 335 }, { "completion_length": 1410.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1410.25, "completions/mean_terminated_length": 1410.25, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "epoch": 0.5647058823529412, "frac_reward_zero_std": 0.0, "grad_norm": 0.522068202495575, "kl": 0.43463587760925293, "learning_rate": 3.694444444444445e-06, "loss": 0.0004, "num_tokens": 6119090.0, "reward": 8.875, "reward_std": 0.8539125919342041, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.8539125919342041, "step": 336 }, { "completion_length": 2513.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2661.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 2513.25, "completions/mean_terminated_length": 2513.25, "completions/min_length": 2370.0, "completions/min_terminated_length": 2370.0, "epoch": 0.5663865546218487, "frac_reward_zero_std": 0.0, "grad_norm": 0.2718733251094818, "kl": 0.48305216431617737, "learning_rate": 3.688888888888889e-06, "loss": 0.0005, "num_tokens": 6139359.0, "reward": 8.5625, "reward_std": 2.258087396621704, "rewards/reward_model/mean": 8.5625, "rewards/reward_model/std": 2.258087396621704, "step": 337 }, { "completion_length": 2095.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 2095.0, "completions/mean_terminated_length": 2095.0, "completions/min_length": 1850.0, "completions/min_terminated_length": 1850.0, "epoch": 0.5680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.3519833981990814, "kl": 0.3278481960296631, "learning_rate": 3.6833333333333338e-06, "loss": 0.0003, "num_tokens": 6156779.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 338 }, { "completion_length": 320.75, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5697478991596638, "frac_reward_zero_std": 0.0, "grad_norm": 0.4907205402851105, "kl": 0.983814001083374, "learning_rate": 3.6777777777777778e-06, "loss": 0.001, "num_tokens": 6168426.0, "reward": 7.375, "reward_std": 0.6291528940200806, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 0.6291528940200806, "step": 339 }, { "completion_length": 2951.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 2951.5, "completions/mean_terminated_length": 2951.5, "completions/min_length": 2770.0, "completions/min_terminated_length": 2770.0, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22113487124443054, "kl": 0.4703660309314728, "learning_rate": 3.6722222222222226e-06, "loss": 0.0005, "num_tokens": 6189828.0, "reward": 8.08819580078125, "reward_std": 0.5853735208511353, "rewards/reward_model/mean": 8.08819580078125, "rewards/reward_model/std": 0.5853736400604248, "step": 340 }, { "completion_length": 1875.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1875.25, "completions/mean_terminated_length": 1875.25, "completions/min_length": 1111.0, "completions/min_terminated_length": 1111.0, "epoch": 0.573109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.3157437741756439, "kl": 0.3444173038005829, "learning_rate": 3.6666666666666666e-06, "loss": 0.0003, "num_tokens": 6207765.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 341 }, { "completion_length": 2756.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 2756.0, "completions/mean_terminated_length": 2756.0, "completions/min_length": 2424.0, "completions/min_terminated_length": 2424.0, "epoch": 0.5747899159663865, "frac_reward_zero_std": 0.0, "grad_norm": 0.24052739143371582, "kl": 0.4601094722747803, "learning_rate": 3.6611111111111114e-06, "loss": 0.0005, "num_tokens": 6228001.0, "reward": 9.5, "reward_std": 1.0, "rewards/reward_model/mean": 9.5, "rewards/reward_model/std": 1.0, "step": 342 }, { "completion_length": 289.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.5295632481575012, "kl": 1.093843936920166, "learning_rate": 3.6555555555555562e-06, "loss": 0.0011, "num_tokens": 6238677.0, "reward": 8.875, "reward_std": 0.25, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.25, "step": 343 }, { "completion_length": 288.75, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.5781512605042017, "frac_reward_zero_std": 0.0, "grad_norm": 0.4892807900905609, "kl": 1.1656584739685059, "learning_rate": 3.65e-06, "loss": 0.0012, "num_tokens": 6250096.0, "reward": 8.8125, "reward_std": 0.125, "rewards/reward_model/mean": 8.8125, "rewards/reward_model/std": 0.125, "step": 344 }, { "completion_length": 3056.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3541.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 3056.0, "completions/mean_terminated_length": 3056.0, "completions/min_length": 2591.0, "completions/min_terminated_length": 2591.0, "epoch": 0.5798319327731093, "frac_reward_zero_std": 0.0, "grad_norm": 0.19821204245090485, "kl": 0.39397579431533813, "learning_rate": 3.644444444444445e-06, "loss": 0.0004, "num_tokens": 6272492.0, "reward": 7.870750427246094, "reward_std": 1.414124608039856, "rewards/reward_model/mean": 7.870750427246094, "rewards/reward_model/std": 1.4141244888305664, "step": 345 }, { "completion_length": 262.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5815126050420169, "frac_reward_zero_std": 0.0, "grad_norm": 0.43488001823425293, "kl": 1.1847769021987915, "learning_rate": 3.638888888888889e-06, "loss": 0.0012, "num_tokens": 6282108.0, "reward": 8.875, "reward_std": 0.25, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.25, "step": 346 }, { "completion_length": 2774.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3022.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 2774.5, "completions/mean_terminated_length": 2774.5, "completions/min_length": 2164.0, "completions/min_terminated_length": 2164.0, "epoch": 0.5831932773109244, "frac_reward_zero_std": 0.0, "grad_norm": 0.2428930252790451, "kl": 0.4620867669582367, "learning_rate": 3.633333333333334e-06, "loss": 0.0005, "num_tokens": 6302218.0, "reward": 9.005943298339844, "reward_std": 0.8126783967018127, "rewards/reward_model/mean": 9.005943298339844, "rewards/reward_model/std": 0.8126783967018127, "step": 347 }, { "completion_length": 2617.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2885.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 2617.5, "completions/mean_terminated_length": 2617.5, "completions/min_length": 2383.0, "completions/min_terminated_length": 2383.0, "epoch": 0.584873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.2244744896888733, "kl": 0.522290825843811, "learning_rate": 3.627777777777778e-06, "loss": 0.0005, "num_tokens": 6321368.0, "reward": 8.48847770690918, "reward_std": 0.5614652633666992, "rewards/reward_model/mean": 8.48847770690918, "rewards/reward_model/std": 0.561465322971344, "step": 348 }, { "completion_length": 1709.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1709.0, "completions/mean_terminated_length": 1709.0, "completions/min_length": 1578.0, "completions/min_terminated_length": 1578.0, "epoch": 0.5865546218487395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013478255132213235, "kl": 0.34176814556121826, "learning_rate": 3.6222222222222226e-06, "loss": 0.0003, "num_tokens": 6338708.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 349 }, { "completion_length": 321.75, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.5882352941176471, "frac_reward_zero_std": 0.0, "grad_norm": 0.6858517527580261, "kl": 1.1373226642608643, "learning_rate": 3.616666666666667e-06, "loss": 0.0011, "num_tokens": 6349963.0, "reward": 8.137499809265137, "reward_std": 0.125, "rewards/reward_model/mean": 8.137499809265137, "rewards/reward_model/std": 0.125, "step": 350 }, { "completion_length": 318.75, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.7904449105262756, "kl": 0.9482468366622925, "learning_rate": 3.6111111111111115e-06, "loss": 0.0009, "num_tokens": 6361014.0, "reward": 8.125, "reward_std": 0.5951189994812012, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 0.595119059085846, "step": 351 }, { "completion_length": 2691.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 2691.0, "completions/mean_terminated_length": 2691.0, "completions/min_length": 2577.0, "completions/min_terminated_length": 2577.0, "epoch": 0.5915966386554622, "frac_reward_zero_std": 0.0, "grad_norm": 0.2261233925819397, "kl": 0.4394461512565613, "learning_rate": 3.605555555555556e-06, "loss": 0.0004, "num_tokens": 6382670.0, "reward": 8.03146743774414, "reward_std": 1.4514906406402588, "rewards/reward_model/mean": 8.03146743774414, "rewards/reward_model/std": 1.4514905214309692, "step": 352 }, { "completion_length": 2609.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2931.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 2609.25, "completions/mean_terminated_length": 2609.25, "completions/min_length": 2348.0, "completions/min_terminated_length": 2348.0, "epoch": 0.5932773109243697, "frac_reward_zero_std": 0.0, "grad_norm": 0.24388550221920013, "kl": 0.48458898067474365, "learning_rate": 3.6000000000000003e-06, "loss": 0.0005, "num_tokens": 6403051.0, "reward": 8.5625, "reward_std": 1.1433686017990112, "rewards/reward_model/mean": 8.5625, "rewards/reward_model/std": 1.1433686017990112, "step": 353 }, { "completion_length": 2441.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 2441.5, "completions/mean_terminated_length": 2441.5, "completions/min_length": 2355.0, "completions/min_terminated_length": 2355.0, "epoch": 0.5949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.30120784044265747, "kl": 0.5056976079940796, "learning_rate": 3.5944444444444447e-06, "loss": 0.0005, "num_tokens": 6422189.0, "reward": 7.515214920043945, "reward_std": 2.208329439163208, "rewards/reward_model/mean": 7.515214920043945, "rewards/reward_model/std": 2.208329439163208, "step": 354 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5966386554621849, "frac_reward_zero_std": 1.0, "grad_norm": 3.6571454984368756e-05, "kl": 2.739680051803589, "learning_rate": 3.588888888888889e-06, "loss": 0.0027, "num_tokens": 6430517.0, "reward": 5.0, "reward_std": 0.0, "rewards/reward_model/mean": 5.0, "rewards/reward_model/std": 0.0, "step": 355 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5983193277310924, "frac_reward_zero_std": 1.0, "grad_norm": 2.606996304166387e-06, "kl": 1.293775200843811, "learning_rate": 3.5833333333333335e-06, "loss": 0.0013, "num_tokens": 6438709.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 356 }, { "completion_length": 2853.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 2853.25, "completions/mean_terminated_length": 2853.25, "completions/min_length": 2410.0, "completions/min_terminated_length": 2410.0, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.25039204955101013, "kl": 0.474241703748703, "learning_rate": 3.577777777777778e-06, "loss": 0.0005, "num_tokens": 6461042.0, "reward": 7.93405818939209, "reward_std": 1.743273377418518, "rewards/reward_model/mean": 7.93405818939209, "rewards/reward_model/std": 1.7432732582092285, "step": 357 }, { "completion_length": 2484.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 2484.75, "completions/mean_terminated_length": 2484.75, "completions/min_length": 2272.0, "completions/min_terminated_length": 2272.0, "epoch": 0.6016806722689075, "frac_reward_zero_std": 0.0, "grad_norm": 0.2618462145328522, "kl": 0.5813424587249756, "learning_rate": 3.5722222222222223e-06, "loss": 0.0006, "num_tokens": 6480077.0, "reward": 9.0625, "reward_std": 0.875, "rewards/reward_model/mean": 9.0625, "rewards/reward_model/std": 0.875, "step": 358 }, { "completion_length": 1927.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 1927.75, "completions/mean_terminated_length": 1927.75, "completions/min_length": 1387.0, "completions/min_terminated_length": 1387.0, "epoch": 0.6033613445378151, "frac_reward_zero_std": 0.0, "grad_norm": 0.367982417345047, "kl": 0.3646852672100067, "learning_rate": 3.566666666666667e-06, "loss": 0.0004, "num_tokens": 6496564.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 359 }, { "completion_length": 2916.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3082.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 2916.75, "completions/mean_terminated_length": 2916.75, "completions/min_length": 2748.0, "completions/min_terminated_length": 2748.0, "epoch": 0.6050420168067226, "frac_reward_zero_std": 0.0, "grad_norm": 0.2301839292049408, "kl": 0.4210048317909241, "learning_rate": 3.561111111111111e-06, "loss": 0.0004, "num_tokens": 6518847.0, "reward": 8.218905448913574, "reward_std": 0.9460164308547974, "rewards/reward_model/mean": 8.218905448913574, "rewards/reward_model/std": 0.9460163116455078, "step": 360 }, { "completion_length": 371.5, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.6067226890756302, "frac_reward_zero_std": 0.0, "grad_norm": 0.4018150568008423, "kl": 0.9139790534973145, "learning_rate": 3.555555555555556e-06, "loss": 0.0009, "num_tokens": 6529569.0, "reward": 7.875, "reward_std": 0.75, "rewards/reward_model/mean": 7.875, "rewards/reward_model/std": 0.75, "step": 361 }, { "completion_length": 342.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 342.0, "completions/mean_terminated_length": 342.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.4517315626144409, "kl": 0.925557017326355, "learning_rate": 3.5500000000000003e-06, "loss": 0.0009, "num_tokens": 6541389.0, "reward": 8.4375, "reward_std": 0.125, "rewards/reward_model/mean": 8.4375, "rewards/reward_model/std": 0.125, "step": 362 }, { "completion_length": 2601.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2827.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 2601.5, "completions/mean_terminated_length": 2601.5, "completions/min_length": 2439.0, "completions/min_terminated_length": 2439.0, "epoch": 0.6100840336134454, "frac_reward_zero_std": 0.0, "grad_norm": 0.22458426654338837, "kl": 0.526951014995575, "learning_rate": 3.5444444444444447e-06, "loss": 0.0005, "num_tokens": 6560711.0, "reward": 8.748923301696777, "reward_std": 0.8164992332458496, "rewards/reward_model/mean": 8.748923301696777, "rewards/reward_model/std": 0.8164994120597839, "step": 363 }, { "completion_length": 2426.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2665.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 2426.5, "completions/mean_terminated_length": 2426.5, "completions/min_length": 2166.0, "completions/min_terminated_length": 2166.0, "epoch": 0.611764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.24633418023586273, "kl": 0.4865972399711609, "learning_rate": 3.538888888888889e-06, "loss": 0.0005, "num_tokens": 6579309.0, "reward": 8.049957275390625, "reward_std": 0.9723284840583801, "rewards/reward_model/mean": 8.049957275390625, "rewards/reward_model/std": 0.9723284840583801, "step": 364 }, { "completion_length": 542.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 542.0, "completions/mean_terminated_length": 542.0, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.6134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.5689728856086731, "kl": 0.9688239693641663, "learning_rate": 3.5333333333333335e-06, "loss": 0.001, "num_tokens": 6591465.0, "reward": 6.302643775939941, "reward_std": 1.7320598363876343, "rewards/reward_model/mean": 6.302643775939941, "rewards/reward_model/std": 1.7320598363876343, "step": 365 }, { "completion_length": 545.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 545.75, "completions/mean_terminated_length": 545.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6151260504201681, "frac_reward_zero_std": 0.0, "grad_norm": 0.4972035586833954, "kl": 0.9355713725090027, "learning_rate": 3.5277777777777784e-06, "loss": 0.0009, "num_tokens": 6604072.0, "reward": 8.424234390258789, "reward_std": 1.15153169631958, "rewards/reward_model/mean": 8.424234390258789, "rewards/reward_model/std": 1.15153169631958, "step": 366 }, { "completion_length": 2558.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2612.0, "completions/max_terminated_length": 2612.0, "completions/mean_length": 2558.5, "completions/mean_terminated_length": 2558.5, "completions/min_length": 2502.0, "completions/min_terminated_length": 2502.0, "epoch": 0.6168067226890757, "frac_reward_zero_std": 0.0, "grad_norm": 0.2452915459871292, "kl": 0.4797371029853821, "learning_rate": 3.5222222222222223e-06, "loss": 0.0005, "num_tokens": 6624478.0, "reward": 8.9375, "reward_std": 0.875, "rewards/reward_model/mean": 8.9375, "rewards/reward_model/std": 0.875, "step": 367 }, { "completion_length": 263.25, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 0.5835832357406616, "kl": 1.2101694345474243, "learning_rate": 3.516666666666667e-06, "loss": 0.0012, "num_tokens": 6634127.0, "reward": 8.9375, "reward_std": 0.125, "rewards/reward_model/mean": 8.9375, "rewards/reward_model/std": 0.125, "step": 368 }, { "completion_length": 1561.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1901.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 1561.25, "completions/mean_terminated_length": 1561.25, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "epoch": 0.6201680672268908, "frac_reward_zero_std": 0.0, "grad_norm": 0.3950733542442322, "kl": 0.4149470329284668, "learning_rate": 3.511111111111111e-06, "loss": 0.0004, "num_tokens": 6650020.0, "reward": 9.0, "reward_std": 1.6832507848739624, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 1.683250904083252, "step": 369 }, { "completion_length": 2692.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3271.0, "completions/max_terminated_length": 3271.0, "completions/mean_length": 2692.0, "completions/mean_terminated_length": 2692.0, "completions/min_length": 2461.0, "completions/min_terminated_length": 2461.0, "epoch": 0.6218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.2428463101387024, "kl": 0.44819653034210205, "learning_rate": 3.505555555555556e-06, "loss": 0.0004, "num_tokens": 6672216.0, "reward": 9.062994956970215, "reward_std": 0.6246710419654846, "rewards/reward_model/mean": 9.062994956970215, "rewards/reward_model/std": 0.6246709227561951, "step": 370 }, { "completion_length": 1699.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1699.5, "completions/mean_terminated_length": 1699.5, "completions/min_length": 1509.0, "completions/min_terminated_length": 1509.0, "epoch": 0.6235294117647059, "frac_reward_zero_std": 0.0, "grad_norm": 0.3164796531200409, "kl": 0.3432800769805908, "learning_rate": 3.5e-06, "loss": 0.0003, "num_tokens": 6687418.0, "reward": 8.125, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 1.9311050176620483, "step": 371 }, { "completion_length": 2836.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 2836.5, "completions/mean_terminated_length": 2836.5, "completions/min_length": 2693.0, "completions/min_terminated_length": 2693.0, "epoch": 0.6252100840336134, "frac_reward_zero_std": 0.0, "grad_norm": 0.23035331070423126, "kl": 0.4091682732105255, "learning_rate": 3.4944444444444448e-06, "loss": 0.0004, "num_tokens": 6708920.0, "reward": 8.449344635009766, "reward_std": 0.776966392993927, "rewards/reward_model/mean": 8.449344635009766, "rewards/reward_model/std": 0.776966392993927, "step": 372 }, { "completion_length": 292.75, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.626890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.8259927034378052, "kl": 1.2085371017456055, "learning_rate": 3.4888888888888896e-06, "loss": 0.0012, "num_tokens": 6719591.0, "reward": 6.875, "reward_std": 0.7772815823554993, "rewards/reward_model/mean": 6.875, "rewards/reward_model/std": 0.7772815823554993, "step": 373 }, { "completion_length": 2818.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3478.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 2818.5, "completions/mean_terminated_length": 2818.5, "completions/min_length": 2333.0, "completions/min_terminated_length": 2333.0, "epoch": 0.6285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.24244414269924164, "kl": 0.46453791856765747, "learning_rate": 3.4833333333333336e-06, "loss": 0.0005, "num_tokens": 6741329.0, "reward": 9.171806335449219, "reward_std": 0.6070655584335327, "rewards/reward_model/mean": 9.171806335449219, "rewards/reward_model/std": 0.6070655584335327, "step": 374 }, { "completion_length": 323.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6302521008403361, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026480199303478003, "kl": 1.0174778699874878, "learning_rate": 3.4777777777777784e-06, "loss": 0.001, "num_tokens": 6753629.0, "reward": 9.0, "reward_std": 0.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.0, "step": 375 }, { "completion_length": 357.5, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6319327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.6044626832008362, "kl": 0.9555953145027161, "learning_rate": 3.4722222222222224e-06, "loss": 0.001, "num_tokens": 6764819.0, "reward": 8.25, "reward_std": 0.20412415266036987, "rewards/reward_model/mean": 8.25, "rewards/reward_model/std": 0.20412415266036987, "step": 376 }, { "completion_length": 2380.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 2380.0, "completions/mean_terminated_length": 2380.0, "completions/min_length": 2225.0, "completions/min_terminated_length": 2225.0, "epoch": 0.6336134453781512, "frac_reward_zero_std": 0.0, "grad_norm": 0.2673751711845398, "kl": 0.5172151327133179, "learning_rate": 3.4666666666666672e-06, "loss": 0.0005, "num_tokens": 6783043.0, "reward": 9.4375, "reward_std": 0.5153881907463074, "rewards/reward_model/mean": 9.4375, "rewards/reward_model/std": 0.5153881907463074, "step": 377 }, { "completion_length": 2991.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3151.0, "completions/max_terminated_length": 3151.0, "completions/mean_length": 2991.0, "completions/mean_terminated_length": 2991.0, "completions/min_length": 2800.0, "completions/min_terminated_length": 2800.0, "epoch": 0.6352941176470588, "frac_reward_zero_std": 0.0, "grad_norm": 0.20062915980815887, "kl": 0.38919225335121155, "learning_rate": 3.461111111111111e-06, "loss": 0.0004, "num_tokens": 6805859.0, "reward": 9.3590726852417, "reward_std": 0.7400889992713928, "rewards/reward_model/mean": 9.3590726852417, "rewards/reward_model/std": 0.7400889992713928, "step": 378 }, { "completion_length": 3862.75, "completions/clipped_ratio": 0.25, "completions/max_length": 7232.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 3862.75, "completions/mean_terminated_length": 2739.666748046875, "completions/min_length": 2482.0, "completions/min_terminated_length": 2482.0, "epoch": 0.6369747899159663, "frac_reward_zero_std": 0.0, "grad_norm": 0.1502574384212494, "kl": 0.3546273708343506, "learning_rate": 3.455555555555556e-06, "loss": 0.0004, "num_tokens": 6831450.0, "reward": 7.031451225280762, "reward_std": 1.2368788719177246, "rewards/reward_model/mean": 7.031451225280762, "rewards/reward_model/std": 1.2368788719177246, "step": 379 }, { "completion_length": 242.5, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6386554621848739, "frac_reward_zero_std": 0.0, "grad_norm": 0.4096352159976959, "kl": 1.4422924518585205, "learning_rate": 3.45e-06, "loss": 0.0014, "num_tokens": 6841204.0, "reward": 9.0, "reward_std": 0.20412415266036987, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.20412415266036987, "step": 380 }, { "completion_length": 139.75, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 3.364586591720581, "kl": 1.418567419052124, "learning_rate": 3.444444444444445e-06, "loss": 0.0014, "num_tokens": 6849271.0, "reward": 8.75, "reward_std": 1.5, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.5, "step": 381 }, { "completion_length": 276.25, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 276.25, "completions/mean_terminated_length": 276.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6420168067226891, "frac_reward_zero_std": 0.0, "grad_norm": 0.5434556007385254, "kl": 1.226255178451538, "learning_rate": 3.4388888888888892e-06, "loss": 0.0012, "num_tokens": 6859184.0, "reward": 7.6875, "reward_std": 0.4732423424720764, "rewards/reward_model/mean": 7.6875, "rewards/reward_model/std": 0.4732423722743988, "step": 382 }, { "completion_length": 1627.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1627.5, "completions/mean_terminated_length": 1627.5, "completions/min_length": 1075.0, "completions/min_terminated_length": 1075.0, "epoch": 0.6436974789915967, "frac_reward_zero_std": 0.0, "grad_norm": 0.3591630458831787, "kl": 0.39100292325019836, "learning_rate": 3.4333333333333336e-06, "loss": 0.0004, "num_tokens": 6875446.0, "reward": 8.875, "reward_std": 1.9311050176620483, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.9311050176620483, "step": 383 }, { "completion_length": 2654.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2977.0, "completions/max_terminated_length": 2977.0, "completions/mean_length": 2654.75, "completions/mean_terminated_length": 2654.75, "completions/min_length": 2375.0, "completions/min_terminated_length": 2375.0, "epoch": 0.6453781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.23516781628131866, "kl": 0.4711880385875702, "learning_rate": 3.427777777777778e-06, "loss": 0.0005, "num_tokens": 6895305.0, "reward": 7.5625, "reward_std": 2.7108962535858154, "rewards/reward_model/mean": 7.5625, "rewards/reward_model/std": 2.7108962535858154, "step": 384 }, { "completion_length": 280.25, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.6687840819358826, "kl": 1.1973485946655273, "learning_rate": 3.4222222222222224e-06, "loss": 0.0012, "num_tokens": 6905162.0, "reward": 6.800000190734863, "reward_std": 1.0384283065795898, "rewards/reward_model/mean": 6.800000190734863, "rewards/reward_model/std": 1.0384283065795898, "step": 385 }, { "completion_length": 1285.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 1285.75, "completions/mean_terminated_length": 1285.75, "completions/min_length": 1155.0, "completions/min_terminated_length": 1155.0, "epoch": 0.6487394957983194, "frac_reward_zero_std": 0.0, "grad_norm": 0.4363056421279907, "kl": 0.47353795170783997, "learning_rate": 3.416666666666667e-06, "loss": 0.0005, "num_tokens": 6919485.0, "reward": 8.5, "reward_std": 1.7795131206512451, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 1.7795131206512451, "step": 386 }, { "completion_length": 2428.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 2428.5, "completions/mean_terminated_length": 2428.5, "completions/min_length": 2289.0, "completions/min_terminated_length": 2289.0, "epoch": 0.6504201680672269, "frac_reward_zero_std": 0.0, "grad_norm": 0.2599104344844818, "kl": 0.4969731271266937, "learning_rate": 3.4111111111111113e-06, "loss": 0.0005, "num_tokens": 6938299.0, "reward": 8.75, "reward_std": 0.28867512941360474, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 0.28867512941360474, "step": 387 }, { "completion_length": 366.5, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 366.5, "completions/mean_terminated_length": 366.5, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.6521008403361345, "frac_reward_zero_std": 0.0, "grad_norm": 0.6845762133598328, "kl": 0.9161630868911743, "learning_rate": 3.4055555555555557e-06, "loss": 0.0009, "num_tokens": 6949465.0, "reward": 7.375, "reward_std": 1.010362982749939, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 1.010362982749939, "step": 388 }, { "completion_length": 2573.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2849.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 2573.0, "completions/mean_terminated_length": 2573.0, "completions/min_length": 2244.0, "completions/min_terminated_length": 2244.0, "epoch": 0.653781512605042, "frac_reward_zero_std": 0.0, "grad_norm": 0.2595427334308624, "kl": 0.508905291557312, "learning_rate": 3.4000000000000005e-06, "loss": 0.0005, "num_tokens": 6968413.0, "reward": 8.1875, "reward_std": 1.007782220840454, "rewards/reward_model/mean": 8.1875, "rewards/reward_model/std": 1.007782220840454, "step": 389 }, { "completion_length": 2527.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 2527.0, "completions/mean_terminated_length": 2527.0, "completions/min_length": 2295.0, "completions/min_terminated_length": 2295.0, "epoch": 0.6554621848739496, "frac_reward_zero_std": 0.0, "grad_norm": 0.23779208958148956, "kl": 0.48245739936828613, "learning_rate": 3.3944444444444445e-06, "loss": 0.0005, "num_tokens": 6988197.0, "reward": 9.110245704650879, "reward_std": 0.6081050634384155, "rewards/reward_model/mean": 9.110245704650879, "rewards/reward_model/std": 0.6081050038337708, "step": 390 }, { "completion_length": 2619.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 2619.5, "completions/mean_terminated_length": 2619.5, "completions/min_length": 2498.0, "completions/min_terminated_length": 2498.0, "epoch": 0.6571428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.24607634544372559, "kl": 0.4333855211734772, "learning_rate": 3.3888888888888893e-06, "loss": 0.0004, "num_tokens": 7008307.0, "reward": 8.125, "reward_std": 0.4787135720252991, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 0.4787135720252991, "step": 391 }, { "completion_length": 1773.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1773.5, "completions/mean_terminated_length": 1773.5, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "epoch": 0.6588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016868688398972154, "kl": 0.38211238384246826, "learning_rate": 3.3833333333333333e-06, "loss": 0.0004, "num_tokens": 7025461.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 392 }, { "completion_length": 332.5, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 332.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.6605042016806723, "frac_reward_zero_std": 0.0, "grad_norm": 0.4347955584526062, "kl": 0.9926878809928894, "learning_rate": 3.377777777777778e-06, "loss": 0.001, "num_tokens": 7035875.0, "reward": 8.375, "reward_std": 1.0897247791290283, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.0897247791290283, "step": 393 }, { "completion_length": 1727.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 1727.5, "completions/mean_terminated_length": 1727.5, "completions/min_length": 1667.0, "completions/min_terminated_length": 1667.0, "epoch": 0.6621848739495798, "frac_reward_zero_std": 0.0, "grad_norm": 0.3720357418060303, "kl": 0.3603493869304657, "learning_rate": 3.372222222222222e-06, "loss": 0.0004, "num_tokens": 7054021.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 394 }, { "completion_length": 2544.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2644.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 2544.75, "completions/mean_terminated_length": 2544.75, "completions/min_length": 2437.0, "completions/min_terminated_length": 2437.0, "epoch": 0.6638655462184874, "frac_reward_zero_std": 0.0, "grad_norm": 0.2450246512889862, "kl": 0.5328658223152161, "learning_rate": 3.366666666666667e-06, "loss": 0.0005, "num_tokens": 7073464.0, "reward": 7.75, "reward_std": 0.8660253882408142, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 0.8660253882408142, "step": 395 }, { "completion_length": 1697.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 1697.75, "completions/mean_terminated_length": 1697.75, "completions/min_length": 1481.0, "completions/min_terminated_length": 1481.0, "epoch": 0.6655462184873949, "frac_reward_zero_std": 1.0, "grad_norm": 0.00202116253785789, "kl": 0.4121755063533783, "learning_rate": 3.3611111111111117e-06, "loss": 0.0004, "num_tokens": 7089303.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 396 }, { "completion_length": 2680.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2789.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 2680.0, "completions/mean_terminated_length": 2680.0, "completions/min_length": 2507.0, "completions/min_terminated_length": 2507.0, "epoch": 0.6672268907563025, "frac_reward_zero_std": 0.0, "grad_norm": 0.23588807880878448, "kl": 0.4839998185634613, "learning_rate": 3.3555555555555557e-06, "loss": 0.0005, "num_tokens": 7109347.0, "reward": 8.499490737915039, "reward_std": 1.3627090454101562, "rewards/reward_model/mean": 8.499490737915039, "rewards/reward_model/std": 1.3627089262008667, "step": 397 }, { "completion_length": 369.75, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.66890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.6593483090400696, "kl": 0.8930791020393372, "learning_rate": 3.3500000000000005e-06, "loss": 0.0009, "num_tokens": 7119474.0, "reward": 8.9375, "reward_std": 0.125, "rewards/reward_model/mean": 8.9375, "rewards/reward_model/std": 0.125, "step": 398 }, { "completion_length": 1455.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1455.25, "completions/mean_terminated_length": 1455.25, "completions/min_length": 1133.0, "completions/min_terminated_length": 1133.0, "epoch": 0.6705882352941176, "frac_reward_zero_std": 0.0, "grad_norm": 0.3591264486312866, "kl": 0.4290870726108551, "learning_rate": 3.3444444444444445e-06, "loss": 0.0004, "num_tokens": 7133767.0, "reward": 9.25, "reward_std": 1.1902379989624023, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 1.190238118171692, "step": 399 }, { "completion_length": 368.75, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 368.75, "completions/mean_terminated_length": 368.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6722689075630253, "frac_reward_zero_std": 0.0, "grad_norm": 0.681368350982666, "kl": 0.9673736691474915, "learning_rate": 3.3388888888888893e-06, "loss": 0.001, "num_tokens": 7145994.0, "reward": 8.125, "reward_std": 0.7772815823554993, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 0.7772815823554993, "step": 400 }, { "completion_length": 50.25, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.6739495798319328, "frac_reward_zero_std": 1.0, "grad_norm": 0.005487820599228144, "kl": 1.914741039276123, "learning_rate": 3.3333333333333333e-06, "loss": 0.0019, "num_tokens": 7153699.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 401 }, { "completion_length": 1575.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 1575.5, "completions/mean_terminated_length": 1575.5, "completions/min_length": 1460.0, "completions/min_terminated_length": 1460.0, "epoch": 0.6756302521008404, "frac_reward_zero_std": 0.0, "grad_norm": 0.4023473560810089, "kl": 0.424016535282135, "learning_rate": 3.327777777777778e-06, "loss": 0.0004, "num_tokens": 7168713.0, "reward": 8.125, "reward_std": 2.462214469909668, "rewards/reward_model/mean": 8.125, "rewards/reward_model/std": 2.462214469909668, "step": 402 }, { "completion_length": 1573.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 1573.5, "completions/mean_terminated_length": 1573.5, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "epoch": 0.6773109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.3969379961490631, "kl": 0.4201173484325409, "learning_rate": 3.322222222222222e-06, "loss": 0.0004, "num_tokens": 7183771.0, "reward": 8.875, "reward_std": 2.25, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 2.25, "step": 403 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6789915966386555, "frac_reward_zero_std": 1.0, "grad_norm": 3.2419518447568407e-07, "kl": 0.7377337217330933, "learning_rate": 3.316666666666667e-06, "loss": 0.0007, "num_tokens": 7191967.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 404 }, { "completion_length": 2325.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2613.0, "completions/max_terminated_length": 2613.0, "completions/mean_length": 2325.5, "completions/mean_terminated_length": 2325.5, "completions/min_length": 2113.0, "completions/min_terminated_length": 2113.0, "epoch": 0.680672268907563, "frac_reward_zero_std": 0.0, "grad_norm": 0.2661324441432953, "kl": 0.6026577949523926, "learning_rate": 3.3111111111111118e-06, "loss": 0.0006, "num_tokens": 7209937.0, "reward": 8.024206161499023, "reward_std": 0.9380781054496765, "rewards/reward_model/mean": 8.024206161499023, "rewards/reward_model/std": 0.9380781054496765, "step": 405 }, { "completion_length": 325.75, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.6823529411764706, "frac_reward_zero_std": 0.0, "grad_norm": 0.7103143334388733, "kl": 0.9401676654815674, "learning_rate": 3.3055555555555558e-06, "loss": 0.0009, "num_tokens": 7221960.0, "reward": 7.4375, "reward_std": 0.4732423424720764, "rewards/reward_model/mean": 7.4375, "rewards/reward_model/std": 0.4732423722743988, "step": 406 }, { "completion_length": 1499.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 1499.5, "completions/mean_terminated_length": 1499.5, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "epoch": 0.6840336134453782, "frac_reward_zero_std": 0.0, "grad_norm": 0.37637561559677124, "kl": 0.505764901638031, "learning_rate": 3.3000000000000006e-06, "loss": 0.0005, "num_tokens": 7237402.0, "reward": 9.875, "reward_std": 0.25, "rewards/reward_model/mean": 9.875, "rewards/reward_model/std": 0.25, "step": 407 }, { "completion_length": 2587.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 2587.25, "completions/mean_terminated_length": 2587.25, "completions/min_length": 2383.0, "completions/min_terminated_length": 2383.0, "epoch": 0.6857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23304714262485504, "kl": 0.3865312337875366, "learning_rate": 3.2944444444444446e-06, "loss": 0.0004, "num_tokens": 7258703.0, "reward": 8.054327011108398, "reward_std": 2.2484207153320312, "rewards/reward_model/mean": 8.054327011108398, "rewards/reward_model/std": 2.2484207153320312, "step": 408 }, { "completion_length": 2469.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2469.75, "completions/mean_terminated_length": 2469.75, "completions/min_length": 1947.0, "completions/min_terminated_length": 1947.0, "epoch": 0.6873949579831933, "frac_reward_zero_std": 0.0, "grad_norm": 0.2391023337841034, "kl": 0.40689554810523987, "learning_rate": 3.2888888888888894e-06, "loss": 0.0004, "num_tokens": 7279086.0, "reward": 8.331256866455078, "reward_std": 0.8763896226882935, "rewards/reward_model/mean": 8.331256866455078, "rewards/reward_model/std": 0.8763895630836487, "step": 409 }, { "completion_length": 363.75, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 363.75, "completions/mean_terminated_length": 363.75, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.6890756302521008, "frac_reward_zero_std": 0.0, "grad_norm": 0.6118437051773071, "kl": 0.9001212120056152, "learning_rate": 3.2833333333333334e-06, "loss": 0.0009, "num_tokens": 7289685.0, "reward": 8.375, "reward_std": 0.25, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.25, "step": 410 }, { "completion_length": 318.5, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 318.5, "completions/mean_terminated_length": 318.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.6907563025210084, "frac_reward_zero_std": 0.0, "grad_norm": 0.7379865050315857, "kl": 1.1553810834884644, "learning_rate": 3.277777777777778e-06, "loss": 0.0012, "num_tokens": 7299683.0, "reward": 7.9375, "reward_std": 0.42695629596710205, "rewards/reward_model/mean": 7.9375, "rewards/reward_model/std": 0.42695629596710205, "step": 411 }, { "completion_length": 335.5, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.692436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.6067933440208435, "kl": 0.9998348951339722, "learning_rate": 3.2722222222222226e-06, "loss": 0.001, "num_tokens": 7310585.0, "reward": 7.8125, "reward_std": 0.8625301718711853, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 0.8625302314758301, "step": 412 }, { "completion_length": 2627.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3004.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 2627.25, "completions/mean_terminated_length": 2627.25, "completions/min_length": 2105.0, "completions/min_terminated_length": 2105.0, "epoch": 0.6941176470588235, "frac_reward_zero_std": 0.0, "grad_norm": 0.2939527928829193, "kl": 0.4167979657649994, "learning_rate": 3.266666666666667e-06, "loss": 0.0004, "num_tokens": 7332058.0, "reward": 9.00445556640625, "reward_std": 1.2223494052886963, "rewards/reward_model/mean": 9.00445556640625, "rewards/reward_model/std": 1.2223494052886963, "step": 413 }, { "completion_length": 2190.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 2190.5, "completions/mean_terminated_length": 2190.5, "completions/min_length": 1959.0, "completions/min_terminated_length": 1959.0, "epoch": 0.6957983193277311, "frac_reward_zero_std": 0.0, "grad_norm": 0.22502215206623077, "kl": 0.42680925130844116, "learning_rate": 3.2611111111111114e-06, "loss": 0.0004, "num_tokens": 7349412.0, "reward": 7.8125, "reward_std": 1.8860783576965332, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 1.8860783576965332, "step": 414 }, { "completion_length": 1482.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 1482.25, "completions/mean_terminated_length": 1482.25, "completions/min_length": 1222.0, "completions/min_terminated_length": 1222.0, "epoch": 0.6974789915966386, "frac_reward_zero_std": 0.0, "grad_norm": 0.4203389585018158, "kl": 0.4068883955478668, "learning_rate": 3.255555555555556e-06, "loss": 0.0004, "num_tokens": 7365345.0, "reward": 8.5, "reward_std": 3.0, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 3.0, "step": 415 }, { "completion_length": 2670.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2753.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 2670.5, "completions/mean_terminated_length": 2670.5, "completions/min_length": 2608.0, "completions/min_terminated_length": 2608.0, "epoch": 0.6991596638655462, "frac_reward_zero_std": 0.0, "grad_norm": 0.2636106014251709, "kl": 0.43991971015930176, "learning_rate": 3.2500000000000002e-06, "loss": 0.0004, "num_tokens": 7386007.0, "reward": 9.50544548034668, "reward_std": 0.8308428525924683, "rewards/reward_model/mean": 9.50544548034668, "rewards/reward_model/std": 0.8308427333831787, "step": 416 }, { "completion_length": 2372.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 2372.75, "completions/mean_terminated_length": 2372.75, "completions/min_length": 1891.0, "completions/min_terminated_length": 1891.0, "epoch": 0.7008403361344537, "frac_reward_zero_std": 0.0, "grad_norm": 0.32366567850112915, "kl": 0.5305396914482117, "learning_rate": 3.2444444444444446e-06, "loss": 0.0005, "num_tokens": 7404598.0, "reward": 7.6233601570129395, "reward_std": 0.7097952365875244, "rewards/reward_model/mean": 7.6233601570129395, "rewards/reward_model/std": 0.709795355796814, "step": 417 }, { "completion_length": 2421.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 2421.25, "completions/mean_terminated_length": 2421.25, "completions/min_length": 2200.0, "completions/min_terminated_length": 2200.0, "epoch": 0.7025210084033613, "frac_reward_zero_std": 0.0, "grad_norm": 0.2678946554660797, "kl": 0.4320683181285858, "learning_rate": 3.238888888888889e-06, "loss": 0.0004, "num_tokens": 7424731.0, "reward": 9.1875, "reward_std": 0.3145764470100403, "rewards/reward_model/mean": 9.1875, "rewards/reward_model/std": 0.3145764470100403, "step": 418 }, { "completion_length": 2536.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2645.0, "completions/max_terminated_length": 2645.0, "completions/mean_length": 2536.5, "completions/mean_terminated_length": 2536.5, "completions/min_length": 2327.0, "completions/min_terminated_length": 2327.0, "epoch": 0.704201680672269, "frac_reward_zero_std": 0.0, "grad_norm": 0.22222469747066498, "kl": 0.33580076694488525, "learning_rate": 3.2333333333333334e-06, "loss": 0.0003, "num_tokens": 7445793.0, "reward": 8.3125, "reward_std": 1.375, "rewards/reward_model/mean": 8.3125, "rewards/reward_model/std": 1.375, "step": 419 }, { "completion_length": 2898.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2898.5, "completions/mean_terminated_length": 2898.5, "completions/min_length": 2141.0, "completions/min_terminated_length": 2141.0, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.0, "grad_norm": 0.21620996296405792, "kl": 0.37706634402275085, "learning_rate": 3.227777777777778e-06, "loss": 0.0004, "num_tokens": 7467571.0, "reward": 7.368702411651611, "reward_std": 2.0254738330841064, "rewards/reward_model/mean": 7.368702411651611, "rewards/reward_model/std": 2.0254738330841064, "step": 420 }, { "completion_length": 1390.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1390.5, "completions/mean_terminated_length": 1390.5, "completions/min_length": 1221.0, "completions/min_terminated_length": 1221.0, "epoch": 0.7075630252100841, "frac_reward_zero_std": 0.0, "grad_norm": 0.3499312400817871, "kl": 0.4640088379383087, "learning_rate": 3.2222222222222227e-06, "loss": 0.0005, "num_tokens": 7482465.0, "reward": 6.375, "reward_std": 2.7195281982421875, "rewards/reward_model/mean": 6.375, "rewards/reward_model/std": 2.7195281982421875, "step": 421 }, { "completion_length": 2421.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 2421.25, "completions/mean_terminated_length": 2421.25, "completions/min_length": 2095.0, "completions/min_terminated_length": 2095.0, "epoch": 0.7092436974789916, "frac_reward_zero_std": 0.0, "grad_norm": 0.22053168714046478, "kl": 0.36029741168022156, "learning_rate": 3.2166666666666666e-06, "loss": 0.0004, "num_tokens": 7502510.0, "reward": 8.5625, "reward_std": 0.8260095119476318, "rewards/reward_model/mean": 8.5625, "rewards/reward_model/std": 0.8260095119476318, "step": 422 }, { "completion_length": 1860.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 1860.75, "completions/mean_terminated_length": 1860.75, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "epoch": 0.7109243697478992, "frac_reward_zero_std": 0.0, "grad_norm": 0.364641010761261, "kl": 0.3490104377269745, "learning_rate": 3.2111111111111115e-06, "loss": 0.0003, "num_tokens": 7519913.0, "reward": 7.375, "reward_std": 4.922313213348389, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 4.922313213348389, "step": 423 }, { "completion_length": 1768.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1768.75, "completions/mean_terminated_length": 1768.75, "completions/min_length": 1421.0, "completions/min_terminated_length": 1421.0, "epoch": 0.7126050420168067, "frac_reward_zero_std": 0.0, "grad_norm": 0.3673297166824341, "kl": 0.3888038694858551, "learning_rate": 3.2055555555555555e-06, "loss": 0.0004, "num_tokens": 7537144.0, "reward": 9.625, "reward_std": 0.75, "rewards/reward_model/mean": 9.625, "rewards/reward_model/std": 0.75, "step": 424 }, { "completion_length": 36.5, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7142857142857143, "frac_reward_zero_std": 1.0, "grad_norm": 0.006167990155518055, "kl": 2.0814332962036133, "learning_rate": 3.2000000000000003e-06, "loss": 0.0021, "num_tokens": 7544782.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 425 }, { "completion_length": 2596.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 2596.0, "completions/mean_terminated_length": 2596.0, "completions/min_length": 2306.0, "completions/min_terminated_length": 2306.0, "epoch": 0.7159663865546219, "frac_reward_zero_std": 0.0, "grad_norm": 0.23090402781963348, "kl": 0.3331482410430908, "learning_rate": 3.1944444444444443e-06, "loss": 0.0003, "num_tokens": 7565358.0, "reward": 8.8125, "reward_std": 1.4343262910842896, "rewards/reward_model/mean": 8.8125, "rewards/reward_model/std": 1.4343262910842896, "step": 426 }, { "completion_length": 359.5, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7176470588235294, "frac_reward_zero_std": 0.0, "grad_norm": 0.7299246191978455, "kl": 0.8759728670120239, "learning_rate": 3.188888888888889e-06, "loss": 0.0009, "num_tokens": 7576432.0, "reward": 7.25, "reward_std": 0.5, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 0.5, "step": 427 }, { "completion_length": 1460.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1460.0, "completions/mean_terminated_length": 1460.0, "completions/min_length": 1199.0, "completions/min_terminated_length": 1199.0, "epoch": 0.719327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.4390854835510254, "kl": 0.5082501769065857, "learning_rate": 3.183333333333334e-06, "loss": 0.0005, "num_tokens": 7592904.0, "reward": 9.375, "reward_std": 1.25, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 1.25, "step": 428 }, { "completion_length": 2338.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2574.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 2338.0, "completions/mean_terminated_length": 2338.0, "completions/min_length": 1791.0, "completions/min_terminated_length": 1791.0, "epoch": 0.7210084033613445, "frac_reward_zero_std": 0.0, "grad_norm": 0.23791323602199554, "kl": 0.38422390818595886, "learning_rate": 3.177777777777778e-06, "loss": 0.0004, "num_tokens": 7612512.0, "reward": 7.25, "reward_std": 1.6955825090408325, "rewards/reward_model/mean": 7.25, "rewards/reward_model/std": 1.6955825090408325, "step": 429 }, { "completion_length": 361.5, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7226890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.5719473958015442, "kl": 1.198698878288269, "learning_rate": 3.1722222222222227e-06, "loss": 0.0012, "num_tokens": 7624754.0, "reward": 7.5, "reward_std": 1.0206207036972046, "rewards/reward_model/mean": 7.5, "rewards/reward_model/std": 1.0206208229064941, "step": 430 }, { "completion_length": 433.5, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 433.5, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.7243697478991596, "frac_reward_zero_std": 0.0, "grad_norm": 0.5106713771820068, "kl": 0.6850795149803162, "learning_rate": 3.1666666666666667e-06, "loss": 0.0007, "num_tokens": 7637216.0, "reward": 8.0, "reward_std": 0.5, "rewards/reward_model/mean": 8.0, "rewards/reward_model/std": 0.5, "step": 431 }, { "completion_length": 2789.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3166.0, "completions/max_terminated_length": 3166.0, "completions/mean_length": 2789.75, "completions/mean_terminated_length": 2789.75, "completions/min_length": 2645.0, "completions/min_terminated_length": 2645.0, "epoch": 0.7260504201680672, "frac_reward_zero_std": 0.0, "grad_norm": 0.20267575979232788, "kl": 0.3429641127586365, "learning_rate": 3.1611111111111115e-06, "loss": 0.0003, "num_tokens": 7658315.0, "reward": 8.625, "reward_std": 1.1814539432525635, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 1.1814539432525635, "step": 432 }, { "completion_length": 12.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7277310924369748, "frac_reward_zero_std": 1.0, "grad_norm": 1.8006809114012867e-05, "kl": 1.6123958826065063, "learning_rate": 3.1555555555555555e-06, "loss": 0.0016, "num_tokens": 7666267.0, "reward": 5.0, "reward_std": 0.0, "rewards/reward_model/mean": 5.0, "rewards/reward_model/std": 0.0, "step": 433 }, { "completion_length": 1852.25, "completions/clipped_ratio": 0.0, "completions/max_length": 1999.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1852.25, "completions/mean_terminated_length": 1852.25, "completions/min_length": 1609.0, "completions/min_terminated_length": 1609.0, "epoch": 0.7294117647058823, "frac_reward_zero_std": 0.0, "grad_norm": 0.3309252858161926, "kl": 0.3283352255821228, "learning_rate": 3.1500000000000003e-06, "loss": 0.0003, "num_tokens": 7683516.0, "reward": 9.875, "reward_std": 0.25, "rewards/reward_model/mean": 9.875, "rewards/reward_model/std": 0.25, "step": 434 }, { "completion_length": 2222.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 2222.5, "completions/mean_terminated_length": 2222.5, "completions/min_length": 2089.0, "completions/min_terminated_length": 2089.0, "epoch": 0.7310924369747899, "frac_reward_zero_std": 0.0, "grad_norm": 0.2202591896057129, "kl": 0.42230889201164246, "learning_rate": 3.144444444444445e-06, "loss": 0.0004, "num_tokens": 7701842.0, "reward": 9.6875, "reward_std": 0.625, "rewards/reward_model/mean": 9.6875, "rewards/reward_model/std": 0.625, "step": 435 }, { "completion_length": 2931.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3158.0, "completions/max_terminated_length": 3158.0, "completions/mean_length": 2931.25, "completions/mean_terminated_length": 2931.25, "completions/min_length": 2484.0, "completions/min_terminated_length": 2484.0, "epoch": 0.7327731092436974, "frac_reward_zero_std": 0.0, "grad_norm": 0.21833336353302002, "kl": 0.5321593284606934, "learning_rate": 3.138888888888889e-06, "loss": 0.0005, "num_tokens": 7723811.0, "reward": 8.144652366638184, "reward_std": 1.6265151500701904, "rewards/reward_model/mean": 8.144652366638184, "rewards/reward_model/std": 1.6265151500701904, "step": 436 }, { "completion_length": 280.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7344537815126051, "frac_reward_zero_std": 0.0, "grad_norm": 0.6133278608322144, "kl": 1.1166269779205322, "learning_rate": 3.133333333333334e-06, "loss": 0.0011, "num_tokens": 7733999.0, "reward": 8.625, "reward_std": 0.14433756470680237, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 0.14433756470680237, "step": 437 }, { "completion_length": 2501.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 2501.5, "completions/mean_terminated_length": 2501.5, "completions/min_length": 2106.0, "completions/min_terminated_length": 2106.0, "epoch": 0.7361344537815127, "frac_reward_zero_std": 0.0, "grad_norm": 0.2427554726600647, "kl": 0.4551956057548523, "learning_rate": 3.127777777777778e-06, "loss": 0.0005, "num_tokens": 7754501.0, "reward": 9.3125, "reward_std": 0.9437292814254761, "rewards/reward_model/mean": 9.3125, "rewards/reward_model/std": 0.9437292814254761, "step": 438 }, { "completion_length": 2383.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 2383.25, "completions/mean_terminated_length": 2383.25, "completions/min_length": 2180.0, "completions/min_terminated_length": 2180.0, "epoch": 0.7378151260504202, "frac_reward_zero_std": 0.0, "grad_norm": 0.26357850432395935, "kl": 0.39432790875434875, "learning_rate": 3.1222222222222228e-06, "loss": 0.0004, "num_tokens": 7774202.0, "reward": 9.6875, "reward_std": 0.625, "rewards/reward_model/mean": 9.6875, "rewards/reward_model/std": 0.625, "step": 439 }, { "completion_length": 296.75, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 296.75, "completions/mean_terminated_length": 296.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7394957983193278, "frac_reward_zero_std": 0.0, "grad_norm": 0.3580112159252167, "kl": 1.0777561664581299, "learning_rate": 3.1166666666666668e-06, "loss": 0.0011, "num_tokens": 7784137.0, "reward": 8.1875, "reward_std": 0.125, "rewards/reward_model/mean": 8.1875, "rewards/reward_model/std": 0.125, "step": 440 }, { "completion_length": 1677.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1677.75, "completions/mean_terminated_length": 1677.75, "completions/min_length": 1380.0, "completions/min_terminated_length": 1380.0, "epoch": 0.7411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.3705597221851349, "kl": 0.36553627252578735, "learning_rate": 3.1111111111111116e-06, "loss": 0.0004, "num_tokens": 7801028.0, "reward": 8.875, "reward_std": 1.314977765083313, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 1.3149778842926025, "step": 441 }, { "completion_length": 1932.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1932.0, "completions/mean_terminated_length": 1932.0, "completions/min_length": 1718.0, "completions/min_terminated_length": 1718.0, "epoch": 0.7428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2825237810611725, "kl": 0.44736677408218384, "learning_rate": 3.1055555555555556e-06, "loss": 0.0004, "num_tokens": 7818500.0, "reward": 7.5, "reward_std": 1.0801234245300293, "rewards/reward_model/mean": 7.5, "rewards/reward_model/std": 1.0801235437393188, "step": 442 }, { "completion_length": 1473.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1473.0, "completions/mean_terminated_length": 1473.0, "completions/min_length": 1176.0, "completions/min_terminated_length": 1176.0, "epoch": 0.7445378151260504, "frac_reward_zero_std": 1.0, "grad_norm": 0.011470720171928406, "kl": 0.5145735144615173, "learning_rate": 3.1000000000000004e-06, "loss": 0.0005, "num_tokens": 7833476.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 443 }, { "completion_length": 319.25, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.746218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.8381687998771667, "kl": 1.0946135520935059, "learning_rate": 3.094444444444445e-06, "loss": 0.0011, "num_tokens": 7844981.0, "reward": 7.625, "reward_std": 1.010362982749939, "rewards/reward_model/mean": 7.625, "rewards/reward_model/std": 1.010362982749939, "step": 444 }, { "completion_length": 1671.5, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1671.5, "completions/mean_terminated_length": 1671.5, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "epoch": 0.7478991596638656, "frac_reward_zero_std": 1.0, "grad_norm": 0.00311764283105731, "kl": 0.35973402857780457, "learning_rate": 3.088888888888889e-06, "loss": 0.0004, "num_tokens": 7861143.0, "reward": 10.0, "reward_std": 0.0, "rewards/reward_model/mean": 10.0, "rewards/reward_model/std": 0.0, "step": 445 }, { "completion_length": 2006.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 2006.75, "completions/mean_terminated_length": 2006.75, "completions/min_length": 1672.0, "completions/min_terminated_length": 1672.0, "epoch": 0.7495798319327731, "frac_reward_zero_std": 0.0, "grad_norm": 0.210985466837883, "kl": 0.366791695356369, "learning_rate": 3.0833333333333336e-06, "loss": 0.0004, "num_tokens": 7879066.0, "reward": 8.0625, "reward_std": 0.42695629596710205, "rewards/reward_model/mean": 8.0625, "rewards/reward_model/std": 0.42695629596710205, "step": 446 }, { "completion_length": 312.5, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7512605042016807, "frac_reward_zero_std": 0.0, "grad_norm": 0.7183101773262024, "kl": 1.07097327709198, "learning_rate": 3.077777777777778e-06, "loss": 0.0011, "num_tokens": 7889340.0, "reward": 8.9375, "reward_std": 0.3145764470100403, "rewards/reward_model/mean": 8.9375, "rewards/reward_model/std": 0.3145764470100403, "step": 447 }, { "completion_length": 272.25, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.8803057074546814, "kl": 1.3437713384628296, "learning_rate": 3.0722222222222224e-06, "loss": 0.0013, "num_tokens": 7899677.0, "reward": 8.262499809265137, "reward_std": 0.5186117887496948, "rewards/reward_model/mean": 8.262499809265137, "rewards/reward_model/std": 0.5186117887496948, "step": 448 }, { "completion_length": 263.5, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7546218487394958, "frac_reward_zero_std": 0.0, "grad_norm": 0.8395476937294006, "kl": 1.2998098134994507, "learning_rate": 3.066666666666667e-06, "loss": 0.0013, "num_tokens": 7909295.0, "reward": 8.25, "reward_std": 0.4564354717731476, "rewards/reward_model/mean": 8.25, "rewards/reward_model/std": 0.4564354717731476, "step": 449 }, { "completion_length": 2215.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 2215.75, "completions/mean_terminated_length": 2215.75, "completions/min_length": 2092.0, "completions/min_terminated_length": 2092.0, "epoch": 0.7563025210084033, "frac_reward_zero_std": 0.0, "grad_norm": 0.2492198944091797, "kl": 0.4474806785583496, "learning_rate": 3.0611111111111112e-06, "loss": 0.0004, "num_tokens": 7927690.0, "reward": 8.5, "reward_std": 0.28867512941360474, "rewards/reward_model/mean": 8.5, "rewards/reward_model/std": 0.28867512941360474, "step": 450 }, { "completion_length": 276.25, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 276.25, "completions/mean_terminated_length": 276.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7579831932773109, "frac_reward_zero_std": 0.0, "grad_norm": 0.6820639371871948, "kl": 1.162339210510254, "learning_rate": 3.055555555555556e-06, "loss": 0.0012, "num_tokens": 7938519.0, "reward": 8.375, "reward_std": 0.25, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.25, "step": 451 }, { "completion_length": 2633.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 2633.75, "completions/mean_terminated_length": 2633.75, "completions/min_length": 2430.0, "completions/min_terminated_length": 2430.0, "epoch": 0.7596638655462185, "frac_reward_zero_std": 0.0, "grad_norm": 0.204949751496315, "kl": 0.3215061128139496, "learning_rate": 3.05e-06, "loss": 0.0003, "num_tokens": 7958170.0, "reward": 8.5625, "reward_std": 0.5153881907463074, "rewards/reward_model/mean": 8.5625, "rewards/reward_model/std": 0.5153881907463074, "step": 452 }, { "completion_length": 291.75, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.761344537815126, "frac_reward_zero_std": 1.0, "grad_norm": 0.001799565157853067, "kl": 1.0171741247177124, "learning_rate": 3.044444444444445e-06, "loss": 0.001, "num_tokens": 7968621.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_model/mean": 0.0, "rewards/reward_model/std": 0.0, "step": 453 }, { "completion_length": 2634.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3293.0, "completions/max_terminated_length": 3293.0, "completions/mean_length": 2634.5, "completions/mean_terminated_length": 2634.5, "completions/min_length": 2338.0, "completions/min_terminated_length": 2338.0, "epoch": 0.7630252100840336, "frac_reward_zero_std": 0.0, "grad_norm": 0.22290703654289246, "kl": 0.3588775098323822, "learning_rate": 3.038888888888889e-06, "loss": 0.0004, "num_tokens": 7988459.0, "reward": 8.75, "reward_std": 1.0408329963684082, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.0408329963684082, "step": 454 }, { "completion_length": 301.75, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.7647058823529411, "frac_reward_zero_std": 0.0, "grad_norm": 0.5139786005020142, "kl": 1.0628443956375122, "learning_rate": 3.0333333333333337e-06, "loss": 0.0011, "num_tokens": 7999266.0, "reward": 7.75, "reward_std": 0.6123724579811096, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 0.6123724579811096, "step": 455 }, { "completion_length": 2497.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2781.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 2497.75, "completions/mean_terminated_length": 2497.75, "completions/min_length": 2271.0, "completions/min_terminated_length": 2271.0, "epoch": 0.7663865546218488, "frac_reward_zero_std": 0.0, "grad_norm": 0.2121046632528305, "kl": 0.3944380283355713, "learning_rate": 3.0277777777777776e-06, "loss": 0.0004, "num_tokens": 8019325.0, "reward": 8.75, "reward_std": 1.172603964805603, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 1.172603964805603, "step": 456 }, { "completion_length": 292.75, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7680672268907563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018121593166142702, "kl": 1.2022079229354858, "learning_rate": 3.0222222222222225e-06, "loss": 0.0012, "num_tokens": 8030408.0, "reward": 9.0, "reward_std": 0.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.0, "step": 457 }, { "completion_length": 2627.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 2627.25, "completions/mean_terminated_length": 2627.25, "completions/min_length": 2280.0, "completions/min_terminated_length": 2280.0, "epoch": 0.7697478991596639, "frac_reward_zero_std": 0.0, "grad_norm": 0.25572511553764343, "kl": 0.37036368250846863, "learning_rate": 3.0166666666666673e-06, "loss": 0.0004, "num_tokens": 8050149.0, "reward": 9.625, "reward_std": 0.75, "rewards/reward_model/mean": 9.625, "rewards/reward_model/std": 0.75, "step": 458 }, { "completion_length": 294.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.7714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.9476527571678162, "kl": 1.0749303102493286, "learning_rate": 3.0111111111111113e-06, "loss": 0.0011, "num_tokens": 8061497.0, "reward": 6.25, "reward_std": 3.523729085922241, "rewards/reward_model/mean": 6.25, "rewards/reward_model/std": 3.523729085922241, "step": 459 }, { "completion_length": 2446.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 2446.0, "completions/mean_terminated_length": 2446.0, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "epoch": 0.773109243697479, "frac_reward_zero_std": 0.0, "grad_norm": 0.18646523356437683, "kl": 0.43717890977859497, "learning_rate": 3.005555555555556e-06, "loss": 0.0004, "num_tokens": 8081057.0, "reward": 8.625, "reward_std": 0.7216877937316895, "rewards/reward_model/mean": 8.625, "rewards/reward_model/std": 0.7216878533363342, "step": 460 }, { "completion_length": 2570.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3041.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 2570.5, "completions/mean_terminated_length": 2570.5, "completions/min_length": 2226.0, "completions/min_terminated_length": 2226.0, "epoch": 0.7747899159663866, "frac_reward_zero_std": 0.0, "grad_norm": 0.24103441834449768, "kl": 0.3680728077888489, "learning_rate": 3e-06, "loss": 0.0004, "num_tokens": 8100031.0, "reward": 9.375, "reward_std": 0.7216877937316895, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 0.7216878533363342, "step": 461 }, { "completion_length": 2773.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 2773.5, "completions/mean_terminated_length": 2773.5, "completions/min_length": 2524.0, "completions/min_terminated_length": 2524.0, "epoch": 0.7764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.2051316648721695, "kl": 0.36059391498565674, "learning_rate": 2.994444444444445e-06, "loss": 0.0004, "num_tokens": 8122957.0, "reward": 8.6875, "reward_std": 1.8860783576965332, "rewards/reward_model/mean": 8.6875, "rewards/reward_model/std": 1.8860783576965332, "step": 462 }, { "completion_length": 1975.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1975.75, "completions/mean_terminated_length": 1975.75, "completions/min_length": 1699.0, "completions/min_terminated_length": 1699.0, "epoch": 0.7781512605042017, "frac_reward_zero_std": 0.0, "grad_norm": 0.31091874837875366, "kl": 0.35764190554618835, "learning_rate": 2.988888888888889e-06, "loss": 0.0004, "num_tokens": 8141328.0, "reward": 9.0, "reward_std": 2.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 2.0, "step": 463 }, { "completion_length": 2129.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 2129.5, "completions/mean_terminated_length": 2129.5, "completions/min_length": 1929.0, "completions/min_terminated_length": 1929.0, "epoch": 0.7798319327731092, "frac_reward_zero_std": 0.0, "grad_norm": 0.23099564015865326, "kl": 0.4275173246860504, "learning_rate": 2.9833333333333337e-06, "loss": 0.0004, "num_tokens": 8158354.0, "reward": 8.375, "reward_std": 1.9843134880065918, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 1.9843134880065918, "step": 464 }, { "completion_length": 252.75, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7815126050420168, "frac_reward_zero_std": 0.0, "grad_norm": 0.46659764647483826, "kl": 1.0949368476867676, "learning_rate": 2.9777777777777777e-06, "loss": 0.0011, "num_tokens": 8168737.0, "reward": 8.375, "reward_std": 0.14433756470680237, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.14433756470680237, "step": 465 }, { "completion_length": 303.75, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7831932773109244, "frac_reward_zero_std": 0.0, "grad_norm": 0.6316483020782471, "kl": 0.9128633737564087, "learning_rate": 2.9722222222222225e-06, "loss": 0.0009, "num_tokens": 8178656.0, "reward": 6.5625, "reward_std": 1.2479149103164673, "rewards/reward_model/mean": 6.5625, "rewards/reward_model/std": 1.2479149103164673, "step": 466 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7848739495798319, "frac_reward_zero_std": 0.0, "grad_norm": 0.5026215314865112, "kl": 0.9611730575561523, "learning_rate": 2.9666666666666673e-06, "loss": 0.001, "num_tokens": 8191012.0, "reward": 6.375, "reward_std": 0.6291528940200806, "rewards/reward_model/mean": 6.375, "rewards/reward_model/std": 0.6291528940200806, "step": 467 }, { "completion_length": 284.25, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.7865546218487395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013979588402435184, "kl": 1.2525960206985474, "learning_rate": 2.9611111111111113e-06, "loss": 0.0013, "num_tokens": 8201777.0, "reward": 9.0, "reward_std": 0.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.0, "step": 468 }, { "completion_length": 2432.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 2432.0, "completions/mean_terminated_length": 2432.0, "completions/min_length": 1859.0, "completions/min_terminated_length": 1859.0, "epoch": 0.788235294117647, "frac_reward_zero_std": 0.0, "grad_norm": 0.21950338780879974, "kl": 0.4229564368724823, "learning_rate": 2.955555555555556e-06, "loss": 0.0004, "num_tokens": 8220485.0, "reward": 8.875, "reward_std": 0.7772815823554993, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.7772815823554993, "step": 469 }, { "completion_length": 2576.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 2576.5, "completions/mean_terminated_length": 2576.5, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "epoch": 0.7899159663865546, "frac_reward_zero_std": 0.0, "grad_norm": 0.2019379734992981, "kl": 0.3797975778579712, "learning_rate": 2.95e-06, "loss": 0.0004, "num_tokens": 8240927.0, "reward": 7.8125, "reward_std": 1.5991533994674683, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 1.5991533994674683, "step": 470 }, { "completion_length": 3294.5, "completions/clipped_ratio": 0.0, "completions/max_length": 4702.0, "completions/max_terminated_length": 4702.0, "completions/mean_length": 3294.5, "completions/mean_terminated_length": 3294.5, "completions/min_length": 2663.0, "completions/min_terminated_length": 2663.0, "epoch": 0.7915966386554621, "frac_reward_zero_std": 0.0, "grad_norm": 0.15410743653774261, "kl": 0.28884583711624146, "learning_rate": 2.944444444444445e-06, "loss": 0.0003, "num_tokens": 8264633.0, "reward": 9.0625, "reward_std": 0.875, "rewards/reward_model/mean": 9.0625, "rewards/reward_model/std": 0.875, "step": 471 }, { "completion_length": 2523.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 2523.25, "completions/mean_terminated_length": 2523.25, "completions/min_length": 2460.0, "completions/min_terminated_length": 2460.0, "epoch": 0.7932773109243697, "frac_reward_zero_std": 0.0, "grad_norm": 0.2220030575990677, "kl": 0.38866809010505676, "learning_rate": 2.938888888888889e-06, "loss": 0.0004, "num_tokens": 8283230.0, "reward": 7.8125, "reward_std": 0.9437292814254761, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 0.9437292814254761, "step": 472 }, { "completion_length": 335.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.7949579831932773, "frac_reward_zero_std": 0.0, "grad_norm": 0.6103029251098633, "kl": 0.9431692957878113, "learning_rate": 2.9333333333333338e-06, "loss": 0.0009, "num_tokens": 8294226.0, "reward": 7.5625, "reward_std": 0.125, "rewards/reward_model/mean": 7.5625, "rewards/reward_model/std": 0.125, "step": 473 }, { "completion_length": 2220.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2394.0, "completions/max_terminated_length": 2394.0, "completions/mean_length": 2220.75, "completions/mean_terminated_length": 2220.75, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "epoch": 0.7966386554621848, "frac_reward_zero_std": 0.0, "grad_norm": 0.24137230217456818, "kl": 0.40519964694976807, "learning_rate": 2.927777777777778e-06, "loss": 0.0004, "num_tokens": 8313621.0, "reward": 8.9375, "reward_std": 0.6574888825416565, "rewards/reward_model/mean": 8.9375, "rewards/reward_model/std": 0.6574889421463013, "step": 474 }, { "completion_length": 278.5, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7983193277310925, "frac_reward_zero_std": 0.0, "grad_norm": 1.0909346342086792, "kl": 1.1162244081497192, "learning_rate": 2.9222222222222226e-06, "loss": 0.0011, "num_tokens": 8323915.0, "reward": 8.75, "reward_std": 0.3535533845424652, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 0.3535533845424652, "step": 475 }, { "completion_length": 1580.75, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 1580.75, "completions/mean_terminated_length": 1580.75, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.42269042134284973, "kl": 0.4337494671344757, "learning_rate": 2.916666666666667e-06, "loss": 0.0004, "num_tokens": 8339378.0, "reward": 9.375, "reward_std": 0.75, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 0.75, "step": 476 }, { "completion_length": 328.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.8016806722689076, "frac_reward_zero_std": 0.0, "grad_norm": 1.0535587072372437, "kl": 1.0401971340179443, "learning_rate": 2.9111111111111114e-06, "loss": 0.001, "num_tokens": 8351422.0, "reward": 8.25, "reward_std": 0.3535533845424652, "rewards/reward_model/mean": 8.25, "rewards/reward_model/std": 0.3535533845424652, "step": 477 }, { "completion_length": 1398.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1398.75, "completions/mean_terminated_length": 1398.75, "completions/min_length": 1235.0, "completions/min_terminated_length": 1235.0, "epoch": 0.8033613445378152, "frac_reward_zero_std": 0.0, "grad_norm": 0.39420226216316223, "kl": 0.40025004744529724, "learning_rate": 2.9055555555555558e-06, "loss": 0.0004, "num_tokens": 8366709.0, "reward": 9.25, "reward_std": 0.9574271440505981, "rewards/reward_model/mean": 9.25, "rewards/reward_model/std": 0.9574271440505981, "step": 478 }, { "completion_length": 287.75, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.8050420168067227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007705939933657646, "kl": 1.2240272760391235, "learning_rate": 2.9e-06, "loss": 0.0012, "num_tokens": 8377080.0, "reward": 9.0, "reward_std": 0.0, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 0.0, "step": 479 }, { "completion_length": 1695.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1695.75, "completions/mean_terminated_length": 1695.75, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "epoch": 0.8067226890756303, "frac_reward_zero_std": 0.0, "grad_norm": 0.3496275246143341, "kl": 0.4244525730609894, "learning_rate": 2.8944444444444446e-06, "loss": 0.0004, "num_tokens": 8393271.0, "reward": 6.625, "reward_std": 3.902456521987915, "rewards/reward_model/mean": 6.625, "rewards/reward_model/std": 3.902456521987915, "step": 480 }, { "completion_length": 1932.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 1932.25, "completions/mean_terminated_length": 1932.25, "completions/min_length": 1724.0, "completions/min_terminated_length": 1724.0, "epoch": 0.8084033613445378, "frac_reward_zero_std": 0.0, "grad_norm": 0.3640977442264557, "kl": 0.7901957035064697, "learning_rate": 2.888888888888889e-06, "loss": 0.0008, "num_tokens": 8409832.0, "reward": 9.375, "reward_std": 0.9464846849441528, "rewards/reward_model/mean": 9.375, "rewards/reward_model/std": 0.9464847445487976, "step": 481 }, { "completion_length": 335.5, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8100840336134454, "frac_reward_zero_std": 0.0, "grad_norm": 0.6656431555747986, "kl": 1.0875601768493652, "learning_rate": 2.8833333333333334e-06, "loss": 0.0011, "num_tokens": 8422090.0, "reward": 6.6875, "reward_std": 0.8003905415534973, "rewards/reward_model/mean": 6.6875, "rewards/reward_model/std": 0.8003905415534973, "step": 482 }, { "completion_length": 297.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8117647058823529, "frac_reward_zero_std": 0.0, "grad_norm": 0.5610443949699402, "kl": 0.9642500877380371, "learning_rate": 2.8777777777777782e-06, "loss": 0.001, "num_tokens": 8431950.0, "reward": 5.6875, "reward_std": 0.125, "rewards/reward_model/mean": 5.6875, "rewards/reward_model/std": 0.125, "step": 483 }, { "completion_length": 2636.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 2636.25, "completions/mean_terminated_length": 2636.25, "completions/min_length": 2577.0, "completions/min_terminated_length": 2577.0, "epoch": 0.8134453781512605, "frac_reward_zero_std": 0.0, "grad_norm": 0.20206233859062195, "kl": 0.3824126720428467, "learning_rate": 2.872222222222222e-06, "loss": 0.0004, "num_tokens": 8452191.0, "reward": 7.8125, "reward_std": 1.9080421924591064, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 1.9080421924591064, "step": 484 }, { "completion_length": 339.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.8151260504201681, "frac_reward_zero_std": 0.0, "grad_norm": 1.0165866613388062, "kl": 0.9398729205131531, "learning_rate": 2.866666666666667e-06, "loss": 0.0009, "num_tokens": 8464831.0, "reward": 9.1875, "reward_std": 0.125, "rewards/reward_model/mean": 9.1875, "rewards/reward_model/std": 0.125, "step": 485 }, { "completion_length": 327.25, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8168067226890756, "frac_reward_zero_std": 0.0, "grad_norm": 0.5722994804382324, "kl": 1.0113110542297363, "learning_rate": 2.861111111111111e-06, "loss": 0.001, "num_tokens": 8475676.0, "reward": 8.875, "reward_std": 0.25, "rewards/reward_model/mean": 8.875, "rewards/reward_model/std": 0.25, "step": 486 }, { "completion_length": 284.5, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8184873949579832, "frac_reward_zero_std": 0.0, "grad_norm": 1.5203814506530762, "kl": 3.0067973136901855, "learning_rate": 2.855555555555556e-06, "loss": 0.003, "num_tokens": 8487526.0, "reward": 9.1875, "reward_std": 0.125, "rewards/reward_model/mean": 9.1875, "rewards/reward_model/std": 0.125, "step": 487 }, { "completion_length": 2633.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2744.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 2633.5, "completions/mean_terminated_length": 2633.5, "completions/min_length": 2445.0, "completions/min_terminated_length": 2445.0, "epoch": 0.8201680672268907, "frac_reward_zero_std": 0.0, "grad_norm": 0.19445210695266724, "kl": 0.3773399293422699, "learning_rate": 2.85e-06, "loss": 0.0004, "num_tokens": 8507312.0, "reward": 9.4375, "reward_std": 0.7180703282356262, "rewards/reward_model/mean": 9.4375, "rewards/reward_model/std": 0.7180703282356262, "step": 488 }, { "completion_length": 2859.25, "completions/clipped_ratio": 0.0, "completions/max_length": 3251.0, "completions/max_terminated_length": 3251.0, "completions/mean_length": 2859.25, "completions/mean_terminated_length": 2859.25, "completions/min_length": 2620.0, "completions/min_terminated_length": 2620.0, "epoch": 0.8218487394957983, "frac_reward_zero_std": 0.0, "grad_norm": 0.2145254760980606, "kl": 0.3481994867324829, "learning_rate": 2.8444444444444446e-06, "loss": 0.0003, "num_tokens": 8528401.0, "reward": 6.375, "reward_std": 2.3318448066711426, "rewards/reward_model/mean": 6.375, "rewards/reward_model/std": 2.3318448066711426, "step": 489 }, { "completion_length": 340.75, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.8235294117647058, "frac_reward_zero_std": 0.0, "grad_norm": 0.45591264963150024, "kl": 0.9131271839141846, "learning_rate": 2.8388888888888895e-06, "loss": 0.0009, "num_tokens": 8540656.0, "reward": 7.8125, "reward_std": 0.5907269716262817, "rewards/reward_model/mean": 7.8125, "rewards/reward_model/std": 0.5907269716262817, "step": 490 }, { "completion_length": 273.25, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8252100840336134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5277583003044128, "kl": 1.2598211765289307, "learning_rate": 2.8333333333333335e-06, "loss": 0.0013, "num_tokens": 8550997.0, "reward": 8.75, "reward_std": 0.20412415266036987, "rewards/reward_model/mean": 8.75, "rewards/reward_model/std": 0.20412415266036987, "step": 491 }, { "completion_length": 1722.75, "completions/clipped_ratio": 0.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1722.75, "completions/mean_terminated_length": 1722.75, "completions/min_length": 1526.0, "completions/min_terminated_length": 1526.0, "epoch": 0.826890756302521, "frac_reward_zero_std": 0.0, "grad_norm": 0.36949422955513, "kl": 0.43771082162857056, "learning_rate": 2.8277777777777783e-06, "loss": 0.0004, "num_tokens": 8568028.0, "reward": 9.75, "reward_std": 0.5, "rewards/reward_model/mean": 9.75, "rewards/reward_model/std": 0.5, "step": 492 }, { "completion_length": 2203.5, "completions/clipped_ratio": 0.0, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 2203.5, "completions/mean_terminated_length": 2203.5, "completions/min_length": 1756.0, "completions/min_terminated_length": 1756.0, "epoch": 0.8285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.21307273209095, "kl": 0.3633190989494324, "learning_rate": 2.8222222222222223e-06, "loss": 0.0004, "num_tokens": 8586702.0, "reward": 7.75, "reward_std": 2.254624843597412, "rewards/reward_model/mean": 7.75, "rewards/reward_model/std": 2.254624843597412, "step": 493 }, { "completion_length": 2487.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2643.0, "completions/max_terminated_length": 2643.0, "completions/mean_length": 2487.25, "completions/mean_terminated_length": 2487.25, "completions/min_length": 2400.0, "completions/min_terminated_length": 2400.0, "epoch": 0.8302521008403362, "frac_reward_zero_std": 0.0, "grad_norm": 0.22060628235340118, "kl": 0.4187045395374298, "learning_rate": 2.816666666666667e-06, "loss": 0.0004, "num_tokens": 8606435.0, "reward": 9.0, "reward_std": 1.670828104019165, "rewards/reward_model/mean": 9.0, "rewards/reward_model/std": 1.670828104019165, "step": 494 }, { "completion_length": 2614.75, "completions/clipped_ratio": 0.0, "completions/max_length": 3125.0, "completions/max_terminated_length": 3125.0, "completions/mean_length": 2614.75, "completions/mean_terminated_length": 2614.75, "completions/min_length": 2407.0, "completions/min_terminated_length": 2407.0, "epoch": 0.8319327731092437, "frac_reward_zero_std": 0.0, "grad_norm": 0.20532122254371643, "kl": 0.38791146874427795, "learning_rate": 2.811111111111111e-06, "loss": 0.0004, "num_tokens": 8627014.0, "reward": 9.9375, "reward_std": 0.125, "rewards/reward_model/mean": 9.9375, "rewards/reward_model/std": 0.125, "step": 495 }, { "completion_length": 2414.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2650.0, "completions/max_terminated_length": 2650.0, "completions/mean_length": 2414.0, "completions/mean_terminated_length": 2414.0, "completions/min_length": 2118.0, "completions/min_terminated_length": 2118.0, "epoch": 0.8336134453781513, "frac_reward_zero_std": 0.0, "grad_norm": 0.24500900506973267, "kl": 0.41920095682144165, "learning_rate": 2.805555555555556e-06, "loss": 0.0004, "num_tokens": 8647778.0, "reward": 8.375, "reward_std": 0.14433756470680237, "rewards/reward_model/mean": 8.375, "rewards/reward_model/std": 0.14433756470680237, "step": 496 }, { "completion_length": 2251.5, "completions/clipped_ratio": 0.0, "completions/max_length": 3407.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 2251.5, "completions/mean_terminated_length": 2251.5, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "epoch": 0.8352941176470589, "frac_reward_zero_std": 0.0, "grad_norm": 0.24598845839500427, "kl": 0.33193981647491455, "learning_rate": 2.8000000000000003e-06, "loss": 0.0003, "num_tokens": 8666480.0, "reward": 6.875, "reward_std": 4.661455154418945, "rewards/reward_model/mean": 6.875, "rewards/reward_model/std": 4.6614556312561035, "step": 497 }, { "completion_length": 2489.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2637.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 2489.0, "completions/mean_terminated_length": 2489.0, "completions/min_length": 2345.0, "completions/min_terminated_length": 2345.0, "epoch": 0.8369747899159664, "frac_reward_zero_std": 0.0, "grad_norm": 0.22297415137290955, "kl": 0.3635345995426178, "learning_rate": 2.7944444444444447e-06, "loss": 0.0004, "num_tokens": 8687368.0, "reward": 9.1875, "reward_std": 0.9437292814254761, "rewards/reward_model/mean": 9.1875, "rewards/reward_model/std": 0.9437292814254761, "step": 498 }, { "completion_length": 2200.25, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 2200.25, "completions/mean_terminated_length": 2200.25, "completions/min_length": 1920.0, "completions/min_terminated_length": 1920.0, "epoch": 0.838655462184874, "frac_reward_zero_std": 0.0, "grad_norm": 0.23436479270458221, "kl": 0.40902072191238403, "learning_rate": 2.788888888888889e-06, "loss": 0.0004, "num_tokens": 8705205.0, "reward": 9.0625, "reward_std": 0.375, "rewards/reward_model/mean": 9.0625, "rewards/reward_model/std": 0.375, "step": 499 }, { "completion_length": 1925.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 1925.0, "completions/mean_terminated_length": 1925.0, "completions/min_length": 1510.0, "completions/min_terminated_length": 1510.0, "epoch": 0.8403361344537815, "frac_reward_zero_std": 0.0, "grad_norm": 0.36077001690864563, "kl": 0.3835509121417999, "learning_rate": 2.7833333333333335e-06, "loss": 0.0004, "num_tokens": 8722881.0, "reward": 7.375, "reward_std": 3.1983067989349365, "rewards/reward_model/mean": 7.375, "rewards/reward_model/std": 3.1983067989349365, "step": 500 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 8722881, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }