diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6577 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.998452810727179, + "eval_steps": 500, + "global_step": 242, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.453125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 2762.078369140625, + "completions/mean_terminated_length": 1656.8284912109375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.0041258380608561115, + "grad_norm": 0.10848142206668854, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 1505084.0, + "reward": 0.029017861932516098, + "reward_std": 0.047291483730077744, + "rewards/code_format_reward/mean": 0.0223214291036129, + "rewards/code_format_reward/std": 0.14789186418056488, + "rewards/curriculum_aware_reward_fn/mean": 0.0066964286379516125, + "rewards/curriculum_aware_reward_fn/std": 0.0310124009847641, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4107142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2660.15625, + "completions/mean_terminated_length": 1659.416748046875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.008251676121712223, + "grad_norm": 0.12915591895580292, + "kl": 0.0005238056182861328, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 2947042.0, + "reward": 0.04441964253783226, + "reward_std": 0.0783877819776535, + "rewards/code_format_reward/mean": 0.02901785634458065, + "rewards/code_format_reward/std": 0.16804419457912445, + "rewards/curriculum_aware_reward_fn/mean": 0.015401787124574184, + "rewards/curriculum_aware_reward_fn/std": 0.045581694692373276, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4486607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 2641.801513671875, + "completions/mean_terminated_length": 1458.4251708984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.012377514182568335, + "grad_norm": 0.12791307270526886, + "kl": 0.0005018711090087891, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 4383850.0, + "reward": 0.04955357685685158, + "reward_std": 0.08979818224906921, + "rewards/code_format_reward/mean": 0.0334821417927742, + "rewards/code_format_reward/std": 0.1800929754972458, + "rewards/curriculum_aware_reward_fn/mean": 0.01607142947614193, + "rewards/curriculum_aware_reward_fn/std": 0.04751747474074364, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5803571428571428, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3912.0, + "completions/mean_length": 3049.80810546875, + "completions/mean_terminated_length": 1602.94677734375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.016503352243424446, + "grad_norm": 0.26403144001960754, + "kl": 0.0005340576171875, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 6031011.0, + "reward": 0.06015625596046448, + "reward_std": 0.08077409863471985, + "rewards/code_format_reward/mean": 0.0424107126891613, + "rewards/code_format_reward/std": 0.20174959301948547, + "rewards/curriculum_aware_reward_fn/mean": 0.01774553582072258, + "rewards/curriculum_aware_reward_fn/std": 0.04849924519658089, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5669642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 2951.32373046875, + "completions/mean_terminated_length": 1452.6236572265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.020629190304280558, + "grad_norm": 0.11759795993566513, + "kl": 0.0006890296936035156, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 7632987.0, + "reward": 0.10122768580913544, + "reward_std": 0.150455042719841, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.26866820454597473, + "rewards/curriculum_aware_reward_fn/mean": 0.02310268022119999, + "rewards/curriculum_aware_reward_fn/std": 0.05603185296058655, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4196428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 2626.33935546875, + "completions/mean_terminated_length": 1563.6614990234375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.02475502836513667, + "grad_norm": 0.176630437374115, + "kl": 0.0012578964233398438, + "learning_rate": 1e-06, + "loss": 0.0312, + "num_tokens": 9085730.0, + "reward": 0.18604911863803864, + "reward_std": 0.23488061130046844, + "rewards/code_format_reward/mean": 0.1428571492433548, + "rewards/code_format_reward/std": 0.3503182828426361, + "rewards/curriculum_aware_reward_fn/mean": 0.04319196566939354, + "rewards/curriculum_aware_reward_fn/std": 0.06799682974815369, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4263392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 2600.0625, + "completions/mean_terminated_length": 1488.2957763671875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.02888086642599278, + "grad_norm": 0.1658683717250824, + "kl": 0.0018901824951171875, + "learning_rate": 1e-06, + "loss": 0.0443, + "num_tokens": 10532126.0, + "reward": 0.2216518074274063, + "reward_std": 0.23757225275039673, + "rewards/code_format_reward/mean": 0.1674107164144516, + "rewards/code_format_reward/std": 0.37375950813293457, + "rewards/curriculum_aware_reward_fn/mean": 0.054241079837083817, + "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4709821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 2653.466552734375, + "completions/mean_terminated_length": 1369.185546875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.03300670448684889, + "grad_norm": 0.16779452562332153, + "kl": 0.0017452239990234375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 11985080.0, + "reward": 0.2589285969734192, + "reward_std": 0.23284269869327545, + "rewards/code_format_reward/mean": 0.1986607164144516, + "rewards/code_format_reward/std": 0.3994380831718445, + "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, + "rewards/curriculum_aware_reward_fn/std": 0.07362107187509537, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5066964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 2908.83056640625, + "completions/mean_terminated_length": 1689.429931640625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.037132542547705004, + "grad_norm": 0.1578310877084732, + "kl": 0.00200653076171875, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 13550561.0, + "reward": 0.36138394474983215, + "reward_std": 0.30128249526023865, + "rewards/code_format_reward/mean": 0.2924107015132904, + "rewards/code_format_reward/std": 0.4553784430027008, + "rewards/curriculum_aware_reward_fn/mean": 0.06897322088479996, + "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4330357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 2671.055908203125, + "completions/mean_terminated_length": 1582.712646484375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.041258380608561115, + "grad_norm": 0.1712757647037506, + "kl": 0.0027599334716796875, + "learning_rate": 1e-06, + "loss": 0.0316, + "num_tokens": 15018507.0, + "reward": 0.500558078289032, + "reward_std": 0.3041485548019409, + "rewards/code_format_reward/mean": 0.421875, + "rewards/code_format_reward/std": 0.4944108724594116, + "rewards/curriculum_aware_reward_fn/mean": 0.078683041036129, + "rewards/curriculum_aware_reward_fn/std": 0.07499326020479202, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5022321428571428, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 2796.071533203125, + "completions/mean_terminated_length": 1484.484375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.04538421866941723, + "grad_norm": 0.7890152335166931, + "kl": 0.005886077880859375, + "learning_rate": 1e-06, + "loss": 0.0268, + "num_tokens": 16537021.0, + "reward": 0.49921879172325134, + "reward_std": 0.2203681766986847, + "rewards/code_format_reward/mean": 0.4285714328289032, + "rewards/code_format_reward/std": 0.49542486667633057, + "rewards/curriculum_aware_reward_fn/mean": 0.07064732164144516, + "rewards/curriculum_aware_reward_fn/std": 0.07495728880167007, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4821428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 2733.216552734375, + "completions/mean_terminated_length": 1464.4180908203125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.04951005673027334, + "grad_norm": 0.15826576948165894, + "kl": 0.0032367706298828125, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 18030295.0, + "reward": 0.5366071462631226, + "reward_std": 0.24472731351852417, + "rewards/code_format_reward/mean": 0.4709821343421936, + "rewards/code_format_reward/std": 0.49971529841423035, + "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, + "rewards/curriculum_aware_reward_fn/std": 0.07449494302272797, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 3284.473388671875, + "completions/mean_terminated_length": 1837.8385009765625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.05363589479112945, + "grad_norm": 0.12409358471632004, + "kl": 0.0019292831420898438, + "learning_rate": 1e-06, + "loss": 0.0648, + "num_tokens": 19771120.0, + "reward": 0.3962053656578064, + "reward_std": 0.24470412731170654, + "rewards/code_format_reward/mean": 0.3459821343421936, + "rewards/code_format_reward/std": 0.47621920704841614, + "rewards/curriculum_aware_reward_fn/mean": 0.0502232126891613, + "rewards/curriculum_aware_reward_fn/std": 0.07086833566427231, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4955357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 2699.68994140625, + "completions/mean_terminated_length": 1328.0928955078125, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.05776173285198556, + "grad_norm": 0.15436327457427979, + "kl": 0.004604339599609375, + "learning_rate": 1e-06, + "loss": 0.0586, + "num_tokens": 21266482.0, + "reward": 0.5574777126312256, + "reward_std": 0.19266396760940552, + "rewards/code_format_reward/mean": 0.4888392984867096, + "rewards/code_format_reward/std": 0.5004342794418335, + "rewards/curriculum_aware_reward_fn/mean": 0.0686383917927742, + "rewards/curriculum_aware_reward_fn/std": 0.07938338816165924, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4799107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4050.0, + "completions/mean_length": 2604.18310546875, + "completions/mean_terminated_length": 1227.61376953125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.06188757091284167, + "grad_norm": 0.14027918875217438, + "kl": 0.004291534423828125, + "learning_rate": 1e-06, + "loss": 0.0539, + "num_tokens": 22688586.0, + "reward": 0.5889509320259094, + "reward_std": 0.16005173325538635, + "rewards/code_format_reward/mean": 0.515625, + "rewards/code_format_reward/std": 0.500314474105835, + "rewards/curriculum_aware_reward_fn/mean": 0.07332588732242584, + "rewards/curriculum_aware_reward_fn/std": 0.07506514340639114, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 2344.2724609375, + "completions/mean_terminated_length": 1293.2357177734375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.06601340897369778, + "grad_norm": 0.14219032227993011, + "kl": 0.004871368408203125, + "learning_rate": 1e-06, + "loss": 0.081, + "num_tokens": 24024252.0, + "reward": 0.7078125476837158, + "reward_std": 0.1512121558189392, + "rewards/code_format_reward/mean": 0.6227678656578064, + "rewards/code_format_reward/std": 0.48523563146591187, + "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584, + "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4129464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 2521.03369140625, + "completions/mean_terminated_length": 1413.167236328125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.07013924703455389, + "grad_norm": 0.1439032107591629, + "kl": 0.0036773681640625, + "learning_rate": 1e-06, + "loss": 0.0766, + "num_tokens": 25413592.0, + "reward": 0.662834882736206, + "reward_std": 0.19979971647262573, + "rewards/code_format_reward/mean": 0.5848214030265808, + "rewards/code_format_reward/std": 0.49330365657806396, + "rewards/curriculum_aware_reward_fn/mean": 0.07801339775323868, + "rewards/curriculum_aware_reward_fn/std": 0.07958128303289413, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4352678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 2537.692138671875, + "completions/mean_terminated_length": 1336.62451171875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.07426508509541001, + "grad_norm": 0.20727689564228058, + "kl": 0.0038776397705078125, + "learning_rate": 1e-06, + "loss": 0.0636, + "num_tokens": 26820657.0, + "reward": 0.6412946581840515, + "reward_std": 0.1462433785200119, + "rewards/code_format_reward/mean": 0.5602678656578064, + "rewards/code_format_reward/std": 0.49690937995910645, + "rewards/curriculum_aware_reward_fn/mean": 0.08102678507566452, + "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 2791.4130859375, + "completions/mean_terminated_length": 1565.8917236328125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.07839092315626611, + "grad_norm": 0.22532987594604492, + "kl": 0.002948760986328125, + "learning_rate": 1e-06, + "loss": 0.0666, + "num_tokens": 28362529.0, + "reward": 0.5607143640518188, + "reward_std": 0.17854316532611847, + "rewards/code_format_reward/mean": 0.4977678656578064, + "rewards/code_format_reward/std": 0.5005539655685425, + "rewards/curriculum_aware_reward_fn/mean": 0.06294643133878708, + "rewards/curriculum_aware_reward_fn/std": 0.074107825756073, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4069.0, + "completions/mean_length": 2899.388427734375, + "completions/mean_terminated_length": 1518.6827392578125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.08251676121712223, + "grad_norm": 0.14237935841083527, + "kl": 0.0032501220703125, + "learning_rate": 1e-06, + "loss": 0.0718, + "num_tokens": 29931508.0, + "reward": 0.5177456140518188, + "reward_std": 0.18259648978710175, + "rewards/code_format_reward/mean": 0.4598214328289032, + "rewards/code_format_reward/std": 0.49894019961357117, + "rewards/curriculum_aware_reward_fn/mean": 0.05792411044239998, + "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4464285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4025.0, + "completions/mean_length": 2652.560302734375, + "completions/mean_terminated_length": 1488.4959716796875, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.08664259927797834, + "grad_norm": 0.9559803009033203, + "kl": 0.0038776397705078125, + "learning_rate": 1e-06, + "loss": 0.0722, + "num_tokens": 31381718.0, + "reward": 0.6266741156578064, + "reward_std": 0.17785993218421936, + "rewards/code_format_reward/mean": 0.5513392686843872, + "rewards/code_format_reward/std": 0.49791330099105835, + "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774, + "rewards/curriculum_aware_reward_fn/std": 0.07508309930562973, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4285714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3976.0, + "completions/mean_length": 2559.6318359375, + "completions/mean_terminated_length": 1407.35546875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.09076843733883445, + "grad_norm": 0.15661275386810303, + "kl": 0.004146575927734375, + "learning_rate": 1e-06, + "loss": 0.0993, + "num_tokens": 32786432.0, + "reward": 0.6441964507102966, + "reward_std": 0.21372844278812408, + "rewards/code_format_reward/mean": 0.5691964030265808, + "rewards/code_format_reward/std": 0.4957422614097595, + "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, + "rewards/curriculum_aware_reward_fn/std": 0.07508385181427002, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4196428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 2585.23681640625, + "completions/mean_terminated_length": 1492.8385009765625, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.09489427539969056, + "grad_norm": 0.17427508533000946, + "kl": 0.00446319580078125, + "learning_rate": 1e-06, + "loss": 0.0821, + "num_tokens": 34208021.0, + "reward": 0.6626116633415222, + "reward_std": 0.19658702611923218, + "rewards/code_format_reward/mean": 0.5825892686843872, + "rewards/code_format_reward/std": 0.4936830997467041, + "rewards/curriculum_aware_reward_fn/mean": 0.08002232015132904, + "rewards/curriculum_aware_reward_fn/std": 0.074915312230587, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3348214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 2340.810302734375, + "completions/mean_terminated_length": 1457.3255615234375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.09902011346054668, + "grad_norm": 0.1813523918390274, + "kl": 0.0107879638671875, + "learning_rate": 1e-06, + "loss": 0.109, + "num_tokens": 35529753.0, + "reward": 0.751897394657135, + "reward_std": 0.21504586935043335, + "rewards/code_format_reward/mean": 0.6651785969734192, + "rewards/code_format_reward/std": 0.47245556116104126, + "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164, + "rewards/curriculum_aware_reward_fn/std": 0.07416162639856339, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3705357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 2455.239013671875, + "completions/mean_terminated_length": 1489.400634765625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.10314595152140278, + "grad_norm": 0.17525888979434967, + "kl": 0.00499725341796875, + "learning_rate": 1e-06, + "loss": 0.0594, + "num_tokens": 36900957.0, + "reward": 0.7077009081840515, + "reward_std": 0.20892252027988434, + "rewards/code_format_reward/mean": 0.625, + "rewards/code_format_reward/std": 0.48466411232948303, + "rewards/curriculum_aware_reward_fn/mean": 0.08270090073347092, + "rewards/curriculum_aware_reward_fn/std": 0.07535793632268906, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3526785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 2328.640625, + "completions/mean_terminated_length": 1365.7344970703125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.1072717895822589, + "grad_norm": 0.15375342965126038, + "kl": 0.00540924072265625, + "learning_rate": 1e-06, + "loss": 0.0594, + "num_tokens": 38222347.0, + "reward": 0.7113839983940125, + "reward_std": 0.1420706957578659, + "rewards/code_format_reward/mean": 0.6316964030265808, + "rewards/code_format_reward/std": 0.4828835427761078, + "rewards/curriculum_aware_reward_fn/mean": 0.07968749850988388, + "rewards/curriculum_aware_reward_fn/std": 0.07493705302476883, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4508928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4056.0, + "completions/mean_length": 2649.419677734375, + "completions/mean_terminated_length": 1461.5771484375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.111397627643115, + "grad_norm": 0.1293102651834488, + "kl": 0.00531768798828125, + "learning_rate": 1e-06, + "loss": 0.065, + "num_tokens": 39685605.0, + "reward": 0.6244419813156128, + "reward_std": 0.17081955075263977, + "rewards/code_format_reward/mean": 0.5491071343421936, + "rewards/code_format_reward/std": 0.4981389045715332, + "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774, + "rewards/curriculum_aware_reward_fn/std": 0.07900315523147583, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 2605.43310546875, + "completions/mean_terminated_length": 1446.103271484375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.11552346570397112, + "grad_norm": 0.16880519688129425, + "kl": 0.0055389404296875, + "learning_rate": 1e-06, + "loss": 0.0838, + "num_tokens": 41124915.0, + "reward": 0.6412946581840515, + "reward_std": 0.23654666543006897, + "rewards/code_format_reward/mean": 0.5602678656578064, + "rewards/code_format_reward/std": 0.49690937995910645, + "rewards/curriculum_aware_reward_fn/mean": 0.08102679252624512, + "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3973214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 2520.87060546875, + "completions/mean_terminated_length": 1482.4517822265625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.11964930376482723, + "grad_norm": 0.16759856045246124, + "kl": 0.005680084228515625, + "learning_rate": 1e-06, + "loss": 0.0741, + "num_tokens": 42532123.0, + "reward": 0.6870536208152771, + "reward_std": 0.23582594096660614, + "rewards/code_format_reward/mean": 0.6026785969734192, + "rewards/code_format_reward/std": 0.48989060521125793, + "rewards/curriculum_aware_reward_fn/mean": 0.08437500894069672, + "rewards/curriculum_aware_reward_fn/std": 0.07844439893960953, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 2569.11181640625, + "completions/mean_terminated_length": 1590.3370361328125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.12377514182568335, + "grad_norm": 0.25616586208343506, + "kl": 0.00691986083984375, + "learning_rate": 1e-06, + "loss": 0.0843, + "num_tokens": 43947307.0, + "reward": 0.6825892925262451, + "reward_std": 0.2599073052406311, + "rewards/code_format_reward/mean": 0.6049107313156128, + "rewards/code_format_reward/std": 0.4894163906574249, + "rewards/curriculum_aware_reward_fn/mean": 0.07767857611179352, + "rewards/curriculum_aware_reward_fn/std": 0.07570379227399826, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4241071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 2515.082763671875, + "completions/mean_terminated_length": 1350.841064453125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.12790097988653945, + "grad_norm": 0.1738700419664383, + "kl": 0.005664825439453125, + "learning_rate": 1e-06, + "loss": 0.0742, + "num_tokens": 45356850.0, + "reward": 0.6507812738418579, + "reward_std": 0.21465569734573364, + "rewards/code_format_reward/mean": 0.578125, + "rewards/code_format_reward/std": 0.4944108724594116, + "rewards/curriculum_aware_reward_fn/mean": 0.07265625894069672, + "rewards/curriculum_aware_reward_fn/std": 0.07896901667118073, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3482142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 2322.555908203125, + "completions/mean_terminated_length": 1375.099365234375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.13202681794739557, + "grad_norm": 0.19044172763824463, + "kl": 0.00688934326171875, + "learning_rate": 1e-06, + "loss": 0.0985, + "num_tokens": 46668801.0, + "reward": 0.7402902841567993, + "reward_std": 0.2134973704814911, + "rewards/code_format_reward/mean": 0.65625, + "rewards/code_format_reward/std": 0.47548985481262207, + "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096, + "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4017857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2533.83056640625, + "completions/mean_terminated_length": 1484.6119384765625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.1361526560082517, + "grad_norm": 0.1775142401456833, + "kl": 0.0061492919921875, + "learning_rate": 1e-06, + "loss": 0.1187, + "num_tokens": 48082396.0, + "reward": 0.6822544932365417, + "reward_std": 0.2595500349998474, + "rewards/code_format_reward/mean": 0.5982142686843872, + "rewards/code_format_reward/std": 0.49080711603164673, + "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096, + "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4464285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4036.0, + "completions/mean_length": 2743.6630859375, + "completions/mean_terminated_length": 1653.0684814453125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.14027849406910778, + "grad_norm": 0.17279711365699768, + "kl": 0.0059051513671875, + "learning_rate": 1e-06, + "loss": 0.1089, + "num_tokens": 49573595.0, + "reward": 0.6310268640518188, + "reward_std": 0.2643914222717285, + "rewards/code_format_reward/mean": 0.5580357313156128, + "rewards/code_format_reward/std": 0.4971756041049957, + "rewards/curriculum_aware_reward_fn/mean": 0.07299107313156128, + "rewards/curriculum_aware_reward_fn/std": 0.07505691051483154, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4084821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4014.0, + "completions/mean_length": 2337.328125, + "completions/mean_terminated_length": 1122.84912109375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.1444043321299639, + "grad_norm": 0.17568987607955933, + "kl": 0.00757598876953125, + "learning_rate": 1e-06, + "loss": 0.0903, + "num_tokens": 50905954.0, + "reward": 0.6639509201049805, + "reward_std": 0.19163252413272858, + "rewards/code_format_reward/mean": 0.5892857313156128, + "rewards/code_format_reward/std": 0.4925134479999542, + "rewards/curriculum_aware_reward_fn/mean": 0.07466518133878708, + "rewards/curriculum_aware_reward_fn/std": 0.07508310675621033, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 2267.618408203125, + "completions/mean_terminated_length": 1495.6348876953125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.14853017019082002, + "grad_norm": 2.414182662963867, + "kl": 0.00893402099609375, + "learning_rate": 1e-06, + "loss": 0.0851, + "num_tokens": 52191977.0, + "reward": 0.7859375476837158, + "reward_std": 0.21878042817115784, + "rewards/code_format_reward/mean": 0.7008928656578064, + "rewards/code_format_reward/std": 0.45837870240211487, + "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584, + "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2924107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 2191.107177734375, + "completions/mean_terminated_length": 1403.9117431640625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.15265600825167613, + "grad_norm": 0.3153815269470215, + "kl": 0.010066986083984375, + "learning_rate": 1e-06, + "loss": 0.0934, + "num_tokens": 53442519.0, + "reward": 0.7965402007102966, + "reward_std": 0.2366231083869934, + "rewards/code_format_reward/mean": 0.7098214030265808, + "rewards/code_format_reward/std": 0.4543519914150238, + "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164, + "rewards/curriculum_aware_reward_fn/std": 0.07416163384914398, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2611607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 2011.8907470703125, + "completions/mean_terminated_length": 1275.21142578125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.15678184631253222, + "grad_norm": 0.18517161905765533, + "kl": 0.009616851806640625, + "learning_rate": 1e-06, + "loss": 0.0701, + "num_tokens": 54602188.0, + "reward": 0.8469865918159485, + "reward_std": 0.15580664575099945, + "rewards/code_format_reward/mean": 0.7455357313156128, + "rewards/code_format_reward/std": 0.4360465407371521, + "rewards/curriculum_aware_reward_fn/mean": 0.10145089775323868, + "rewards/curriculum_aware_reward_fn/std": 0.07025929540395737, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2455357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 2168.19873046875, + "completions/mean_terminated_length": 1540.8077392578125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.16090768437338834, + "grad_norm": 0.20817974209785461, + "kl": 0.00962066650390625, + "learning_rate": 1e-06, + "loss": 0.1159, + "num_tokens": 55845050.0, + "reward": 0.8487723469734192, + "reward_std": 0.23730090260505676, + "rewards/code_format_reward/mean": 0.7566964030265808, + "rewards/code_format_reward/std": 0.42955654859542847, + "rewards/curriculum_aware_reward_fn/mean": 0.0920758917927742, + "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4241071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2592.7255859375, + "completions/mean_terminated_length": 1485.662841796875, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.16503352243424446, + "grad_norm": 0.19099269807338715, + "kl": 0.0080718994140625, + "learning_rate": 1e-06, + "loss": 0.1264, + "num_tokens": 57271114.0, + "reward": 0.6504464149475098, + "reward_std": 0.27461573481559753, + "rewards/code_format_reward/mean": 0.578125, + "rewards/code_format_reward/std": 0.4944108724594116, + "rewards/curriculum_aware_reward_fn/mean": 0.07232142984867096, + "rewards/curriculum_aware_reward_fn/std": 0.0750359445810318, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.359375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4036.0, + "completions/mean_length": 2346.348388671875, + "completions/mean_terminated_length": 1364.8363037109375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.16915936049510058, + "grad_norm": 0.20166631042957306, + "kl": 0.009296417236328125, + "learning_rate": 1e-06, + "loss": 0.0774, + "num_tokens": 58593780.0, + "reward": 0.7228795289993286, + "reward_std": 0.20277433097362518, + "rewards/code_format_reward/mean": 0.6428571343421936, + "rewards/code_format_reward/std": 0.47969305515289307, + "rewards/curriculum_aware_reward_fn/mean": 0.08002232760190964, + "rewards/curriculum_aware_reward_fn/std": 0.07558422535657883, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2477678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 1821.66748046875, + "completions/mean_terminated_length": 1072.554931640625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.17328519855595667, + "grad_norm": 0.3720153272151947, + "kl": 0.01447296142578125, + "learning_rate": 1e-06, + "loss": 0.0418, + "num_tokens": 59652129.0, + "reward": 0.8479911088943481, + "reward_std": 0.14334198832511902, + "rewards/code_format_reward/mean": 0.7522321343421936, + "rewards/code_format_reward/std": 0.4321989119052887, + "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096, + "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3526785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 2276.546875, + "completions/mean_terminated_length": 1285.2586669921875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.1774110366168128, + "grad_norm": 46.85969924926758, + "kl": 2.2713623046875, + "learning_rate": 1e-06, + "loss": 0.1385, + "num_tokens": 60933570.0, + "reward": 0.7345982789993286, + "reward_std": 0.22185997664928436, + "rewards/code_format_reward/mean": 0.6495535969734192, + "rewards/code_format_reward/std": 0.4776431620121002, + "rewards/curriculum_aware_reward_fn/mean": 0.08504464477300644, + "rewards/curriculum_aware_reward_fn/std": 0.07900101691484451, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2477678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 2081.060302734375, + "completions/mean_terminated_length": 1417.3857421875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.1815368746776689, + "grad_norm": 0.20214147865772247, + "kl": 0.01105499267578125, + "learning_rate": 1e-06, + "loss": 0.1317, + "num_tokens": 62112236.0, + "reward": 0.8328125476837158, + "reward_std": 0.2402763068675995, + "rewards/code_format_reward/mean": 0.7410714030265808, + "rewards/code_format_reward/std": 0.43853598833084106, + "rewards/curriculum_aware_reward_fn/mean": 0.09174107015132904, + "rewards/curriculum_aware_reward_fn/std": 0.0778549388051033, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3258928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2354.993408203125, + "completions/mean_terminated_length": 1513.3145751953125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.18566271273852503, + "grad_norm": 0.5439615845680237, + "kl": 0.01001739501953125, + "learning_rate": 1e-06, + "loss": 0.1551, + "num_tokens": 63457521.0, + "reward": 0.7627232670783997, + "reward_std": 0.2988319396972656, + "rewards/code_format_reward/mean": 0.6763392686843872, + "rewards/code_format_reward/std": 0.46839532256126404, + "rewards/curriculum_aware_reward_fn/mean": 0.08638393133878708, + "rewards/curriculum_aware_reward_fn/std": 0.07488906383514404, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 1772.7857666015625, + "completions/mean_terminated_length": 1228.7823486328125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.18978855079938112, + "grad_norm": 0.18871024250984192, + "kl": 0.01403045654296875, + "learning_rate": 1e-06, + "loss": 0.0889, + "num_tokens": 64520732.0, + "reward": 0.9100447297096252, + "reward_std": 0.13270780444145203, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252743124961853, + "rewards/curriculum_aware_reward_fn/mean": 0.09977678954601288, + "rewards/curriculum_aware_reward_fn/std": 0.07365463674068451, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3214285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 2240.32373046875, + "completions/mean_terminated_length": 1361.319091796875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.19391438886023724, + "grad_norm": 0.5957781672477722, + "kl": 0.01197052001953125, + "learning_rate": 1e-06, + "loss": 0.0987, + "num_tokens": 65795367.0, + "reward": 0.762276828289032, + "reward_std": 0.21752412617206573, + "rewards/code_format_reward/mean": 0.6785714030265808, + "rewards/code_format_reward/std": 0.4675469994544983, + "rewards/curriculum_aware_reward_fn/mean": 0.0837053582072258, + "rewards/curriculum_aware_reward_fn/std": 0.07457634806632996, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2299107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 1868.493408203125, + "completions/mean_terminated_length": 1203.4696044921875, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.19804022692109335, + "grad_norm": 0.22301578521728516, + "kl": 0.01412200927734375, + "learning_rate": 1e-06, + "loss": 0.1032, + "num_tokens": 66888015.0, + "reward": 0.865178644657135, + "reward_std": 0.18127720057964325, + "rewards/code_format_reward/mean": 0.7700892686843872, + "rewards/code_format_reward/std": 0.42124560475349426, + "rewards/curriculum_aware_reward_fn/mean": 0.09508929401636124, + "rewards/curriculum_aware_reward_fn/std": 0.07234017550945282, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1748.5804443359375, + "completions/mean_terminated_length": 1158.4468994140625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.20216606498194944, + "grad_norm": 0.18596109747886658, + "kl": 0.016571044921875, + "learning_rate": 1e-06, + "loss": 0.0924, + "num_tokens": 67942181.0, + "reward": 0.8970983028411865, + "reward_std": 0.15365570783615112, + "rewards/code_format_reward/mean": 0.8013392686843872, + "rewards/code_format_reward/std": 0.3994380831718445, + "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096, + "rewards/curriculum_aware_reward_fn/std": 0.07687902450561523, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2767857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4053.0, + "completions/mean_length": 1998.4866943359375, + "completions/mean_terminated_length": 1195.734619140625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.20629190304280556, + "grad_norm": 0.22046102583408356, + "kl": 0.01477813720703125, + "learning_rate": 1e-06, + "loss": 0.1574, + "num_tokens": 69098841.0, + "reward": 0.8946428894996643, + "reward_std": 0.26137298345565796, + "rewards/code_format_reward/mean": 0.7232142686843872, + "rewards/code_format_reward/std": 0.44790980219841003, + "rewards/curriculum_aware_reward_fn/mean": 0.17142857611179352, + "rewards/curriculum_aware_reward_fn/std": 0.141336590051651, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 1979.493408203125, + "completions/mean_terminated_length": 1331.5831298828125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.21041774110366168, + "grad_norm": 0.22472943365573883, + "kl": 0.01605224609375, + "learning_rate": 1e-06, + "loss": 0.1532, + "num_tokens": 70250422.0, + "reward": 0.9502232670783997, + "reward_std": 0.27020275592803955, + "rewards/code_format_reward/mean": 0.7700892686843872, + "rewards/code_format_reward/std": 0.42124560475349426, + "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648, + "rewards/curriculum_aware_reward_fn/std": 0.13901372253894806, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3370535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 2305.919677734375, + "completions/mean_terminated_length": 1395.8114013671875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.2145435791645178, + "grad_norm": 0.19396060705184937, + "kl": 0.01288604736328125, + "learning_rate": 1e-06, + "loss": 0.1115, + "num_tokens": 71575367.0, + "reward": 0.8095982670783997, + "reward_std": 0.2751266360282898, + "rewards/code_format_reward/mean": 0.6696428656578064, + "rewards/code_format_reward/std": 0.4708675146102905, + "rewards/curriculum_aware_reward_fn/mean": 0.13995537161827087, + "rewards/curriculum_aware_reward_fn/std": 0.14260126650333405, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2745535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 2132.26123046875, + "completions/mean_terminated_length": 1389.0615234375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.2186694172253739, + "grad_norm": 2.5251505374908447, + "kl": 0.0143890380859375, + "learning_rate": 1e-06, + "loss": 0.12, + "num_tokens": 72795332.0, + "reward": 0.8919642567634583, + "reward_std": 0.2576311528682709, + "rewards/code_format_reward/mean": 0.7299107313156128, + "rewards/code_format_reward/std": 0.444502055644989, + "rewards/curriculum_aware_reward_fn/mean": 0.16205357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.15102121233940125, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2299107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 1957.2969970703125, + "completions/mean_terminated_length": 1318.7855224609375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.22279525528623, + "grad_norm": 0.37278586626052856, + "kl": 0.01513671875, + "learning_rate": 1e-06, + "loss": 0.1216, + "num_tokens": 73926567.0, + "reward": 0.932366132736206, + "reward_std": 0.22399599850177765, + "rewards/code_format_reward/mean": 0.7589285969734192, + "rewards/code_format_reward/std": 0.4282117187976837, + "rewards/curriculum_aware_reward_fn/mean": 0.17343749105930328, + "rewards/curriculum_aware_reward_fn/std": 0.139581561088562, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1986607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 1917.977783203125, + "completions/mean_terminated_length": 1378.022216796875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.22692109334708613, + "grad_norm": 1.5422786474227905, + "kl": 0.01769256591796875, + "learning_rate": 1e-06, + "loss": 0.1373, + "num_tokens": 75080997.0, + "reward": 0.9751116037368774, + "reward_std": 0.24655158817768097, + "rewards/code_format_reward/mean": 0.8013392686843872, + "rewards/code_format_reward/std": 0.3994380533695221, + "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023, + "rewards/curriculum_aware_reward_fn/std": 0.14535558223724365, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2745535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 2178.58935546875, + "completions/mean_terminated_length": 1452.923095703125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.23104693140794225, + "grad_norm": 0.19924461841583252, + "kl": 0.0144500732421875, + "learning_rate": 1e-06, + "loss": 0.1163, + "num_tokens": 76347561.0, + "reward": 0.8948661088943481, + "reward_std": 0.23191384971141815, + "rewards/code_format_reward/mean": 0.7254464030265808, + "rewards/code_format_reward/std": 0.44678795337677, + "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256, + "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1716.118408203125, + "completions/mean_terminated_length": 1198.752685546875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.23517276946879834, + "grad_norm": 0.2652409076690674, + "kl": 0.0170440673828125, + "learning_rate": 1e-06, + "loss": 0.0689, + "num_tokens": 77394926.0, + "reward": 1.0386160612106323, + "reward_std": 0.16931238770484924, + "rewards/code_format_reward/mean": 0.8236607313156128, + "rewards/code_format_reward/std": 0.3815346360206604, + "rewards/curriculum_aware_reward_fn/mean": 0.21495535969734192, + "rewards/curriculum_aware_reward_fn/std": 0.13793620467185974, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2767857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 2039.154052734375, + "completions/mean_terminated_length": 1251.966064453125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.23929860752965446, + "grad_norm": 0.22779759764671326, + "kl": 0.0151824951171875, + "learning_rate": 1e-06, + "loss": 0.1741, + "num_tokens": 78570204.0, + "reward": 0.8920758962631226, + "reward_std": 0.2624998390674591, + "rewards/code_format_reward/mean": 0.7276785969734192, + "rewards/code_format_reward/std": 0.4456520676612854, + "rewards/curriculum_aware_reward_fn/mean": 0.16439732909202576, + "rewards/curriculum_aware_reward_fn/std": 0.14416027069091797, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2566964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 1969.24560546875, + "completions/mean_terminated_length": 1234.78076171875, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.24342444559051057, + "grad_norm": 9.569337844848633, + "kl": 0.660430908203125, + "learning_rate": 1e-06, + "loss": 0.0865, + "num_tokens": 79723060.0, + "reward": 0.9041295051574707, + "reward_std": 0.21036864817142487, + "rewards/code_format_reward/mean": 0.7477678656578064, + "rewards/code_format_reward/std": 0.4347792863845825, + "rewards/curriculum_aware_reward_fn/mean": 0.15636160969734192, + "rewards/curriculum_aware_reward_fn/std": 0.14369189739227295, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1985.6898193359375, + "completions/mean_terminated_length": 1222.385986328125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.2475502836513667, + "grad_norm": 0.23437850177288055, + "kl": 0.01544952392578125, + "learning_rate": 1e-06, + "loss": 0.1137, + "num_tokens": 80894980.0, + "reward": 0.8974330425262451, + "reward_std": 0.2371726632118225, + "rewards/code_format_reward/mean": 0.734375, + "rewards/code_format_reward/std": 0.44215917587280273, + "rewards/curriculum_aware_reward_fn/mean": 0.16305804252624512, + "rewards/curriculum_aware_reward_fn/std": 0.14288581907749176, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 1866.6630859375, + "completions/mean_terminated_length": 1242.6199951171875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.2516761217122228, + "grad_norm": 0.21880938112735748, + "kl": 0.0179290771484375, + "learning_rate": 1e-06, + "loss": 0.0995, + "num_tokens": 81992338.0, + "reward": 0.9709821939468384, + "reward_std": 0.22359324991703033, + "rewards/code_format_reward/mean": 0.7767857313156128, + "rewards/code_format_reward/std": 0.41686633229255676, + "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.13704219460487366, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1919642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1788.6876220703125, + "completions/mean_terminated_length": 1240.54150390625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.2558019597730789, + "grad_norm": 0.2513861656188965, + "kl": 0.01721954345703125, + "learning_rate": 1e-06, + "loss": 0.1641, + "num_tokens": 83078318.0, + "reward": 0.9954241514205933, + "reward_std": 0.2597261369228363, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252743124961853, + "rewards/curriculum_aware_reward_fn/mean": 0.18515624105930328, + "rewards/curriculum_aware_reward_fn/std": 0.14922460913658142, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 1818.868408203125, + "completions/mean_terminated_length": 1181.2713623046875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.259927797833935, + "grad_norm": 0.5408481359481812, + "kl": 0.0183258056640625, + "learning_rate": 1e-06, + "loss": 0.1045, + "num_tokens": 84156539.0, + "reward": 0.9498884081840515, + "reward_std": 0.2111586630344391, + "rewards/code_format_reward/mean": 0.7767857313156128, + "rewards/code_format_reward/std": 0.41686636209487915, + "rewards/curriculum_aware_reward_fn/mean": 0.17310269176959991, + "rewards/curriculum_aware_reward_fn/std": 0.1488838940858841, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2165178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3919.0, + "completions/mean_length": 1976.8460693359375, + "completions/mean_terminated_length": 1391.2108154296875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.26405363589479114, + "grad_norm": 1.840519666671753, + "kl": 0.41724395751953125, + "learning_rate": 1e-06, + "loss": 0.1207, + "num_tokens": 85322859.0, + "reward": 0.9379464983940125, + "reward_std": 0.2321721613407135, + "rewards/code_format_reward/mean": 0.78125, + "rewards/code_format_reward/std": 0.4138607978820801, + "rewards/curriculum_aware_reward_fn/mean": 0.15669642388820648, + "rewards/curriculum_aware_reward_fn/std": 0.1403089463710785, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2165178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3828.0, + "completions/mean_length": 1842.6920166015625, + "completions/mean_terminated_length": 1219.98291015625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.26817947395564723, + "grad_norm": 0.20033693313598633, + "kl": 0.01674652099609375, + "learning_rate": 1e-06, + "loss": 0.0878, + "num_tokens": 86418634.0, + "reward": 0.9722098112106323, + "reward_std": 0.20250627398490906, + "rewards/code_format_reward/mean": 0.7857142686843872, + "rewards/code_format_reward/std": 0.41078460216522217, + "rewards/curriculum_aware_reward_fn/mean": 0.18649554252624512, + "rewards/curriculum_aware_reward_fn/std": 0.1416252702474594, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2366071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 1811.4888916015625, + "completions/mean_terminated_length": 1103.4239501953125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.2723053120165034, + "grad_norm": 0.21493232250213623, + "kl": 0.0157470703125, + "learning_rate": 1e-06, + "loss": 0.0875, + "num_tokens": 87501102.0, + "reward": 0.9482142925262451, + "reward_std": 0.1998186707496643, + "rewards/code_format_reward/mean": 0.7633928656578064, + "rewards/code_format_reward/std": 0.4254741966724396, + "rewards/curriculum_aware_reward_fn/mean": 0.18482144176959991, + "rewards/curriculum_aware_reward_fn/std": 0.13863980770111084, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 1813.9376220703125, + "completions/mean_terminated_length": 1240.234619140625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.27643115007735947, + "grad_norm": 0.2010592669248581, + "kl": 0.0166473388671875, + "learning_rate": 1e-06, + "loss": 0.0958, + "num_tokens": 88571731.0, + "reward": 1.012834906578064, + "reward_std": 0.20978660881519318, + "rewards/code_format_reward/mean": 0.8035714030265808, + "rewards/code_format_reward/std": 0.3977404832839966, + "rewards/curriculum_aware_reward_fn/mean": 0.209263414144516, + "rewards/curriculum_aware_reward_fn/std": 0.14208464324474335, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 1591.321533203125, + "completions/mean_terminated_length": 1166.2454833984375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.28055698813821556, + "grad_norm": 0.3400166928768158, + "kl": 0.01915740966796875, + "learning_rate": 1e-06, + "loss": 0.0839, + "num_tokens": 89549775.0, + "reward": 1.0475447177886963, + "reward_std": 0.17855383455753326, + "rewards/code_format_reward/mean": 0.8526785969734192, + "rewards/code_format_reward/std": 0.3548222780227661, + "rewards/curriculum_aware_reward_fn/mean": 0.19486607611179352, + "rewards/curriculum_aware_reward_fn/std": 0.13271550834178925, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 1908.4599609375, + "completions/mean_terminated_length": 1311.8580322265625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.2846828261990717, + "grad_norm": 0.35188552737236023, + "kl": 0.01520538330078125, + "learning_rate": 1e-06, + "loss": 0.1303, + "num_tokens": 90663597.0, + "reward": 0.9658482670783997, + "reward_std": 0.23443441092967987, + "rewards/code_format_reward/mean": 0.7857142686843872, + "rewards/code_format_reward/std": 0.41078460216522217, + "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648, + "rewards/curriculum_aware_reward_fn/std": 0.13937532901763916, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1316964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4009.0, + "completions/mean_length": 1637.712158203125, + "completions/mean_terminated_length": 1264.8612060546875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.2888086642599278, + "grad_norm": 0.21659672260284424, + "kl": 0.0170745849609375, + "learning_rate": 1e-06, + "loss": 0.0605, + "num_tokens": 91653793.0, + "reward": 1.070424199104309, + "reward_std": 0.16760630905628204, + "rewards/code_format_reward/mean": 0.8705357313156128, + "rewards/code_format_reward/std": 0.3360883891582489, + "rewards/curriculum_aware_reward_fn/mean": 0.19988839328289032, + "rewards/curriculum_aware_reward_fn/std": 0.13450957834720612, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 1594.72998046875, + "completions/mean_terminated_length": 1147.13427734375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.2929345023207839, + "grad_norm": 0.24188809096813202, + "kl": 0.01702117919921875, + "learning_rate": 1e-06, + "loss": 0.111, + "num_tokens": 92642758.0, + "reward": 1.0497767925262451, + "reward_std": 0.20430657267570496, + "rewards/code_format_reward/mean": 0.8482142686843872, + "rewards/code_format_reward/std": 0.3592142164707184, + "rewards/curriculum_aware_reward_fn/mean": 0.20156250894069672, + "rewards/curriculum_aware_reward_fn/std": 0.13481204211711884, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2232142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 1918.1295166015625, + "completions/mean_terminated_length": 1292.3045654296875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.29706034038164003, + "grad_norm": 0.23266896605491638, + "kl": 0.01465606689453125, + "learning_rate": 1e-06, + "loss": 0.1266, + "num_tokens": 93783375.0, + "reward": 0.9430804252624512, + "reward_std": 0.2125602513551712, + "rewards/code_format_reward/mean": 0.7790178656578064, + "rewards/code_format_reward/std": 0.4153723120689392, + "rewards/curriculum_aware_reward_fn/mean": 0.1640625, + "rewards/curriculum_aware_reward_fn/std": 0.1429663747549057, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2522321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 1978.180908203125, + "completions/mean_terminated_length": 1263.8118896484375, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.3011861784424961, + "grad_norm": 0.2386295050382614, + "kl": 0.014862060546875, + "learning_rate": 1e-06, + "loss": 0.1095, + "num_tokens": 94935747.0, + "reward": 0.9466517567634583, + "reward_std": 0.2304624617099762, + "rewards/code_format_reward/mean": 0.7544642686843872, + "rewards/code_format_reward/std": 0.43088552355766296, + "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343, + "rewards/curriculum_aware_reward_fn/std": 0.14789843559265137, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2633928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 2113.1318359375, + "completions/mean_terminated_length": 1404.10595703125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.30531201650335227, + "grad_norm": 0.1983596831560135, + "kl": 0.01523590087890625, + "learning_rate": 1e-06, + "loss": 0.084, + "num_tokens": 96176073.0, + "reward": 0.9197545051574707, + "reward_std": 0.20995503664016724, + "rewards/code_format_reward/mean": 0.7433035969734192, + "rewards/code_format_reward/std": 0.4372987747192383, + "rewards/curriculum_aware_reward_fn/mean": 0.17645089328289032, + "rewards/curriculum_aware_reward_fn/std": 0.141729936003685, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2232142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1784.3818359375, + "completions/mean_terminated_length": 1120.12353515625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.30943785456420836, + "grad_norm": 0.20856866240501404, + "kl": 0.0185394287109375, + "learning_rate": 1e-06, + "loss": 0.0662, + "num_tokens": 97242659.0, + "reward": 0.9758929014205933, + "reward_std": 0.1802317053079605, + "rewards/code_format_reward/mean": 0.7790178656578064, + "rewards/code_format_reward/std": 0.4153723120689392, + "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, + "rewards/curriculum_aware_reward_fn/std": 0.14509600400924683, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2589285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 2063.15625, + "completions/mean_terminated_length": 1352.885498046875, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.31356369262506445, + "grad_norm": 0.28605571389198303, + "kl": 0.064544677734375, + "learning_rate": 1e-06, + "loss": 0.1413, + "num_tokens": 98452756.0, + "reward": 0.8792411088943481, + "reward_std": 0.2832396924495697, + "rewards/code_format_reward/mean": 0.7433035969734192, + "rewards/code_format_reward/std": 0.43729880452156067, + "rewards/curriculum_aware_reward_fn/mean": 0.13593749701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14190621674060822, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 1724.165283203125, + "completions/mean_terminated_length": 1224.15673828125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.3176895306859206, + "grad_norm": 0.21490244567394257, + "kl": 0.01772308349609375, + "learning_rate": 1e-06, + "loss": 0.0601, + "num_tokens": 99505350.0, + "reward": 1.0366071462631226, + "reward_std": 0.17229565978050232, + "rewards/code_format_reward/mean": 0.8236607313156128, + "rewards/code_format_reward/std": 0.3815346360206604, + "rewards/curriculum_aware_reward_fn/mean": 0.21294644474983215, + "rewards/curriculum_aware_reward_fn/std": 0.14066724479198456, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2276785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1939.0380859375, + "completions/mean_terminated_length": 1303.1705322265625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3218153687467767, + "grad_norm": 0.22845226526260376, + "kl": 0.015228271484375, + "learning_rate": 1e-06, + "loss": 0.1391, + "num_tokens": 100650427.0, + "reward": 0.9670760035514832, + "reward_std": 0.2431958168745041, + "rewards/code_format_reward/mean": 0.78125, + "rewards/code_format_reward/std": 0.4138607978820801, + "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548, + "rewards/curriculum_aware_reward_fn/std": 0.13783639669418335, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 1808.8326416015625, + "completions/mean_terminated_length": 1370.8642578125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.3259412068076328, + "grad_norm": 0.2206607162952423, + "kl": 0.01694488525390625, + "learning_rate": 1e-06, + "loss": 0.1064, + "num_tokens": 101732923.0, + "reward": 1.0236607789993286, + "reward_std": 0.21730710566043854, + "rewards/code_format_reward/mean": 0.8415178656578064, + "rewards/code_format_reward/std": 0.36560073494911194, + "rewards/curriculum_aware_reward_fn/mean": 0.18214286863803864, + "rewards/curriculum_aware_reward_fn/std": 0.13819824159145355, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2477678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1931.15185546875, + "completions/mean_terminated_length": 1218.100830078125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.3300670448684889, + "grad_norm": 0.2661918103694916, + "kl": 0.01607513427734375, + "learning_rate": 1e-06, + "loss": 0.1196, + "num_tokens": 102861237.0, + "reward": 0.9046875238418579, + "reward_std": 0.2178276777267456, + "rewards/code_format_reward/mean": 0.75, + "rewards/code_format_reward/std": 0.43349677324295044, + "rewards/curriculum_aware_reward_fn/mean": 0.15468750894069672, + "rewards/curriculum_aware_reward_fn/std": 0.14217200875282288, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2321428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 1844.3438720703125, + "completions/mean_terminated_length": 1163.6104736328125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.334192882929345, + "grad_norm": 0.21588768064975739, + "kl": 0.01670074462890625, + "learning_rate": 1e-06, + "loss": 0.1077, + "num_tokens": 103959747.0, + "reward": 0.9659598469734192, + "reward_std": 0.1847652792930603, + "rewards/code_format_reward/mean": 0.7700892686843872, + "rewards/code_format_reward/std": 0.42124560475349426, + "rewards/curriculum_aware_reward_fn/mean": 0.1958705335855484, + "rewards/curriculum_aware_reward_fn/std": 0.16381129622459412, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2209821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 1976.0068359375, + "completions/mean_terminated_length": 1374.63330078125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.33831872099020116, + "grad_norm": 0.20742100477218628, + "kl": 0.01610565185546875, + "learning_rate": 1e-06, + "loss": 0.097, + "num_tokens": 105114190.0, + "reward": 0.9648438096046448, + "reward_std": 0.21596986055374146, + "rewards/code_format_reward/mean": 0.7790178656578064, + "rewards/code_format_reward/std": 0.4153723120689392, + "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548, + "rewards/curriculum_aware_reward_fn/std": 0.14072753489017487, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4000.0, + "completions/mean_length": 2044.899658203125, + "completions/mean_terminated_length": 1361.199462890625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.34244455905105725, + "grad_norm": 0.26717469096183777, + "kl": 0.01522064208984375, + "learning_rate": 1e-06, + "loss": 0.1235, + "num_tokens": 106323115.0, + "reward": 0.9233258962631226, + "reward_std": 0.23657990992069244, + "rewards/code_format_reward/mean": 0.7522321343421936, + "rewards/code_format_reward/std": 0.4321989119052887, + "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14227430522441864, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2522321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 2159.348388671875, + "completions/mean_terminated_length": 1506.089599609375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.34657039711191334, + "grad_norm": 2.97698974609375, + "kl": 0.319793701171875, + "learning_rate": 1e-06, + "loss": 0.0768, + "num_tokens": 107574248.0, + "reward": 0.9077010154724121, + "reward_std": 0.20499999821186066, + "rewards/code_format_reward/mean": 0.75, + "rewards/code_format_reward/std": 0.43349677324295044, + "rewards/curriculum_aware_reward_fn/mean": 0.15770089626312256, + "rewards/curriculum_aware_reward_fn/std": 0.14362619817256927, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1941964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 1893.1898193359375, + "completions/mean_terminated_length": 1362.3184814453125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.3506962351727695, + "grad_norm": 0.23494893312454224, + "kl": 0.01613616943359375, + "learning_rate": 1e-06, + "loss": 0.1371, + "num_tokens": 108703958.0, + "reward": 0.9831473231315613, + "reward_std": 0.28516101837158203, + "rewards/code_format_reward/mean": 0.8080357313156128, + "rewards/code_format_reward/std": 0.3942854404449463, + "rewards/curriculum_aware_reward_fn/mean": 0.17511160671710968, + "rewards/curriculum_aware_reward_fn/std": 0.1376536637544632, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2611607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4058.0, + "completions/mean_length": 2132.754638671875, + "completions/mean_terminated_length": 1438.797607421875, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.3548220732336256, + "grad_norm": 0.21388478577136993, + "kl": 0.01474761962890625, + "learning_rate": 1e-06, + "loss": 0.1361, + "num_tokens": 109917858.0, + "reward": 0.914843738079071, + "reward_std": 0.25436264276504517, + "rewards/code_format_reward/mean": 0.7410714030265808, + "rewards/code_format_reward/std": 0.43853598833084106, + "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023, + "rewards/curriculum_aware_reward_fn/std": 0.14255832135677338, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1919642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4035.0, + "completions/mean_length": 1760.279052734375, + "completions/mean_terminated_length": 1205.384033203125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.35894791129448167, + "grad_norm": 0.2028041034936905, + "kl": 0.0170440673828125, + "learning_rate": 1e-06, + "loss": 0.1113, + "num_tokens": 110991643.0, + "reward": 1.0066964626312256, + "reward_std": 0.16249890625476837, + "rewards/code_format_reward/mean": 0.8058035969734192, + "rewards/code_format_reward/std": 0.3960230052471161, + "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, + "rewards/curriculum_aware_reward_fn/std": 0.13506683707237244, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4056.0, + "completions/mean_length": 1839.2568359375, + "completions/mean_terminated_length": 1264.0084228515625, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.3630737493553378, + "grad_norm": 0.25737714767456055, + "kl": 0.01641082763671875, + "learning_rate": 1e-06, + "loss": 0.1086, + "num_tokens": 112092172.0, + "reward": 0.9707589745521545, + "reward_std": 0.23582224547863007, + "rewards/code_format_reward/mean": 0.7946428656578064, + "rewards/code_format_reward/std": 0.40441393852233887, + "rewards/curriculum_aware_reward_fn/mean": 0.17611606419086456, + "rewards/curriculum_aware_reward_fn/std": 0.14302924275398254, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2388392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 1971.482177734375, + "completions/mean_terminated_length": 1304.8446044921875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.3671995874161939, + "grad_norm": 0.43521085381507874, + "kl": 0.01739501953125, + "learning_rate": 1e-06, + "loss": 0.1102, + "num_tokens": 113258112.0, + "reward": 0.9188616275787354, + "reward_std": 0.2197439968585968, + "rewards/code_format_reward/mean": 0.7611607313156128, + "rewards/code_format_reward/std": 0.4268510043621063, + "rewards/curriculum_aware_reward_fn/mean": 0.15770088136196136, + "rewards/curriculum_aware_reward_fn/std": 0.14327529072761536, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1830357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 1960.5224609375, + "completions/mean_terminated_length": 1482.0819091796875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.37132542547705005, + "grad_norm": 1.3905837535858154, + "kl": 0.028656005859375, + "learning_rate": 1e-06, + "loss": 0.0801, + "num_tokens": 114416783.0, + "reward": 1.009151816368103, + "reward_std": 0.2122829705476761, + "rewards/code_format_reward/mean": 0.8169642686843872, + "rewards/code_format_reward/std": 0.387128084897995, + "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343, + "rewards/curriculum_aware_reward_fn/std": 0.13840459287166595, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 1792.435302734375, + "completions/mean_terminated_length": 1213.3267822265625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.37545126353790614, + "grad_norm": 0.19705531001091003, + "kl": 0.017974853515625, + "learning_rate": 1e-06, + "loss": 0.0856, + "num_tokens": 115498786.0, + "reward": 0.9724330902099609, + "reward_std": 0.17091991007328033, + "rewards/code_format_reward/mean": 0.8013392686843872, + "rewards/code_format_reward/std": 0.3994380533695221, + "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776, + "rewards/curriculum_aware_reward_fn/std": 0.14368249475955963, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2209821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 1975.6273193359375, + "completions/mean_terminated_length": 1374.146240234375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.37957710159876223, + "grad_norm": 0.43463876843452454, + "kl": 0.0468292236328125, + "learning_rate": 1e-06, + "loss": 0.0951, + "num_tokens": 116643371.0, + "reward": 0.9460937976837158, + "reward_std": 0.21486541628837585, + "rewards/code_format_reward/mean": 0.7790178656578064, + "rewards/code_format_reward/std": 0.4153723120689392, + "rewards/curriculum_aware_reward_fn/mean": 0.16707590222358704, + "rewards/curriculum_aware_reward_fn/std": 0.1629006415605545, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3997.0, + "completions/mean_length": 1699.790283203125, + "completions/mean_terminated_length": 1217.9786376953125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.3837029396596184, + "grad_norm": 0.2277679443359375, + "kl": 0.0191802978515625, + "learning_rate": 1e-06, + "loss": 0.1171, + "num_tokens": 117679158.0, + "reward": 1.0132813453674316, + "reward_std": 0.2020494043827057, + "rewards/code_format_reward/mean": 0.8348214030265808, + "rewards/code_format_reward/std": 0.37175676226615906, + "rewards/curriculum_aware_reward_fn/mean": 0.17845983803272247, + "rewards/curriculum_aware_reward_fn/std": 0.1565471738576889, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4056.0, + "completions/mean_length": 1753.99560546875, + "completions/mean_terminated_length": 1377.818603515625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.38782877772047447, + "grad_norm": 0.22877489030361176, + "kl": 0.019775390625, + "learning_rate": 1e-06, + "loss": 0.0487, + "num_tokens": 118731312.0, + "reward": 1.0579241514205933, + "reward_std": 0.16506682336330414, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3480229377746582, + "rewards/curriculum_aware_reward_fn/mean": 0.19854912161827087, + "rewards/curriculum_aware_reward_fn/std": 0.13795046508312225, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4084.0, + "completions/mean_length": 1703.1160888671875, + "completions/mean_terminated_length": 1282.3201904296875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.39195461578133056, + "grad_norm": 0.2043149769306183, + "kl": 0.0197296142578125, + "learning_rate": 1e-06, + "loss": 0.0728, + "num_tokens": 119770878.0, + "reward": 1.0570311546325684, + "reward_std": 0.18754906952381134, + "rewards/code_format_reward/mean": 0.8504464030265808, + "rewards/code_format_reward/std": 0.3570319712162018, + "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352, + "rewards/curriculum_aware_reward_fn/std": 0.1388920247554779, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1004464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4057.0, + "completions/mean_length": 1530.4085693359375, + "completions/mean_terminated_length": 1243.927978515625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.3960804538421867, + "grad_norm": 0.30418333411216736, + "kl": 0.0242156982421875, + "learning_rate": 1e-06, + "loss": 0.1313, + "num_tokens": 120728556.0, + "reward": 1.0689733028411865, + "reward_std": 0.18825943768024445, + "rewards/code_format_reward/mean": 0.8995535969734192, + "rewards/code_format_reward/std": 0.30093035101890564, + "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256, + "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 1664.134033203125, + "completions/mean_terminated_length": 1251.4151611328125, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.4002062919030428, + "grad_norm": 0.6233256459236145, + "kl": 0.0251312255859375, + "learning_rate": 1e-06, + "loss": 0.074, + "num_tokens": 121740935.0, + "reward": 1.0614955425262451, + "reward_std": 0.16232284903526306, + "rewards/code_format_reward/mean": 0.8549107313156128, + "rewards/code_format_reward/std": 0.3525845408439636, + "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352, + "rewards/curriculum_aware_reward_fn/std": 0.14282289147377014, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 1824.4576416015625, + "completions/mean_terminated_length": 1292.5537109375, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.4043321299638989, + "grad_norm": 0.22889189422130585, + "kl": 0.01766204833984375, + "learning_rate": 1e-06, + "loss": 0.104, + "num_tokens": 122834834.0, + "reward": 1.1388393640518188, + "reward_std": 0.28884097933769226, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252743124961853, + "rewards/curriculum_aware_reward_fn/mean": 0.3285714089870453, + "rewards/curriculum_aware_reward_fn/std": 0.318551242351532, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4000.0, + "completions/mean_length": 1661.8616943359375, + "completions/mean_terminated_length": 1248.7572021484375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.40845796802475504, + "grad_norm": 0.7959763407707214, + "kl": 0.0198974609375, + "learning_rate": 1e-06, + "loss": 0.0787, + "num_tokens": 123850430.0, + "reward": 1.2450892925262451, + "reward_std": 0.2383604198694229, + "rewards/code_format_reward/mean": 0.8571428656578064, + "rewards/code_format_reward/std": 0.3503182828426361, + "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387, + "rewards/curriculum_aware_reward_fn/std": 0.3007444441318512, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2410714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 2082.6005859375, + "completions/mean_terminated_length": 1443.050048828125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.4125838060856111, + "grad_norm": 0.22675098478794098, + "kl": 0.016845703125, + "learning_rate": 1e-06, + "loss": 0.1181, + "num_tokens": 125048924.0, + "reward": 1.1112724542617798, + "reward_std": 0.3455579876899719, + "rewards/code_format_reward/mean": 0.7589285969734192, + "rewards/code_format_reward/std": 0.4282117187976837, + "rewards/curriculum_aware_reward_fn/mean": 0.35234373807907104, + "rewards/curriculum_aware_reward_fn/std": 0.30698445439338684, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3952.0, + "completions/mean_length": 1744.680908203125, + "completions/mean_terminated_length": 1323.91845703125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.4167096441464673, + "grad_norm": 0.22585518658161163, + "kl": 0.0187835693359375, + "learning_rate": 1e-06, + "loss": 0.1183, + "num_tokens": 126114405.0, + "reward": 1.2287946939468384, + "reward_std": 0.2684269845485687, + "rewards/code_format_reward/mean": 0.8482142686843872, + "rewards/code_format_reward/std": 0.3592142164707184, + "rewards/curriculum_aware_reward_fn/mean": 0.3805803656578064, + "rewards/curriculum_aware_reward_fn/std": 0.29996660351753235, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2098214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3967.0, + "completions/mean_length": 1884.2835693359375, + "completions/mean_terminated_length": 1296.9915771484375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.42083548220732336, + "grad_norm": 0.3677929639816284, + "kl": 0.0188140869140625, + "learning_rate": 1e-06, + "loss": 0.1043, + "num_tokens": 127244766.0, + "reward": 1.1637277603149414, + "reward_std": 0.3110141158103943, + "rewards/code_format_reward/mean": 0.7901785969734192, + "rewards/code_format_reward/std": 0.40763622522354126, + "rewards/curriculum_aware_reward_fn/mean": 0.37354913353919983, + "rewards/curriculum_aware_reward_fn/std": 0.3067074716091156, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1755.8148193359375, + "completions/mean_terminated_length": 1183.76953125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.42496132026817945, + "grad_norm": 0.229450061917305, + "kl": 0.0190887451171875, + "learning_rate": 1e-06, + "loss": 0.1335, + "num_tokens": 128312631.0, + "reward": 1.1440848112106323, + "reward_std": 0.22956174612045288, + "rewards/code_format_reward/mean": 0.8035714030265808, + "rewards/code_format_reward/std": 0.39774051308631897, + "rewards/curriculum_aware_reward_fn/mean": 0.3405133783817291, + "rewards/curriculum_aware_reward_fn/std": 0.2985129952430725, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 1909.044677734375, + "completions/mean_terminated_length": 1374.45556640625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.4290871583290356, + "grad_norm": 0.24083521962165833, + "kl": 0.017913818359375, + "learning_rate": 1e-06, + "loss": 0.1011, + "num_tokens": 129424734.0, + "reward": 1.1671875715255737, + "reward_std": 0.3163076937198639, + "rewards/code_format_reward/mean": 0.8035714030265808, + "rewards/code_format_reward/std": 0.3977404832839966, + "rewards/curriculum_aware_reward_fn/mean": 0.36361604928970337, + "rewards/curriculum_aware_reward_fn/std": 0.29430437088012695, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0758928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 1389.8460693359375, + "completions/mean_terminated_length": 1167.6014404296875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.4332129963898917, + "grad_norm": 0.2619832158088684, + "kl": 0.0225067138671875, + "learning_rate": 1e-06, + "loss": 0.1098, + "num_tokens": 130308331.0, + "reward": 1.3381696939468384, + "reward_std": 0.24706150591373444, + "rewards/code_format_reward/mean": 0.9241071343421936, + "rewards/code_format_reward/std": 0.265122652053833, + "rewards/curriculum_aware_reward_fn/mean": 0.4140624701976776, + "rewards/curriculum_aware_reward_fn/std": 0.2952509820461273, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4042.0, + "completions/mean_length": 1655.8326416015625, + "completions/mean_terminated_length": 1307.2371826171875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4373388344507478, + "grad_norm": 0.24595658481121063, + "kl": 0.0212249755859375, + "learning_rate": 1e-06, + "loss": 0.0767, + "num_tokens": 131325034.0, + "reward": 1.2672991752624512, + "reward_std": 0.24971547722816467, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.3310886323451996, + "rewards/curriculum_aware_reward_fn/mean": 0.3922991156578064, + "rewards/curriculum_aware_reward_fn/std": 0.2960248589515686, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4057.0, + "completions/mean_length": 1656.15185546875, + "completions/mean_terminated_length": 1188.94677734375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.44146467251160393, + "grad_norm": 0.3704790771007538, + "kl": 0.020477294921875, + "learning_rate": 1e-06, + "loss": 0.1244, + "num_tokens": 132320691.0, + "reward": 1.2327009439468384, + "reward_std": 0.2673065662384033, + "rewards/code_format_reward/mean": 0.8370535969734192, + "rewards/code_format_reward/std": 0.3697296679019928, + "rewards/curriculum_aware_reward_fn/mean": 0.3956473171710968, + "rewards/curriculum_aware_reward_fn/std": 0.29913750290870667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 1538.372802734375, + "completions/mean_terminated_length": 1135.2325439453125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.44559051057246, + "grad_norm": 0.32088541984558105, + "kl": 0.0222625732421875, + "learning_rate": 1e-06, + "loss": 0.0987, + "num_tokens": 133278850.0, + "reward": 1.200446605682373, + "reward_std": 0.1880311220884323, + "rewards/code_format_reward/mean": 0.8660714030265808, + "rewards/code_format_reward/std": 0.34095627069473267, + "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, + "rewards/curriculum_aware_reward_fn/std": 0.31286945939064026, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 1741.90185546875, + "completions/mean_terminated_length": 1206.5863037109375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.44971634863331617, + "grad_norm": 0.22313106060028076, + "kl": 0.0215301513671875, + "learning_rate": 1e-06, + "loss": 0.0738, + "num_tokens": 134327455.0, + "reward": 1.2027901411056519, + "reward_std": 0.25760817527770996, + "rewards/code_format_reward/mean": 0.8147321343421936, + "rewards/code_format_reward/std": 0.38894903659820557, + "rewards/curriculum_aware_reward_fn/mean": 0.38805803656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3171059191226959, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 1705.8482666015625, + "completions/mean_terminated_length": 1336.237060546875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.45384218669417226, + "grad_norm": 0.5322986245155334, + "kl": 0.064666748046875, + "learning_rate": 1e-06, + "loss": 0.1118, + "num_tokens": 135364563.0, + "reward": 1.2229912281036377, + "reward_std": 0.2728971242904663, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334254264831543, + "rewards/curriculum_aware_reward_fn/mean": 0.35915178060531616, + "rewards/curriculum_aware_reward_fn/std": 0.29508450627326965, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0669642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3892.0, + "completions/mean_length": 1267.446533203125, + "completions/mean_terminated_length": 1064.440185546875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.45796802475502835, + "grad_norm": 0.2848234474658966, + "kl": 0.0265960693359375, + "learning_rate": 1e-06, + "loss": 0.0998, + "num_tokens": 136178695.0, + "reward": 1.4200893640518188, + "reward_std": 0.19471189379692078, + "rewards/code_format_reward/mean": 0.9308035969734192, + "rewards/code_format_reward/std": 0.25407159328460693, + "rewards/curriculum_aware_reward_fn/mean": 0.4892856776714325, + "rewards/curriculum_aware_reward_fn/std": 0.28257983922958374, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0825892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 1507.4710693359375, + "completions/mean_terminated_length": 1274.4404296875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.4620938628158845, + "grad_norm": 0.24630674719810486, + "kl": 0.023223876953125, + "learning_rate": 1e-06, + "loss": 0.0871, + "num_tokens": 137136779.0, + "reward": 1.3398438692092896, + "reward_std": 0.24005301296710968, + "rewards/code_format_reward/mean": 0.9174107313156128, + "rewards/code_format_reward/std": 0.2755681276321411, + "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096, + "rewards/curriculum_aware_reward_fn/std": 0.2932768166065216, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2209821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1992.060302734375, + "completions/mean_terminated_length": 1395.24072265625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.4662197008767406, + "grad_norm": 4.038174629211426, + "kl": 0.0250091552734375, + "learning_rate": 1e-06, + "loss": 0.1024, + "num_tokens": 138315249.0, + "reward": 1.1348215341567993, + "reward_std": 0.31040358543395996, + "rewards/code_format_reward/mean": 0.78125, + "rewards/code_format_reward/std": 0.4138607978820801, + "rewards/curriculum_aware_reward_fn/mean": 0.35357141494750977, + "rewards/curriculum_aware_reward_fn/std": 0.2992880642414093, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4021.0, + "completions/mean_length": 1731.1160888671875, + "completions/mean_terminated_length": 1177.3553466796875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.4703455389375967, + "grad_norm": 0.21624675393104553, + "kl": 0.021209716796875, + "learning_rate": 1e-06, + "loss": 0.1069, + "num_tokens": 139343846.0, + "reward": 1.1998885869979858, + "reward_std": 0.27098482847213745, + "rewards/code_format_reward/mean": 0.8125, + "rewards/code_format_reward/std": 0.3907487094402313, + "rewards/curriculum_aware_reward_fn/mean": 0.3873883783817291, + "rewards/curriculum_aware_reward_fn/std": 0.30484986305236816, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0892857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 1448.509033203125, + "completions/mean_terminated_length": 1188.9510498046875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.4744713769984528, + "grad_norm": 0.717583179473877, + "kl": 0.0406646728515625, + "learning_rate": 1e-06, + "loss": 0.0648, + "num_tokens": 140255075.0, + "reward": 1.3869420289993286, + "reward_std": 0.22124750912189484, + "rewards/code_format_reward/mean": 0.9107142686843872, + "rewards/code_format_reward/std": 0.2854745090007782, + "rewards/curriculum_aware_reward_fn/mean": 0.47622767090797424, + "rewards/curriculum_aware_reward_fn/std": 0.29526206851005554, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 1866.904052734375, + "completions/mean_terminated_length": 1411.497314453125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.4785972150593089, + "grad_norm": 0.25914421677589417, + "kl": 0.0201873779296875, + "learning_rate": 1e-06, + "loss": 0.104, + "num_tokens": 141357756.0, + "reward": 1.1671875715255737, + "reward_std": 0.29631105065345764, + "rewards/code_format_reward/mean": 0.8303571343421936, + "rewards/code_format_reward/std": 0.37573832273483276, + "rewards/curriculum_aware_reward_fn/mean": 0.33683034777641296, + "rewards/curriculum_aware_reward_fn/std": 0.30225586891174316, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 1616.9376220703125, + "completions/mean_terminated_length": 1226.180908203125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.48272305312016506, + "grad_norm": 0.3568706810474396, + "kl": 0.0214080810546875, + "learning_rate": 1e-06, + "loss": 0.1059, + "num_tokens": 142361990.0, + "reward": 1.2168527841567993, + "reward_std": 0.20818667113780975, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334251284599304, + "rewards/curriculum_aware_reward_fn/mean": 0.35301336646080017, + "rewards/curriculum_aware_reward_fn/std": 0.30549928545951843, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1428571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1734.7567138671875, + "completions/mean_terminated_length": 1341.2161865234375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.48684889118102115, + "grad_norm": 0.2393302172422409, + "kl": 0.0227508544921875, + "learning_rate": 1e-06, + "loss": 0.0651, + "num_tokens": 143411237.0, + "reward": 1.2437500953674316, + "reward_std": 0.2932717204093933, + "rewards/code_format_reward/mean": 0.8526785969734192, + "rewards/code_format_reward/std": 0.3548222780227661, + "rewards/curriculum_aware_reward_fn/mean": 0.3910714089870453, + "rewards/curriculum_aware_reward_fn/std": 0.3008536696434021, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 1937.966552734375, + "completions/mean_terminated_length": 1395.444091796875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.49097472924187724, + "grad_norm": 144.79115295410156, + "kl": 11.142135620117188, + "learning_rate": 1e-06, + "loss": 0.2232, + "num_tokens": 144552754.0, + "reward": 1.1873886585235596, + "reward_std": 0.27385786175727844, + "rewards/code_format_reward/mean": 0.7991071343421936, + "rewards/code_format_reward/std": 0.4011159837245941, + "rewards/curriculum_aware_reward_fn/mean": 0.3882812559604645, + "rewards/curriculum_aware_reward_fn/std": 0.3559999465942383, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 2016.8951416015625, + "completions/mean_terminated_length": 1449.8665771484375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.4951005673027334, + "grad_norm": 0.3591211438179016, + "kl": 0.019683837890625, + "learning_rate": 1e-06, + "loss": 0.1078, + "num_tokens": 145719796.0, + "reward": 1.1426339149475098, + "reward_std": 0.36062249541282654, + "rewards/code_format_reward/mean": 0.7857142686843872, + "rewards/code_format_reward/std": 0.41078460216522217, + "rewards/curriculum_aware_reward_fn/mean": 0.35691961646080017, + "rewards/curriculum_aware_reward_fn/std": 0.3005719482898712, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1584821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 1744.2857666015625, + "completions/mean_terminated_length": 1301.389892578125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.4992264053635895, + "grad_norm": 0.3126465976238251, + "kl": 0.047943115234375, + "learning_rate": 1e-06, + "loss": 0.0751, + "num_tokens": 146769224.0, + "reward": 1.2447545528411865, + "reward_std": 0.2551679313182831, + "rewards/code_format_reward/mean": 0.84375, + "rewards/code_format_reward/std": 0.36349809169769287, + "rewards/curriculum_aware_reward_fn/mean": 0.40100446343421936, + "rewards/curriculum_aware_reward_fn/std": 0.30527445673942566, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1205357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4042.0, + "completions/mean_length": 1506.857177734375, + "completions/mean_terminated_length": 1152.0, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5033522434244456, + "grad_norm": 0.2568581998348236, + "kl": 0.0231475830078125, + "learning_rate": 1e-06, + "loss": 0.0733, + "num_tokens": 147711045.0, + "reward": 1.3008930683135986, + "reward_std": 0.24989202618598938, + "rewards/code_format_reward/mean": 0.8794642686843872, + "rewards/code_format_reward/std": 0.3259509205818176, + "rewards/curriculum_aware_reward_fn/mean": 0.42142853140830994, + "rewards/curriculum_aware_reward_fn/std": 0.30747321248054504, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4052.0, + "completions/mean_length": 1926.27685546875, + "completions/mean_terminated_length": 1395.9000244140625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.5074780814853017, + "grad_norm": 0.23189201951026917, + "kl": 0.01876068115234375, + "learning_rate": 1e-06, + "loss": 0.0917, + "num_tokens": 148856558.0, + "reward": 1.1736607551574707, + "reward_std": 0.2991105318069458, + "rewards/code_format_reward/mean": 0.8035714030265808, + "rewards/code_format_reward/std": 0.3977404832839966, + "rewards/curriculum_aware_reward_fn/mean": 0.37008926272392273, + "rewards/curriculum_aware_reward_fn/std": 0.3080544173717499, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 1860.857177734375, + "completions/mean_terminated_length": 1432.85107421875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.5116039195461578, + "grad_norm": 0.2435297667980194, + "kl": 0.0208282470703125, + "learning_rate": 1e-06, + "loss": 0.1266, + "num_tokens": 149947157.0, + "reward": 1.2025669813156128, + "reward_std": 0.3079543709754944, + "rewards/code_format_reward/mean": 0.8415178656578064, + "rewards/code_format_reward/std": 0.36560073494911194, + "rewards/curriculum_aware_reward_fn/mean": 0.3610491156578064, + "rewards/curriculum_aware_reward_fn/std": 0.30205413699150085, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3839.0, + "completions/mean_length": 1687.22998046875, + "completions/mean_terminated_length": 1225.97607421875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5157297576070139, + "grad_norm": 1.005789875984192, + "kl": 0.0277099609375, + "learning_rate": 1e-06, + "loss": 0.0732, + "num_tokens": 150966362.0, + "reward": 1.2402902841567993, + "reward_std": 0.2671992778778076, + "rewards/code_format_reward/mean": 0.8415178656578064, + "rewards/code_format_reward/std": 0.36560073494911194, + "rewards/curriculum_aware_reward_fn/mean": 0.39877229928970337, + "rewards/curriculum_aware_reward_fn/std": 0.29583922028541565, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 1716.3304443359375, + "completions/mean_terminated_length": 1230.1612548828125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.51985559566787, + "grad_norm": 0.23711848258972168, + "kl": 0.0218658447265625, + "learning_rate": 1e-06, + "loss": 0.1152, + "num_tokens": 151995531.0, + "reward": 1.2325893640518188, + "reward_std": 0.2793101370334625, + "rewards/code_format_reward/mean": 0.8303571343421936, + "rewards/code_format_reward/std": 0.37573832273483276, + "rewards/curriculum_aware_reward_fn/mean": 0.4022321403026581, + "rewards/curriculum_aware_reward_fn/std": 0.296654611825943, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 1562.4285888671875, + "completions/mean_terminated_length": 1286.4949951171875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.5239814337287262, + "grad_norm": 0.25690746307373047, + "kl": 0.0216522216796875, + "learning_rate": 1e-06, + "loss": 0.0878, + "num_tokens": 152969745.0, + "reward": 1.32421875, + "reward_std": 0.25885266065597534, + "rewards/code_format_reward/mean": 0.9017857313156128, + "rewards/code_format_reward/std": 0.29793688654899597, + "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096, + "rewards/curriculum_aware_reward_fn/std": 0.3098207712173462, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 1784.1585693359375, + "completions/mean_terminated_length": 1419.759765625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.5281072717895823, + "grad_norm": 0.2524595558643341, + "kl": 0.0201263427734375, + "learning_rate": 1e-06, + "loss": 0.073, + "num_tokens": 154036553.0, + "reward": 1.2431920766830444, + "reward_std": 0.2910597026348114, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334254264831543, + "rewards/curriculum_aware_reward_fn/mean": 0.3793526291847229, + "rewards/curriculum_aware_reward_fn/std": 0.3008619546890259, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1830357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 1931.9130859375, + "completions/mean_terminated_length": 1447.062744140625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5322331098504384, + "grad_norm": 0.620236337184906, + "kl": 0.0193634033203125, + "learning_rate": 1e-06, + "loss": 0.093, + "num_tokens": 155200845.0, + "reward": 1.173437476158142, + "reward_std": 0.2876041829586029, + "rewards/code_format_reward/mean": 0.8214285969734192, + "rewards/code_format_reward/std": 0.3834212124347687, + "rewards/curriculum_aware_reward_fn/mean": 0.3520089089870453, + "rewards/curriculum_aware_reward_fn/std": 0.30668607354164124, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1763392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3971.0, + "completions/mean_length": 1707.0491943359375, + "completions/mean_terminated_length": 1195.593505859375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.5363589479112945, + "grad_norm": 0.3680073022842407, + "kl": 0.05645751953125, + "learning_rate": 1e-06, + "loss": 0.1403, + "num_tokens": 156233115.0, + "reward": 1.2227678298950195, + "reward_std": 0.28796523809432983, + "rewards/code_format_reward/mean": 0.8258928656578064, + "rewards/code_format_reward/std": 0.37962549924850464, + "rewards/curriculum_aware_reward_fn/mean": 0.3968749940395355, + "rewards/curriculum_aware_reward_fn/std": 0.3155217170715332, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 1826.3773193359375, + "completions/mean_terminated_length": 1325.4522705078125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.5404847859721505, + "grad_norm": 0.22368091344833374, + "kl": 0.0381317138671875, + "learning_rate": 1e-06, + "loss": 0.064, + "num_tokens": 157330149.0, + "reward": 1.1661831140518188, + "reward_std": 0.26647037267684937, + "rewards/code_format_reward/mean": 0.8147321343421936, + "rewards/code_format_reward/std": 0.38894903659820557, + "rewards/curriculum_aware_reward_fn/mean": 0.3514508605003357, + "rewards/curriculum_aware_reward_fn/std": 0.3114034831523895, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 1739.4263916015625, + "completions/mean_terminated_length": 1360.9093017578125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.5446106240330068, + "grad_norm": 5.251157283782959, + "kl": 1.00634765625, + "learning_rate": 1e-06, + "loss": 0.0779, + "num_tokens": 158392779.0, + "reward": 1.2868304252624512, + "reward_std": 0.26909175515174866, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.4252232015132904, + "rewards/curriculum_aware_reward_fn/std": 0.2903171479701996, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 1873.0045166015625, + "completions/mean_terminated_length": 1352.4683837890625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5487364620938628, + "grad_norm": 0.2221236675977707, + "kl": 0.0202484130859375, + "learning_rate": 1e-06, + "loss": 0.078, + "num_tokens": 159511220.0, + "reward": 1.130357265472412, + "reward_std": 0.2697470784187317, + "rewards/code_format_reward/mean": 0.8147321343421936, + "rewards/code_format_reward/std": 0.38894903659820557, + "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, + "rewards/curriculum_aware_reward_fn/std": 0.30488425493240356, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1049107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 1628.3438720703125, + "completions/mean_terminated_length": 1339.1173095703125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5528623001547189, + "grad_norm": 0.2322208136320114, + "kl": 0.0207061767578125, + "learning_rate": 1e-06, + "loss": 0.093, + "num_tokens": 160512094.0, + "reward": 1.2918527126312256, + "reward_std": 0.24635407328605652, + "rewards/code_format_reward/mean": 0.8973214030265808, + "rewards/code_format_reward/std": 0.30387791991233826, + "rewards/curriculum_aware_reward_fn/mean": 0.3945312201976776, + "rewards/curriculum_aware_reward_fn/std": 0.28814804553985596, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4014.0, + "completions/mean_length": 1850.7724609375, + "completions/mean_terminated_length": 1442.0106201171875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.556988138215575, + "grad_norm": 0.2326047718524933, + "kl": 0.02008056640625, + "learning_rate": 1e-06, + "loss": 0.102, + "num_tokens": 161625734.0, + "reward": 1.2510045766830444, + "reward_std": 0.3010588586330414, + "rewards/code_format_reward/mean": 0.8459821343421936, + "rewards/code_format_reward/std": 0.36136940121650696, + "rewards/curriculum_aware_reward_fn/mean": 0.4050223231315613, + "rewards/curriculum_aware_reward_fn/std": 0.3193120062351227, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1272321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1741.94873046875, + "completions/mean_terminated_length": 1398.77490234375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.5611139762764311, + "grad_norm": 0.2658071219921112, + "kl": 0.0209503173828125, + "learning_rate": 1e-06, + "loss": 0.0679, + "num_tokens": 162668816.0, + "reward": 1.3080357313156128, + "reward_std": 0.2536565661430359, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.3310886323451996, + "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, + "rewards/curriculum_aware_reward_fn/std": 0.28444406390190125, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1614.5938720703125, + "completions/mean_terminated_length": 1193.4674072265625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.5652398143372873, + "grad_norm": 0.24400874972343445, + "kl": 0.022796630859375, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 163658504.0, + "reward": 1.2035715579986572, + "reward_std": 0.22330662608146667, + "rewards/code_format_reward/mean": 0.8549107313156128, + "rewards/code_format_reward/std": 0.3525845408439636, + "rewards/curriculum_aware_reward_fn/mean": 0.3486607074737549, + "rewards/curriculum_aware_reward_fn/std": 0.31307879090309143, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3776.0, + "completions/mean_length": 1638.0938720703125, + "completions/mean_terminated_length": 1243.300537109375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5693656523981434, + "grad_norm": 0.217624232172966, + "kl": 0.01983642578125, + "learning_rate": 1e-06, + "loss": 0.0868, + "num_tokens": 164669143.0, + "reward": 1.2754465341567993, + "reward_std": 0.2384774088859558, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.32119113206863403, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0870535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 1413.9129638671875, + "completions/mean_terminated_length": 1158.163818359375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5734914904589995, + "grad_norm": 0.2783685624599457, + "kl": 0.0218353271484375, + "learning_rate": 1e-06, + "loss": 0.0606, + "num_tokens": 165554219.0, + "reward": 1.3580358028411865, + "reward_std": 0.22280895709991455, + "rewards/code_format_reward/mean": 0.9129464030265808, + "rewards/code_format_reward/std": 0.2822287082672119, + "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, + "rewards/curriculum_aware_reward_fn/std": 0.2796177864074707, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1116071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 1759.0826416015625, + "completions/mean_terminated_length": 1465.5, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5776173285198556, + "grad_norm": 0.2369956076145172, + "kl": 0.020294189453125, + "learning_rate": 1e-06, + "loss": 0.074, + "num_tokens": 166618536.0, + "reward": 1.2844866514205933, + "reward_std": 0.29120200872421265, + "rewards/code_format_reward/mean": 0.890625, + "rewards/code_format_reward/std": 0.3124580383300781, + "rewards/curriculum_aware_reward_fn/mean": 0.3938615620136261, + "rewards/curriculum_aware_reward_fn/std": 0.29871317744255066, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3962.0, + "completions/mean_length": 1629.52685546875, + "completions/mean_terminated_length": 1117.6171875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.5817431665807117, + "grad_norm": 0.22888554632663727, + "kl": 0.02154541015625, + "learning_rate": 1e-06, + "loss": 0.0965, + "num_tokens": 167620136.0, + "reward": 1.2700893878936768, + "reward_std": 0.22284676134586334, + "rewards/code_format_reward/mean": 0.828125, + "rewards/code_format_reward/std": 0.3776935040950775, + "rewards/curriculum_aware_reward_fn/mean": 0.4419642984867096, + "rewards/curriculum_aware_reward_fn/std": 0.2943010926246643, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 1929.825927734375, + "completions/mean_terminated_length": 1451.73291015625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.5858690046415678, + "grad_norm": 0.31053388118743896, + "kl": 0.0206146240234375, + "learning_rate": 1e-06, + "loss": 0.0926, + "num_tokens": 168775254.0, + "reward": 1.1685268878936768, + "reward_std": 0.2711971700191498, + "rewards/code_format_reward/mean": 0.8169642686843872, + "rewards/code_format_reward/std": 0.387128084897995, + "rewards/curriculum_aware_reward_fn/mean": 0.3515625, + "rewards/curriculum_aware_reward_fn/std": 0.3206498324871063, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 2099.91748046875, + "completions/mean_terminated_length": 1611.9862060546875, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.589994842702424, + "grad_norm": 1.0171209573745728, + "kl": 0.0308685302734375, + "learning_rate": 1e-06, + "loss": 0.092, + "num_tokens": 169998306.0, + "reward": 1.1707589626312256, + "reward_std": 0.32189029455184937, + "rewards/code_format_reward/mean": 0.8058035969734192, + "rewards/code_format_reward/std": 0.3960230052471161, + "rewards/curriculum_aware_reward_fn/mean": 0.3649553656578064, + "rewards/curriculum_aware_reward_fn/std": 0.3043855130672455, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 1779.51123046875, + "completions/mean_terminated_length": 1357.7757568359375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.5941206807632801, + "grad_norm": 0.2087247222661972, + "kl": 0.0209197998046875, + "learning_rate": 1e-06, + "loss": 0.0873, + "num_tokens": 171055451.0, + "reward": 1.2494419813156128, + "reward_std": 0.22824469208717346, + "rewards/code_format_reward/mean": 0.8459821343421936, + "rewards/code_format_reward/std": 0.36136940121650696, + "rewards/curriculum_aware_reward_fn/mean": 0.4034597873687744, + "rewards/curriculum_aware_reward_fn/std": 0.2976873219013214, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 1915.65185546875, + "completions/mean_terminated_length": 1434.4305419921875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5982465188241362, + "grad_norm": 0.22233060002326965, + "kl": 0.021392822265625, + "learning_rate": 1e-06, + "loss": 0.0797, + "num_tokens": 172183898.0, + "reward": 1.1960937976837158, + "reward_std": 0.28867703676223755, + "rewards/code_format_reward/mean": 0.8191964030265808, + "rewards/code_format_reward/std": 0.38528555631637573, + "rewards/curriculum_aware_reward_fn/mean": 0.37689733505249023, + "rewards/curriculum_aware_reward_fn/std": 0.3042333126068115, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1272321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4037.0, + "completions/mean_length": 1554.509033203125, + "completions/mean_terminated_length": 1184.01025390625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.6023723568849922, + "grad_norm": 0.25177621841430664, + "kl": 0.0221405029296875, + "learning_rate": 1e-06, + "loss": 0.0628, + "num_tokens": 173136882.0, + "reward": 1.3223215341567993, + "reward_std": 0.2333114594221115, + "rewards/code_format_reward/mean": 0.8727678656578064, + "rewards/code_format_reward/std": 0.3336053788661957, + "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, + "rewards/curriculum_aware_reward_fn/std": 0.2995332181453705, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1026785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 1709.7098388671875, + "completions/mean_terminated_length": 1436.6517333984375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.6064981949458483, + "grad_norm": 0.23244866728782654, + "kl": 0.02008056640625, + "learning_rate": 1e-06, + "loss": 0.0525, + "num_tokens": 174169469.0, + "reward": 1.5930804014205933, + "reward_std": 0.3070758581161499, + "rewards/code_format_reward/mean": 0.8995535969734192, + "rewards/code_format_reward/std": 0.30093035101890564, + "rewards/curriculum_aware_reward_fn/mean": 0.6935268044471741, + "rewards/curriculum_aware_reward_fn/std": 0.43586739897727966, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4094.0, + "completions/mean_length": 2015.8974609375, + "completions/mean_terminated_length": 1542.8876953125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6106240330067045, + "grad_norm": 0.22977960109710693, + "kl": 0.020538330078125, + "learning_rate": 1e-06, + "loss": 0.0895, + "num_tokens": 175345031.0, + "reward": 1.3625000715255737, + "reward_std": 0.37156882882118225, + "rewards/code_format_reward/mean": 0.8191964030265808, + "rewards/code_format_reward/std": 0.38528555631637573, + "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034, + "rewards/curriculum_aware_reward_fn/std": 0.44534605741500854, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 1978.6429443359375, + "completions/mean_terminated_length": 1532.2811279296875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.6147498710675606, + "grad_norm": 54.07054138183594, + "kl": 5.5802459716796875, + "learning_rate": 1e-06, + "loss": 0.1208, + "num_tokens": 176520752.0, + "reward": 1.4250000715255737, + "reward_std": 0.3232128322124481, + "rewards/code_format_reward/mean": 0.8258928656578064, + "rewards/code_format_reward/std": 0.37962549924850464, + "rewards/curriculum_aware_reward_fn/mean": 0.5991071462631226, + "rewards/curriculum_aware_reward_fn/std": 0.4462988078594208, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1473214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3912.0, + "completions/mean_length": 1759.966552734375, + "completions/mean_terminated_length": 1356.358642578125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.6188757091284167, + "grad_norm": 0.2159195989370346, + "kl": 0.0220794677734375, + "learning_rate": 1e-06, + "loss": 0.0831, + "num_tokens": 177595706.0, + "reward": 1.485267996788025, + "reward_std": 0.31410348415374756, + "rewards/code_format_reward/mean": 0.8504464030265808, + "rewards/code_format_reward/std": 0.3570319712162018, + "rewards/curriculum_aware_reward_fn/mean": 0.6348214745521545, + "rewards/curriculum_aware_reward_fn/std": 0.43525242805480957, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 1610.05810546875, + "completions/mean_terminated_length": 1262.1527099609375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6230015471892728, + "grad_norm": 0.26271775364875793, + "kl": 0.0230865478515625, + "learning_rate": 1e-06, + "loss": 0.0389, + "num_tokens": 178580982.0, + "reward": 1.5331473350524902, + "reward_std": 0.29026809334754944, + "rewards/code_format_reward/mean": 0.8772321343421936, + "rewards/code_format_reward/std": 0.3285374045372009, + "rewards/curriculum_aware_reward_fn/mean": 0.6559152007102966, + "rewards/curriculum_aware_reward_fn/std": 0.42833590507507324, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0758928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 1439.6920166015625, + "completions/mean_terminated_length": 1221.541015625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.6271273852501289, + "grad_norm": 0.2625259459018707, + "kl": 0.0258941650390625, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 179483746.0, + "reward": 1.6277902126312256, + "reward_std": 0.24596910178661346, + "rewards/code_format_reward/mean": 0.9241071343421936, + "rewards/code_format_reward/std": 0.2651226818561554, + "rewards/curriculum_aware_reward_fn/mean": 0.7036830186843872, + "rewards/curriculum_aware_reward_fn/std": 0.3924238681793213, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 1682.0826416015625, + "completions/mean_terminated_length": 1287.077880859375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.631253223310985, + "grad_norm": 0.22368377447128296, + "kl": 0.0244598388671875, + "learning_rate": 1e-06, + "loss": 0.056, + "num_tokens": 180512524.0, + "reward": 1.5142858028411865, + "reward_std": 0.2848253846168518, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.6526784896850586, + "rewards/curriculum_aware_reward_fn/std": 0.4242819547653198, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1473214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3971.0, + "completions/mean_length": 1798.2724609375, + "completions/mean_terminated_length": 1401.28271484375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.6353790613718412, + "grad_norm": 3.2692782878875732, + "kl": 0.0210113525390625, + "learning_rate": 1e-06, + "loss": 0.0781, + "num_tokens": 181588987.0, + "reward": 1.4170759916305542, + "reward_std": 0.3211572766304016, + "rewards/code_format_reward/mean": 0.8549107313156128, + "rewards/code_format_reward/std": 0.3525845408439636, + "rewards/curriculum_aware_reward_fn/mean": 0.5621652007102966, + "rewards/curriculum_aware_reward_fn/std": 0.4653093218803406, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 1706.3795166015625, + "completions/mean_terminated_length": 1286.157470703125, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.6395048994326973, + "grad_norm": 0.20925132930278778, + "kl": 0.02191162109375, + "learning_rate": 1e-06, + "loss": 0.0643, + "num_tokens": 182636882.0, + "reward": 1.430915355682373, + "reward_std": 0.28560274839401245, + "rewards/code_format_reward/mean": 0.8504464030265808, + "rewards/code_format_reward/std": 0.3570319712162018, + "rewards/curriculum_aware_reward_fn/mean": 0.5804687142372131, + "rewards/curriculum_aware_reward_fn/std": 0.4736019968986511, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 1834.4130859375, + "completions/mean_terminated_length": 1265.8575439453125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.6436307374935534, + "grad_norm": 0.2284567803144455, + "kl": 0.0224151611328125, + "learning_rate": 1e-06, + "loss": 0.061, + "num_tokens": 183734975.0, + "reward": 1.3992189168930054, + "reward_std": 0.2940734326839447, + "rewards/code_format_reward/mean": 0.8013392686843872, + "rewards/code_format_reward/std": 0.3994380831718445, + "rewards/curriculum_aware_reward_fn/mean": 0.5978794693946838, + "rewards/curriculum_aware_reward_fn/std": 0.4464387595653534, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2120535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 1867.7857666015625, + "completions/mean_terminated_length": 1268.1246337890625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.6477565755544095, + "grad_norm": 0.24589860439300537, + "kl": 0.0221405029296875, + "learning_rate": 1e-06, + "loss": 0.1204, + "num_tokens": 184850196.0, + "reward": 1.4158483743667603, + "reward_std": 0.31856080889701843, + "rewards/code_format_reward/mean": 0.7879464030265808, + "rewards/code_format_reward/std": 0.40921953320503235, + "rewards/curriculum_aware_reward_fn/mean": 0.6279017329216003, + "rewards/curriculum_aware_reward_fn/std": 0.4837891459465027, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1900.57373046875, + "completions/mean_terminated_length": 1416.0245361328125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6518824136152656, + "grad_norm": 0.23172283172607422, + "kl": 0.02203369140625, + "learning_rate": 1e-06, + "loss": 0.094, + "num_tokens": 185946744.0, + "reward": 1.4448662996292114, + "reward_std": 0.3671303391456604, + "rewards/code_format_reward/mean": 0.828125, + "rewards/code_format_reward/std": 0.3776935040950775, + "rewards/curriculum_aware_reward_fn/mean": 0.6167410612106323, + "rewards/curriculum_aware_reward_fn/std": 0.4445529878139496, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 1822.5179443359375, + "completions/mean_terminated_length": 1457.34716796875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.6560082516761218, + "grad_norm": 0.21886523067951202, + "kl": 0.023040771484375, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 187044471.0, + "reward": 1.4851562976837158, + "reward_std": 0.27218568325042725, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334251284599304, + "rewards/curriculum_aware_reward_fn/mean": 0.6213169693946838, + "rewards/curriculum_aware_reward_fn/std": 0.43654102087020874, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1428571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4056.0, + "completions/mean_length": 1853.3037109375, + "completions/mean_terminated_length": 1479.5208740234375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6601340897369778, + "grad_norm": 0.25297442078590393, + "kl": 0.022857666015625, + "learning_rate": 1e-06, + "loss": 0.0641, + "num_tokens": 188175443.0, + "reward": 1.4321428537368774, + "reward_std": 0.3305543065071106, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3480229377746582, + "rewards/curriculum_aware_reward_fn/mean": 0.5727678537368774, + "rewards/curriculum_aware_reward_fn/std": 0.4538741111755371, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2120535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 2163.453125, + "completions/mean_terminated_length": 1643.362548828125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.6642599277978339, + "grad_norm": 15.494752883911133, + "kl": 0.241455078125, + "learning_rate": 1e-06, + "loss": 0.1041, + "num_tokens": 189418118.0, + "reward": 1.4010045528411865, + "reward_std": 0.42570188641548157, + "rewards/code_format_reward/mean": 0.7879464030265808, + "rewards/code_format_reward/std": 0.40921953320503235, + "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162, + "rewards/curriculum_aware_reward_fn/std": 0.46330952644348145, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 1600.154052734375, + "completions/mean_terminated_length": 1293.6466064453125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.66838576585869, + "grad_norm": 0.23208573460578918, + "kl": 0.02618408203125, + "learning_rate": 1e-06, + "loss": 0.041, + "num_tokens": 190404746.0, + "reward": 1.5679688453674316, + "reward_std": 0.28690239787101746, + "rewards/code_format_reward/mean": 0.8928571343421936, + "rewards/code_format_reward/std": 0.3096405565738678, + "rewards/curriculum_aware_reward_fn/mean": 0.6751116514205933, + "rewards/curriculum_aware_reward_fn/std": 0.43228229880332947, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 1678.0067138671875, + "completions/mean_terminated_length": 1332.5791015625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.6725116039195461, + "grad_norm": 0.21358643472194672, + "kl": 0.02490234375, + "learning_rate": 1e-06, + "loss": 0.079, + "num_tokens": 191424173.0, + "reward": 1.4213169813156128, + "reward_std": 0.3107840418815613, + "rewards/code_format_reward/mean": 0.8727678656578064, + "rewards/code_format_reward/std": 0.3336053788661957, + "rewards/curriculum_aware_reward_fn/mean": 0.5485491156578064, + "rewards/curriculum_aware_reward_fn/std": 0.4599668085575104, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1071428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4073.0, + "completions/mean_length": 1544.6473388671875, + "completions/mean_terminated_length": 1238.4849853515625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.6766374419804023, + "grad_norm": 0.22339744865894318, + "kl": 0.027435302734375, + "learning_rate": 1e-06, + "loss": 0.1136, + "num_tokens": 192387628.0, + "reward": 1.5645090341567993, + "reward_std": 0.3053089678287506, + "rewards/code_format_reward/mean": 0.8928571343421936, + "rewards/code_format_reward/std": 0.3096405565738678, + "rewards/curriculum_aware_reward_fn/mean": 0.6716518402099609, + "rewards/curriculum_aware_reward_fn/std": 0.42687925696372986, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1722.247802734375, + "completions/mean_terminated_length": 1340.971435546875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.6807632800412584, + "grad_norm": 0.22346089780330658, + "kl": 0.026641845703125, + "learning_rate": 1e-06, + "loss": 0.0703, + "num_tokens": 193439790.0, + "reward": 1.5340402126312256, + "reward_std": 0.3189355731010437, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.6724330186843872, + "rewards/curriculum_aware_reward_fn/std": 0.4248095750808716, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1473214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 1860.029052734375, + "completions/mean_terminated_length": 1473.70947265625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.6848891181021145, + "grad_norm": 0.4491024613380432, + "kl": 0.0242767333984375, + "learning_rate": 1e-06, + "loss": 0.0949, + "num_tokens": 194559180.0, + "reward": 1.485602855682373, + "reward_std": 0.37783434987068176, + "rewards/code_format_reward/mean": 0.8571428656578064, + "rewards/code_format_reward/std": 0.3503182828426361, + "rewards/curriculum_aware_reward_fn/mean": 0.6284598112106323, + "rewards/curriculum_aware_reward_fn/std": 0.4338955879211426, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 1602.30810546875, + "completions/mean_terminated_length": 1246.0662841796875, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.6890149561629706, + "grad_norm": 0.2428234964609146, + "kl": 0.031341552734375, + "learning_rate": 1e-06, + "loss": 0.0549, + "num_tokens": 195544246.0, + "reward": 1.5504463911056519, + "reward_std": 0.2811921536922455, + "rewards/code_format_reward/mean": 0.8772321343421936, + "rewards/code_format_reward/std": 0.3285374045372009, + "rewards/curriculum_aware_reward_fn/mean": 0.6732142567634583, + "rewards/curriculum_aware_reward_fn/std": 0.4268362522125244, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4023.0, + "completions/mean_length": 1865.9130859375, + "completions/mean_terminated_length": 1507.71240234375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.6931407942238267, + "grad_norm": 0.2817108929157257, + "kl": 0.025543212890625, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 196671669.0, + "reward": 1.4904019832611084, + "reward_std": 0.3173461854457855, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334254264831543, + "rewards/curriculum_aware_reward_fn/mean": 0.6265625357627869, + "rewards/curriculum_aware_reward_fn/std": 0.43362295627593994, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 1881.388427734375, + "completions/mean_terminated_length": 1436.0911865234375, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.6972666322846828, + "grad_norm": 0.3142848312854767, + "kl": 0.0292205810546875, + "learning_rate": 1e-06, + "loss": 0.0543, + "num_tokens": 197800782.0, + "reward": 1.4146206378936768, + "reward_std": 0.3862914443016052, + "rewards/code_format_reward/mean": 0.8348214030265808, + "rewards/code_format_reward/std": 0.37175676226615906, + "rewards/curriculum_aware_reward_fn/mean": 0.5797991156578064, + "rewards/curriculum_aware_reward_fn/std": 0.4677990674972534, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 2074.52685546875, + "completions/mean_terminated_length": 1523.2159423828125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.701392470345539, + "grad_norm": 0.2322724312543869, + "kl": 0.026275634765625, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 199023176.0, + "reward": 1.3515626192092896, + "reward_std": 0.3802144229412079, + "rewards/code_format_reward/mean": 0.7857142686843872, + "rewards/code_format_reward/std": 0.41078460216522217, + "rewards/curriculum_aware_reward_fn/mean": 0.5658482313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4406941831111908, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1990.6407470703125, + "completions/mean_terminated_length": 1560.513427734375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.7055183084063951, + "grad_norm": 0.2236924022436142, + "kl": 0.026702880859375, + "learning_rate": 1e-06, + "loss": 0.0675, + "num_tokens": 200189656.0, + "reward": 1.4627233743667603, + "reward_std": 0.32308143377304077, + "rewards/code_format_reward/mean": 0.8303571343421936, + "rewards/code_format_reward/std": 0.37573832273483276, + "rewards/curriculum_aware_reward_fn/mean": 0.6323660612106323, + "rewards/curriculum_aware_reward_fn/std": 0.43709608912467957, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4032.0, + "completions/mean_length": 1724.93310546875, + "completions/mean_terminated_length": 1285.8465576171875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.7096441464672512, + "grad_norm": 0.22402040660381317, + "kl": 0.0262603759765625, + "learning_rate": 1e-06, + "loss": 0.0855, + "num_tokens": 201223736.0, + "reward": 1.447767972946167, + "reward_std": 0.3190096616744995, + "rewards/code_format_reward/mean": 0.84375, + "rewards/code_format_reward/std": 0.36349809169769287, + "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774, + "rewards/curriculum_aware_reward_fn/std": 0.48629483580589294, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 1807.4287109375, + "completions/mean_terminated_length": 1324.9730224609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.7137699845281072, + "grad_norm": 0.20310117304325104, + "kl": 0.026702880859375, + "learning_rate": 1e-06, + "loss": 0.0933, + "num_tokens": 202291253.0, + "reward": 1.5252233743667603, + "reward_std": 0.3091839551925659, + "rewards/code_format_reward/mean": 0.828125, + "rewards/code_format_reward/std": 0.3776935040950775, + "rewards/curriculum_aware_reward_fn/mean": 0.6970981955528259, + "rewards/curriculum_aware_reward_fn/std": 0.44606462121009827, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0892857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4081.0, + "completions/mean_length": 1324.3170166015625, + "completions/mean_terminated_length": 1052.5833740234375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.7178958225889633, + "grad_norm": 0.25287967920303345, + "kl": 0.0323333740234375, + "learning_rate": 1e-06, + "loss": 0.0458, + "num_tokens": 203151912.0, + "reward": 1.601562738418579, + "reward_std": 0.264218270778656, + "rewards/code_format_reward/mean": 0.9107142686843872, + "rewards/code_format_reward/std": 0.2854744791984558, + "rewards/curriculum_aware_reward_fn/mean": 0.6908482313156128, + "rewards/curriculum_aware_reward_fn/std": 0.4577873945236206, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 1976.7076416015625, + "completions/mean_terminated_length": 1487.64013671875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.7220216606498195, + "grad_norm": 0.22337962687015533, + "kl": 0.0275726318359375, + "learning_rate": 1e-06, + "loss": 0.0868, + "num_tokens": 204336881.0, + "reward": 1.3909599781036377, + "reward_std": 0.35256850719451904, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252740144729614, + "rewards/curriculum_aware_reward_fn/mean": 0.580691933631897, + "rewards/curriculum_aware_reward_fn/std": 0.4459373652935028, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1631.310302734375, + "completions/mean_terminated_length": 1362.878662109375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7261474987106756, + "grad_norm": 0.22130419313907623, + "kl": 0.028656005859375, + "learning_rate": 1e-06, + "loss": 0.0352, + "num_tokens": 205337467.0, + "reward": 1.6895090341567993, + "reward_std": 0.2801817059516907, + "rewards/code_format_reward/mean": 0.9040178656578064, + "rewards/code_format_reward/std": 0.29489603638648987, + "rewards/curriculum_aware_reward_fn/mean": 0.7854910492897034, + "rewards/curriculum_aware_reward_fn/std": 0.40007179975509644, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4021.0, + "completions/mean_length": 1944.5826416015625, + "completions/mean_terminated_length": 1476.8831787109375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.7302733367715317, + "grad_norm": 0.2612294852733612, + "kl": 0.027130126953125, + "learning_rate": 1e-06, + "loss": 0.0948, + "num_tokens": 206477860.0, + "reward": 1.4131697416305542, + "reward_std": 0.355887770652771, + "rewards/code_format_reward/mean": 0.8214285969734192, + "rewards/code_format_reward/std": 0.3834212124347687, + "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4985761046409607, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1941964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 1801.9443359375, + "completions/mean_terminated_length": 1249.0830078125, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.7343991748323878, + "grad_norm": 0.2323751598596573, + "kl": 0.0294036865234375, + "learning_rate": 1e-06, + "loss": 0.0978, + "num_tokens": 207566363.0, + "reward": 1.3904019594192505, + "reward_std": 0.31305989623069763, + "rewards/code_format_reward/mean": 0.8080357313156128, + "rewards/code_format_reward/std": 0.3942854404449463, + "rewards/curriculum_aware_reward_fn/mean": 0.5823659896850586, + "rewards/curriculum_aware_reward_fn/std": 0.44665220379829407, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1071428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 1608.8773193359375, + "completions/mean_terminated_length": 1310.4224853515625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7385250128932439, + "grad_norm": 0.24387770891189575, + "kl": 0.0294342041015625, + "learning_rate": 1e-06, + "loss": 0.0693, + "num_tokens": 208537103.0, + "reward": 1.486830472946167, + "reward_std": 0.3190726041793823, + "rewards/code_format_reward/mean": 0.8950892686843872, + "rewards/code_format_reward/std": 0.3067808747291565, + "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902, + "rewards/curriculum_aware_reward_fn/std": 0.4914354383945465, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4007.0, + "completions/mean_length": 1846.66748046875, + "completions/mean_terminated_length": 1357.68212890625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.7426508509541001, + "grad_norm": 0.42223504185676575, + "kl": 0.0312347412109375, + "learning_rate": 1e-06, + "loss": 0.0936, + "num_tokens": 209624936.0, + "reward": 1.4097100496292114, + "reward_std": 0.31943628191947937, + "rewards/code_format_reward/mean": 0.8236607313156128, + "rewards/code_format_reward/std": 0.3815346360206604, + "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195, + "rewards/curriculum_aware_reward_fn/std": 0.4472143352031708, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 1833.060302734375, + "completions/mean_terminated_length": 1303.1707763671875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7467766890149562, + "grad_norm": 0.22536613047122955, + "kl": 0.029296875, + "learning_rate": 1e-06, + "loss": 0.0963, + "num_tokens": 210728513.0, + "reward": 1.450334906578064, + "reward_std": 0.2970465123653412, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252740144729614, + "rewards/curriculum_aware_reward_fn/mean": 0.6400669813156128, + "rewards/curriculum_aware_reward_fn/std": 0.4401248097419739, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 1873.4866943359375, + "completions/mean_terminated_length": 1475.7738037109375, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.7509025270758123, + "grad_norm": 0.25091269612312317, + "kl": 0.0291290283203125, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 211823881.0, + "reward": 1.4522322416305542, + "reward_std": 0.3361148536205292, + "rewards/code_format_reward/mean": 0.8504464030265808, + "rewards/code_format_reward/std": 0.3570319712162018, + "rewards/curriculum_aware_reward_fn/mean": 0.6017856597900391, + "rewards/curriculum_aware_reward_fn/std": 0.46313437819480896, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1651785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 1746.0023193359375, + "completions/mean_terminated_length": 1281.0294189453125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.7550283651366684, + "grad_norm": 0.24613533914089203, + "kl": 0.03045654296875, + "learning_rate": 1e-06, + "loss": 0.0629, + "num_tokens": 212866385.0, + "reward": 1.489174246788025, + "reward_std": 0.297740638256073, + "rewards/code_format_reward/mean": 0.8348214030265808, + "rewards/code_format_reward/std": 0.37175676226615906, + "rewards/curriculum_aware_reward_fn/mean": 0.6543526649475098, + "rewards/curriculum_aware_reward_fn/std": 0.4452936053276062, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 2007.6295166015625, + "completions/mean_terminated_length": 1587.7158203125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7591542031975245, + "grad_norm": 0.23514395952224731, + "kl": 0.0292510986328125, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 214051270.0, + "reward": 1.4108260869979858, + "reward_std": 0.36220675706863403, + "rewards/code_format_reward/mean": 0.8370535969734192, + "rewards/code_format_reward/std": 0.3697296679019928, + "rewards/curriculum_aware_reward_fn/mean": 0.5737723112106323, + "rewards/curriculum_aware_reward_fn/std": 0.4432297646999359, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 1622.7344970703125, + "completions/mean_terminated_length": 1232.8914794921875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7632800412583806, + "grad_norm": 0.2874307930469513, + "kl": 0.0306396484375, + "learning_rate": 1e-06, + "loss": 0.1072, + "num_tokens": 215048952.0, + "reward": 1.3916295766830444, + "reward_std": 0.27101579308509827, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.5300223231315613, + "rewards/curriculum_aware_reward_fn/std": 0.4591200649738312, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2276785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 2090.618408203125, + "completions/mean_terminated_length": 1499.4364013671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.7674058793192368, + "grad_norm": 0.22228585183620453, + "kl": 0.0281829833984375, + "learning_rate": 1e-06, + "loss": 0.1194, + "num_tokens": 216246425.0, + "reward": 1.2716518640518188, + "reward_std": 0.4031871259212494, + "rewards/code_format_reward/mean": 0.78125, + "rewards/code_format_reward/std": 0.4138607978820801, + "rewards/curriculum_aware_reward_fn/mean": 0.4904017746448517, + "rewards/curriculum_aware_reward_fn/std": 0.4519253373146057, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0870535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4049.0, + "completions/mean_length": 1512.0535888671875, + "completions/mean_terminated_length": 1265.66259765625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.7715317173800929, + "grad_norm": 0.22118829190731049, + "kl": 0.031341552734375, + "learning_rate": 1e-06, + "loss": 0.0398, + "num_tokens": 217192268.0, + "reward": 1.527009129524231, + "reward_std": 0.26555386185646057, + "rewards/code_format_reward/mean": 0.9196428656578064, + "rewards/code_format_reward/std": 0.2721492052078247, + "rewards/curriculum_aware_reward_fn/mean": 0.6073660254478455, + "rewards/curriculum_aware_reward_fn/std": 0.46336886286735535, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1183035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1622.5201416015625, + "completions/mean_terminated_length": 1290.635498046875, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.7756575554409489, + "grad_norm": 0.22401097416877747, + "kl": 0.0296630859375, + "learning_rate": 1e-06, + "loss": 0.0562, + "num_tokens": 218182868.0, + "reward": 1.4532368183135986, + "reward_std": 0.2841085195541382, + "rewards/code_format_reward/mean": 0.8839285969734192, + "rewards/code_format_reward/std": 0.32066863775253296, + "rewards/curriculum_aware_reward_fn/mean": 0.5693081021308899, + "rewards/curriculum_aware_reward_fn/std": 0.4530544579029083, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 1664.4554443359375, + "completions/mean_terminated_length": 1273.8963623046875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.779783393501805, + "grad_norm": 0.2380165308713913, + "kl": 0.0315399169921875, + "learning_rate": 1e-06, + "loss": 0.0685, + "num_tokens": 219187502.0, + "reward": 1.4876116514205933, + "reward_std": 0.32384008169174194, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3480229377746582, + "rewards/curriculum_aware_reward_fn/mean": 0.6282365918159485, + "rewards/curriculum_aware_reward_fn/std": 0.4722626507282257, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1138392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 1659.7723388671875, + "completions/mean_terminated_length": 1346.8060302734375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.7839092315626611, + "grad_norm": 0.34679973125457764, + "kl": 0.0302276611328125, + "learning_rate": 1e-06, + "loss": 0.1048, + "num_tokens": 220184697.0, + "reward": 1.539955496788025, + "reward_std": 0.3225671648979187, + "rewards/code_format_reward/mean": 0.8861607313156128, + "rewards/code_format_reward/std": 0.31797102093696594, + "rewards/curriculum_aware_reward_fn/mean": 0.6537945866584778, + "rewards/curriculum_aware_reward_fn/std": 0.4496917724609375, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1183035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3969.0, + "completions/mean_length": 1594.0648193359375, + "completions/mean_terminated_length": 1258.362060546875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.7880350696235173, + "grad_norm": 0.347991406917572, + "kl": 0.0323486328125, + "learning_rate": 1e-06, + "loss": 0.11, + "num_tokens": 221148836.0, + "reward": 1.6414064168930054, + "reward_std": 0.328239381313324, + "rewards/code_format_reward/mean": 0.8861607313156128, + "rewards/code_format_reward/std": 0.31797102093696594, + "rewards/curriculum_aware_reward_fn/mean": 0.755245566368103, + "rewards/curriculum_aware_reward_fn/std": 0.41560760140419006, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 1887.9844970703125, + "completions/mean_terminated_length": 1513.255859375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.7921609076843734, + "grad_norm": 0.7645904421806335, + "kl": 0.0301361083984375, + "learning_rate": 1e-06, + "loss": 0.0756, + "num_tokens": 222288485.0, + "reward": 1.4526787996292114, + "reward_std": 0.30151334404945374, + "rewards/code_format_reward/mean": 0.8571428656578064, + "rewards/code_format_reward/std": 0.3503182828426361, + "rewards/curriculum_aware_reward_fn/mean": 0.5955356955528259, + "rewards/curriculum_aware_reward_fn/std": 0.44083017110824585, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1725.9442138671875, + "completions/mean_terminated_length": 1359.440673828125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.7962867457452295, + "grad_norm": 0.2471107840538025, + "kl": 0.0307464599609375, + "learning_rate": 1e-06, + "loss": 0.0607, + "num_tokens": 223335947.0, + "reward": 1.5162948369979858, + "reward_std": 0.32580631971359253, + "rewards/code_format_reward/mean": 0.8660714030265808, + "rewards/code_format_reward/std": 0.34095630049705505, + "rewards/curriculum_aware_reward_fn/mean": 0.6502231955528259, + "rewards/curriculum_aware_reward_fn/std": 0.4345405101776123, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2254464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 2046.5001220703125, + "completions/mean_terminated_length": 1449.9595947265625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.8004125838060856, + "grad_norm": 0.2045373022556305, + "kl": 0.0279083251953125, + "learning_rate": 1e-06, + "loss": 0.1047, + "num_tokens": 224518009.0, + "reward": 1.3463170528411865, + "reward_std": 0.3773540258407593, + "rewards/code_format_reward/mean": 0.7745535969734192, + "rewards/code_format_reward/std": 0.41834309697151184, + "rewards/curriculum_aware_reward_fn/mean": 0.5717633962631226, + "rewards/curriculum_aware_reward_fn/std": 0.4517683684825897, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 1744.388427734375, + "completions/mean_terminated_length": 1345.289794921875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.8045384218669417, + "grad_norm": 0.2941077947616577, + "kl": 0.0295867919921875, + "learning_rate": 1e-06, + "loss": 0.0859, + "num_tokens": 225559174.0, + "reward": 1.5177457332611084, + "reward_std": 0.3033301830291748, + "rewards/code_format_reward/mean": 0.8549107313156128, + "rewards/code_format_reward/std": 0.3525845408439636, + "rewards/curriculum_aware_reward_fn/mean": 0.6628348231315613, + "rewards/curriculum_aware_reward_fn/std": 0.4127408564090729, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1205357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1809.15185546875, + "completions/mean_terminated_length": 1495.725830078125, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8086642599277978, + "grad_norm": 0.2557990550994873, + "kl": 0.031646728515625, + "learning_rate": 1e-06, + "loss": 0.1066, + "num_tokens": 226641974.0, + "reward": 1.4722100496292114, + "reward_std": 0.37684550881385803, + "rewards/code_format_reward/mean": 0.8861607313156128, + "rewards/code_format_reward/std": 0.31797102093696594, + "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195, + "rewards/curriculum_aware_reward_fn/std": 0.44295454025268555, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1995.5982666015625, + "completions/mean_terminated_length": 1503.7686767578125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.812790097988654, + "grad_norm": 0.20381715893745422, + "kl": 0.0287933349609375, + "learning_rate": 1e-06, + "loss": 0.0545, + "num_tokens": 227803885.0, + "reward": 1.3908482789993286, + "reward_std": 0.3567146360874176, + "rewards/code_format_reward/mean": 0.8147321343421936, + "rewards/code_format_reward/std": 0.38894903659820557, + "rewards/curriculum_aware_reward_fn/mean": 0.5761160254478455, + "rewards/curriculum_aware_reward_fn/std": 0.47179847955703735, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 1600.185302734375, + "completions/mean_terminated_length": 1199.3031005859375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.8169159360495101, + "grad_norm": 0.29694321751594543, + "kl": 0.0296630859375, + "learning_rate": 1e-06, + "loss": 0.0737, + "num_tokens": 228775491.0, + "reward": 1.4993302822113037, + "reward_std": 0.23577921092510223, + "rewards/code_format_reward/mean": 0.8638392686843872, + "rewards/code_format_reward/std": 0.34334254264831543, + "rewards/curriculum_aware_reward_fn/mean": 0.6354910731315613, + "rewards/curriculum_aware_reward_fn/std": 0.42602863907814026, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4077.0, + "completions/mean_length": 1745.0513916015625, + "completions/mean_terminated_length": 1346.0653076171875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.8210417741103662, + "grad_norm": 0.2424536943435669, + "kl": 0.0304412841796875, + "learning_rate": 1e-06, + "loss": 0.0717, + "num_tokens": 229817282.0, + "reward": 1.483035683631897, + "reward_std": 0.3085847795009613, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.6214285492897034, + "rewards/curriculum_aware_reward_fn/std": 0.45782846212387085, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0915178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 1753.384033203125, + "completions/mean_terminated_length": 1517.3955078125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.8251676121712223, + "grad_norm": 0.2286684364080429, + "kl": 0.0298004150390625, + "learning_rate": 1e-06, + "loss": 0.0483, + "num_tokens": 230885598.0, + "reward": 1.54676353931427, + "reward_std": 0.31968414783477783, + "rewards/code_format_reward/mean": 0.9129464030265808, + "rewards/code_format_reward/std": 0.2822287082672119, + "rewards/curriculum_aware_reward_fn/mean": 0.6338168978691101, + "rewards/curriculum_aware_reward_fn/std": 0.44459256529808044, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1600.5157470703125, + "completions/mean_terminated_length": 1244.017822265625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.8292934502320783, + "grad_norm": 0.22150000929832458, + "kl": 0.03021240234375, + "learning_rate": 1e-06, + "loss": 0.094, + "num_tokens": 231872199.0, + "reward": 1.434598445892334, + "reward_std": 0.298090398311615, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.3310886323451996, + "rewards/curriculum_aware_reward_fn/mean": 0.5595981478691101, + "rewards/curriculum_aware_reward_fn/std": 0.46301397681236267, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 1712.7254638671875, + "completions/mean_terminated_length": 1293.619384765625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8334192882929345, + "grad_norm": 0.24653927981853485, + "kl": 0.0312347412109375, + "learning_rate": 1e-06, + "loss": 0.0796, + "num_tokens": 232899585.0, + "reward": 1.3952010869979858, + "reward_std": 0.31651803851127625, + "rewards/code_format_reward/mean": 0.8549107313156128, + "rewards/code_format_reward/std": 0.3525845408439636, + "rewards/curriculum_aware_reward_fn/mean": 0.5402902364730835, + "rewards/curriculum_aware_reward_fn/std": 0.45736637711524963, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4076.0, + "completions/mean_length": 2037.8304443359375, + "completions/mean_terminated_length": 1603.946044921875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.8375451263537906, + "grad_norm": 0.22224228084087372, + "kl": 0.029876708984375, + "learning_rate": 1e-06, + "loss": 0.0668, + "num_tokens": 234095361.0, + "reward": 1.3809152841567993, + "reward_std": 0.37990495562553406, + "rewards/code_format_reward/mean": 0.828125, + "rewards/code_format_reward/std": 0.3776935040950775, + "rewards/curriculum_aware_reward_fn/mean": 0.5527901649475098, + "rewards/curriculum_aware_reward_fn/std": 0.43541234731674194, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3876.0, + "completions/mean_length": 1682.0201416015625, + "completions/mean_terminated_length": 1337.165771484375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8416709644146467, + "grad_norm": 0.25886648893356323, + "kl": 0.0328826904296875, + "learning_rate": 1e-06, + "loss": 0.0476, + "num_tokens": 235114591.0, + "reward": 1.5552457571029663, + "reward_std": 0.3339327573776245, + "rewards/code_format_reward/mean": 0.8839285969734192, + "rewards/code_format_reward/std": 0.32066863775253296, + "rewards/curriculum_aware_reward_fn/mean": 0.6713169813156128, + "rewards/curriculum_aware_reward_fn/std": 0.4169880151748657, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4068.0, + "completions/mean_length": 1614.8438720703125, + "completions/mean_terminated_length": 1260.392822265625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.8457968024755028, + "grad_norm": 0.25539520382881165, + "kl": 0.03082275390625, + "learning_rate": 1e-06, + "loss": 0.0544, + "num_tokens": 236103705.0, + "reward": 1.507924199104309, + "reward_std": 0.25643816590309143, + "rewards/code_format_reward/mean": 0.8794642686843872, + "rewards/code_format_reward/std": 0.3259509205818176, + "rewards/curriculum_aware_reward_fn/mean": 0.6284598708152771, + "rewards/curriculum_aware_reward_fn/std": 0.5155189037322998, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0959821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 1644.69873046875, + "completions/mean_terminated_length": 1384.4371337890625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.8499226405363589, + "grad_norm": 0.2182522863149643, + "kl": 0.032928466796875, + "learning_rate": 1e-06, + "loss": 0.0571, + "num_tokens": 237102526.0, + "reward": 1.6372768878936768, + "reward_std": 0.2757372558116913, + "rewards/code_format_reward/mean": 0.9151785969734192, + "rewards/code_format_reward/std": 0.2789272665977478, + "rewards/curriculum_aware_reward_fn/mean": 0.7220982313156128, + "rewards/curriculum_aware_reward_fn/std": 0.3957159221172333, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4093.0, + "completions/mean_length": 1881.337158203125, + "completions/mean_terminated_length": 1370.260986328125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.8540484785972151, + "grad_norm": 0.20540083944797516, + "kl": 0.03094482421875, + "learning_rate": 1e-06, + "loss": 0.0761, + "num_tokens": 238202951.0, + "reward": 1.3666294813156128, + "reward_std": 0.3059476613998413, + "rewards/code_format_reward/mean": 0.8147321343421936, + "rewards/code_format_reward/std": 0.38894903659820557, + "rewards/curriculum_aware_reward_fn/mean": 0.5518973469734192, + "rewards/curriculum_aware_reward_fn/std": 0.4562782943248749, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 1735.805908203125, + "completions/mean_terminated_length": 1398.6351318359375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.8581743166580712, + "grad_norm": 0.2484840750694275, + "kl": 0.0309600830078125, + "learning_rate": 1e-06, + "loss": 0.0979, + "num_tokens": 239242658.0, + "reward": 1.5085939168930054, + "reward_std": 0.34762099385261536, + "rewards/code_format_reward/mean": 0.8794642686843872, + "rewards/code_format_reward/std": 0.3259509205818176, + "rewards/curriculum_aware_reward_fn/mean": 0.6291294097900391, + "rewards/curriculum_aware_reward_fn/std": 0.4308629035949707, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 2012.274658203125, + "completions/mean_terminated_length": 1488.432861328125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8623001547189273, + "grad_norm": 0.19853706657886505, + "kl": 0.030975341796875, + "learning_rate": 1e-06, + "loss": 0.0884, + "num_tokens": 240423849.0, + "reward": 1.4322545528411865, + "reward_std": 0.3431403338909149, + "rewards/code_format_reward/mean": 0.8191964030265808, + "rewards/code_format_reward/std": 0.38528555631637573, + "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162, + "rewards/curriculum_aware_reward_fn/std": 0.4514111578464508, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2165178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 2111.489013671875, + "completions/mean_terminated_length": 1563.0626220703125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.8664259927797834, + "grad_norm": 0.24466735124588013, + "kl": 0.030792236328125, + "learning_rate": 1e-06, + "loss": 0.0651, + "num_tokens": 241637357.0, + "reward": 1.4133929014205933, + "reward_std": 0.33380740880966187, + "rewards/code_format_reward/mean": 0.796875, + "rewards/code_format_reward/std": 0.4027745723724365, + "rewards/curriculum_aware_reward_fn/mean": 0.6165178418159485, + "rewards/curriculum_aware_reward_fn/std": 0.46240732073783875, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4071.0, + "completions/mean_length": 1902.3037109375, + "completions/mean_terminated_length": 1425.4130859375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.8705518308406395, + "grad_norm": 0.1979178786277771, + "kl": 0.032989501953125, + "learning_rate": 1e-06, + "loss": 0.0419, + "num_tokens": 242745187.0, + "reward": 1.4494421482086182, + "reward_std": 0.26812899112701416, + "rewards/code_format_reward/mean": 0.8258928656578064, + "rewards/code_format_reward/std": 0.37962549924850464, + "rewards/curriculum_aware_reward_fn/mean": 0.6235490441322327, + "rewards/curriculum_aware_reward_fn/std": 0.47366687655448914, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4085.0, + "completions/mean_length": 1524.2188720703125, + "completions/mean_terminated_length": 1306.271240234375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.8746776689014956, + "grad_norm": 0.23214657604694366, + "kl": 0.03704833984375, + "learning_rate": 1e-06, + "loss": 0.054, + "num_tokens": 243692420.0, + "reward": 1.6546876430511475, + "reward_std": 0.2663484513759613, + "rewards/code_format_reward/mean": 0.9263392686843872, + "rewards/code_format_reward/std": 0.2615099549293518, + "rewards/curriculum_aware_reward_fn/mean": 0.7283481955528259, + "rewards/curriculum_aware_reward_fn/std": 0.40330010652542114, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 1860.7412109375, + "completions/mean_terminated_length": 1396.8193359375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.8788035069623518, + "grad_norm": 0.2359783798456192, + "kl": 0.0379638671875, + "learning_rate": 1e-06, + "loss": 0.1015, + "num_tokens": 244798745.0, + "reward": 1.4878350496292114, + "reward_std": 0.3436294496059418, + "rewards/code_format_reward/mean": 0.8392857313156128, + "rewards/code_format_reward/std": 0.3676777780056, + "rewards/curriculum_aware_reward_fn/mean": 0.6485491394996643, + "rewards/curriculum_aware_reward_fn/std": 0.4317037761211395, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1294642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3974.0, + "completions/mean_length": 1827.13623046875, + "completions/mean_terminated_length": 1489.7154541015625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.8829293450232079, + "grad_norm": 0.43438419699668884, + "kl": 0.032135009765625, + "learning_rate": 1e-06, + "loss": 0.0871, + "num_tokens": 245898247.0, + "reward": 1.5524554252624512, + "reward_std": 0.309803307056427, + "rewards/code_format_reward/mean": 0.8861607313156128, + "rewards/code_format_reward/std": 0.31797102093696594, + "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4090176224708557, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0959821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1742.2366943359375, + "completions/mean_terminated_length": 1492.3309326171875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.887055183084064, + "grad_norm": 0.29153332114219666, + "kl": 0.0323944091796875, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 246938763.0, + "reward": 1.610714316368103, + "reward_std": 0.3009406626224518, + "rewards/code_format_reward/mean": 0.9174107313156128, + "rewards/code_format_reward/std": 0.2755681276321411, + "rewards/curriculum_aware_reward_fn/mean": 0.6933035254478455, + "rewards/curriculum_aware_reward_fn/std": 0.40388187766075134, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3960.0, + "completions/mean_length": 1818.85498046875, + "completions/mean_terminated_length": 1418.412109375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.89118102114492, + "grad_norm": 0.21342267096042633, + "kl": 0.0300750732421875, + "learning_rate": 1e-06, + "loss": 0.083, + "num_tokens": 248023628.0, + "reward": 1.3243303298950195, + "reward_std": 0.2561975419521332, + "rewards/code_format_reward/mean": 0.8660714030265808, + "rewards/code_format_reward/std": 0.34095627069473267, + "rewards/curriculum_aware_reward_fn/mean": 0.45825889706611633, + "rewards/curriculum_aware_reward_fn/std": 0.4662056863307953, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1629464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 1759.1407470703125, + "completions/mean_terminated_length": 1304.23193359375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.8953068592057761, + "grad_norm": 0.20606671273708344, + "kl": 0.033477783203125, + "learning_rate": 1e-06, + "loss": 0.0774, + "num_tokens": 249074727.0, + "reward": 1.3753349781036377, + "reward_std": 0.2294098436832428, + "rewards/code_format_reward/mean": 0.8683035969734192, + "rewards/code_format_reward/std": 0.3385384678840637, + "rewards/curriculum_aware_reward_fn/mean": 0.507031261920929, + "rewards/curriculum_aware_reward_fn/std": 0.46656760573387146, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1071428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4040.0, + "completions/mean_length": 1633.0782470703125, + "completions/mean_terminated_length": 1337.5274658203125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.8994326972666323, + "grad_norm": 0.26139992475509644, + "kl": 0.03546142578125, + "learning_rate": 1e-06, + "loss": 0.0694, + "num_tokens": 250064611.0, + "reward": 1.6659599542617798, + "reward_std": 0.2920212149620056, + "rewards/code_format_reward/mean": 0.9040178656578064, + "rewards/code_format_reward/std": 0.29489603638648987, + "rewards/curriculum_aware_reward_fn/mean": 0.7619419693946838, + "rewards/curriculum_aware_reward_fn/std": 0.4175769090652466, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2098214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4019.0, + "completions/mean_length": 1939.6005859375, + "completions/mean_terminated_length": 1366.9971923828125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.9035585353274884, + "grad_norm": 0.2301202118396759, + "kl": 0.03155517578125, + "learning_rate": 1e-06, + "loss": 0.0979, + "num_tokens": 251204739.0, + "reward": 1.364174246788025, + "reward_std": 0.3046819865703583, + "rewards/code_format_reward/mean": 0.8169642686843872, + "rewards/code_format_reward/std": 0.387128084897995, + "rewards/curriculum_aware_reward_fn/mean": 0.5472097992897034, + "rewards/curriculum_aware_reward_fn/std": 0.4933087229728699, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 1786.0157470703125, + "completions/mean_terminated_length": 1221.352783203125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.9076843733883445, + "grad_norm": 0.5878376960754395, + "kl": 0.13665771484375, + "learning_rate": 1e-06, + "loss": 0.0782, + "num_tokens": 252260367.0, + "reward": 1.51551353931427, + "reward_std": 0.3167440593242645, + "rewards/code_format_reward/mean": 0.828125, + "rewards/code_format_reward/std": 0.3776935040950775, + "rewards/curriculum_aware_reward_fn/mean": 0.6873884201049805, + "rewards/curriculum_aware_reward_fn/std": 0.45862656831741333, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1651785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 1775.884033203125, + "completions/mean_terminated_length": 1316.8236083984375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.9118102114492006, + "grad_norm": 0.22542373836040497, + "kl": 0.0326995849609375, + "learning_rate": 1e-06, + "loss": 0.0955, + "num_tokens": 253308551.0, + "reward": 1.4619419574737549, + "reward_std": 0.32789382338523865, + "rewards/code_format_reward/mean": 0.8392857313156128, + "rewards/code_format_reward/std": 0.3676777780056, + "rewards/curriculum_aware_reward_fn/mean": 0.6226562857627869, + "rewards/curriculum_aware_reward_fn/std": 0.44398173689842224, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4030.0, + "completions/mean_length": 1829.544677734375, + "completions/mean_terminated_length": 1444.898193359375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.9159360495100567, + "grad_norm": 0.24280433356761932, + "kl": 0.032867431640625, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 254392535.0, + "reward": 1.5252233743667603, + "reward_std": 0.3197804391384125, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.6636161208152771, + "rewards/curriculum_aware_reward_fn/std": 0.4355611503124237, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4086.0, + "completions/mean_length": 1836.180908203125, + "completions/mean_terminated_length": 1590.0618896484375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.9200618875709129, + "grad_norm": 0.24582676589488983, + "kl": 0.0306854248046875, + "learning_rate": 1e-06, + "loss": 0.0706, + "num_tokens": 255481207.0, + "reward": 1.5697544813156128, + "reward_std": 0.3073837459087372, + "rewards/code_format_reward/mean": 0.9174107313156128, + "rewards/code_format_reward/std": 0.2755681276321411, + "rewards/curriculum_aware_reward_fn/mean": 0.65234375, + "rewards/curriculum_aware_reward_fn/std": 0.4572802186012268, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1116071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4023.0, + "completions/mean_length": 1784.857177734375, + "completions/mean_terminated_length": 1494.5125732421875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.924187725631769, + "grad_norm": 0.32395270466804504, + "kl": 0.029998779296875, + "learning_rate": 1e-06, + "loss": 0.1009, + "num_tokens": 256559521.0, + "reward": 1.5242189168930054, + "reward_std": 0.3307342529296875, + "rewards/code_format_reward/mean": 0.8928571343421936, + "rewards/code_format_reward/std": 0.3096405565738678, + "rewards/curriculum_aware_reward_fn/mean": 0.6313616633415222, + "rewards/curriculum_aware_reward_fn/std": 0.43223538994789124, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0803571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 1617.3192138671875, + "completions/mean_terminated_length": 1400.7354736328125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.9283135636926251, + "grad_norm": 0.21705108880996704, + "kl": 0.032135009765625, + "learning_rate": 1e-06, + "loss": 0.0599, + "num_tokens": 257553552.0, + "reward": 1.4319196939468384, + "reward_std": 0.19349054992198944, + "rewards/code_format_reward/mean": 0.9196428656578064, + "rewards/code_format_reward/std": 0.2721492052078247, + "rewards/curriculum_aware_reward_fn/mean": 0.5122767686843872, + "rewards/curriculum_aware_reward_fn/std": 0.4762067496776581, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 1747.665283203125, + "completions/mean_terminated_length": 1419.017822265625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.9324394017534812, + "grad_norm": 0.3296845555305481, + "kl": 0.0321044921875, + "learning_rate": 1e-06, + "loss": 0.0784, + "num_tokens": 258600945.0, + "reward": 1.543861746788025, + "reward_std": 0.30385205149650574, + "rewards/code_format_reward/mean": 0.8772321343421936, + "rewards/code_format_reward/std": 0.3285374045372009, + "rewards/curriculum_aware_reward_fn/mean": 0.666629433631897, + "rewards/curriculum_aware_reward_fn/std": 0.4261413514614105, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4091.0, + "completions/mean_length": 1931.638427734375, + "completions/mean_terminated_length": 1461.125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.9365652398143373, + "grad_norm": 0.23225350677967072, + "kl": 0.0313873291015625, + "learning_rate": 1e-06, + "loss": 0.1046, + "num_tokens": 259729778.0, + "reward": 1.358147382736206, + "reward_std": 0.32802098989486694, + "rewards/code_format_reward/mean": 0.8214285969734192, + "rewards/code_format_reward/std": 0.3834212124347687, + "rewards/curriculum_aware_reward_fn/mean": 0.5367187857627869, + "rewards/curriculum_aware_reward_fn/std": 0.46693459153175354, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4088.0, + "completions/mean_length": 1851.51123046875, + "completions/mean_terminated_length": 1442.8839111328125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.9406910778751933, + "grad_norm": 0.21108375489711761, + "kl": 0.03179931640625, + "learning_rate": 1e-06, + "loss": 0.0731, + "num_tokens": 260825178.0, + "reward": 1.5152901411056519, + "reward_std": 0.33156508207321167, + "rewards/code_format_reward/mean": 0.84375, + "rewards/code_format_reward/std": 0.36349809169769287, + "rewards/curriculum_aware_reward_fn/mean": 0.6715401411056519, + "rewards/curriculum_aware_reward_fn/std": 0.4522121250629425, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1428571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4065.0, + "completions/mean_length": 1721.6451416015625, + "completions/mean_terminated_length": 1325.9193115234375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9448169159360496, + "grad_norm": 0.3119603097438812, + "kl": 0.035125732421875, + "learning_rate": 1e-06, + "loss": 0.0564, + "num_tokens": 261878428.0, + "reward": 1.5291296243667603, + "reward_std": 0.29912132024765015, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3480229377746582, + "rewards/curriculum_aware_reward_fn/mean": 0.6697544455528259, + "rewards/curriculum_aware_reward_fn/std": 0.42858609557151794, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 1753.6607666015625, + "completions/mean_terminated_length": 1305.1275634765625, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9489427539969056, + "grad_norm": 0.22429104149341583, + "kl": 0.0317535400390625, + "learning_rate": 1e-06, + "loss": 0.1006, + "num_tokens": 262934310.0, + "reward": 1.381361722946167, + "reward_std": 0.3329118490219116, + "rewards/code_format_reward/mean": 0.8392857313156128, + "rewards/code_format_reward/std": 0.3676777780056, + "rewards/curriculum_aware_reward_fn/mean": 0.5420759320259094, + "rewards/curriculum_aware_reward_fn/std": 0.4664093852043152, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 1739.4263916015625, + "completions/mean_terminated_length": 1360.9093017578125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.9530685920577617, + "grad_norm": 0.22260624170303345, + "kl": 0.032623291015625, + "learning_rate": 1e-06, + "loss": 0.065, + "num_tokens": 263976059.0, + "reward": 1.5345982313156128, + "reward_std": 0.28138771653175354, + "rewards/code_format_reward/mean": 0.8683035969734192, + "rewards/code_format_reward/std": 0.3385384678840637, + "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936, + "rewards/curriculum_aware_reward_fn/std": 0.4627295732498169, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0892857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4026.0, + "completions/mean_length": 1576.6473388671875, + "completions/mean_terminated_length": 1329.6519775390625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.9571944301186178, + "grad_norm": 0.22776390612125397, + "kl": 0.03472900390625, + "learning_rate": 1e-06, + "loss": 0.041, + "num_tokens": 264949986.0, + "reward": 1.6268973350524902, + "reward_std": 0.2715020775794983, + "rewards/code_format_reward/mean": 0.9107142686843872, + "rewards/code_format_reward/std": 0.2854745090007782, + "rewards/curriculum_aware_reward_fn/mean": 0.716183066368103, + "rewards/curriculum_aware_reward_fn/std": 0.4235256612300873, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4092.0, + "completions/mean_length": 1813.0826416015625, + "completions/mean_terminated_length": 1375.9281005859375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.9613202681794739, + "grad_norm": 0.5938295125961304, + "kl": 0.03570556640625, + "learning_rate": 1e-06, + "loss": 0.0627, + "num_tokens": 266030928.0, + "reward": 1.4172991514205933, + "reward_std": 0.27423596382141113, + "rewards/code_format_reward/mean": 0.8392857313156128, + "rewards/code_format_reward/std": 0.3676777780056, + "rewards/curriculum_aware_reward_fn/mean": 0.5780134201049805, + "rewards/curriculum_aware_reward_fn/std": 0.4604513943195343, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 1860.4107666015625, + "completions/mean_terminated_length": 1453.40380859375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.9654461062403301, + "grad_norm": 0.2252698838710785, + "kl": 0.032440185546875, + "learning_rate": 1e-06, + "loss": 0.0919, + "num_tokens": 267137231.0, + "reward": 1.4508929252624512, + "reward_std": 0.35087722539901733, + "rewards/code_format_reward/mean": 0.84375, + "rewards/code_format_reward/std": 0.36349809169769287, + "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064, + "rewards/curriculum_aware_reward_fn/std": 0.4518719017505646, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4072.0, + "completions/mean_length": 1817.727783203125, + "completions/mean_terminated_length": 1417.086669921875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.9695719443011862, + "grad_norm": 0.2286522537469864, + "kl": 0.0334320068359375, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 268223397.0, + "reward": 1.3939732313156128, + "reward_std": 0.27675938606262207, + "rewards/code_format_reward/mean": 0.8526785969734192, + "rewards/code_format_reward/std": 0.3548222780227661, + "rewards/curriculum_aware_reward_fn/mean": 0.5412946343421936, + "rewards/curriculum_aware_reward_fn/std": 0.48773789405822754, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4053.0, + "completions/mean_length": 1985.325927734375, + "completions/mean_terminated_length": 1491.0909423828125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.9736977823620423, + "grad_norm": 0.20148524641990662, + "kl": 0.03509521484375, + "learning_rate": 1e-06, + "loss": 0.0899, + "num_tokens": 269388133.0, + "reward": 1.4079241752624512, + "reward_std": 0.3953123390674591, + "rewards/code_format_reward/mean": 0.8080357313156128, + "rewards/code_format_reward/std": 0.3942854106426239, + "rewards/curriculum_aware_reward_fn/mean": 0.5998883843421936, + "rewards/curriculum_aware_reward_fn/std": 0.4576530158519745, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4057.0, + "completions/mean_length": 1784.509033203125, + "completions/mean_terminated_length": 1413.233154296875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.9778236204228984, + "grad_norm": 0.21030192077159882, + "kl": 0.036651611328125, + "learning_rate": 1e-06, + "loss": 0.0617, + "num_tokens": 270444745.0, + "reward": 1.5034600496292114, + "reward_std": 0.2465612292289734, + "rewards/code_format_reward/mean": 0.8616071343421936, + "rewards/code_format_reward/std": 0.34569787979125977, + "rewards/curriculum_aware_reward_fn/mean": 0.6418526768684387, + "rewards/curriculum_aware_reward_fn/std": 0.4315384030342102, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1071428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4069.0, + "completions/mean_length": 1658.04248046875, + "completions/mean_terminated_length": 1365.4874267578125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.9819494584837545, + "grad_norm": 0.22513382136821747, + "kl": 0.03680419921875, + "learning_rate": 1e-06, + "loss": 0.0645, + "num_tokens": 271464328.0, + "reward": 1.5197545289993286, + "reward_std": 0.30367511510849, + "rewards/code_format_reward/mean": 0.8928571343421936, + "rewards/code_format_reward/std": 0.3096405565738678, + "rewards/curriculum_aware_reward_fn/mean": 0.6268973350524902, + "rewards/curriculum_aware_reward_fn/std": 0.4268620014190674, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1473214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 1725.8326416015625, + "completions/mean_terminated_length": 1316.3272705078125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.9860752965446106, + "grad_norm": 0.2153979241847992, + "kl": 0.036102294921875, + "learning_rate": 1e-06, + "loss": 0.0744, + "num_tokens": 272505594.0, + "reward": 1.3497768640518188, + "reward_std": 0.31243571639060974, + "rewards/code_format_reward/mean": 0.8526785969734192, + "rewards/code_format_reward/std": 0.3548222780227661, + "rewards/curriculum_aware_reward_fn/mean": 0.4970982074737549, + "rewards/curriculum_aware_reward_fn/std": 0.46139857172966003, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4089.0, + "completions/mean_length": 1712.7344970703125, + "completions/mean_terminated_length": 1344.1881103515625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.9902011346054668, + "grad_norm": 0.22380459308624268, + "kl": 0.0360107421875, + "learning_rate": 1e-06, + "loss": 0.058, + "num_tokens": 273546494.0, + "reward": 1.521875023841858, + "reward_std": 0.3063981533050537, + "rewards/code_format_reward/mean": 0.8660714030265808, + "rewards/code_format_reward/std": 0.34095630049705505, + "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323, + "rewards/curriculum_aware_reward_fn/std": 0.42365700006484985, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1160714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 1675.0223388671875, + "completions/mean_terminated_length": 1357.1162109375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.9943269726663229, + "grad_norm": 0.22325699031352997, + "kl": 0.036895751953125, + "learning_rate": 1e-06, + "loss": 0.0827, + "num_tokens": 274574076.0, + "reward": 1.5726563930511475, + "reward_std": 0.253899484872818, + "rewards/code_format_reward/mean": 0.8816964030265808, + "rewards/code_format_reward/std": 0.32332828640937805, + "rewards/curriculum_aware_reward_fn/mean": 0.6909598112106323, + "rewards/curriculum_aware_reward_fn/std": 0.4153672456741333, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0948905109489051, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 1584.3941650390625, + "completions/mean_terminated_length": 1321.08056640625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.998452810727179, + "grad_norm": 0.24855820834636688, + "kl": 0.037872314453125, + "learning_rate": 1e-06, + "loss": 0.1187, + "num_tokens": 275609321.0, + "reward": 1.4710938930511475, + "reward_std": 0.3146342933177948, + "rewards/code_format_reward/mean": 0.8816964030265808, + "rewards/code_format_reward/std": 0.32332828640937805, + "rewards/curriculum_aware_reward_fn/mean": 0.5893973112106323, + "rewards/curriculum_aware_reward_fn/std": 0.4410484731197357, + "step": 242 + }, + { + "epoch": 0.998452810727179, + "step": 242, + "total_flos": 0.0, + "train_loss": 0.08443626471047583, + "train_runtime": 92210.0342, + "train_samples_per_second": 0.168, + "train_steps_per_second": 0.003 + } + ], + "logging_steps": 1, + "max_steps": 242, + "num_input_tokens_seen": 275609321, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}