{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2762.078369140625, "completions/mean_terminated_length": 1656.8284912109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.0041258380608561115, "grad_norm": 0.10848142206668854, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1505084.0, "reward": 0.029017861932516098, "reward_std": 0.047291483730077744, "rewards/code_format_reward/mean": 0.0223214291036129, "rewards/code_format_reward/std": 0.14789186418056488, "rewards/curriculum_aware_reward_fn/mean": 0.0066964286379516125, "rewards/curriculum_aware_reward_fn/std": 0.0310124009847641, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2660.15625, "completions/mean_terminated_length": 1659.416748046875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.008251676121712223, "grad_norm": 0.12915591895580292, "kl": 0.0005238056182861328, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 2947042.0, "reward": 0.04441964253783226, "reward_std": 0.0783877819776535, "rewards/code_format_reward/mean": 0.02901785634458065, "rewards/code_format_reward/std": 0.16804419457912445, "rewards/curriculum_aware_reward_fn/mean": 0.015401787124574184, "rewards/curriculum_aware_reward_fn/std": 0.045581694692373276, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4486607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 2641.801513671875, "completions/mean_terminated_length": 1458.4251708984375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.012377514182568335, "grad_norm": 0.12791307270526886, "kl": 0.0005018711090087891, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 4383850.0, "reward": 0.04955357685685158, "reward_std": 0.08979818224906921, "rewards/code_format_reward/mean": 0.0334821417927742, "rewards/code_format_reward/std": 0.1800929754972458, "rewards/curriculum_aware_reward_fn/mean": 0.01607142947614193, "rewards/curriculum_aware_reward_fn/std": 0.04751747474074364, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5803571428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 3049.80810546875, "completions/mean_terminated_length": 1602.94677734375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.016503352243424446, "grad_norm": 0.26403144001960754, "kl": 0.0005340576171875, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 6031011.0, "reward": 0.06015625596046448, "reward_std": 0.08077409863471985, "rewards/code_format_reward/mean": 0.0424107126891613, "rewards/code_format_reward/std": 0.20174959301948547, "rewards/curriculum_aware_reward_fn/mean": 0.01774553582072258, "rewards/curriculum_aware_reward_fn/std": 0.04849924519658089, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 2951.32373046875, "completions/mean_terminated_length": 1452.6236572265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.020629190304280558, "grad_norm": 0.11759795993566513, "kl": 0.0006890296936035156, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 7632987.0, "reward": 0.10122768580913544, "reward_std": 0.150455042719841, "rewards/code_format_reward/mean": 0.078125, "rewards/code_format_reward/std": 0.26866820454597473, "rewards/curriculum_aware_reward_fn/mean": 0.02310268022119999, "rewards/curriculum_aware_reward_fn/std": 0.05603185296058655, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2626.33935546875, "completions/mean_terminated_length": 1563.6614990234375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.02475502836513667, "grad_norm": 0.176630437374115, "kl": 0.0012578964233398438, "learning_rate": 1e-06, "loss": 0.0312, "num_tokens": 9085730.0, "reward": 0.18604911863803864, "reward_std": 0.23488061130046844, "rewards/code_format_reward/mean": 0.1428571492433548, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.04319196566939354, "rewards/curriculum_aware_reward_fn/std": 0.06799682974815369, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4263392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 2600.0625, "completions/mean_terminated_length": 1488.2957763671875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.02888086642599278, "grad_norm": 0.1658683717250824, "kl": 0.0018901824951171875, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 10532126.0, "reward": 0.2216518074274063, "reward_std": 0.23757225275039673, "rewards/code_format_reward/mean": 0.1674107164144516, "rewards/code_format_reward/std": 0.37375950813293457, "rewards/curriculum_aware_reward_fn/mean": 0.054241079837083817, "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4709821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2653.466552734375, "completions/mean_terminated_length": 1369.185546875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.03300670448684889, "grad_norm": 0.16779452562332153, "kl": 0.0017452239990234375, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 11985080.0, "reward": 0.2589285969734192, "reward_std": 0.23284269869327545, "rewards/code_format_reward/mean": 0.1986607164144516, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, "rewards/curriculum_aware_reward_fn/std": 0.07362107187509537, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5066964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2908.83056640625, "completions/mean_terminated_length": 1689.429931640625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.037132542547705004, "grad_norm": 0.1578310877084732, "kl": 0.00200653076171875, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 13550561.0, "reward": 0.36138394474983215, "reward_std": 0.30128249526023865, "rewards/code_format_reward/mean": 0.2924107015132904, "rewards/code_format_reward/std": 0.4553784430027008, "rewards/curriculum_aware_reward_fn/mean": 0.06897322088479996, "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4330357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 2671.055908203125, "completions/mean_terminated_length": 1582.712646484375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.041258380608561115, "grad_norm": 0.1712757647037506, "kl": 0.0027599334716796875, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 15018507.0, "reward": 0.500558078289032, "reward_std": 0.3041485548019409, "rewards/code_format_reward/mean": 0.421875, "rewards/code_format_reward/std": 0.4944108724594116, "rewards/curriculum_aware_reward_fn/mean": 0.078683041036129, "rewards/curriculum_aware_reward_fn/std": 0.07499326020479202, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5022321428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2796.071533203125, "completions/mean_terminated_length": 1484.484375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.04538421866941723, "grad_norm": 0.7890152335166931, "kl": 0.005886077880859375, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 16537021.0, "reward": 0.49921879172325134, "reward_std": 0.2203681766986847, "rewards/code_format_reward/mean": 0.4285714328289032, "rewards/code_format_reward/std": 0.49542486667633057, "rewards/curriculum_aware_reward_fn/mean": 0.07064732164144516, "rewards/curriculum_aware_reward_fn/std": 0.07495728880167007, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4821428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2733.216552734375, "completions/mean_terminated_length": 1464.4180908203125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.04951005673027334, "grad_norm": 0.15826576948165894, "kl": 0.0032367706298828125, "learning_rate": 1e-06, "loss": 0.0484, "num_tokens": 18030295.0, "reward": 0.5366071462631226, "reward_std": 0.24472731351852417, "rewards/code_format_reward/mean": 0.4709821343421936, "rewards/code_format_reward/std": 0.49971529841423035, "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776, "rewards/curriculum_aware_reward_fn/std": 0.07449494302272797, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 3284.473388671875, "completions/mean_terminated_length": 1837.8385009765625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.05363589479112945, "grad_norm": 0.12409358471632004, "kl": 0.0019292831420898438, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 19771120.0, "reward": 0.3962053656578064, "reward_std": 0.24470412731170654, "rewards/code_format_reward/mean": 0.3459821343421936, "rewards/code_format_reward/std": 0.47621920704841614, "rewards/curriculum_aware_reward_fn/mean": 0.0502232126891613, "rewards/curriculum_aware_reward_fn/std": 0.07086833566427231, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4955357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2699.68994140625, "completions/mean_terminated_length": 1328.0928955078125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.05776173285198556, "grad_norm": 0.15436327457427979, "kl": 0.004604339599609375, "learning_rate": 1e-06, "loss": 0.0586, "num_tokens": 21266482.0, "reward": 0.5574777126312256, "reward_std": 0.19266396760940552, "rewards/code_format_reward/mean": 0.4888392984867096, "rewards/code_format_reward/std": 0.5004342794418335, "rewards/curriculum_aware_reward_fn/mean": 0.0686383917927742, "rewards/curriculum_aware_reward_fn/std": 0.07938338816165924, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4799107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 2604.18310546875, "completions/mean_terminated_length": 1227.61376953125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.06188757091284167, "grad_norm": 0.14027918875217438, "kl": 0.004291534423828125, "learning_rate": 1e-06, "loss": 0.0539, "num_tokens": 22688586.0, "reward": 0.5889509320259094, "reward_std": 0.16005173325538635, "rewards/code_format_reward/mean": 0.515625, "rewards/code_format_reward/std": 0.500314474105835, "rewards/curriculum_aware_reward_fn/mean": 0.07332588732242584, "rewards/curriculum_aware_reward_fn/std": 0.07506514340639114, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2344.2724609375, "completions/mean_terminated_length": 1293.2357177734375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.06601340897369778, "grad_norm": 0.14219032227993011, "kl": 0.004871368408203125, "learning_rate": 1e-06, "loss": 0.081, "num_tokens": 24024252.0, "reward": 0.7078125476837158, "reward_std": 0.1512121558189392, "rewards/code_format_reward/mean": 0.6227678656578064, "rewards/code_format_reward/std": 0.48523563146591187, "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584, "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4129464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 2521.03369140625, "completions/mean_terminated_length": 1413.167236328125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.07013924703455389, "grad_norm": 0.1439032107591629, "kl": 0.0036773681640625, "learning_rate": 1e-06, "loss": 0.0766, "num_tokens": 25413592.0, "reward": 0.662834882736206, "reward_std": 0.19979971647262573, "rewards/code_format_reward/mean": 0.5848214030265808, "rewards/code_format_reward/std": 0.49330365657806396, "rewards/curriculum_aware_reward_fn/mean": 0.07801339775323868, "rewards/curriculum_aware_reward_fn/std": 0.07958128303289413, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4352678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 2537.692138671875, "completions/mean_terminated_length": 1336.62451171875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.07426508509541001, "grad_norm": 0.20727689564228058, "kl": 0.0038776397705078125, "learning_rate": 1e-06, "loss": 0.0636, "num_tokens": 26820657.0, "reward": 0.6412946581840515, "reward_std": 0.1462433785200119, "rewards/code_format_reward/mean": 0.5602678656578064, "rewards/code_format_reward/std": 0.49690937995910645, "rewards/curriculum_aware_reward_fn/mean": 0.08102678507566452, "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 2791.4130859375, "completions/mean_terminated_length": 1565.8917236328125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.07839092315626611, "grad_norm": 0.22532987594604492, "kl": 0.002948760986328125, "learning_rate": 1e-06, "loss": 0.0666, "num_tokens": 28362529.0, "reward": 0.5607143640518188, "reward_std": 0.17854316532611847, "rewards/code_format_reward/mean": 0.4977678656578064, "rewards/code_format_reward/std": 0.5005539655685425, "rewards/curriculum_aware_reward_fn/mean": 0.06294643133878708, "rewards/curriculum_aware_reward_fn/std": 0.074107825756073, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5357142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 2899.388427734375, "completions/mean_terminated_length": 1518.6827392578125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.08251676121712223, "grad_norm": 0.14237935841083527, "kl": 0.0032501220703125, "learning_rate": 1e-06, "loss": 0.0718, "num_tokens": 29931508.0, "reward": 0.5177456140518188, "reward_std": 0.18259648978710175, "rewards/code_format_reward/mean": 0.4598214328289032, "rewards/code_format_reward/std": 0.49894019961357117, "rewards/curriculum_aware_reward_fn/mean": 0.05792411044239998, "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 2652.560302734375, "completions/mean_terminated_length": 1488.4959716796875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.08664259927797834, "grad_norm": 0.9559803009033203, "kl": 0.0038776397705078125, "learning_rate": 1e-06, "loss": 0.0722, "num_tokens": 31381718.0, "reward": 0.6266741156578064, "reward_std": 0.17785993218421936, "rewards/code_format_reward/mean": 0.5513392686843872, "rewards/code_format_reward/std": 0.49791330099105835, "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774, "rewards/curriculum_aware_reward_fn/std": 0.07508309930562973, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4285714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 2559.6318359375, "completions/mean_terminated_length": 1407.35546875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.09076843733883445, "grad_norm": 0.15661275386810303, "kl": 0.004146575927734375, "learning_rate": 1e-06, "loss": 0.0993, "num_tokens": 32786432.0, "reward": 0.6441964507102966, "reward_std": 0.21372844278812408, "rewards/code_format_reward/mean": 0.5691964030265808, "rewards/code_format_reward/std": 0.4957422614097595, "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164, "rewards/curriculum_aware_reward_fn/std": 0.07508385181427002, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 2585.23681640625, "completions/mean_terminated_length": 1492.8385009765625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.09489427539969056, "grad_norm": 0.17427508533000946, "kl": 0.00446319580078125, "learning_rate": 1e-06, "loss": 0.0821, "num_tokens": 34208021.0, "reward": 0.6626116633415222, "reward_std": 0.19658702611923218, "rewards/code_format_reward/mean": 0.5825892686843872, "rewards/code_format_reward/std": 0.4936830997467041, "rewards/curriculum_aware_reward_fn/mean": 0.08002232015132904, "rewards/curriculum_aware_reward_fn/std": 0.074915312230587, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3348214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2340.810302734375, "completions/mean_terminated_length": 1457.3255615234375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.09902011346054668, "grad_norm": 0.1813523918390274, "kl": 0.0107879638671875, "learning_rate": 1e-06, "loss": 0.109, "num_tokens": 35529753.0, "reward": 0.751897394657135, "reward_std": 0.21504586935043335, "rewards/code_format_reward/mean": 0.6651785969734192, "rewards/code_format_reward/std": 0.47245556116104126, "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164, "rewards/curriculum_aware_reward_fn/std": 0.07416162639856339, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3705357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 2455.239013671875, "completions/mean_terminated_length": 1489.400634765625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.10314595152140278, "grad_norm": 0.17525888979434967, "kl": 0.00499725341796875, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 36900957.0, "reward": 0.7077009081840515, "reward_std": 0.20892252027988434, "rewards/code_format_reward/mean": 0.625, "rewards/code_format_reward/std": 0.48466411232948303, "rewards/curriculum_aware_reward_fn/mean": 0.08270090073347092, "rewards/curriculum_aware_reward_fn/std": 0.07535793632268906, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3526785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2328.640625, "completions/mean_terminated_length": 1365.7344970703125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1072717895822589, "grad_norm": 0.15375342965126038, "kl": 0.00540924072265625, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 38222347.0, "reward": 0.7113839983940125, "reward_std": 0.1420706957578659, "rewards/code_format_reward/mean": 0.6316964030265808, "rewards/code_format_reward/std": 0.4828835427761078, "rewards/curriculum_aware_reward_fn/mean": 0.07968749850988388, "rewards/curriculum_aware_reward_fn/std": 0.07493705302476883, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4508928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 2649.419677734375, "completions/mean_terminated_length": 1461.5771484375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.111397627643115, "grad_norm": 0.1293102651834488, "kl": 0.00531768798828125, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 39685605.0, "reward": 0.6244419813156128, "reward_std": 0.17081955075263977, "rewards/code_format_reward/mean": 0.5491071343421936, "rewards/code_format_reward/std": 0.4981389045715332, "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774, "rewards/curriculum_aware_reward_fn/std": 0.07900315523147583, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2605.43310546875, "completions/mean_terminated_length": 1446.103271484375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.11552346570397112, "grad_norm": 0.16880519688129425, "kl": 0.0055389404296875, "learning_rate": 1e-06, "loss": 0.0838, "num_tokens": 41124915.0, "reward": 0.6412946581840515, "reward_std": 0.23654666543006897, "rewards/code_format_reward/mean": 0.5602678656578064, "rewards/code_format_reward/std": 0.49690937995910645, "rewards/curriculum_aware_reward_fn/mean": 0.08102679252624512, "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3973214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2520.87060546875, "completions/mean_terminated_length": 1482.4517822265625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.11964930376482723, "grad_norm": 0.16759856045246124, "kl": 0.005680084228515625, "learning_rate": 1e-06, "loss": 0.0741, "num_tokens": 42532123.0, "reward": 0.6870536208152771, "reward_std": 0.23582594096660614, "rewards/code_format_reward/mean": 0.6026785969734192, "rewards/code_format_reward/std": 0.48989060521125793, "rewards/curriculum_aware_reward_fn/mean": 0.08437500894069672, "rewards/curriculum_aware_reward_fn/std": 0.07844439893960953, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2569.11181640625, "completions/mean_terminated_length": 1590.3370361328125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.12377514182568335, "grad_norm": 0.25616586208343506, "kl": 0.00691986083984375, "learning_rate": 1e-06, "loss": 0.0843, "num_tokens": 43947307.0, "reward": 0.6825892925262451, "reward_std": 0.2599073052406311, "rewards/code_format_reward/mean": 0.6049107313156128, "rewards/code_format_reward/std": 0.4894163906574249, "rewards/curriculum_aware_reward_fn/mean": 0.07767857611179352, "rewards/curriculum_aware_reward_fn/std": 0.07570379227399826, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4241071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 2515.082763671875, "completions/mean_terminated_length": 1350.841064453125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.12790097988653945, "grad_norm": 0.1738700419664383, "kl": 0.005664825439453125, "learning_rate": 1e-06, "loss": 0.0742, "num_tokens": 45356850.0, "reward": 0.6507812738418579, "reward_std": 0.21465569734573364, "rewards/code_format_reward/mean": 0.578125, "rewards/code_format_reward/std": 0.4944108724594116, "rewards/curriculum_aware_reward_fn/mean": 0.07265625894069672, "rewards/curriculum_aware_reward_fn/std": 0.07896901667118073, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3482142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 2322.555908203125, "completions/mean_terminated_length": 1375.099365234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.13202681794739557, "grad_norm": 0.19044172763824463, "kl": 0.00688934326171875, "learning_rate": 1e-06, "loss": 0.0985, "num_tokens": 46668801.0, "reward": 0.7402902841567993, "reward_std": 0.2134973704814911, "rewards/code_format_reward/mean": 0.65625, "rewards/code_format_reward/std": 0.47548985481262207, "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096, "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2533.83056640625, "completions/mean_terminated_length": 1484.6119384765625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1361526560082517, "grad_norm": 0.1775142401456833, "kl": 0.0061492919921875, "learning_rate": 1e-06, "loss": 0.1187, "num_tokens": 48082396.0, "reward": 0.6822544932365417, "reward_std": 0.2595500349998474, "rewards/code_format_reward/mean": 0.5982142686843872, "rewards/code_format_reward/std": 0.49080711603164673, "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096, "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4464285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 2743.6630859375, "completions/mean_terminated_length": 1653.0684814453125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.14027849406910778, "grad_norm": 0.17279711365699768, "kl": 0.0059051513671875, "learning_rate": 1e-06, "loss": 0.1089, "num_tokens": 49573595.0, "reward": 0.6310268640518188, "reward_std": 0.2643914222717285, "rewards/code_format_reward/mean": 0.5580357313156128, "rewards/code_format_reward/std": 0.4971756041049957, "rewards/curriculum_aware_reward_fn/mean": 0.07299107313156128, "rewards/curriculum_aware_reward_fn/std": 0.07505691051483154, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4084821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 2337.328125, "completions/mean_terminated_length": 1122.84912109375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1444043321299639, "grad_norm": 0.17568987607955933, "kl": 0.00757598876953125, "learning_rate": 1e-06, "loss": 0.0903, "num_tokens": 50905954.0, "reward": 0.6639509201049805, "reward_std": 0.19163252413272858, "rewards/code_format_reward/mean": 0.5892857313156128, "rewards/code_format_reward/std": 0.4925134479999542, "rewards/curriculum_aware_reward_fn/mean": 0.07466518133878708, "rewards/curriculum_aware_reward_fn/std": 0.07508310675621033, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 2267.618408203125, "completions/mean_terminated_length": 1495.6348876953125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.14853017019082002, "grad_norm": 2.414182662963867, "kl": 0.00893402099609375, "learning_rate": 1e-06, "loss": 0.0851, "num_tokens": 52191977.0, "reward": 0.7859375476837158, "reward_std": 0.21878042817115784, "rewards/code_format_reward/mean": 0.7008928656578064, "rewards/code_format_reward/std": 0.45837870240211487, "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584, "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2924107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2191.107177734375, "completions/mean_terminated_length": 1403.9117431640625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.15265600825167613, "grad_norm": 0.3153815269470215, "kl": 0.010066986083984375, "learning_rate": 1e-06, "loss": 0.0934, "num_tokens": 53442519.0, "reward": 0.7965402007102966, "reward_std": 0.2366231083869934, "rewards/code_format_reward/mean": 0.7098214030265808, "rewards/code_format_reward/std": 0.4543519914150238, "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164, "rewards/curriculum_aware_reward_fn/std": 0.07416163384914398, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2611607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 2011.8907470703125, "completions/mean_terminated_length": 1275.21142578125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.15678184631253222, "grad_norm": 0.18517161905765533, "kl": 0.009616851806640625, "learning_rate": 1e-06, "loss": 0.0701, "num_tokens": 54602188.0, "reward": 0.8469865918159485, "reward_std": 0.15580664575099945, "rewards/code_format_reward/mean": 0.7455357313156128, "rewards/code_format_reward/std": 0.4360465407371521, "rewards/curriculum_aware_reward_fn/mean": 0.10145089775323868, "rewards/curriculum_aware_reward_fn/std": 0.07025929540395737, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2455357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2168.19873046875, "completions/mean_terminated_length": 1540.8077392578125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.16090768437338834, "grad_norm": 0.20817974209785461, "kl": 0.00962066650390625, "learning_rate": 1e-06, "loss": 0.1159, "num_tokens": 55845050.0, "reward": 0.8487723469734192, "reward_std": 0.23730090260505676, "rewards/code_format_reward/mean": 0.7566964030265808, "rewards/code_format_reward/std": 0.42955654859542847, "rewards/curriculum_aware_reward_fn/mean": 0.0920758917927742, "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4241071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2592.7255859375, "completions/mean_terminated_length": 1485.662841796875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.16503352243424446, "grad_norm": 0.19099269807338715, "kl": 0.0080718994140625, "learning_rate": 1e-06, "loss": 0.1264, "num_tokens": 57271114.0, "reward": 0.6504464149475098, "reward_std": 0.27461573481559753, "rewards/code_format_reward/mean": 0.578125, "rewards/code_format_reward/std": 0.4944108724594116, "rewards/curriculum_aware_reward_fn/mean": 0.07232142984867096, "rewards/curriculum_aware_reward_fn/std": 0.0750359445810318, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 2346.348388671875, "completions/mean_terminated_length": 1364.8363037109375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.16915936049510058, "grad_norm": 0.20166631042957306, "kl": 0.009296417236328125, "learning_rate": 1e-06, "loss": 0.0774, "num_tokens": 58593780.0, "reward": 0.7228795289993286, "reward_std": 0.20277433097362518, "rewards/code_format_reward/mean": 0.6428571343421936, "rewards/code_format_reward/std": 0.47969305515289307, "rewards/curriculum_aware_reward_fn/mean": 0.08002232760190964, "rewards/curriculum_aware_reward_fn/std": 0.07558422535657883, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1821.66748046875, "completions/mean_terminated_length": 1072.554931640625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.17328519855595667, "grad_norm": 0.3720153272151947, "kl": 0.01447296142578125, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 59652129.0, "reward": 0.8479911088943481, "reward_std": 0.14334198832511902, "rewards/code_format_reward/mean": 0.7522321343421936, "rewards/code_format_reward/std": 0.4321989119052887, "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096, "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3526785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2276.546875, "completions/mean_terminated_length": 1285.2586669921875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1774110366168128, "grad_norm": 46.85969924926758, "kl": 2.2713623046875, "learning_rate": 1e-06, "loss": 0.1385, "num_tokens": 60933570.0, "reward": 0.7345982789993286, "reward_std": 0.22185997664928436, "rewards/code_format_reward/mean": 0.6495535969734192, "rewards/code_format_reward/std": 0.4776431620121002, "rewards/curriculum_aware_reward_fn/mean": 0.08504464477300644, "rewards/curriculum_aware_reward_fn/std": 0.07900101691484451, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2081.060302734375, "completions/mean_terminated_length": 1417.3857421875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.1815368746776689, "grad_norm": 0.20214147865772247, "kl": 0.01105499267578125, "learning_rate": 1e-06, "loss": 0.1317, "num_tokens": 62112236.0, "reward": 0.8328125476837158, "reward_std": 0.2402763068675995, "rewards/code_format_reward/mean": 0.7410714030265808, "rewards/code_format_reward/std": 0.43853598833084106, "rewards/curriculum_aware_reward_fn/mean": 0.09174107015132904, "rewards/curriculum_aware_reward_fn/std": 0.0778549388051033, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2354.993408203125, "completions/mean_terminated_length": 1513.3145751953125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.18566271273852503, "grad_norm": 0.5439615845680237, "kl": 0.01001739501953125, "learning_rate": 1e-06, "loss": 0.1551, "num_tokens": 63457521.0, "reward": 0.7627232670783997, "reward_std": 0.2988319396972656, "rewards/code_format_reward/mean": 0.6763392686843872, "rewards/code_format_reward/std": 0.46839532256126404, "rewards/curriculum_aware_reward_fn/mean": 0.08638393133878708, "rewards/curriculum_aware_reward_fn/std": 0.07488906383514404, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1772.7857666015625, "completions/mean_terminated_length": 1228.7823486328125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.18978855079938112, "grad_norm": 0.18871024250984192, "kl": 0.01403045654296875, "learning_rate": 1e-06, "loss": 0.0889, "num_tokens": 64520732.0, "reward": 0.9100447297096252, "reward_std": 0.13270780444145203, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252743124961853, "rewards/curriculum_aware_reward_fn/mean": 0.09977678954601288, "rewards/curriculum_aware_reward_fn/std": 0.07365463674068451, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 2240.32373046875, "completions/mean_terminated_length": 1361.319091796875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.19391438886023724, "grad_norm": 0.5957781672477722, "kl": 0.01197052001953125, "learning_rate": 1e-06, "loss": 0.0987, "num_tokens": 65795367.0, "reward": 0.762276828289032, "reward_std": 0.21752412617206573, "rewards/code_format_reward/mean": 0.6785714030265808, "rewards/code_format_reward/std": 0.4675469994544983, "rewards/curriculum_aware_reward_fn/mean": 0.0837053582072258, "rewards/curriculum_aware_reward_fn/std": 0.07457634806632996, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2299107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1868.493408203125, "completions/mean_terminated_length": 1203.4696044921875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.19804022692109335, "grad_norm": 0.22301578521728516, "kl": 0.01412200927734375, "learning_rate": 1e-06, "loss": 0.1032, "num_tokens": 66888015.0, "reward": 0.865178644657135, "reward_std": 0.18127720057964325, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.09508929401636124, "rewards/curriculum_aware_reward_fn/std": 0.07234017550945282, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1748.5804443359375, "completions/mean_terminated_length": 1158.4468994140625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.20216606498194944, "grad_norm": 0.18596109747886658, "kl": 0.016571044921875, "learning_rate": 1e-06, "loss": 0.0924, "num_tokens": 67942181.0, "reward": 0.8970983028411865, "reward_std": 0.15365570783615112, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096, "rewards/curriculum_aware_reward_fn/std": 0.07687902450561523, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 1998.4866943359375, "completions/mean_terminated_length": 1195.734619140625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.20629190304280556, "grad_norm": 0.22046102583408356, "kl": 0.01477813720703125, "learning_rate": 1e-06, "loss": 0.1574, "num_tokens": 69098841.0, "reward": 0.8946428894996643, "reward_std": 0.26137298345565796, "rewards/code_format_reward/mean": 0.7232142686843872, "rewards/code_format_reward/std": 0.44790980219841003, "rewards/curriculum_aware_reward_fn/mean": 0.17142857611179352, "rewards/curriculum_aware_reward_fn/std": 0.141336590051651, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1979.493408203125, "completions/mean_terminated_length": 1331.5831298828125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.21041774110366168, "grad_norm": 0.22472943365573883, "kl": 0.01605224609375, "learning_rate": 1e-06, "loss": 0.1532, "num_tokens": 70250422.0, "reward": 0.9502232670783997, "reward_std": 0.27020275592803955, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648, "rewards/curriculum_aware_reward_fn/std": 0.13901372253894806, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3370535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 2305.919677734375, "completions/mean_terminated_length": 1395.8114013671875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2145435791645178, "grad_norm": 0.19396060705184937, "kl": 0.01288604736328125, "learning_rate": 1e-06, "loss": 0.1115, "num_tokens": 71575367.0, "reward": 0.8095982670783997, "reward_std": 0.2751266360282898, "rewards/code_format_reward/mean": 0.6696428656578064, "rewards/code_format_reward/std": 0.4708675146102905, "rewards/curriculum_aware_reward_fn/mean": 0.13995537161827087, "rewards/curriculum_aware_reward_fn/std": 0.14260126650333405, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2745535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 2132.26123046875, "completions/mean_terminated_length": 1389.0615234375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2186694172253739, "grad_norm": 2.5251505374908447, "kl": 0.0143890380859375, "learning_rate": 1e-06, "loss": 0.12, "num_tokens": 72795332.0, "reward": 0.8919642567634583, "reward_std": 0.2576311528682709, "rewards/code_format_reward/mean": 0.7299107313156128, "rewards/code_format_reward/std": 0.444502055644989, "rewards/curriculum_aware_reward_fn/mean": 0.16205357015132904, "rewards/curriculum_aware_reward_fn/std": 0.15102121233940125, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2299107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1957.2969970703125, "completions/mean_terminated_length": 1318.7855224609375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.22279525528623, "grad_norm": 0.37278586626052856, "kl": 0.01513671875, "learning_rate": 1e-06, "loss": 0.1216, "num_tokens": 73926567.0, "reward": 0.932366132736206, "reward_std": 0.22399599850177765, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.17343749105930328, "rewards/curriculum_aware_reward_fn/std": 0.139581561088562, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1986607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1917.977783203125, "completions/mean_terminated_length": 1378.022216796875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.22692109334708613, "grad_norm": 1.5422786474227905, "kl": 0.01769256591796875, "learning_rate": 1e-06, "loss": 0.1373, "num_tokens": 75080997.0, "reward": 0.9751116037368774, "reward_std": 0.24655158817768097, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380533695221, "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023, "rewards/curriculum_aware_reward_fn/std": 0.14535558223724365, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2745535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 2178.58935546875, "completions/mean_terminated_length": 1452.923095703125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.23104693140794225, "grad_norm": 0.19924461841583252, "kl": 0.0144500732421875, "learning_rate": 1e-06, "loss": 0.1163, "num_tokens": 76347561.0, "reward": 0.8948661088943481, "reward_std": 0.23191384971141815, "rewards/code_format_reward/mean": 0.7254464030265808, "rewards/code_format_reward/std": 0.44678795337677, "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256, "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1716.118408203125, "completions/mean_terminated_length": 1198.752685546875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.23517276946879834, "grad_norm": 0.2652409076690674, "kl": 0.0170440673828125, "learning_rate": 1e-06, "loss": 0.0689, "num_tokens": 77394926.0, "reward": 1.0386160612106323, "reward_std": 0.16931238770484924, "rewards/code_format_reward/mean": 0.8236607313156128, "rewards/code_format_reward/std": 0.3815346360206604, "rewards/curriculum_aware_reward_fn/mean": 0.21495535969734192, "rewards/curriculum_aware_reward_fn/std": 0.13793620467185974, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 2039.154052734375, "completions/mean_terminated_length": 1251.966064453125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.23929860752965446, "grad_norm": 0.22779759764671326, "kl": 0.0151824951171875, "learning_rate": 1e-06, "loss": 0.1741, "num_tokens": 78570204.0, "reward": 0.8920758962631226, "reward_std": 0.2624998390674591, "rewards/code_format_reward/mean": 0.7276785969734192, "rewards/code_format_reward/std": 0.4456520676612854, "rewards/curriculum_aware_reward_fn/mean": 0.16439732909202576, "rewards/curriculum_aware_reward_fn/std": 0.14416027069091797, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1969.24560546875, "completions/mean_terminated_length": 1234.78076171875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.24342444559051057, "grad_norm": 9.569337844848633, "kl": 0.660430908203125, "learning_rate": 1e-06, "loss": 0.0865, "num_tokens": 79723060.0, "reward": 0.9041295051574707, "reward_std": 0.21036864817142487, "rewards/code_format_reward/mean": 0.7477678656578064, "rewards/code_format_reward/std": 0.4347792863845825, "rewards/curriculum_aware_reward_fn/mean": 0.15636160969734192, "rewards/curriculum_aware_reward_fn/std": 0.14369189739227295, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1985.6898193359375, "completions/mean_terminated_length": 1222.385986328125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.2475502836513667, "grad_norm": 0.23437850177288055, "kl": 0.01544952392578125, "learning_rate": 1e-06, "loss": 0.1137, "num_tokens": 80894980.0, "reward": 0.8974330425262451, "reward_std": 0.2371726632118225, "rewards/code_format_reward/mean": 0.734375, "rewards/code_format_reward/std": 0.44215917587280273, "rewards/curriculum_aware_reward_fn/mean": 0.16305804252624512, "rewards/curriculum_aware_reward_fn/std": 0.14288581907749176, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1866.6630859375, "completions/mean_terminated_length": 1242.6199951171875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2516761217122228, "grad_norm": 0.21880938112735748, "kl": 0.0179290771484375, "learning_rate": 1e-06, "loss": 0.0995, "num_tokens": 81992338.0, "reward": 0.9709821939468384, "reward_std": 0.22359324991703033, "rewards/code_format_reward/mean": 0.7767857313156128, "rewards/code_format_reward/std": 0.41686633229255676, "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, "rewards/curriculum_aware_reward_fn/std": 0.13704219460487366, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1919642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1788.6876220703125, "completions/mean_terminated_length": 1240.54150390625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.2558019597730789, "grad_norm": 0.2513861656188965, "kl": 0.01721954345703125, "learning_rate": 1e-06, "loss": 0.1641, "num_tokens": 83078318.0, "reward": 0.9954241514205933, "reward_std": 0.2597261369228363, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252743124961853, "rewards/curriculum_aware_reward_fn/mean": 0.18515624105930328, "rewards/curriculum_aware_reward_fn/std": 0.14922460913658142, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1818.868408203125, "completions/mean_terminated_length": 1181.2713623046875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.259927797833935, "grad_norm": 0.5408481359481812, "kl": 0.0183258056640625, "learning_rate": 1e-06, "loss": 0.1045, "num_tokens": 84156539.0, "reward": 0.9498884081840515, "reward_std": 0.2111586630344391, "rewards/code_format_reward/mean": 0.7767857313156128, "rewards/code_format_reward/std": 0.41686636209487915, "rewards/curriculum_aware_reward_fn/mean": 0.17310269176959991, "rewards/curriculum_aware_reward_fn/std": 0.1488838940858841, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2165178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3919.0, "completions/mean_length": 1976.8460693359375, "completions/mean_terminated_length": 1391.2108154296875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.26405363589479114, "grad_norm": 1.840519666671753, "kl": 0.41724395751953125, "learning_rate": 1e-06, "loss": 0.1207, "num_tokens": 85322859.0, "reward": 0.9379464983940125, "reward_std": 0.2321721613407135, "rewards/code_format_reward/mean": 0.78125, "rewards/code_format_reward/std": 0.4138607978820801, "rewards/curriculum_aware_reward_fn/mean": 0.15669642388820648, "rewards/curriculum_aware_reward_fn/std": 0.1403089463710785, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2165178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3828.0, "completions/mean_length": 1842.6920166015625, "completions/mean_terminated_length": 1219.98291015625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.26817947395564723, "grad_norm": 0.20033693313598633, "kl": 0.01674652099609375, "learning_rate": 1e-06, "loss": 0.0878, "num_tokens": 86418634.0, "reward": 0.9722098112106323, "reward_std": 0.20250627398490906, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.18649554252624512, "rewards/curriculum_aware_reward_fn/std": 0.1416252702474594, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2366071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 1811.4888916015625, "completions/mean_terminated_length": 1103.4239501953125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.2723053120165034, "grad_norm": 0.21493232250213623, "kl": 0.0157470703125, "learning_rate": 1e-06, "loss": 0.0875, "num_tokens": 87501102.0, "reward": 0.9482142925262451, "reward_std": 0.1998186707496643, "rewards/code_format_reward/mean": 0.7633928656578064, "rewards/code_format_reward/std": 0.4254741966724396, "rewards/curriculum_aware_reward_fn/mean": 0.18482144176959991, "rewards/curriculum_aware_reward_fn/std": 0.13863980770111084, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1813.9376220703125, "completions/mean_terminated_length": 1240.234619140625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.27643115007735947, "grad_norm": 0.2010592669248581, "kl": 0.0166473388671875, "learning_rate": 1e-06, "loss": 0.0958, "num_tokens": 88571731.0, "reward": 1.012834906578064, "reward_std": 0.20978660881519318, "rewards/code_format_reward/mean": 0.8035714030265808, "rewards/code_format_reward/std": 0.3977404832839966, "rewards/curriculum_aware_reward_fn/mean": 0.209263414144516, "rewards/curriculum_aware_reward_fn/std": 0.14208464324474335, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1591.321533203125, "completions/mean_terminated_length": 1166.2454833984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.28055698813821556, "grad_norm": 0.3400166928768158, "kl": 0.01915740966796875, "learning_rate": 1e-06, "loss": 0.0839, "num_tokens": 89549775.0, "reward": 1.0475447177886963, "reward_std": 0.17855383455753326, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.19486607611179352, "rewards/curriculum_aware_reward_fn/std": 0.13271550834178925, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1908.4599609375, "completions/mean_terminated_length": 1311.8580322265625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.2846828261990717, "grad_norm": 0.35188552737236023, "kl": 0.01520538330078125, "learning_rate": 1e-06, "loss": 0.1303, "num_tokens": 90663597.0, "reward": 0.9658482670783997, "reward_std": 0.23443441092967987, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648, "rewards/curriculum_aware_reward_fn/std": 0.13937532901763916, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 1637.712158203125, "completions/mean_terminated_length": 1264.8612060546875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2888086642599278, "grad_norm": 0.21659672260284424, "kl": 0.0170745849609375, "learning_rate": 1e-06, "loss": 0.0605, "num_tokens": 91653793.0, "reward": 1.070424199104309, "reward_std": 0.16760630905628204, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.19988839328289032, "rewards/curriculum_aware_reward_fn/std": 0.13450957834720612, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 1594.72998046875, "completions/mean_terminated_length": 1147.13427734375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.2929345023207839, "grad_norm": 0.24188809096813202, "kl": 0.01702117919921875, "learning_rate": 1e-06, "loss": 0.111, "num_tokens": 92642758.0, "reward": 1.0497767925262451, "reward_std": 0.20430657267570496, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.20156250894069672, "rewards/curriculum_aware_reward_fn/std": 0.13481204211711884, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1918.1295166015625, "completions/mean_terminated_length": 1292.3045654296875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.29706034038164003, "grad_norm": 0.23266896605491638, "kl": 0.01465606689453125, "learning_rate": 1e-06, "loss": 0.1266, "num_tokens": 93783375.0, "reward": 0.9430804252624512, "reward_std": 0.2125602513551712, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.1640625, "rewards/curriculum_aware_reward_fn/std": 0.1429663747549057, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2522321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1978.180908203125, "completions/mean_terminated_length": 1263.8118896484375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3011861784424961, "grad_norm": 0.2386295050382614, "kl": 0.014862060546875, "learning_rate": 1e-06, "loss": 0.1095, "num_tokens": 94935747.0, "reward": 0.9466517567634583, "reward_std": 0.2304624617099762, "rewards/code_format_reward/mean": 0.7544642686843872, "rewards/code_format_reward/std": 0.43088552355766296, "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343, "rewards/curriculum_aware_reward_fn/std": 0.14789843559265137, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2113.1318359375, "completions/mean_terminated_length": 1404.10595703125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.30531201650335227, "grad_norm": 0.1983596831560135, "kl": 0.01523590087890625, "learning_rate": 1e-06, "loss": 0.084, "num_tokens": 96176073.0, "reward": 0.9197545051574707, "reward_std": 0.20995503664016724, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.4372987747192383, "rewards/curriculum_aware_reward_fn/mean": 0.17645089328289032, "rewards/curriculum_aware_reward_fn/std": 0.141729936003685, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1784.3818359375, "completions/mean_terminated_length": 1120.12353515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.30943785456420836, "grad_norm": 0.20856866240501404, "kl": 0.0185394287109375, "learning_rate": 1e-06, "loss": 0.0662, "num_tokens": 97242659.0, "reward": 0.9758929014205933, "reward_std": 0.1802317053079605, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328, "rewards/curriculum_aware_reward_fn/std": 0.14509600400924683, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2063.15625, "completions/mean_terminated_length": 1352.885498046875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.31356369262506445, "grad_norm": 0.28605571389198303, "kl": 0.064544677734375, "learning_rate": 1e-06, "loss": 0.1413, "num_tokens": 98452756.0, "reward": 0.8792411088943481, "reward_std": 0.2832396924495697, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.43729880452156067, "rewards/curriculum_aware_reward_fn/mean": 0.13593749701976776, "rewards/curriculum_aware_reward_fn/std": 0.14190621674060822, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1724.165283203125, "completions/mean_terminated_length": 1224.15673828125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3176895306859206, "grad_norm": 0.21490244567394257, "kl": 0.01772308349609375, "learning_rate": 1e-06, "loss": 0.0601, "num_tokens": 99505350.0, "reward": 1.0366071462631226, "reward_std": 0.17229565978050232, "rewards/code_format_reward/mean": 0.8236607313156128, "rewards/code_format_reward/std": 0.3815346360206604, "rewards/curriculum_aware_reward_fn/mean": 0.21294644474983215, "rewards/curriculum_aware_reward_fn/std": 0.14066724479198456, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1939.0380859375, "completions/mean_terminated_length": 1303.1705322265625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3218153687467767, "grad_norm": 0.22845226526260376, "kl": 0.015228271484375, "learning_rate": 1e-06, "loss": 0.1391, "num_tokens": 100650427.0, "reward": 0.9670760035514832, "reward_std": 0.2431958168745041, "rewards/code_format_reward/mean": 0.78125, "rewards/code_format_reward/std": 0.4138607978820801, "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548, "rewards/curriculum_aware_reward_fn/std": 0.13783639669418335, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1808.8326416015625, "completions/mean_terminated_length": 1370.8642578125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3259412068076328, "grad_norm": 0.2206607162952423, "kl": 0.01694488525390625, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 101732923.0, "reward": 1.0236607789993286, "reward_std": 0.21730710566043854, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.18214286863803864, "rewards/curriculum_aware_reward_fn/std": 0.13819824159145355, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1931.15185546875, "completions/mean_terminated_length": 1218.100830078125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.3300670448684889, "grad_norm": 0.2661918103694916, "kl": 0.01607513427734375, "learning_rate": 1e-06, "loss": 0.1196, "num_tokens": 102861237.0, "reward": 0.9046875238418579, "reward_std": 0.2178276777267456, "rewards/code_format_reward/mean": 0.75, "rewards/code_format_reward/std": 0.43349677324295044, "rewards/curriculum_aware_reward_fn/mean": 0.15468750894069672, "rewards/curriculum_aware_reward_fn/std": 0.14217200875282288, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1844.3438720703125, "completions/mean_terminated_length": 1163.6104736328125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.334192882929345, "grad_norm": 0.21588768064975739, "kl": 0.01670074462890625, "learning_rate": 1e-06, "loss": 0.1077, "num_tokens": 103959747.0, "reward": 0.9659598469734192, "reward_std": 0.1847652792930603, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.1958705335855484, "rewards/curriculum_aware_reward_fn/std": 0.16381129622459412, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1976.0068359375, "completions/mean_terminated_length": 1374.63330078125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.33831872099020116, "grad_norm": 0.20742100477218628, "kl": 0.01610565185546875, "learning_rate": 1e-06, "loss": 0.097, "num_tokens": 105114190.0, "reward": 0.9648438096046448, "reward_std": 0.21596986055374146, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548, "rewards/curriculum_aware_reward_fn/std": 0.14072753489017487, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 4000.0, "completions/mean_length": 2044.899658203125, "completions/mean_terminated_length": 1361.199462890625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.34244455905105725, "grad_norm": 0.26717469096183777, "kl": 0.01522064208984375, "learning_rate": 1e-06, "loss": 0.1235, "num_tokens": 106323115.0, "reward": 0.9233258962631226, "reward_std": 0.23657990992069244, "rewards/code_format_reward/mean": 0.7522321343421936, "rewards/code_format_reward/std": 0.4321989119052887, "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776, "rewards/curriculum_aware_reward_fn/std": 0.14227430522441864, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2522321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2159.348388671875, "completions/mean_terminated_length": 1506.089599609375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.34657039711191334, "grad_norm": 2.97698974609375, "kl": 0.319793701171875, "learning_rate": 1e-06, "loss": 0.0768, "num_tokens": 107574248.0, "reward": 0.9077010154724121, "reward_std": 0.20499999821186066, "rewards/code_format_reward/mean": 0.75, "rewards/code_format_reward/std": 0.43349677324295044, "rewards/curriculum_aware_reward_fn/mean": 0.15770089626312256, "rewards/curriculum_aware_reward_fn/std": 0.14362619817256927, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1941964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1893.1898193359375, "completions/mean_terminated_length": 1362.3184814453125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3506962351727695, "grad_norm": 0.23494893312454224, "kl": 0.01613616943359375, "learning_rate": 1e-06, "loss": 0.1371, "num_tokens": 108703958.0, "reward": 0.9831473231315613, "reward_std": 0.28516101837158203, "rewards/code_format_reward/mean": 0.8080357313156128, "rewards/code_format_reward/std": 0.3942854404449463, "rewards/curriculum_aware_reward_fn/mean": 0.17511160671710968, "rewards/curriculum_aware_reward_fn/std": 0.1376536637544632, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2611607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 2132.754638671875, "completions/mean_terminated_length": 1438.797607421875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3548220732336256, "grad_norm": 0.21388478577136993, "kl": 0.01474761962890625, "learning_rate": 1e-06, "loss": 0.1361, "num_tokens": 109917858.0, "reward": 0.914843738079071, "reward_std": 0.25436264276504517, "rewards/code_format_reward/mean": 0.7410714030265808, "rewards/code_format_reward/std": 0.43853598833084106, "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023, "rewards/curriculum_aware_reward_fn/std": 0.14255832135677338, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1919642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 1760.279052734375, "completions/mean_terminated_length": 1205.384033203125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.35894791129448167, "grad_norm": 0.2028041034936905, "kl": 0.0170440673828125, "learning_rate": 1e-06, "loss": 0.1113, "num_tokens": 110991643.0, "reward": 1.0066964626312256, "reward_std": 0.16249890625476837, "rewards/code_format_reward/mean": 0.8058035969734192, "rewards/code_format_reward/std": 0.3960230052471161, "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, "rewards/curriculum_aware_reward_fn/std": 0.13506683707237244, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 1839.2568359375, "completions/mean_terminated_length": 1264.0084228515625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.3630737493553378, "grad_norm": 0.25737714767456055, "kl": 0.01641082763671875, "learning_rate": 1e-06, "loss": 0.1086, "num_tokens": 112092172.0, "reward": 0.9707589745521545, "reward_std": 0.23582224547863007, "rewards/code_format_reward/mean": 0.7946428656578064, "rewards/code_format_reward/std": 0.40441393852233887, "rewards/curriculum_aware_reward_fn/mean": 0.17611606419086456, "rewards/curriculum_aware_reward_fn/std": 0.14302924275398254, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1971.482177734375, "completions/mean_terminated_length": 1304.8446044921875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.3671995874161939, "grad_norm": 0.43521085381507874, "kl": 0.01739501953125, "learning_rate": 1e-06, "loss": 0.1102, "num_tokens": 113258112.0, "reward": 0.9188616275787354, "reward_std": 0.2197439968585968, "rewards/code_format_reward/mean": 0.7611607313156128, "rewards/code_format_reward/std": 0.4268510043621063, "rewards/curriculum_aware_reward_fn/mean": 0.15770088136196136, "rewards/curriculum_aware_reward_fn/std": 0.14327529072761536, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1960.5224609375, "completions/mean_terminated_length": 1482.0819091796875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.37132542547705005, "grad_norm": 1.3905837535858154, "kl": 0.028656005859375, "learning_rate": 1e-06, "loss": 0.0801, "num_tokens": 114416783.0, "reward": 1.009151816368103, "reward_std": 0.2122829705476761, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343, "rewards/curriculum_aware_reward_fn/std": 0.13840459287166595, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 1792.435302734375, "completions/mean_terminated_length": 1213.3267822265625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.37545126353790614, "grad_norm": 0.19705531001091003, "kl": 0.017974853515625, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 115498786.0, "reward": 0.9724330902099609, "reward_std": 0.17091991007328033, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380533695221, "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776, "rewards/curriculum_aware_reward_fn/std": 0.14368249475955963, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1975.6273193359375, "completions/mean_terminated_length": 1374.146240234375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.37957710159876223, "grad_norm": 0.43463876843452454, "kl": 0.0468292236328125, "learning_rate": 1e-06, "loss": 0.0951, "num_tokens": 116643371.0, "reward": 0.9460937976837158, "reward_std": 0.21486541628837585, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.16707590222358704, "rewards/curriculum_aware_reward_fn/std": 0.1629006415605545, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1674107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 1699.790283203125, "completions/mean_terminated_length": 1217.9786376953125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3837029396596184, "grad_norm": 0.2277679443359375, "kl": 0.0191802978515625, "learning_rate": 1e-06, "loss": 0.1171, "num_tokens": 117679158.0, "reward": 1.0132813453674316, "reward_std": 0.2020494043827057, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.17845983803272247, "rewards/curriculum_aware_reward_fn/std": 0.1565471738576889, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 1753.99560546875, "completions/mean_terminated_length": 1377.818603515625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.38782877772047447, "grad_norm": 0.22877489030361176, "kl": 0.019775390625, "learning_rate": 1e-06, "loss": 0.0487, "num_tokens": 118731312.0, "reward": 1.0579241514205933, "reward_std": 0.16506682336330414, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.19854912161827087, "rewards/curriculum_aware_reward_fn/std": 0.13795046508312225, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1703.1160888671875, "completions/mean_terminated_length": 1282.3201904296875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.39195461578133056, "grad_norm": 0.2043149769306183, "kl": 0.0197296142578125, "learning_rate": 1e-06, "loss": 0.0728, "num_tokens": 119770878.0, "reward": 1.0570311546325684, "reward_std": 0.18754906952381134, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352, "rewards/curriculum_aware_reward_fn/std": 0.1388920247554779, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 1530.4085693359375, "completions/mean_terminated_length": 1243.927978515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3960804538421867, "grad_norm": 0.30418333411216736, "kl": 0.0242156982421875, "learning_rate": 1e-06, "loss": 0.1313, "num_tokens": 120728556.0, "reward": 1.0689733028411865, "reward_std": 0.18825943768024445, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256, "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 1664.134033203125, "completions/mean_terminated_length": 1251.4151611328125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.4002062919030428, "grad_norm": 0.6233256459236145, "kl": 0.0251312255859375, "learning_rate": 1e-06, "loss": 0.074, "num_tokens": 121740935.0, "reward": 1.0614955425262451, "reward_std": 0.16232284903526306, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352, "rewards/curriculum_aware_reward_fn/std": 0.14282289147377014, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1824.4576416015625, "completions/mean_terminated_length": 1292.5537109375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4043321299638989, "grad_norm": 0.22889189422130585, "kl": 0.01766204833984375, "learning_rate": 1e-06, "loss": 0.104, "num_tokens": 122834834.0, "reward": 1.1388393640518188, "reward_std": 0.28884097933769226, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252743124961853, "rewards/curriculum_aware_reward_fn/mean": 0.3285714089870453, "rewards/curriculum_aware_reward_fn/std": 0.318551242351532, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4000.0, "completions/mean_length": 1661.8616943359375, "completions/mean_terminated_length": 1248.7572021484375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.40845796802475504, "grad_norm": 0.7959763407707214, "kl": 0.0198974609375, "learning_rate": 1e-06, "loss": 0.0787, "num_tokens": 123850430.0, "reward": 1.2450892925262451, "reward_std": 0.2383604198694229, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387, "rewards/curriculum_aware_reward_fn/std": 0.3007444441318512, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 2082.6005859375, "completions/mean_terminated_length": 1443.050048828125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.4125838060856111, "grad_norm": 0.22675098478794098, "kl": 0.016845703125, "learning_rate": 1e-06, "loss": 0.1181, "num_tokens": 125048924.0, "reward": 1.1112724542617798, "reward_std": 0.3455579876899719, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.35234373807907104, "rewards/curriculum_aware_reward_fn/std": 0.30698445439338684, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 1744.680908203125, "completions/mean_terminated_length": 1323.91845703125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4167096441464673, "grad_norm": 0.22585518658161163, "kl": 0.0187835693359375, "learning_rate": 1e-06, "loss": 0.1183, "num_tokens": 126114405.0, "reward": 1.2287946939468384, "reward_std": 0.2684269845485687, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.3805803656578064, "rewards/curriculum_aware_reward_fn/std": 0.29996660351753235, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2098214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 1884.2835693359375, "completions/mean_terminated_length": 1296.9915771484375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.42083548220732336, "grad_norm": 0.3677929639816284, "kl": 0.0188140869140625, "learning_rate": 1e-06, "loss": 0.1043, "num_tokens": 127244766.0, "reward": 1.1637277603149414, "reward_std": 0.3110141158103943, "rewards/code_format_reward/mean": 0.7901785969734192, "rewards/code_format_reward/std": 0.40763622522354126, "rewards/curriculum_aware_reward_fn/mean": 0.37354913353919983, "rewards/curriculum_aware_reward_fn/std": 0.3067074716091156, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1755.8148193359375, "completions/mean_terminated_length": 1183.76953125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.42496132026817945, "grad_norm": 0.229450061917305, "kl": 0.0190887451171875, "learning_rate": 1e-06, "loss": 0.1335, "num_tokens": 128312631.0, "reward": 1.1440848112106323, "reward_std": 0.22956174612045288, "rewards/code_format_reward/mean": 0.8035714030265808, "rewards/code_format_reward/std": 0.39774051308631897, "rewards/curriculum_aware_reward_fn/mean": 0.3405133783817291, "rewards/curriculum_aware_reward_fn/std": 0.2985129952430725, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1909.044677734375, "completions/mean_terminated_length": 1374.45556640625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.4290871583290356, "grad_norm": 0.24083521962165833, "kl": 0.017913818359375, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 129424734.0, "reward": 1.1671875715255737, "reward_std": 0.3163076937198639, "rewards/code_format_reward/mean": 0.8035714030265808, "rewards/code_format_reward/std": 0.3977404832839966, "rewards/curriculum_aware_reward_fn/mean": 0.36361604928970337, "rewards/curriculum_aware_reward_fn/std": 0.29430437088012695, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1389.8460693359375, "completions/mean_terminated_length": 1167.6014404296875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.4332129963898917, "grad_norm": 0.2619832158088684, "kl": 0.0225067138671875, "learning_rate": 1e-06, "loss": 0.1098, "num_tokens": 130308331.0, "reward": 1.3381696939468384, "reward_std": 0.24706150591373444, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.265122652053833, "rewards/curriculum_aware_reward_fn/mean": 0.4140624701976776, "rewards/curriculum_aware_reward_fn/std": 0.2952509820461273, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1655.8326416015625, "completions/mean_terminated_length": 1307.2371826171875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.4373388344507478, "grad_norm": 0.24595658481121063, "kl": 0.0212249755859375, "learning_rate": 1e-06, "loss": 0.0767, "num_tokens": 131325034.0, "reward": 1.2672991752624512, "reward_std": 0.24971547722816467, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.3922991156578064, "rewards/curriculum_aware_reward_fn/std": 0.2960248589515686, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 1656.15185546875, "completions/mean_terminated_length": 1188.94677734375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.44146467251160393, "grad_norm": 0.3704790771007538, "kl": 0.020477294921875, "learning_rate": 1e-06, "loss": 0.1244, "num_tokens": 132320691.0, "reward": 1.2327009439468384, "reward_std": 0.2673065662384033, "rewards/code_format_reward/mean": 0.8370535969734192, "rewards/code_format_reward/std": 0.3697296679019928, "rewards/curriculum_aware_reward_fn/mean": 0.3956473171710968, "rewards/curriculum_aware_reward_fn/std": 0.29913750290870667, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1538.372802734375, "completions/mean_terminated_length": 1135.2325439453125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.44559051057246, "grad_norm": 0.32088541984558105, "kl": 0.0222625732421875, "learning_rate": 1e-06, "loss": 0.0987, "num_tokens": 133278850.0, "reward": 1.200446605682373, "reward_std": 0.1880311220884323, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095627069473267, "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, "rewards/curriculum_aware_reward_fn/std": 0.31286945939064026, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1741.90185546875, "completions/mean_terminated_length": 1206.5863037109375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.44971634863331617, "grad_norm": 0.22313106060028076, "kl": 0.0215301513671875, "learning_rate": 1e-06, "loss": 0.0738, "num_tokens": 134327455.0, "reward": 1.2027901411056519, "reward_std": 0.25760817527770996, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.38805803656578064, "rewards/curriculum_aware_reward_fn/std": 0.3171059191226959, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1705.8482666015625, "completions/mean_terminated_length": 1336.237060546875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.45384218669417226, "grad_norm": 0.5322986245155334, "kl": 0.064666748046875, "learning_rate": 1e-06, "loss": 0.1118, "num_tokens": 135364563.0, "reward": 1.2229912281036377, "reward_std": 0.2728971242904663, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.35915178060531616, "rewards/curriculum_aware_reward_fn/std": 0.29508450627326965, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 1267.446533203125, "completions/mean_terminated_length": 1064.440185546875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.45796802475502835, "grad_norm": 0.2848234474658966, "kl": 0.0265960693359375, "learning_rate": 1e-06, "loss": 0.0998, "num_tokens": 136178695.0, "reward": 1.4200893640518188, "reward_std": 0.19471189379692078, "rewards/code_format_reward/mean": 0.9308035969734192, "rewards/code_format_reward/std": 0.25407159328460693, "rewards/curriculum_aware_reward_fn/mean": 0.4892856776714325, "rewards/curriculum_aware_reward_fn/std": 0.28257983922958374, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1507.4710693359375, "completions/mean_terminated_length": 1274.4404296875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4620938628158845, "grad_norm": 0.24630674719810486, "kl": 0.023223876953125, "learning_rate": 1e-06, "loss": 0.0871, "num_tokens": 137136779.0, "reward": 1.3398438692092896, "reward_std": 0.24005301296710968, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096, "rewards/curriculum_aware_reward_fn/std": 0.2932768166065216, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1992.060302734375, "completions/mean_terminated_length": 1395.24072265625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4662197008767406, "grad_norm": 4.038174629211426, "kl": 0.0250091552734375, "learning_rate": 1e-06, "loss": 0.1024, "num_tokens": 138315249.0, "reward": 1.1348215341567993, "reward_std": 0.31040358543395996, "rewards/code_format_reward/mean": 0.78125, "rewards/code_format_reward/std": 0.4138607978820801, "rewards/curriculum_aware_reward_fn/mean": 0.35357141494750977, "rewards/curriculum_aware_reward_fn/std": 0.2992880642414093, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 1731.1160888671875, "completions/mean_terminated_length": 1177.3553466796875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4703455389375967, "grad_norm": 0.21624675393104553, "kl": 0.021209716796875, "learning_rate": 1e-06, "loss": 0.1069, "num_tokens": 139343846.0, "reward": 1.1998885869979858, "reward_std": 0.27098482847213745, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.3873883783817291, "rewards/curriculum_aware_reward_fn/std": 0.30484986305236816, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1448.509033203125, "completions/mean_terminated_length": 1188.9510498046875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4744713769984528, "grad_norm": 0.717583179473877, "kl": 0.0406646728515625, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 140255075.0, "reward": 1.3869420289993286, "reward_std": 0.22124750912189484, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854745090007782, "rewards/curriculum_aware_reward_fn/mean": 0.47622767090797424, "rewards/curriculum_aware_reward_fn/std": 0.29526206851005554, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1866.904052734375, "completions/mean_terminated_length": 1411.497314453125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4785972150593089, "grad_norm": 0.25914421677589417, "kl": 0.0201873779296875, "learning_rate": 1e-06, "loss": 0.104, "num_tokens": 141357756.0, "reward": 1.1671875715255737, "reward_std": 0.29631105065345764, "rewards/code_format_reward/mean": 0.8303571343421936, "rewards/code_format_reward/std": 0.37573832273483276, "rewards/curriculum_aware_reward_fn/mean": 0.33683034777641296, "rewards/curriculum_aware_reward_fn/std": 0.30225586891174316, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 1616.9376220703125, "completions/mean_terminated_length": 1226.180908203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.48272305312016506, "grad_norm": 0.3568706810474396, "kl": 0.0214080810546875, "learning_rate": 1e-06, "loss": 0.1059, "num_tokens": 142361990.0, "reward": 1.2168527841567993, "reward_std": 0.20818667113780975, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334251284599304, "rewards/curriculum_aware_reward_fn/mean": 0.35301336646080017, "rewards/curriculum_aware_reward_fn/std": 0.30549928545951843, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1734.7567138671875, "completions/mean_terminated_length": 1341.2161865234375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.48684889118102115, "grad_norm": 0.2393302172422409, "kl": 0.0227508544921875, "learning_rate": 1e-06, "loss": 0.0651, "num_tokens": 143411237.0, "reward": 1.2437500953674316, "reward_std": 0.2932717204093933, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.3910714089870453, "rewards/curriculum_aware_reward_fn/std": 0.3008536696434021, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1937.966552734375, "completions/mean_terminated_length": 1395.444091796875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.49097472924187724, "grad_norm": 144.79115295410156, "kl": 11.142135620117188, "learning_rate": 1e-06, "loss": 0.2232, "num_tokens": 144552754.0, "reward": 1.1873886585235596, "reward_std": 0.27385786175727844, "rewards/code_format_reward/mean": 0.7991071343421936, "rewards/code_format_reward/std": 0.4011159837245941, "rewards/curriculum_aware_reward_fn/mean": 0.3882812559604645, "rewards/curriculum_aware_reward_fn/std": 0.3559999465942383, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 2016.8951416015625, "completions/mean_terminated_length": 1449.8665771484375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.4951005673027334, "grad_norm": 0.3591211438179016, "kl": 0.019683837890625, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 145719796.0, "reward": 1.1426339149475098, "reward_std": 0.36062249541282654, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.35691961646080017, "rewards/curriculum_aware_reward_fn/std": 0.3005719482898712, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 1744.2857666015625, "completions/mean_terminated_length": 1301.389892578125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4992264053635895, "grad_norm": 0.3126465976238251, "kl": 0.047943115234375, "learning_rate": 1e-06, "loss": 0.0751, "num_tokens": 146769224.0, "reward": 1.2447545528411865, "reward_std": 0.2551679313182831, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.40100446343421936, "rewards/curriculum_aware_reward_fn/std": 0.30527445673942566, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1506.857177734375, "completions/mean_terminated_length": 1152.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5033522434244456, "grad_norm": 0.2568581998348236, "kl": 0.0231475830078125, "learning_rate": 1e-06, "loss": 0.0733, "num_tokens": 147711045.0, "reward": 1.3008930683135986, "reward_std": 0.24989202618598938, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.42142853140830994, "rewards/curriculum_aware_reward_fn/std": 0.30747321248054504, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 1926.27685546875, "completions/mean_terminated_length": 1395.9000244140625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5074780814853017, "grad_norm": 0.23189201951026917, "kl": 0.01876068115234375, "learning_rate": 1e-06, "loss": 0.0917, "num_tokens": 148856558.0, "reward": 1.1736607551574707, "reward_std": 0.2991105318069458, "rewards/code_format_reward/mean": 0.8035714030265808, "rewards/code_format_reward/std": 0.3977404832839966, "rewards/curriculum_aware_reward_fn/mean": 0.37008926272392273, "rewards/curriculum_aware_reward_fn/std": 0.3080544173717499, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1860.857177734375, "completions/mean_terminated_length": 1432.85107421875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5116039195461578, "grad_norm": 0.2435297667980194, "kl": 0.0208282470703125, "learning_rate": 1e-06, "loss": 0.1266, "num_tokens": 149947157.0, "reward": 1.2025669813156128, "reward_std": 0.3079543709754944, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.3610491156578064, "rewards/curriculum_aware_reward_fn/std": 0.30205413699150085, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 1687.22998046875, "completions/mean_terminated_length": 1225.97607421875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5157297576070139, "grad_norm": 1.005789875984192, "kl": 0.0277099609375, "learning_rate": 1e-06, "loss": 0.0732, "num_tokens": 150966362.0, "reward": 1.2402902841567993, "reward_std": 0.2671992778778076, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.39877229928970337, "rewards/curriculum_aware_reward_fn/std": 0.29583922028541565, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1716.3304443359375, "completions/mean_terminated_length": 1230.1612548828125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.51985559566787, "grad_norm": 0.23711848258972168, "kl": 0.0218658447265625, "learning_rate": 1e-06, "loss": 0.1152, "num_tokens": 151995531.0, "reward": 1.2325893640518188, "reward_std": 0.2793101370334625, "rewards/code_format_reward/mean": 0.8303571343421936, "rewards/code_format_reward/std": 0.37573832273483276, "rewards/curriculum_aware_reward_fn/mean": 0.4022321403026581, "rewards/curriculum_aware_reward_fn/std": 0.296654611825943, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1562.4285888671875, "completions/mean_terminated_length": 1286.4949951171875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5239814337287262, "grad_norm": 0.25690746307373047, "kl": 0.0216522216796875, "learning_rate": 1e-06, "loss": 0.0878, "num_tokens": 152969745.0, "reward": 1.32421875, "reward_std": 0.25885266065597534, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.29793688654899597, "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096, "rewards/curriculum_aware_reward_fn/std": 0.3098207712173462, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1784.1585693359375, "completions/mean_terminated_length": 1419.759765625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5281072717895823, "grad_norm": 0.2524595558643341, "kl": 0.0201263427734375, "learning_rate": 1e-06, "loss": 0.073, "num_tokens": 154036553.0, "reward": 1.2431920766830444, "reward_std": 0.2910597026348114, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.3793526291847229, "rewards/curriculum_aware_reward_fn/std": 0.3008619546890259, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 1931.9130859375, "completions/mean_terminated_length": 1447.062744140625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.5322331098504384, "grad_norm": 0.620236337184906, "kl": 0.0193634033203125, "learning_rate": 1e-06, "loss": 0.093, "num_tokens": 155200845.0, "reward": 1.173437476158142, "reward_std": 0.2876041829586029, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.3520089089870453, "rewards/curriculum_aware_reward_fn/std": 0.30668607354164124, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1763392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 1707.0491943359375, "completions/mean_terminated_length": 1195.593505859375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5363589479112945, "grad_norm": 0.3680073022842407, "kl": 0.05645751953125, "learning_rate": 1e-06, "loss": 0.1403, "num_tokens": 156233115.0, "reward": 1.2227678298950195, "reward_std": 0.28796523809432983, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.3968749940395355, "rewards/curriculum_aware_reward_fn/std": 0.3155217170715332, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 1826.3773193359375, "completions/mean_terminated_length": 1325.4522705078125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5404847859721505, "grad_norm": 0.22368091344833374, "kl": 0.0381317138671875, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 157330149.0, "reward": 1.1661831140518188, "reward_std": 0.26647037267684937, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.3514508605003357, "rewards/curriculum_aware_reward_fn/std": 0.3114034831523895, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1739.4263916015625, "completions/mean_terminated_length": 1360.9093017578125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5446106240330068, "grad_norm": 5.251157283782959, "kl": 1.00634765625, "learning_rate": 1e-06, "loss": 0.0779, "num_tokens": 158392779.0, "reward": 1.2868304252624512, "reward_std": 0.26909175515174866, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.4252232015132904, "rewards/curriculum_aware_reward_fn/std": 0.2903171479701996, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1873.0045166015625, "completions/mean_terminated_length": 1352.4683837890625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.5487364620938628, "grad_norm": 0.2221236675977707, "kl": 0.0202484130859375, "learning_rate": 1e-06, "loss": 0.078, "num_tokens": 159511220.0, "reward": 1.130357265472412, "reward_std": 0.2697470784187317, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657, "rewards/curriculum_aware_reward_fn/std": 0.30488425493240356, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1049107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1628.3438720703125, "completions/mean_terminated_length": 1339.1173095703125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5528623001547189, "grad_norm": 0.2322208136320114, "kl": 0.0207061767578125, "learning_rate": 1e-06, "loss": 0.093, "num_tokens": 160512094.0, "reward": 1.2918527126312256, "reward_std": 0.24635407328605652, "rewards/code_format_reward/mean": 0.8973214030265808, "rewards/code_format_reward/std": 0.30387791991233826, "rewards/curriculum_aware_reward_fn/mean": 0.3945312201976776, "rewards/curriculum_aware_reward_fn/std": 0.28814804553985596, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 1850.7724609375, "completions/mean_terminated_length": 1442.0106201171875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.556988138215575, "grad_norm": 0.2326047718524933, "kl": 0.02008056640625, "learning_rate": 1e-06, "loss": 0.102, "num_tokens": 161625734.0, "reward": 1.2510045766830444, "reward_std": 0.3010588586330414, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.4050223231315613, "rewards/curriculum_aware_reward_fn/std": 0.3193120062351227, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1741.94873046875, "completions/mean_terminated_length": 1398.77490234375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5611139762764311, "grad_norm": 0.2658071219921112, "kl": 0.0209503173828125, "learning_rate": 1e-06, "loss": 0.0679, "num_tokens": 162668816.0, "reward": 1.3080357313156128, "reward_std": 0.2536565661430359, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904, "rewards/curriculum_aware_reward_fn/std": 0.28444406390190125, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1614.5938720703125, "completions/mean_terminated_length": 1193.4674072265625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5652398143372873, "grad_norm": 0.24400874972343445, "kl": 0.022796630859375, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 163658504.0, "reward": 1.2035715579986572, "reward_std": 0.22330662608146667, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.3486607074737549, "rewards/curriculum_aware_reward_fn/std": 0.31307879090309143, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 1638.0938720703125, "completions/mean_terminated_length": 1243.300537109375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5693656523981434, "grad_norm": 0.217624232172966, "kl": 0.01983642578125, "learning_rate": 1e-06, "loss": 0.0868, "num_tokens": 164669143.0, "reward": 1.2754465341567993, "reward_std": 0.2384774088859558, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616, "rewards/curriculum_aware_reward_fn/std": 0.32119113206863403, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 1413.9129638671875, "completions/mean_terminated_length": 1158.163818359375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.5734914904589995, "grad_norm": 0.2783685624599457, "kl": 0.0218353271484375, "learning_rate": 1e-06, "loss": 0.0606, "num_tokens": 165554219.0, "reward": 1.3580358028411865, "reward_std": 0.22280895709991455, "rewards/code_format_reward/mean": 0.9129464030265808, "rewards/code_format_reward/std": 0.2822287082672119, "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616, "rewards/curriculum_aware_reward_fn/std": 0.2796177864074707, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1116071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 1759.0826416015625, "completions/mean_terminated_length": 1465.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5776173285198556, "grad_norm": 0.2369956076145172, "kl": 0.020294189453125, "learning_rate": 1e-06, "loss": 0.074, "num_tokens": 166618536.0, "reward": 1.2844866514205933, "reward_std": 0.29120200872421265, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.3124580383300781, "rewards/curriculum_aware_reward_fn/mean": 0.3938615620136261, "rewards/curriculum_aware_reward_fn/std": 0.29871317744255066, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3962.0, "completions/mean_length": 1629.52685546875, "completions/mean_terminated_length": 1117.6171875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5817431665807117, "grad_norm": 0.22888554632663727, "kl": 0.02154541015625, "learning_rate": 1e-06, "loss": 0.0965, "num_tokens": 167620136.0, "reward": 1.2700893878936768, "reward_std": 0.22284676134586334, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.4419642984867096, "rewards/curriculum_aware_reward_fn/std": 0.2943010926246643, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1929.825927734375, "completions/mean_terminated_length": 1451.73291015625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5858690046415678, "grad_norm": 0.31053388118743896, "kl": 0.0206146240234375, "learning_rate": 1e-06, "loss": 0.0926, "num_tokens": 168775254.0, "reward": 1.1685268878936768, "reward_std": 0.2711971700191498, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.3515625, "rewards/curriculum_aware_reward_fn/std": 0.3206498324871063, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 2099.91748046875, "completions/mean_terminated_length": 1611.9862060546875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.589994842702424, "grad_norm": 1.0171209573745728, "kl": 0.0308685302734375, "learning_rate": 1e-06, "loss": 0.092, "num_tokens": 169998306.0, "reward": 1.1707589626312256, "reward_std": 0.32189029455184937, "rewards/code_format_reward/mean": 0.8058035969734192, "rewards/code_format_reward/std": 0.3960230052471161, "rewards/curriculum_aware_reward_fn/mean": 0.3649553656578064, "rewards/curriculum_aware_reward_fn/std": 0.3043855130672455, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1779.51123046875, "completions/mean_terminated_length": 1357.7757568359375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5941206807632801, "grad_norm": 0.2087247222661972, "kl": 0.0209197998046875, "learning_rate": 1e-06, "loss": 0.0873, "num_tokens": 171055451.0, "reward": 1.2494419813156128, "reward_std": 0.22824469208717346, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.4034597873687744, "rewards/curriculum_aware_reward_fn/std": 0.2976873219013214, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 1915.65185546875, "completions/mean_terminated_length": 1434.4305419921875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5982465188241362, "grad_norm": 0.22233060002326965, "kl": 0.021392822265625, "learning_rate": 1e-06, "loss": 0.0797, "num_tokens": 172183898.0, "reward": 1.1960937976837158, "reward_std": 0.28867703676223755, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.37689733505249023, "rewards/curriculum_aware_reward_fn/std": 0.3042333126068115, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 1554.509033203125, "completions/mean_terminated_length": 1184.01025390625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6023723568849922, "grad_norm": 0.25177621841430664, "kl": 0.0221405029296875, "learning_rate": 1e-06, "loss": 0.0628, "num_tokens": 173136882.0, "reward": 1.3223215341567993, "reward_std": 0.2333114594221115, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337, "rewards/curriculum_aware_reward_fn/std": 0.2995332181453705, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 1709.7098388671875, "completions/mean_terminated_length": 1436.6517333984375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6064981949458483, "grad_norm": 0.23244866728782654, "kl": 0.02008056640625, "learning_rate": 1e-06, "loss": 0.0525, "num_tokens": 174169469.0, "reward": 1.5930804014205933, "reward_std": 0.3070758581161499, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.6935268044471741, "rewards/curriculum_aware_reward_fn/std": 0.43586739897727966, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2015.8974609375, "completions/mean_terminated_length": 1542.8876953125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.6106240330067045, "grad_norm": 0.22977960109710693, "kl": 0.020538330078125, "learning_rate": 1e-06, "loss": 0.0895, "num_tokens": 175345031.0, "reward": 1.3625000715255737, "reward_std": 0.37156882882118225, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034, "rewards/curriculum_aware_reward_fn/std": 0.44534605741500854, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1978.6429443359375, "completions/mean_terminated_length": 1532.2811279296875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6147498710675606, "grad_norm": 54.07054138183594, "kl": 5.5802459716796875, "learning_rate": 1e-06, "loss": 0.1208, "num_tokens": 176520752.0, "reward": 1.4250000715255737, "reward_std": 0.3232128322124481, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.5991071462631226, "rewards/curriculum_aware_reward_fn/std": 0.4462988078594208, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 1759.966552734375, "completions/mean_terminated_length": 1356.358642578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6188757091284167, "grad_norm": 0.2159195989370346, "kl": 0.0220794677734375, "learning_rate": 1e-06, "loss": 0.0831, "num_tokens": 177595706.0, "reward": 1.485267996788025, "reward_std": 0.31410348415374756, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.6348214745521545, "rewards/curriculum_aware_reward_fn/std": 0.43525242805480957, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1610.05810546875, "completions/mean_terminated_length": 1262.1527099609375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6230015471892728, "grad_norm": 0.26271775364875793, "kl": 0.0230865478515625, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 178580982.0, "reward": 1.5331473350524902, "reward_std": 0.29026809334754944, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.6559152007102966, "rewards/curriculum_aware_reward_fn/std": 0.42833590507507324, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1439.6920166015625, "completions/mean_terminated_length": 1221.541015625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6271273852501289, "grad_norm": 0.2625259459018707, "kl": 0.0258941650390625, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 179483746.0, "reward": 1.6277902126312256, "reward_std": 0.24596910178661346, "rewards/code_format_reward/mean": 0.9241071343421936, "rewards/code_format_reward/std": 0.2651226818561554, "rewards/curriculum_aware_reward_fn/mean": 0.7036830186843872, "rewards/curriculum_aware_reward_fn/std": 0.3924238681793213, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 1682.0826416015625, "completions/mean_terminated_length": 1287.077880859375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.631253223310985, "grad_norm": 0.22368377447128296, "kl": 0.0244598388671875, "learning_rate": 1e-06, "loss": 0.056, "num_tokens": 180512524.0, "reward": 1.5142858028411865, "reward_std": 0.2848253846168518, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6526784896850586, "rewards/curriculum_aware_reward_fn/std": 0.4242819547653198, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 1798.2724609375, "completions/mean_terminated_length": 1401.28271484375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6353790613718412, "grad_norm": 3.2692782878875732, "kl": 0.0210113525390625, "learning_rate": 1e-06, "loss": 0.0781, "num_tokens": 181588987.0, "reward": 1.4170759916305542, "reward_std": 0.3211572766304016, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.5621652007102966, "rewards/curriculum_aware_reward_fn/std": 0.4653093218803406, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 1706.3795166015625, "completions/mean_terminated_length": 1286.157470703125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.6395048994326973, "grad_norm": 0.20925132930278778, "kl": 0.02191162109375, "learning_rate": 1e-06, "loss": 0.0643, "num_tokens": 182636882.0, "reward": 1.430915355682373, "reward_std": 0.28560274839401245, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.5804687142372131, "rewards/curriculum_aware_reward_fn/std": 0.4736019968986511, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1834.4130859375, "completions/mean_terminated_length": 1265.8575439453125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6436307374935534, "grad_norm": 0.2284567803144455, "kl": 0.0224151611328125, "learning_rate": 1e-06, "loss": 0.061, "num_tokens": 183734975.0, "reward": 1.3992189168930054, "reward_std": 0.2940734326839447, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.5978794693946838, "rewards/curriculum_aware_reward_fn/std": 0.4464387595653534, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1867.7857666015625, "completions/mean_terminated_length": 1268.1246337890625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6477565755544095, "grad_norm": 0.24589860439300537, "kl": 0.0221405029296875, "learning_rate": 1e-06, "loss": 0.1204, "num_tokens": 184850196.0, "reward": 1.4158483743667603, "reward_std": 0.31856080889701843, "rewards/code_format_reward/mean": 0.7879464030265808, "rewards/code_format_reward/std": 0.40921953320503235, "rewards/curriculum_aware_reward_fn/mean": 0.6279017329216003, "rewards/curriculum_aware_reward_fn/std": 0.4837891459465027, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1900.57373046875, "completions/mean_terminated_length": 1416.0245361328125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6518824136152656, "grad_norm": 0.23172283172607422, "kl": 0.02203369140625, "learning_rate": 1e-06, "loss": 0.094, "num_tokens": 185946744.0, "reward": 1.4448662996292114, "reward_std": 0.3671303391456604, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.6167410612106323, "rewards/curriculum_aware_reward_fn/std": 0.4445529878139496, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1822.5179443359375, "completions/mean_terminated_length": 1457.34716796875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6560082516761218, "grad_norm": 0.21886523067951202, "kl": 0.023040771484375, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 187044471.0, "reward": 1.4851562976837158, "reward_std": 0.27218568325042725, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334251284599304, "rewards/curriculum_aware_reward_fn/mean": 0.6213169693946838, "rewards/curriculum_aware_reward_fn/std": 0.43654102087020874, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 1853.3037109375, "completions/mean_terminated_length": 1479.5208740234375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6601340897369778, "grad_norm": 0.25297442078590393, "kl": 0.022857666015625, "learning_rate": 1e-06, "loss": 0.0641, "num_tokens": 188175443.0, "reward": 1.4321428537368774, "reward_std": 0.3305543065071106, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.5727678537368774, "rewards/curriculum_aware_reward_fn/std": 0.4538741111755371, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2163.453125, "completions/mean_terminated_length": 1643.362548828125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6642599277978339, "grad_norm": 15.494752883911133, "kl": 0.241455078125, "learning_rate": 1e-06, "loss": 0.1041, "num_tokens": 189418118.0, "reward": 1.4010045528411865, "reward_std": 0.42570188641548157, "rewards/code_format_reward/mean": 0.7879464030265808, "rewards/code_format_reward/std": 0.40921953320503235, "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162, "rewards/curriculum_aware_reward_fn/std": 0.46330952644348145, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1600.154052734375, "completions/mean_terminated_length": 1293.6466064453125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.66838576585869, "grad_norm": 0.23208573460578918, "kl": 0.02618408203125, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 190404746.0, "reward": 1.5679688453674316, "reward_std": 0.28690239787101746, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6751116514205933, "rewards/curriculum_aware_reward_fn/std": 0.43228229880332947, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1678.0067138671875, "completions/mean_terminated_length": 1332.5791015625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6725116039195461, "grad_norm": 0.21358643472194672, "kl": 0.02490234375, "learning_rate": 1e-06, "loss": 0.079, "num_tokens": 191424173.0, "reward": 1.4213169813156128, "reward_std": 0.3107840418815613, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.5485491156578064, "rewards/curriculum_aware_reward_fn/std": 0.4599668085575104, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1544.6473388671875, "completions/mean_terminated_length": 1238.4849853515625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.6766374419804023, "grad_norm": 0.22339744865894318, "kl": 0.027435302734375, "learning_rate": 1e-06, "loss": 0.1136, "num_tokens": 192387628.0, "reward": 1.5645090341567993, "reward_std": 0.3053089678287506, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6716518402099609, "rewards/curriculum_aware_reward_fn/std": 0.42687925696372986, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1722.247802734375, "completions/mean_terminated_length": 1340.971435546875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6807632800412584, "grad_norm": 0.22346089780330658, "kl": 0.026641845703125, "learning_rate": 1e-06, "loss": 0.0703, "num_tokens": 193439790.0, "reward": 1.5340402126312256, "reward_std": 0.3189355731010437, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6724330186843872, "rewards/curriculum_aware_reward_fn/std": 0.4248095750808716, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1860.029052734375, "completions/mean_terminated_length": 1473.70947265625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6848891181021145, "grad_norm": 0.4491024613380432, "kl": 0.0242767333984375, "learning_rate": 1e-06, "loss": 0.0949, "num_tokens": 194559180.0, "reward": 1.485602855682373, "reward_std": 0.37783434987068176, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.6284598112106323, "rewards/curriculum_aware_reward_fn/std": 0.4338955879211426, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1602.30810546875, "completions/mean_terminated_length": 1246.0662841796875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6890149561629706, "grad_norm": 0.2428234964609146, "kl": 0.031341552734375, "learning_rate": 1e-06, "loss": 0.0549, "num_tokens": 195544246.0, "reward": 1.5504463911056519, "reward_std": 0.2811921536922455, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.6732142567634583, "rewards/curriculum_aware_reward_fn/std": 0.4268362522125244, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 1865.9130859375, "completions/mean_terminated_length": 1507.71240234375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6931407942238267, "grad_norm": 0.2817108929157257, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 196671669.0, "reward": 1.4904019832611084, "reward_std": 0.3173461854457855, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.6265625357627869, "rewards/curriculum_aware_reward_fn/std": 0.43362295627593994, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1674107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1881.388427734375, "completions/mean_terminated_length": 1436.0911865234375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6972666322846828, "grad_norm": 0.3142848312854767, "kl": 0.0292205810546875, "learning_rate": 1e-06, "loss": 0.0543, "num_tokens": 197800782.0, "reward": 1.4146206378936768, "reward_std": 0.3862914443016052, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.5797991156578064, "rewards/curriculum_aware_reward_fn/std": 0.4677990674972534, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2074.52685546875, "completions/mean_terminated_length": 1523.2159423828125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.701392470345539, "grad_norm": 0.2322724312543869, "kl": 0.026275634765625, "learning_rate": 1e-06, "loss": 0.0639, "num_tokens": 199023176.0, "reward": 1.3515626192092896, "reward_std": 0.3802144229412079, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.5658482313156128, "rewards/curriculum_aware_reward_fn/std": 0.4406941831111908, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1990.6407470703125, "completions/mean_terminated_length": 1560.513427734375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7055183084063951, "grad_norm": 0.2236924022436142, "kl": 0.026702880859375, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 200189656.0, "reward": 1.4627233743667603, "reward_std": 0.32308143377304077, "rewards/code_format_reward/mean": 0.8303571343421936, "rewards/code_format_reward/std": 0.37573832273483276, "rewards/curriculum_aware_reward_fn/mean": 0.6323660612106323, "rewards/curriculum_aware_reward_fn/std": 0.43709608912467957, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 1724.93310546875, "completions/mean_terminated_length": 1285.8465576171875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7096441464672512, "grad_norm": 0.22402040660381317, "kl": 0.0262603759765625, "learning_rate": 1e-06, "loss": 0.0855, "num_tokens": 201223736.0, "reward": 1.447767972946167, "reward_std": 0.3190096616744995, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774, "rewards/curriculum_aware_reward_fn/std": 0.48629483580589294, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1807.4287109375, "completions/mean_terminated_length": 1324.9730224609375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.7137699845281072, "grad_norm": 0.20310117304325104, "kl": 0.026702880859375, "learning_rate": 1e-06, "loss": 0.0933, "num_tokens": 202291253.0, "reward": 1.5252233743667603, "reward_std": 0.3091839551925659, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.6970981955528259, "rewards/curriculum_aware_reward_fn/std": 0.44606462121009827, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1324.3170166015625, "completions/mean_terminated_length": 1052.5833740234375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7178958225889633, "grad_norm": 0.25287967920303345, "kl": 0.0323333740234375, "learning_rate": 1e-06, "loss": 0.0458, "num_tokens": 203151912.0, "reward": 1.601562738418579, "reward_std": 0.264218270778656, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854744791984558, "rewards/curriculum_aware_reward_fn/mean": 0.6908482313156128, "rewards/curriculum_aware_reward_fn/std": 0.4577873945236206, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1976.7076416015625, "completions/mean_terminated_length": 1487.64013671875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7220216606498195, "grad_norm": 0.22337962687015533, "kl": 0.0275726318359375, "learning_rate": 1e-06, "loss": 0.0868, "num_tokens": 204336881.0, "reward": 1.3909599781036377, "reward_std": 0.35256850719451904, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252740144729614, "rewards/curriculum_aware_reward_fn/mean": 0.580691933631897, "rewards/curriculum_aware_reward_fn/std": 0.4459373652935028, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1631.310302734375, "completions/mean_terminated_length": 1362.878662109375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7261474987106756, "grad_norm": 0.22130419313907623, "kl": 0.028656005859375, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 205337467.0, "reward": 1.6895090341567993, "reward_std": 0.2801817059516907, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489603638648987, "rewards/curriculum_aware_reward_fn/mean": 0.7854910492897034, "rewards/curriculum_aware_reward_fn/std": 0.40007179975509644, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 1944.5826416015625, "completions/mean_terminated_length": 1476.8831787109375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7302733367715317, "grad_norm": 0.2612294852733612, "kl": 0.027130126953125, "learning_rate": 1e-06, "loss": 0.0948, "num_tokens": 206477860.0, "reward": 1.4131697416305542, "reward_std": 0.355887770652771, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902, "rewards/curriculum_aware_reward_fn/std": 0.4985761046409607, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1941964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1801.9443359375, "completions/mean_terminated_length": 1249.0830078125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7343991748323878, "grad_norm": 0.2323751598596573, "kl": 0.0294036865234375, "learning_rate": 1e-06, "loss": 0.0978, "num_tokens": 207566363.0, "reward": 1.3904019594192505, "reward_std": 0.31305989623069763, "rewards/code_format_reward/mean": 0.8080357313156128, "rewards/code_format_reward/std": 0.3942854404449463, "rewards/curriculum_aware_reward_fn/mean": 0.5823659896850586, "rewards/curriculum_aware_reward_fn/std": 0.44665220379829407, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1608.8773193359375, "completions/mean_terminated_length": 1310.4224853515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7385250128932439, "grad_norm": 0.24387770891189575, "kl": 0.0294342041015625, "learning_rate": 1e-06, "loss": 0.0693, "num_tokens": 208537103.0, "reward": 1.486830472946167, "reward_std": 0.3190726041793823, "rewards/code_format_reward/mean": 0.8950892686843872, "rewards/code_format_reward/std": 0.3067808747291565, "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902, "rewards/curriculum_aware_reward_fn/std": 0.4914354383945465, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4007.0, "completions/mean_length": 1846.66748046875, "completions/mean_terminated_length": 1357.68212890625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.7426508509541001, "grad_norm": 0.42223504185676575, "kl": 0.0312347412109375, "learning_rate": 1e-06, "loss": 0.0936, "num_tokens": 209624936.0, "reward": 1.4097100496292114, "reward_std": 0.31943628191947937, "rewards/code_format_reward/mean": 0.8236607313156128, "rewards/code_format_reward/std": 0.3815346360206604, "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195, "rewards/curriculum_aware_reward_fn/std": 0.4472143352031708, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1833.060302734375, "completions/mean_terminated_length": 1303.1707763671875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.7467766890149562, "grad_norm": 0.22536613047122955, "kl": 0.029296875, "learning_rate": 1e-06, "loss": 0.0963, "num_tokens": 210728513.0, "reward": 1.450334906578064, "reward_std": 0.2970465123653412, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252740144729614, "rewards/curriculum_aware_reward_fn/mean": 0.6400669813156128, "rewards/curriculum_aware_reward_fn/std": 0.4401248097419739, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1873.4866943359375, "completions/mean_terminated_length": 1475.7738037109375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7509025270758123, "grad_norm": 0.25091269612312317, "kl": 0.0291290283203125, "learning_rate": 1e-06, "loss": 0.0639, "num_tokens": 211823881.0, "reward": 1.4522322416305542, "reward_std": 0.3361148536205292, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.6017856597900391, "rewards/curriculum_aware_reward_fn/std": 0.46313437819480896, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1746.0023193359375, "completions/mean_terminated_length": 1281.0294189453125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7550283651366684, "grad_norm": 0.24613533914089203, "kl": 0.03045654296875, "learning_rate": 1e-06, "loss": 0.0629, "num_tokens": 212866385.0, "reward": 1.489174246788025, "reward_std": 0.297740638256073, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.6543526649475098, "rewards/curriculum_aware_reward_fn/std": 0.4452936053276062, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1674107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 2007.6295166015625, "completions/mean_terminated_length": 1587.7158203125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7591542031975245, "grad_norm": 0.23514395952224731, "kl": 0.0292510986328125, "learning_rate": 1e-06, "loss": 0.046, "num_tokens": 214051270.0, "reward": 1.4108260869979858, "reward_std": 0.36220675706863403, "rewards/code_format_reward/mean": 0.8370535969734192, "rewards/code_format_reward/std": 0.3697296679019928, "rewards/curriculum_aware_reward_fn/mean": 0.5737723112106323, "rewards/curriculum_aware_reward_fn/std": 0.4432297646999359, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1622.7344970703125, "completions/mean_terminated_length": 1232.8914794921875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7632800412583806, "grad_norm": 0.2874307930469513, "kl": 0.0306396484375, "learning_rate": 1e-06, "loss": 0.1072, "num_tokens": 215048952.0, "reward": 1.3916295766830444, "reward_std": 0.27101579308509827, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.5300223231315613, "rewards/curriculum_aware_reward_fn/std": 0.4591200649738312, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 2090.618408203125, "completions/mean_terminated_length": 1499.4364013671875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7674058793192368, "grad_norm": 0.22228585183620453, "kl": 0.0281829833984375, "learning_rate": 1e-06, "loss": 0.1194, "num_tokens": 216246425.0, "reward": 1.2716518640518188, "reward_std": 0.4031871259212494, "rewards/code_format_reward/mean": 0.78125, "rewards/code_format_reward/std": 0.4138607978820801, "rewards/curriculum_aware_reward_fn/mean": 0.4904017746448517, "rewards/curriculum_aware_reward_fn/std": 0.4519253373146057, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 1512.0535888671875, "completions/mean_terminated_length": 1265.66259765625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7715317173800929, "grad_norm": 0.22118829190731049, "kl": 0.031341552734375, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 217192268.0, "reward": 1.527009129524231, "reward_std": 0.26555386185646057, "rewards/code_format_reward/mean": 0.9196428656578064, "rewards/code_format_reward/std": 0.2721492052078247, "rewards/curriculum_aware_reward_fn/mean": 0.6073660254478455, "rewards/curriculum_aware_reward_fn/std": 0.46336886286735535, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1622.5201416015625, "completions/mean_terminated_length": 1290.635498046875, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.7756575554409489, "grad_norm": 0.22401097416877747, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0562, "num_tokens": 218182868.0, "reward": 1.4532368183135986, "reward_std": 0.2841085195541382, "rewards/code_format_reward/mean": 0.8839285969734192, "rewards/code_format_reward/std": 0.32066863775253296, "rewards/curriculum_aware_reward_fn/mean": 0.5693081021308899, "rewards/curriculum_aware_reward_fn/std": 0.4530544579029083, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 1664.4554443359375, "completions/mean_terminated_length": 1273.8963623046875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.779783393501805, "grad_norm": 0.2380165308713913, "kl": 0.0315399169921875, "learning_rate": 1e-06, "loss": 0.0685, "num_tokens": 219187502.0, "reward": 1.4876116514205933, "reward_std": 0.32384008169174194, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.6282365918159485, "rewards/curriculum_aware_reward_fn/std": 0.4722626507282257, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1138392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1659.7723388671875, "completions/mean_terminated_length": 1346.8060302734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7839092315626611, "grad_norm": 0.34679973125457764, "kl": 0.0302276611328125, "learning_rate": 1e-06, "loss": 0.1048, "num_tokens": 220184697.0, "reward": 1.539955496788025, "reward_std": 0.3225671648979187, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.6537945866584778, "rewards/curriculum_aware_reward_fn/std": 0.4496917724609375, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 1594.0648193359375, "completions/mean_terminated_length": 1258.362060546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7880350696235173, "grad_norm": 0.347991406917572, "kl": 0.0323486328125, "learning_rate": 1e-06, "loss": 0.11, "num_tokens": 221148836.0, "reward": 1.6414064168930054, "reward_std": 0.328239381313324, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.755245566368103, "rewards/curriculum_aware_reward_fn/std": 0.41560760140419006, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 1887.9844970703125, "completions/mean_terminated_length": 1513.255859375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.7921609076843734, "grad_norm": 0.7645904421806335, "kl": 0.0301361083984375, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 222288485.0, "reward": 1.4526787996292114, "reward_std": 0.30151334404945374, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.5955356955528259, "rewards/curriculum_aware_reward_fn/std": 0.44083017110824585, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1725.9442138671875, "completions/mean_terminated_length": 1359.440673828125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7962867457452295, "grad_norm": 0.2471107840538025, "kl": 0.0307464599609375, "learning_rate": 1e-06, "loss": 0.0607, "num_tokens": 223335947.0, "reward": 1.5162948369979858, "reward_std": 0.32580631971359253, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095630049705505, "rewards/curriculum_aware_reward_fn/mean": 0.6502231955528259, "rewards/curriculum_aware_reward_fn/std": 0.4345405101776123, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2254464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 2046.5001220703125, "completions/mean_terminated_length": 1449.9595947265625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8004125838060856, "grad_norm": 0.2045373022556305, "kl": 0.0279083251953125, "learning_rate": 1e-06, "loss": 0.1047, "num_tokens": 224518009.0, "reward": 1.3463170528411865, "reward_std": 0.3773540258407593, "rewards/code_format_reward/mean": 0.7745535969734192, "rewards/code_format_reward/std": 0.41834309697151184, "rewards/curriculum_aware_reward_fn/mean": 0.5717633962631226, "rewards/curriculum_aware_reward_fn/std": 0.4517683684825897, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1744.388427734375, "completions/mean_terminated_length": 1345.289794921875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.8045384218669417, "grad_norm": 0.2941077947616577, "kl": 0.0295867919921875, "learning_rate": 1e-06, "loss": 0.0859, "num_tokens": 225559174.0, "reward": 1.5177457332611084, "reward_std": 0.3033301830291748, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.6628348231315613, "rewards/curriculum_aware_reward_fn/std": 0.4127408564090729, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1809.15185546875, "completions/mean_terminated_length": 1495.725830078125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8086642599277978, "grad_norm": 0.2557990550994873, "kl": 0.031646728515625, "learning_rate": 1e-06, "loss": 0.1066, "num_tokens": 226641974.0, "reward": 1.4722100496292114, "reward_std": 0.37684550881385803, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195, "rewards/curriculum_aware_reward_fn/std": 0.44295454025268555, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1995.5982666015625, "completions/mean_terminated_length": 1503.7686767578125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.812790097988654, "grad_norm": 0.20381715893745422, "kl": 0.0287933349609375, "learning_rate": 1e-06, "loss": 0.0545, "num_tokens": 227803885.0, "reward": 1.3908482789993286, "reward_std": 0.3567146360874176, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.5761160254478455, "rewards/curriculum_aware_reward_fn/std": 0.47179847955703735, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1600.185302734375, "completions/mean_terminated_length": 1199.3031005859375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8169159360495101, "grad_norm": 0.29694321751594543, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 228775491.0, "reward": 1.4993302822113037, "reward_std": 0.23577921092510223, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.6354910731315613, "rewards/curriculum_aware_reward_fn/std": 0.42602863907814026, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1745.0513916015625, "completions/mean_terminated_length": 1346.0653076171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8210417741103662, "grad_norm": 0.2424536943435669, "kl": 0.0304412841796875, "learning_rate": 1e-06, "loss": 0.0717, "num_tokens": 229817282.0, "reward": 1.483035683631897, "reward_std": 0.3085847795009613, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6214285492897034, "rewards/curriculum_aware_reward_fn/std": 0.45782846212387085, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1753.384033203125, "completions/mean_terminated_length": 1517.3955078125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8251676121712223, "grad_norm": 0.2286684364080429, "kl": 0.0298004150390625, "learning_rate": 1e-06, "loss": 0.0483, "num_tokens": 230885598.0, "reward": 1.54676353931427, "reward_std": 0.31968414783477783, "rewards/code_format_reward/mean": 0.9129464030265808, "rewards/code_format_reward/std": 0.2822287082672119, "rewards/curriculum_aware_reward_fn/mean": 0.6338168978691101, "rewards/curriculum_aware_reward_fn/std": 0.44459256529808044, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1600.5157470703125, "completions/mean_terminated_length": 1244.017822265625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8292934502320783, "grad_norm": 0.22150000929832458, "kl": 0.03021240234375, "learning_rate": 1e-06, "loss": 0.094, "num_tokens": 231872199.0, "reward": 1.434598445892334, "reward_std": 0.298090398311615, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.5595981478691101, "rewards/curriculum_aware_reward_fn/std": 0.46301397681236267, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1712.7254638671875, "completions/mean_terminated_length": 1293.619384765625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8334192882929345, "grad_norm": 0.24653927981853485, "kl": 0.0312347412109375, "learning_rate": 1e-06, "loss": 0.0796, "num_tokens": 232899585.0, "reward": 1.3952010869979858, "reward_std": 0.31651803851127625, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.5402902364730835, "rewards/curriculum_aware_reward_fn/std": 0.45736637711524963, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 2037.8304443359375, "completions/mean_terminated_length": 1603.946044921875, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.8375451263537906, "grad_norm": 0.22224228084087372, "kl": 0.029876708984375, "learning_rate": 1e-06, "loss": 0.0668, "num_tokens": 234095361.0, "reward": 1.3809152841567993, "reward_std": 0.37990495562553406, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.5527901649475098, "rewards/curriculum_aware_reward_fn/std": 0.43541234731674194, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 1682.0201416015625, "completions/mean_terminated_length": 1337.165771484375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8416709644146467, "grad_norm": 0.25886648893356323, "kl": 0.0328826904296875, "learning_rate": 1e-06, "loss": 0.0476, "num_tokens": 235114591.0, "reward": 1.5552457571029663, "reward_std": 0.3339327573776245, "rewards/code_format_reward/mean": 0.8839285969734192, "rewards/code_format_reward/std": 0.32066863775253296, "rewards/curriculum_aware_reward_fn/mean": 0.6713169813156128, "rewards/curriculum_aware_reward_fn/std": 0.4169880151748657, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 1614.8438720703125, "completions/mean_terminated_length": 1260.392822265625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.8457968024755028, "grad_norm": 0.25539520382881165, "kl": 0.03082275390625, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 236103705.0, "reward": 1.507924199104309, "reward_std": 0.25643816590309143, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.6284598708152771, "rewards/curriculum_aware_reward_fn/std": 0.5155189037322998, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1644.69873046875, "completions/mean_terminated_length": 1384.4371337890625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8499226405363589, "grad_norm": 0.2182522863149643, "kl": 0.032928466796875, "learning_rate": 1e-06, "loss": 0.0571, "num_tokens": 237102526.0, "reward": 1.6372768878936768, "reward_std": 0.2757372558116913, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.7220982313156128, "rewards/curriculum_aware_reward_fn/std": 0.3957159221172333, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1881.337158203125, "completions/mean_terminated_length": 1370.260986328125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8540484785972151, "grad_norm": 0.20540083944797516, "kl": 0.03094482421875, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 238202951.0, "reward": 1.3666294813156128, "reward_std": 0.3059476613998413, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.5518973469734192, "rewards/curriculum_aware_reward_fn/std": 0.4562782943248749, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1735.805908203125, "completions/mean_terminated_length": 1398.6351318359375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.8581743166580712, "grad_norm": 0.2484840750694275, "kl": 0.0309600830078125, "learning_rate": 1e-06, "loss": 0.0979, "num_tokens": 239242658.0, "reward": 1.5085939168930054, "reward_std": 0.34762099385261536, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.6291294097900391, "rewards/curriculum_aware_reward_fn/std": 0.4308629035949707, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 2012.274658203125, "completions/mean_terminated_length": 1488.432861328125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8623001547189273, "grad_norm": 0.19853706657886505, "kl": 0.030975341796875, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 240423849.0, "reward": 1.4322545528411865, "reward_std": 0.3431403338909149, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162, "rewards/curriculum_aware_reward_fn/std": 0.4514111578464508, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2165178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 2111.489013671875, "completions/mean_terminated_length": 1563.0626220703125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8664259927797834, "grad_norm": 0.24466735124588013, "kl": 0.030792236328125, "learning_rate": 1e-06, "loss": 0.0651, "num_tokens": 241637357.0, "reward": 1.4133929014205933, "reward_std": 0.33380740880966187, "rewards/code_format_reward/mean": 0.796875, "rewards/code_format_reward/std": 0.4027745723724365, "rewards/curriculum_aware_reward_fn/mean": 0.6165178418159485, "rewards/curriculum_aware_reward_fn/std": 0.46240732073783875, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1902.3037109375, "completions/mean_terminated_length": 1425.4130859375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8705518308406395, "grad_norm": 0.1979178786277771, "kl": 0.032989501953125, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 242745187.0, "reward": 1.4494421482086182, "reward_std": 0.26812899112701416, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.6235490441322327, "rewards/curriculum_aware_reward_fn/std": 0.47366687655448914, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1524.2188720703125, "completions/mean_terminated_length": 1306.271240234375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8746776689014956, "grad_norm": 0.23214657604694366, "kl": 0.03704833984375, "learning_rate": 1e-06, "loss": 0.054, "num_tokens": 243692420.0, "reward": 1.6546876430511475, "reward_std": 0.2663484513759613, "rewards/code_format_reward/mean": 0.9263392686843872, "rewards/code_format_reward/std": 0.2615099549293518, "rewards/curriculum_aware_reward_fn/mean": 0.7283481955528259, "rewards/curriculum_aware_reward_fn/std": 0.40330010652542114, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1860.7412109375, "completions/mean_terminated_length": 1396.8193359375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8788035069623518, "grad_norm": 0.2359783798456192, "kl": 0.0379638671875, "learning_rate": 1e-06, "loss": 0.1015, "num_tokens": 244798745.0, "reward": 1.4878350496292114, "reward_std": 0.3436294496059418, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6485491394996643, "rewards/curriculum_aware_reward_fn/std": 0.4317037761211395, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1294642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 1827.13623046875, "completions/mean_terminated_length": 1489.7154541015625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.8829293450232079, "grad_norm": 0.43438419699668884, "kl": 0.032135009765625, "learning_rate": 1e-06, "loss": 0.0871, "num_tokens": 245898247.0, "reward": 1.5524554252624512, "reward_std": 0.309803307056427, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936, "rewards/curriculum_aware_reward_fn/std": 0.4090176224708557, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1742.2366943359375, "completions/mean_terminated_length": 1492.3309326171875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.887055183084064, "grad_norm": 0.29153332114219666, "kl": 0.0323944091796875, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 246938763.0, "reward": 1.610714316368103, "reward_std": 0.3009406626224518, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.6933035254478455, "rewards/curriculum_aware_reward_fn/std": 0.40388187766075134, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 1818.85498046875, "completions/mean_terminated_length": 1418.412109375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.89118102114492, "grad_norm": 0.21342267096042633, "kl": 0.0300750732421875, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 248023628.0, "reward": 1.3243303298950195, "reward_std": 0.2561975419521332, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095627069473267, "rewards/curriculum_aware_reward_fn/mean": 0.45825889706611633, "rewards/curriculum_aware_reward_fn/std": 0.4662056863307953, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1629464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1759.1407470703125, "completions/mean_terminated_length": 1304.23193359375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8953068592057761, "grad_norm": 0.20606671273708344, "kl": 0.033477783203125, "learning_rate": 1e-06, "loss": 0.0774, "num_tokens": 249074727.0, "reward": 1.3753349781036377, "reward_std": 0.2294098436832428, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.507031261920929, "rewards/curriculum_aware_reward_fn/std": 0.46656760573387146, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 1633.0782470703125, "completions/mean_terminated_length": 1337.5274658203125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8994326972666323, "grad_norm": 0.26139992475509644, "kl": 0.03546142578125, "learning_rate": 1e-06, "loss": 0.0694, "num_tokens": 250064611.0, "reward": 1.6659599542617798, "reward_std": 0.2920212149620056, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489603638648987, "rewards/curriculum_aware_reward_fn/mean": 0.7619419693946838, "rewards/curriculum_aware_reward_fn/std": 0.4175769090652466, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2098214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1939.6005859375, "completions/mean_terminated_length": 1366.9971923828125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.9035585353274884, "grad_norm": 0.2301202118396759, "kl": 0.03155517578125, "learning_rate": 1e-06, "loss": 0.0979, "num_tokens": 251204739.0, "reward": 1.364174246788025, "reward_std": 0.3046819865703583, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.5472097992897034, "rewards/curriculum_aware_reward_fn/std": 0.4933087229728699, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1786.0157470703125, "completions/mean_terminated_length": 1221.352783203125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9076843733883445, "grad_norm": 0.5878376960754395, "kl": 0.13665771484375, "learning_rate": 1e-06, "loss": 0.0782, "num_tokens": 252260367.0, "reward": 1.51551353931427, "reward_std": 0.3167440593242645, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.6873884201049805, "rewards/curriculum_aware_reward_fn/std": 0.45862656831741333, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1775.884033203125, "completions/mean_terminated_length": 1316.8236083984375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.9118102114492006, "grad_norm": 0.22542373836040497, "kl": 0.0326995849609375, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 253308551.0, "reward": 1.4619419574737549, "reward_std": 0.32789382338523865, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6226562857627869, "rewards/curriculum_aware_reward_fn/std": 0.44398173689842224, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 1829.544677734375, "completions/mean_terminated_length": 1444.898193359375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9159360495100567, "grad_norm": 0.24280433356761932, "kl": 0.032867431640625, "learning_rate": 1e-06, "loss": 0.0575, "num_tokens": 254392535.0, "reward": 1.5252233743667603, "reward_std": 0.3197804391384125, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6636161208152771, "rewards/curriculum_aware_reward_fn/std": 0.4355611503124237, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1836.180908203125, "completions/mean_terminated_length": 1590.0618896484375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.9200618875709129, "grad_norm": 0.24582676589488983, "kl": 0.0306854248046875, "learning_rate": 1e-06, "loss": 0.0706, "num_tokens": 255481207.0, "reward": 1.5697544813156128, "reward_std": 0.3073837459087372, "rewards/code_format_reward/mean": 0.9174107313156128, "rewards/code_format_reward/std": 0.2755681276321411, "rewards/curriculum_aware_reward_fn/mean": 0.65234375, "rewards/curriculum_aware_reward_fn/std": 0.4572802186012268, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1116071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 1784.857177734375, "completions/mean_terminated_length": 1494.5125732421875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.924187725631769, "grad_norm": 0.32395270466804504, "kl": 0.029998779296875, "learning_rate": 1e-06, "loss": 0.1009, "num_tokens": 256559521.0, "reward": 1.5242189168930054, "reward_std": 0.3307342529296875, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6313616633415222, "rewards/curriculum_aware_reward_fn/std": 0.43223538994789124, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1617.3192138671875, "completions/mean_terminated_length": 1400.7354736328125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9283135636926251, "grad_norm": 0.21705108880996704, "kl": 0.032135009765625, "learning_rate": 1e-06, "loss": 0.0599, "num_tokens": 257553552.0, "reward": 1.4319196939468384, "reward_std": 0.19349054992198944, "rewards/code_format_reward/mean": 0.9196428656578064, "rewards/code_format_reward/std": 0.2721492052078247, "rewards/curriculum_aware_reward_fn/mean": 0.5122767686843872, "rewards/curriculum_aware_reward_fn/std": 0.4762067496776581, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1747.665283203125, "completions/mean_terminated_length": 1419.017822265625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9324394017534812, "grad_norm": 0.3296845555305481, "kl": 0.0321044921875, "learning_rate": 1e-06, "loss": 0.0784, "num_tokens": 258600945.0, "reward": 1.543861746788025, "reward_std": 0.30385205149650574, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.666629433631897, "rewards/curriculum_aware_reward_fn/std": 0.4261413514614105, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1931.638427734375, "completions/mean_terminated_length": 1461.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9365652398143373, "grad_norm": 0.23225350677967072, "kl": 0.0313873291015625, "learning_rate": 1e-06, "loss": 0.1046, "num_tokens": 259729778.0, "reward": 1.358147382736206, "reward_std": 0.32802098989486694, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.5367187857627869, "rewards/curriculum_aware_reward_fn/std": 0.46693459153175354, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1851.51123046875, "completions/mean_terminated_length": 1442.8839111328125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.9406910778751933, "grad_norm": 0.21108375489711761, "kl": 0.03179931640625, "learning_rate": 1e-06, "loss": 0.0731, "num_tokens": 260825178.0, "reward": 1.5152901411056519, "reward_std": 0.33156508207321167, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.6715401411056519, "rewards/curriculum_aware_reward_fn/std": 0.4522121250629425, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 1721.6451416015625, "completions/mean_terminated_length": 1325.9193115234375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9448169159360496, "grad_norm": 0.3119603097438812, "kl": 0.035125732421875, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 261878428.0, "reward": 1.5291296243667603, "reward_std": 0.29912132024765015, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.6697544455528259, "rewards/curriculum_aware_reward_fn/std": 0.42858609557151794, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 1753.6607666015625, "completions/mean_terminated_length": 1305.1275634765625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9489427539969056, "grad_norm": 0.22429104149341583, "kl": 0.0317535400390625, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 262934310.0, "reward": 1.381361722946167, "reward_std": 0.3329118490219116, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.5420759320259094, "rewards/curriculum_aware_reward_fn/std": 0.4664093852043152, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1739.4263916015625, "completions/mean_terminated_length": 1360.9093017578125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.9530685920577617, "grad_norm": 0.22260624170303345, "kl": 0.032623291015625, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 263976059.0, "reward": 1.5345982313156128, "reward_std": 0.28138771653175354, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936, "rewards/curriculum_aware_reward_fn/std": 0.4627295732498169, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1576.6473388671875, "completions/mean_terminated_length": 1329.6519775390625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9571944301186178, "grad_norm": 0.22776390612125397, "kl": 0.03472900390625, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 264949986.0, "reward": 1.6268973350524902, "reward_std": 0.2715020775794983, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854745090007782, "rewards/curriculum_aware_reward_fn/mean": 0.716183066368103, "rewards/curriculum_aware_reward_fn/std": 0.4235256612300873, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1813.0826416015625, "completions/mean_terminated_length": 1375.9281005859375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9613202681794739, "grad_norm": 0.5938295125961304, "kl": 0.03570556640625, "learning_rate": 1e-06, "loss": 0.0627, "num_tokens": 266030928.0, "reward": 1.4172991514205933, "reward_std": 0.27423596382141113, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.5780134201049805, "rewards/curriculum_aware_reward_fn/std": 0.4604513943195343, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1860.4107666015625, "completions/mean_terminated_length": 1453.40380859375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.9654461062403301, "grad_norm": 0.2252698838710785, "kl": 0.032440185546875, "learning_rate": 1e-06, "loss": 0.0919, "num_tokens": 267137231.0, "reward": 1.4508929252624512, "reward_std": 0.35087722539901733, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064, "rewards/curriculum_aware_reward_fn/std": 0.4518719017505646, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1817.727783203125, "completions/mean_terminated_length": 1417.086669921875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.9695719443011862, "grad_norm": 0.2286522537469864, "kl": 0.0334320068359375, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 268223397.0, "reward": 1.3939732313156128, "reward_std": 0.27675938606262207, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.5412946343421936, "rewards/curriculum_aware_reward_fn/std": 0.48773789405822754, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 1985.325927734375, "completions/mean_terminated_length": 1491.0909423828125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9736977823620423, "grad_norm": 0.20148524641990662, "kl": 0.03509521484375, "learning_rate": 1e-06, "loss": 0.0899, "num_tokens": 269388133.0, "reward": 1.4079241752624512, "reward_std": 0.3953123390674591, "rewards/code_format_reward/mean": 0.8080357313156128, "rewards/code_format_reward/std": 0.3942854106426239, "rewards/curriculum_aware_reward_fn/mean": 0.5998883843421936, "rewards/curriculum_aware_reward_fn/std": 0.4576530158519745, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 1784.509033203125, "completions/mean_terminated_length": 1413.233154296875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9778236204228984, "grad_norm": 0.21030192077159882, "kl": 0.036651611328125, "learning_rate": 1e-06, "loss": 0.0617, "num_tokens": 270444745.0, "reward": 1.5034600496292114, "reward_std": 0.2465612292289734, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6418526768684387, "rewards/curriculum_aware_reward_fn/std": 0.4315384030342102, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1658.04248046875, "completions/mean_terminated_length": 1365.4874267578125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9819494584837545, "grad_norm": 0.22513382136821747, "kl": 0.03680419921875, "learning_rate": 1e-06, "loss": 0.0645, "num_tokens": 271464328.0, "reward": 1.5197545289993286, "reward_std": 0.30367511510849, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6268973350524902, "rewards/curriculum_aware_reward_fn/std": 0.4268620014190674, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1725.8326416015625, "completions/mean_terminated_length": 1316.3272705078125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9860752965446106, "grad_norm": 0.2153979241847992, "kl": 0.036102294921875, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 272505594.0, "reward": 1.3497768640518188, "reward_std": 0.31243571639060974, "rewards/code_format_reward/mean": 0.8526785969734192, "rewards/code_format_reward/std": 0.3548222780227661, "rewards/curriculum_aware_reward_fn/mean": 0.4970982074737549, "rewards/curriculum_aware_reward_fn/std": 0.46139857172966003, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1712.7344970703125, "completions/mean_terminated_length": 1344.1881103515625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9902011346054668, "grad_norm": 0.22380459308624268, "kl": 0.0360107421875, "learning_rate": 1e-06, "loss": 0.058, "num_tokens": 273546494.0, "reward": 1.521875023841858, "reward_std": 0.3063981533050537, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095630049705505, "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323, "rewards/curriculum_aware_reward_fn/std": 0.42365700006484985, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1675.0223388671875, "completions/mean_terminated_length": 1357.1162109375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.9943269726663229, "grad_norm": 0.22325699031352997, "kl": 0.036895751953125, "learning_rate": 1e-06, "loss": 0.0827, "num_tokens": 274574076.0, "reward": 1.5726563930511475, "reward_std": 0.253899484872818, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.6909598112106323, "rewards/curriculum_aware_reward_fn/std": 0.4153672456741333, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948905109489051, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 1584.3941650390625, "completions/mean_terminated_length": 1321.08056640625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.998452810727179, "grad_norm": 0.24855820834636688, "kl": 0.037872314453125, "learning_rate": 1e-06, "loss": 0.1187, "num_tokens": 275609321.0, "reward": 1.4710938930511475, "reward_std": 0.3146342933177948, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.5893973112106323, "rewards/curriculum_aware_reward_fn/std": 0.4410484731197357, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 0.08443626471047583, "train_runtime": 92210.0342, "train_samples_per_second": 0.168, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 275609321, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }