diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,6577 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.998452810727179,
+  "eval_steps": 500,
+  "global_step": 242,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.453125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4094.0,
+      "completions/mean_length": 2762.078369140625,
+      "completions/mean_terminated_length": 1656.8284912109375,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.0041258380608561115,
+      "grad_norm": 0.10848142206668854,
+      "kl": 0.0,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 1505084.0,
+      "reward": 0.029017861932516098,
+      "reward_std": 0.047291483730077744,
+      "rewards/code_format_reward/mean": 0.0223214291036129,
+      "rewards/code_format_reward/std": 0.14789186418056488,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0066964286379516125,
+      "rewards/curriculum_aware_reward_fn/std": 0.0310124009847641,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4107142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 2660.15625,
+      "completions/mean_terminated_length": 1659.416748046875,
+      "completions/min_length": 305.0,
+      "completions/min_terminated_length": 305.0,
+      "epoch": 0.008251676121712223,
+      "grad_norm": 0.12915591895580292,
+      "kl": 0.0005238056182861328,
+      "learning_rate": 1e-06,
+      "loss": -0.0071,
+      "num_tokens": 2947042.0,
+      "reward": 0.04441964253783226,
+      "reward_std": 0.0783877819776535,
+      "rewards/code_format_reward/mean": 0.02901785634458065,
+      "rewards/code_format_reward/std": 0.16804419457912445,
+      "rewards/curriculum_aware_reward_fn/mean": 0.015401787124574184,
+      "rewards/curriculum_aware_reward_fn/std": 0.045581694692373276,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4486607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4059.0,
+      "completions/mean_length": 2641.801513671875,
+      "completions/mean_terminated_length": 1458.4251708984375,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.012377514182568335,
+      "grad_norm": 0.12791307270526886,
+      "kl": 0.0005018711090087891,
+      "learning_rate": 1e-06,
+      "loss": -0.0028,
+      "num_tokens": 4383850.0,
+      "reward": 0.04955357685685158,
+      "reward_std": 0.08979818224906921,
+      "rewards/code_format_reward/mean": 0.0334821417927742,
+      "rewards/code_format_reward/std": 0.1800929754972458,
+      "rewards/curriculum_aware_reward_fn/mean": 0.01607142947614193,
+      "rewards/curriculum_aware_reward_fn/std": 0.04751747474074364,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5803571428571428,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3912.0,
+      "completions/mean_length": 3049.80810546875,
+      "completions/mean_terminated_length": 1602.94677734375,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.016503352243424446,
+      "grad_norm": 0.26403144001960754,
+      "kl": 0.0005340576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 6031011.0,
+      "reward": 0.06015625596046448,
+      "reward_std": 0.08077409863471985,
+      "rewards/code_format_reward/mean": 0.0424107126891613,
+      "rewards/code_format_reward/std": 0.20174959301948547,
+      "rewards/curriculum_aware_reward_fn/mean": 0.01774553582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.04849924519658089,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5669642857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4033.0,
+      "completions/mean_length": 2951.32373046875,
+      "completions/mean_terminated_length": 1452.6236572265625,
+      "completions/min_length": 138.0,
+      "completions/min_terminated_length": 138.0,
+      "epoch": 0.020629190304280558,
+      "grad_norm": 0.11759795993566513,
+      "kl": 0.0006890296936035156,
+      "learning_rate": 1e-06,
+      "loss": 0.0234,
+      "num_tokens": 7632987.0,
+      "reward": 0.10122768580913544,
+      "reward_std": 0.150455042719841,
+      "rewards/code_format_reward/mean": 0.078125,
+      "rewards/code_format_reward/std": 0.26866820454597473,
+      "rewards/curriculum_aware_reward_fn/mean": 0.02310268022119999,
+      "rewards/curriculum_aware_reward_fn/std": 0.05603185296058655,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4196428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4064.0,
+      "completions/mean_length": 2626.33935546875,
+      "completions/mean_terminated_length": 1563.6614990234375,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.02475502836513667,
+      "grad_norm": 0.176630437374115,
+      "kl": 0.0012578964233398438,
+      "learning_rate": 1e-06,
+      "loss": 0.0312,
+      "num_tokens": 9085730.0,
+      "reward": 0.18604911863803864,
+      "reward_std": 0.23488061130046844,
+      "rewards/code_format_reward/mean": 0.1428571492433548,
+      "rewards/code_format_reward/std": 0.3503182828426361,
+      "rewards/curriculum_aware_reward_fn/mean": 0.04319196566939354,
+      "rewards/curriculum_aware_reward_fn/std": 0.06799682974815369,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4263392857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4045.0,
+      "completions/mean_length": 2600.0625,
+      "completions/mean_terminated_length": 1488.2957763671875,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.02888086642599278,
+      "grad_norm": 0.1658683717250824,
+      "kl": 0.0018901824951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0443,
+      "num_tokens": 10532126.0,
+      "reward": 0.2216518074274063,
+      "reward_std": 0.23757225275039673,
+      "rewards/code_format_reward/mean": 0.1674107164144516,
+      "rewards/code_format_reward/std": 0.37375950813293457,
+      "rewards/curriculum_aware_reward_fn/mean": 0.054241079837083817,
+      "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4709821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 2653.466552734375,
+      "completions/mean_terminated_length": 1369.185546875,
+      "completions/min_length": 356.0,
+      "completions/min_terminated_length": 356.0,
+      "epoch": 0.03300670448684889,
+      "grad_norm": 0.16779452562332153,
+      "kl": 0.0017452239990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 11985080.0,
+      "reward": 0.2589285969734192,
+      "reward_std": 0.23284269869327545,
+      "rewards/code_format_reward/mean": 0.1986607164144516,
+      "rewards/code_format_reward/std": 0.3994380831718445,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.07362107187509537,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5066964285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 2908.83056640625,
+      "completions/mean_terminated_length": 1689.429931640625,
+      "completions/min_length": 408.0,
+      "completions/min_terminated_length": 408.0,
+      "epoch": 0.037132542547705004,
+      "grad_norm": 0.1578310877084732,
+      "kl": 0.00200653076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.053,
+      "num_tokens": 13550561.0,
+      "reward": 0.36138394474983215,
+      "reward_std": 0.30128249526023865,
+      "rewards/code_format_reward/mean": 0.2924107015132904,
+      "rewards/code_format_reward/std": 0.4553784430027008,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06897322088479996,
+      "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4330357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4076.0,
+      "completions/mean_length": 2671.055908203125,
+      "completions/mean_terminated_length": 1582.712646484375,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.041258380608561115,
+      "grad_norm": 0.1712757647037506,
+      "kl": 0.0027599334716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0316,
+      "num_tokens": 15018507.0,
+      "reward": 0.500558078289032,
+      "reward_std": 0.3041485548019409,
+      "rewards/code_format_reward/mean": 0.421875,
+      "rewards/code_format_reward/std": 0.4944108724594116,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078683041036129,
+      "rewards/curriculum_aware_reward_fn/std": 0.07499326020479202,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5022321428571428,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4087.0,
+      "completions/mean_length": 2796.071533203125,
+      "completions/mean_terminated_length": 1484.484375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.04538421866941723,
+      "grad_norm": 0.7890152335166931,
+      "kl": 0.005886077880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0268,
+      "num_tokens": 16537021.0,
+      "reward": 0.49921879172325134,
+      "reward_std": 0.2203681766986847,
+      "rewards/code_format_reward/mean": 0.4285714328289032,
+      "rewards/code_format_reward/std": 0.49542486667633057,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07064732164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.07495728880167007,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4821428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4070.0,
+      "completions/mean_length": 2733.216552734375,
+      "completions/mean_terminated_length": 1464.4180908203125,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.04951005673027334,
+      "grad_norm": 0.15826576948165894,
+      "kl": 0.0032367706298828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0484,
+      "num_tokens": 18030295.0,
+      "reward": 0.5366071462631226,
+      "reward_std": 0.24472731351852417,
+      "rewards/code_format_reward/mean": 0.4709821343421936,
+      "rewards/code_format_reward/std": 0.49971529841423035,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06562499701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.07449494302272797,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.640625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4081.0,
+      "completions/mean_length": 3284.473388671875,
+      "completions/mean_terminated_length": 1837.8385009765625,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.05363589479112945,
+      "grad_norm": 0.12409358471632004,
+      "kl": 0.0019292831420898438,
+      "learning_rate": 1e-06,
+      "loss": 0.0648,
+      "num_tokens": 19771120.0,
+      "reward": 0.3962053656578064,
+      "reward_std": 0.24470412731170654,
+      "rewards/code_format_reward/mean": 0.3459821343421936,
+      "rewards/code_format_reward/std": 0.47621920704841614,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0502232126891613,
+      "rewards/curriculum_aware_reward_fn/std": 0.07086833566427231,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4955357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4070.0,
+      "completions/mean_length": 2699.68994140625,
+      "completions/mean_terminated_length": 1328.0928955078125,
+      "completions/min_length": 235.0,
+      "completions/min_terminated_length": 235.0,
+      "epoch": 0.05776173285198556,
+      "grad_norm": 0.15436327457427979,
+      "kl": 0.004604339599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0586,
+      "num_tokens": 21266482.0,
+      "reward": 0.5574777126312256,
+      "reward_std": 0.19266396760940552,
+      "rewards/code_format_reward/mean": 0.4888392984867096,
+      "rewards/code_format_reward/std": 0.5004342794418335,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0686383917927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.07938338816165924,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4799107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4050.0,
+      "completions/mean_length": 2604.18310546875,
+      "completions/mean_terminated_length": 1227.61376953125,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.06188757091284167,
+      "grad_norm": 0.14027918875217438,
+      "kl": 0.004291534423828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0539,
+      "num_tokens": 22688586.0,
+      "reward": 0.5889509320259094,
+      "reward_std": 0.16005173325538635,
+      "rewards/code_format_reward/mean": 0.515625,
+      "rewards/code_format_reward/std": 0.500314474105835,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07332588732242584,
+      "rewards/curriculum_aware_reward_fn/std": 0.07506514340639114,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 2344.2724609375,
+      "completions/mean_terminated_length": 1293.2357177734375,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.06601340897369778,
+      "grad_norm": 0.14219032227993011,
+      "kl": 0.004871368408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.081,
+      "num_tokens": 24024252.0,
+      "reward": 0.7078125476837158,
+      "reward_std": 0.1512121558189392,
+      "rewards/code_format_reward/mean": 0.6227678656578064,
+      "rewards/code_format_reward/std": 0.48523563146591187,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584,
+      "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4129464285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4049.0,
+      "completions/mean_length": 2521.03369140625,
+      "completions/mean_terminated_length": 1413.167236328125,
+      "completions/min_length": 342.0,
+      "completions/min_terminated_length": 342.0,
+      "epoch": 0.07013924703455389,
+      "grad_norm": 0.1439032107591629,
+      "kl": 0.0036773681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0766,
+      "num_tokens": 25413592.0,
+      "reward": 0.662834882736206,
+      "reward_std": 0.19979971647262573,
+      "rewards/code_format_reward/mean": 0.5848214030265808,
+      "rewards/code_format_reward/std": 0.49330365657806396,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07801339775323868,
+      "rewards/curriculum_aware_reward_fn/std": 0.07958128303289413,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4352678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4077.0,
+      "completions/mean_length": 2537.692138671875,
+      "completions/mean_terminated_length": 1336.62451171875,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.07426508509541001,
+      "grad_norm": 0.20727689564228058,
+      "kl": 0.0038776397705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0636,
+      "num_tokens": 26820657.0,
+      "reward": 0.6412946581840515,
+      "reward_std": 0.1462433785200119,
+      "rewards/code_format_reward/mean": 0.5602678656578064,
+      "rewards/code_format_reward/std": 0.49690937995910645,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08102678507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.484375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4078.0,
+      "completions/mean_length": 2791.4130859375,
+      "completions/mean_terminated_length": 1565.8917236328125,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.07839092315626611,
+      "grad_norm": 0.22532987594604492,
+      "kl": 0.002948760986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0666,
+      "num_tokens": 28362529.0,
+      "reward": 0.5607143640518188,
+      "reward_std": 0.17854316532611847,
+      "rewards/code_format_reward/mean": 0.4977678656578064,
+      "rewards/code_format_reward/std": 0.5005539655685425,
+      "rewards/curriculum_aware_reward_fn/mean": 0.06294643133878708,
+      "rewards/curriculum_aware_reward_fn/std": 0.074107825756073,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.5357142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4069.0,
+      "completions/mean_length": 2899.388427734375,
+      "completions/mean_terminated_length": 1518.6827392578125,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.08251676121712223,
+      "grad_norm": 0.14237935841083527,
+      "kl": 0.0032501220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0718,
+      "num_tokens": 29931508.0,
+      "reward": 0.5177456140518188,
+      "reward_std": 0.18259648978710175,
+      "rewards/code_format_reward/mean": 0.4598214328289032,
+      "rewards/code_format_reward/std": 0.49894019961357117,
+      "rewards/curriculum_aware_reward_fn/mean": 0.05792411044239998,
+      "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4464285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4025.0,
+      "completions/mean_length": 2652.560302734375,
+      "completions/mean_terminated_length": 1488.4959716796875,
+      "completions/min_length": 373.0,
+      "completions/min_terminated_length": 373.0,
+      "epoch": 0.08664259927797834,
+      "grad_norm": 0.9559803009033203,
+      "kl": 0.0038776397705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0722,
+      "num_tokens": 31381718.0,
+      "reward": 0.6266741156578064,
+      "reward_std": 0.17785993218421936,
+      "rewards/code_format_reward/mean": 0.5513392686843872,
+      "rewards/code_format_reward/std": 0.49791330099105835,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.07508309930562973,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4285714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3976.0,
+      "completions/mean_length": 2559.6318359375,
+      "completions/mean_terminated_length": 1407.35546875,
+      "completions/min_length": 237.0,
+      "completions/min_terminated_length": 237.0,
+      "epoch": 0.09076843733883445,
+      "grad_norm": 0.15661275386810303,
+      "kl": 0.004146575927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0993,
+      "num_tokens": 32786432.0,
+      "reward": 0.6441964507102966,
+      "reward_std": 0.21372844278812408,
+      "rewards/code_format_reward/mean": 0.5691964030265808,
+      "rewards/code_format_reward/std": 0.4957422614097595,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07499999552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.07508385181427002,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4196428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 2585.23681640625,
+      "completions/mean_terminated_length": 1492.8385009765625,
+      "completions/min_length": 321.0,
+      "completions/min_terminated_length": 321.0,
+      "epoch": 0.09489427539969056,
+      "grad_norm": 0.17427508533000946,
+      "kl": 0.00446319580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0821,
+      "num_tokens": 34208021.0,
+      "reward": 0.6626116633415222,
+      "reward_std": 0.19658702611923218,
+      "rewards/code_format_reward/mean": 0.5825892686843872,
+      "rewards/code_format_reward/std": 0.4936830997467041,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08002232015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.074915312230587,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3348214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4093.0,
+      "completions/mean_length": 2340.810302734375,
+      "completions/mean_terminated_length": 1457.3255615234375,
+      "completions/min_length": 178.0,
+      "completions/min_terminated_length": 178.0,
+      "epoch": 0.09902011346054668,
+      "grad_norm": 0.1813523918390274,
+      "kl": 0.0107879638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.109,
+      "num_tokens": 35529753.0,
+      "reward": 0.751897394657135,
+      "reward_std": 0.21504586935043335,
+      "rewards/code_format_reward/mean": 0.6651785969734192,
+      "rewards/code_format_reward/std": 0.47245556116104126,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.07416162639856339,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3705357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 2455.239013671875,
+      "completions/mean_terminated_length": 1489.400634765625,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.10314595152140278,
+      "grad_norm": 0.17525888979434967,
+      "kl": 0.00499725341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0594,
+      "num_tokens": 36900957.0,
+      "reward": 0.7077009081840515,
+      "reward_std": 0.20892252027988434,
+      "rewards/code_format_reward/mean": 0.625,
+      "rewards/code_format_reward/std": 0.48466411232948303,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08270090073347092,
+      "rewards/curriculum_aware_reward_fn/std": 0.07535793632268906,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3526785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 2328.640625,
+      "completions/mean_terminated_length": 1365.7344970703125,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.1072717895822589,
+      "grad_norm": 0.15375342965126038,
+      "kl": 0.00540924072265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0594,
+      "num_tokens": 38222347.0,
+      "reward": 0.7113839983940125,
+      "reward_std": 0.1420706957578659,
+      "rewards/code_format_reward/mean": 0.6316964030265808,
+      "rewards/code_format_reward/std": 0.4828835427761078,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07968749850988388,
+      "rewards/curriculum_aware_reward_fn/std": 0.07493705302476883,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4508928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4056.0,
+      "completions/mean_length": 2649.419677734375,
+      "completions/mean_terminated_length": 1461.5771484375,
+      "completions/min_length": 260.0,
+      "completions/min_terminated_length": 260.0,
+      "epoch": 0.111397627643115,
+      "grad_norm": 0.1293102651834488,
+      "kl": 0.00531768798828125,
+      "learning_rate": 1e-06,
+      "loss": 0.065,
+      "num_tokens": 39685605.0,
+      "reward": 0.6244419813156128,
+      "reward_std": 0.17081955075263977,
+      "rewards/code_format_reward/mean": 0.5491071343421936,
+      "rewards/code_format_reward/std": 0.4981389045715332,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0753348246216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.07900315523147583,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 2605.43310546875,
+      "completions/mean_terminated_length": 1446.103271484375,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 0.11552346570397112,
+      "grad_norm": 0.16880519688129425,
+      "kl": 0.0055389404296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0838,
+      "num_tokens": 41124915.0,
+      "reward": 0.6412946581840515,
+      "reward_std": 0.23654666543006897,
+      "rewards/code_format_reward/mean": 0.5602678656578064,
+      "rewards/code_format_reward/std": 0.49690937995910645,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08102679252624512,
+      "rewards/curriculum_aware_reward_fn/std": 0.07484103739261627,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3973214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 2520.87060546875,
+      "completions/mean_terminated_length": 1482.4517822265625,
+      "completions/min_length": 292.0,
+      "completions/min_terminated_length": 292.0,
+      "epoch": 0.11964930376482723,
+      "grad_norm": 0.16759856045246124,
+      "kl": 0.005680084228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0741,
+      "num_tokens": 42532123.0,
+      "reward": 0.6870536208152771,
+      "reward_std": 0.23582594096660614,
+      "rewards/code_format_reward/mean": 0.6026785969734192,
+      "rewards/code_format_reward/std": 0.48989060521125793,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08437500894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.07844439893960953,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.390625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4082.0,
+      "completions/mean_length": 2569.11181640625,
+      "completions/mean_terminated_length": 1590.3370361328125,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.12377514182568335,
+      "grad_norm": 0.25616586208343506,
+      "kl": 0.00691986083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0843,
+      "num_tokens": 43947307.0,
+      "reward": 0.6825892925262451,
+      "reward_std": 0.2599073052406311,
+      "rewards/code_format_reward/mean": 0.6049107313156128,
+      "rewards/code_format_reward/std": 0.4894163906574249,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07767857611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.07570379227399826,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4241071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4026.0,
+      "completions/mean_length": 2515.082763671875,
+      "completions/mean_terminated_length": 1350.841064453125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.12790097988653945,
+      "grad_norm": 0.1738700419664383,
+      "kl": 0.005664825439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0742,
+      "num_tokens": 45356850.0,
+      "reward": 0.6507812738418579,
+      "reward_std": 0.21465569734573364,
+      "rewards/code_format_reward/mean": 0.578125,
+      "rewards/code_format_reward/std": 0.4944108724594116,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07265625894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.07896901667118073,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3482142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 2322.555908203125,
+      "completions/mean_terminated_length": 1375.099365234375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.13202681794739557,
+      "grad_norm": 0.19044172763824463,
+      "kl": 0.00688934326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0985,
+      "num_tokens": 46668801.0,
+      "reward": 0.7402902841567993,
+      "reward_std": 0.2134973704814911,
+      "rewards/code_format_reward/mean": 0.65625,
+      "rewards/code_format_reward/std": 0.47548985481262207,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4017857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 2533.83056640625,
+      "completions/mean_terminated_length": 1484.6119384765625,
+      "completions/min_length": 245.0,
+      "completions/min_terminated_length": 245.0,
+      "epoch": 0.1361526560082517,
+      "grad_norm": 0.1775142401456833,
+      "kl": 0.0061492919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1187,
+      "num_tokens": 48082396.0,
+      "reward": 0.6822544932365417,
+      "reward_std": 0.2595500349998474,
+      "rewards/code_format_reward/mean": 0.5982142686843872,
+      "rewards/code_format_reward/std": 0.49080711603164673,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08404017984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.07453640550374985,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4464285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4036.0,
+      "completions/mean_length": 2743.6630859375,
+      "completions/mean_terminated_length": 1653.0684814453125,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 0.14027849406910778,
+      "grad_norm": 0.17279711365699768,
+      "kl": 0.0059051513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1089,
+      "num_tokens": 49573595.0,
+      "reward": 0.6310268640518188,
+      "reward_std": 0.2643914222717285,
+      "rewards/code_format_reward/mean": 0.5580357313156128,
+      "rewards/code_format_reward/std": 0.4971756041049957,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07299107313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.07505691051483154,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4084821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4014.0,
+      "completions/mean_length": 2337.328125,
+      "completions/mean_terminated_length": 1122.84912109375,
+      "completions/min_length": 146.0,
+      "completions/min_terminated_length": 146.0,
+      "epoch": 0.1444043321299639,
+      "grad_norm": 0.17568987607955933,
+      "kl": 0.00757598876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0903,
+      "num_tokens": 50905954.0,
+      "reward": 0.6639509201049805,
+      "reward_std": 0.19163252413272858,
+      "rewards/code_format_reward/mean": 0.5892857313156128,
+      "rewards/code_format_reward/std": 0.4925134479999542,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07466518133878708,
+      "rewards/curriculum_aware_reward_fn/std": 0.07508310675621033,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.296875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4062.0,
+      "completions/mean_length": 2267.618408203125,
+      "completions/mean_terminated_length": 1495.6348876953125,
+      "completions/min_length": 216.0,
+      "completions/min_terminated_length": 216.0,
+      "epoch": 0.14853017019082002,
+      "grad_norm": 2.414182662963867,
+      "kl": 0.00893402099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0851,
+      "num_tokens": 52191977.0,
+      "reward": 0.7859375476837158,
+      "reward_std": 0.21878042817115784,
+      "rewards/code_format_reward/mean": 0.7008928656578064,
+      "rewards/code_format_reward/std": 0.45837870240211487,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08504463732242584,
+      "rewards/curriculum_aware_reward_fn/std": 0.0744074136018753,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2924107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4064.0,
+      "completions/mean_length": 2191.107177734375,
+      "completions/mean_terminated_length": 1403.9117431640625,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "epoch": 0.15265600825167613,
+      "grad_norm": 0.3153815269470215,
+      "kl": 0.010066986083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0934,
+      "num_tokens": 53442519.0,
+      "reward": 0.7965402007102966,
+      "reward_std": 0.2366231083869934,
+      "rewards/code_format_reward/mean": 0.7098214030265808,
+      "rewards/code_format_reward/std": 0.4543519914150238,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08671874552965164,
+      "rewards/curriculum_aware_reward_fn/std": 0.07416163384914398,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2611607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4052.0,
+      "completions/mean_length": 2011.8907470703125,
+      "completions/mean_terminated_length": 1275.21142578125,
+      "completions/min_length": 252.0,
+      "completions/min_terminated_length": 252.0,
+      "epoch": 0.15678184631253222,
+      "grad_norm": 0.18517161905765533,
+      "kl": 0.009616851806640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0701,
+      "num_tokens": 54602188.0,
+      "reward": 0.8469865918159485,
+      "reward_std": 0.15580664575099945,
+      "rewards/code_format_reward/mean": 0.7455357313156128,
+      "rewards/code_format_reward/std": 0.4360465407371521,
+      "rewards/curriculum_aware_reward_fn/mean": 0.10145089775323868,
+      "rewards/curriculum_aware_reward_fn/std": 0.07025929540395737,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2455357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 2168.19873046875,
+      "completions/mean_terminated_length": 1540.8077392578125,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.16090768437338834,
+      "grad_norm": 0.20817974209785461,
+      "kl": 0.00962066650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1159,
+      "num_tokens": 55845050.0,
+      "reward": 0.8487723469734192,
+      "reward_std": 0.23730090260505676,
+      "rewards/code_format_reward/mean": 0.7566964030265808,
+      "rewards/code_format_reward/std": 0.42955654859542847,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0920758917927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.0731118693947792,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.4241071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 2592.7255859375,
+      "completions/mean_terminated_length": 1485.662841796875,
+      "completions/min_length": 390.0,
+      "completions/min_terminated_length": 390.0,
+      "epoch": 0.16503352243424446,
+      "grad_norm": 0.19099269807338715,
+      "kl": 0.0080718994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1264,
+      "num_tokens": 57271114.0,
+      "reward": 0.6504464149475098,
+      "reward_std": 0.27461573481559753,
+      "rewards/code_format_reward/mean": 0.578125,
+      "rewards/code_format_reward/std": 0.4944108724594116,
+      "rewards/curriculum_aware_reward_fn/mean": 0.07232142984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.0750359445810318,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.359375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4036.0,
+      "completions/mean_length": 2346.348388671875,
+      "completions/mean_terminated_length": 1364.8363037109375,
+      "completions/min_length": 310.0,
+      "completions/min_terminated_length": 310.0,
+      "epoch": 0.16915936049510058,
+      "grad_norm": 0.20166631042957306,
+      "kl": 0.009296417236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0774,
+      "num_tokens": 58593780.0,
+      "reward": 0.7228795289993286,
+      "reward_std": 0.20277433097362518,
+      "rewards/code_format_reward/mean": 0.6428571343421936,
+      "rewards/code_format_reward/std": 0.47969305515289307,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08002232760190964,
+      "rewards/curriculum_aware_reward_fn/std": 0.07558422535657883,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2477678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4089.0,
+      "completions/mean_length": 1821.66748046875,
+      "completions/mean_terminated_length": 1072.554931640625,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.17328519855595667,
+      "grad_norm": 0.3720153272151947,
+      "kl": 0.01447296142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0418,
+      "num_tokens": 59652129.0,
+      "reward": 0.8479911088943481,
+      "reward_std": 0.14334198832511902,
+      "rewards/code_format_reward/mean": 0.7522321343421936,
+      "rewards/code_format_reward/std": 0.4321989119052887,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.07215044647455215,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3526785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 2276.546875,
+      "completions/mean_terminated_length": 1285.2586669921875,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.1774110366168128,
+      "grad_norm": 46.85969924926758,
+      "kl": 2.2713623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1385,
+      "num_tokens": 60933570.0,
+      "reward": 0.7345982789993286,
+      "reward_std": 0.22185997664928436,
+      "rewards/code_format_reward/mean": 0.6495535969734192,
+      "rewards/code_format_reward/std": 0.4776431620121002,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08504464477300644,
+      "rewards/curriculum_aware_reward_fn/std": 0.07900101691484451,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2477678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 2081.060302734375,
+      "completions/mean_terminated_length": 1417.3857421875,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "epoch": 0.1815368746776689,
+      "grad_norm": 0.20214147865772247,
+      "kl": 0.01105499267578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1317,
+      "num_tokens": 62112236.0,
+      "reward": 0.8328125476837158,
+      "reward_std": 0.2402763068675995,
+      "rewards/code_format_reward/mean": 0.7410714030265808,
+      "rewards/code_format_reward/std": 0.43853598833084106,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09174107015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.0778549388051033,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3258928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 2354.993408203125,
+      "completions/mean_terminated_length": 1513.3145751953125,
+      "completions/min_length": 238.0,
+      "completions/min_terminated_length": 238.0,
+      "epoch": 0.18566271273852503,
+      "grad_norm": 0.5439615845680237,
+      "kl": 0.01001739501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1551,
+      "num_tokens": 63457521.0,
+      "reward": 0.7627232670783997,
+      "reward_std": 0.2988319396972656,
+      "rewards/code_format_reward/mean": 0.6763392686843872,
+      "rewards/code_format_reward/std": 0.46839532256126404,
+      "rewards/curriculum_aware_reward_fn/mean": 0.08638393133878708,
+      "rewards/curriculum_aware_reward_fn/std": 0.07488906383514404,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 1772.7857666015625,
+      "completions/mean_terminated_length": 1228.7823486328125,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.18978855079938112,
+      "grad_norm": 0.18871024250984192,
+      "kl": 0.01403045654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0889,
+      "num_tokens": 64520732.0,
+      "reward": 0.9100447297096252,
+      "reward_std": 0.13270780444145203,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252743124961853,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09977678954601288,
+      "rewards/curriculum_aware_reward_fn/std": 0.07365463674068451,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3214285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4043.0,
+      "completions/mean_length": 2240.32373046875,
+      "completions/mean_terminated_length": 1361.319091796875,
+      "completions/min_length": 230.0,
+      "completions/min_terminated_length": 230.0,
+      "epoch": 0.19391438886023724,
+      "grad_norm": 0.5957781672477722,
+      "kl": 0.01197052001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0987,
+      "num_tokens": 65795367.0,
+      "reward": 0.762276828289032,
+      "reward_std": 0.21752412617206573,
+      "rewards/code_format_reward/mean": 0.6785714030265808,
+      "rewards/code_format_reward/std": 0.4675469994544983,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0837053582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.07457634806632996,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2299107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4071.0,
+      "completions/mean_length": 1868.493408203125,
+      "completions/mean_terminated_length": 1203.4696044921875,
+      "completions/min_length": 300.0,
+      "completions/min_terminated_length": 300.0,
+      "epoch": 0.19804022692109335,
+      "grad_norm": 0.22301578521728516,
+      "kl": 0.01412200927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1032,
+      "num_tokens": 66888015.0,
+      "reward": 0.865178644657135,
+      "reward_std": 0.18127720057964325,
+      "rewards/code_format_reward/mean": 0.7700892686843872,
+      "rewards/code_format_reward/std": 0.42124560475349426,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09508929401636124,
+      "rewards/curriculum_aware_reward_fn/std": 0.07234017550945282,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 1748.5804443359375,
+      "completions/mean_terminated_length": 1158.4468994140625,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.20216606498194944,
+      "grad_norm": 0.18596109747886658,
+      "kl": 0.016571044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0924,
+      "num_tokens": 67942181.0,
+      "reward": 0.8970983028411865,
+      "reward_std": 0.15365570783615112,
+      "rewards/code_format_reward/mean": 0.8013392686843872,
+      "rewards/code_format_reward/std": 0.3994380831718445,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09575892984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.07687902450561523,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2767857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4053.0,
+      "completions/mean_length": 1998.4866943359375,
+      "completions/mean_terminated_length": 1195.734619140625,
+      "completions/min_length": 163.0,
+      "completions/min_terminated_length": 163.0,
+      "epoch": 0.20629190304280556,
+      "grad_norm": 0.22046102583408356,
+      "kl": 0.01477813720703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1574,
+      "num_tokens": 69098841.0,
+      "reward": 0.8946428894996643,
+      "reward_std": 0.26137298345565796,
+      "rewards/code_format_reward/mean": 0.7232142686843872,
+      "rewards/code_format_reward/std": 0.44790980219841003,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17142857611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.141336590051651,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.234375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4095.0,
+      "completions/mean_length": 1979.493408203125,
+      "completions/mean_terminated_length": 1331.5831298828125,
+      "completions/min_length": 181.0,
+      "completions/min_terminated_length": 181.0,
+      "epoch": 0.21041774110366168,
+      "grad_norm": 0.22472943365573883,
+      "kl": 0.01605224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.1532,
+      "num_tokens": 70250422.0,
+      "reward": 0.9502232670783997,
+      "reward_std": 0.27020275592803955,
+      "rewards/code_format_reward/mean": 0.7700892686843872,
+      "rewards/code_format_reward/std": 0.42124560475349426,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648,
+      "rewards/curriculum_aware_reward_fn/std": 0.13901372253894806,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.3370535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4095.0,
+      "completions/mean_length": 2305.919677734375,
+      "completions/mean_terminated_length": 1395.8114013671875,
+      "completions/min_length": 231.0,
+      "completions/min_terminated_length": 231.0,
+      "epoch": 0.2145435791645178,
+      "grad_norm": 0.19396060705184937,
+      "kl": 0.01288604736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1115,
+      "num_tokens": 71575367.0,
+      "reward": 0.8095982670783997,
+      "reward_std": 0.2751266360282898,
+      "rewards/code_format_reward/mean": 0.6696428656578064,
+      "rewards/code_format_reward/std": 0.4708675146102905,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13995537161827087,
+      "rewards/curriculum_aware_reward_fn/std": 0.14260126650333405,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2745535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4091.0,
+      "completions/mean_length": 2132.26123046875,
+      "completions/mean_terminated_length": 1389.0615234375,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.2186694172253739,
+      "grad_norm": 2.5251505374908447,
+      "kl": 0.0143890380859375,
+      "learning_rate": 1e-06,
+      "loss": 0.12,
+      "num_tokens": 72795332.0,
+      "reward": 0.8919642567634583,
+      "reward_std": 0.2576311528682709,
+      "rewards/code_format_reward/mean": 0.7299107313156128,
+      "rewards/code_format_reward/std": 0.444502055644989,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16205357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.15102121233940125,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2299107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4074.0,
+      "completions/mean_length": 1957.2969970703125,
+      "completions/mean_terminated_length": 1318.7855224609375,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 0.22279525528623,
+      "grad_norm": 0.37278586626052856,
+      "kl": 0.01513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1216,
+      "num_tokens": 73926567.0,
+      "reward": 0.932366132736206,
+      "reward_std": 0.22399599850177765,
+      "rewards/code_format_reward/mean": 0.7589285969734192,
+      "rewards/code_format_reward/std": 0.4282117187976837,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17343749105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.139581561088562,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1986607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4091.0,
+      "completions/mean_length": 1917.977783203125,
+      "completions/mean_terminated_length": 1378.022216796875,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.22692109334708613,
+      "grad_norm": 1.5422786474227905,
+      "kl": 0.01769256591796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1373,
+      "num_tokens": 75080997.0,
+      "reward": 0.9751116037368774,
+      "reward_std": 0.24655158817768097,
+      "rewards/code_format_reward/mean": 0.8013392686843872,
+      "rewards/code_format_reward/std": 0.3994380533695221,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.14535558223724365,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2745535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4052.0,
+      "completions/mean_length": 2178.58935546875,
+      "completions/mean_terminated_length": 1452.923095703125,
+      "completions/min_length": 242.0,
+      "completions/min_terminated_length": 242.0,
+      "epoch": 0.23104693140794225,
+      "grad_norm": 0.19924461841583252,
+      "kl": 0.0144500732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1163,
+      "num_tokens": 76347561.0,
+      "reward": 0.8948661088943481,
+      "reward_std": 0.23191384971141815,
+      "rewards/code_format_reward/mean": 0.7254464030265808,
+      "rewards/code_format_reward/std": 0.44678795337677,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256,
+      "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1785714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4061.0,
+      "completions/mean_length": 1716.118408203125,
+      "completions/mean_terminated_length": 1198.752685546875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.23517276946879834,
+      "grad_norm": 0.2652409076690674,
+      "kl": 0.0170440673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0689,
+      "num_tokens": 77394926.0,
+      "reward": 1.0386160612106323,
+      "reward_std": 0.16931238770484924,
+      "rewards/code_format_reward/mean": 0.8236607313156128,
+      "rewards/code_format_reward/std": 0.3815346360206604,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21495535969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.13793620467185974,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2767857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4045.0,
+      "completions/mean_length": 2039.154052734375,
+      "completions/mean_terminated_length": 1251.966064453125,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.23929860752965446,
+      "grad_norm": 0.22779759764671326,
+      "kl": 0.0151824951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.1741,
+      "num_tokens": 78570204.0,
+      "reward": 0.8920758962631226,
+      "reward_std": 0.2624998390674591,
+      "rewards/code_format_reward/mean": 0.7276785969734192,
+      "rewards/code_format_reward/std": 0.4456520676612854,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16439732909202576,
+      "rewards/curriculum_aware_reward_fn/std": 0.14416027069091797,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2566964285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4077.0,
+      "completions/mean_length": 1969.24560546875,
+      "completions/mean_terminated_length": 1234.78076171875,
+      "completions/min_length": 187.0,
+      "completions/min_terminated_length": 187.0,
+      "epoch": 0.24342444559051057,
+      "grad_norm": 9.569337844848633,
+      "kl": 0.660430908203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0865,
+      "num_tokens": 79723060.0,
+      "reward": 0.9041295051574707,
+      "reward_std": 0.21036864817142487,
+      "rewards/code_format_reward/mean": 0.7477678656578064,
+      "rewards/code_format_reward/std": 0.4347792863845825,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15636160969734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.14369189739227295,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.265625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 1985.6898193359375,
+      "completions/mean_terminated_length": 1222.385986328125,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.2475502836513667,
+      "grad_norm": 0.23437850177288055,
+      "kl": 0.01544952392578125,
+      "learning_rate": 1e-06,
+      "loss": 0.1137,
+      "num_tokens": 80894980.0,
+      "reward": 0.8974330425262451,
+      "reward_std": 0.2371726632118225,
+      "rewards/code_format_reward/mean": 0.734375,
+      "rewards/code_format_reward/std": 0.44215917587280273,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16305804252624512,
+      "rewards/curriculum_aware_reward_fn/std": 0.14288581907749176,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 1866.6630859375,
+      "completions/mean_terminated_length": 1242.6199951171875,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.2516761217122228,
+      "grad_norm": 0.21880938112735748,
+      "kl": 0.0179290771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0995,
+      "num_tokens": 81992338.0,
+      "reward": 0.9709821939468384,
+      "reward_std": 0.22359324991703033,
+      "rewards/code_format_reward/mean": 0.7767857313156128,
+      "rewards/code_format_reward/std": 0.41686633229255676,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.13704219460487366,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1919642857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4051.0,
+      "completions/mean_length": 1788.6876220703125,
+      "completions/mean_terminated_length": 1240.54150390625,
+      "completions/min_length": 280.0,
+      "completions/min_terminated_length": 280.0,
+      "epoch": 0.2558019597730789,
+      "grad_norm": 0.2513861656188965,
+      "kl": 0.01721954345703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1641,
+      "num_tokens": 83078318.0,
+      "reward": 0.9954241514205933,
+      "reward_std": 0.2597261369228363,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252743124961853,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18515624105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.14922460913658142,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.21875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4082.0,
+      "completions/mean_length": 1818.868408203125,
+      "completions/mean_terminated_length": 1181.2713623046875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.259927797833935,
+      "grad_norm": 0.5408481359481812,
+      "kl": 0.0183258056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.1045,
+      "num_tokens": 84156539.0,
+      "reward": 0.9498884081840515,
+      "reward_std": 0.2111586630344391,
+      "rewards/code_format_reward/mean": 0.7767857313156128,
+      "rewards/code_format_reward/std": 0.41686636209487915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17310269176959991,
+      "rewards/curriculum_aware_reward_fn/std": 0.1488838940858841,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2165178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3919.0,
+      "completions/mean_length": 1976.8460693359375,
+      "completions/mean_terminated_length": 1391.2108154296875,
+      "completions/min_length": 145.0,
+      "completions/min_terminated_length": 145.0,
+      "epoch": 0.26405363589479114,
+      "grad_norm": 1.840519666671753,
+      "kl": 0.41724395751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1207,
+      "num_tokens": 85322859.0,
+      "reward": 0.9379464983940125,
+      "reward_std": 0.2321721613407135,
+      "rewards/code_format_reward/mean": 0.78125,
+      "rewards/code_format_reward/std": 0.4138607978820801,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15669642388820648,
+      "rewards/curriculum_aware_reward_fn/std": 0.1403089463710785,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2165178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3828.0,
+      "completions/mean_length": 1842.6920166015625,
+      "completions/mean_terminated_length": 1219.98291015625,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.26817947395564723,
+      "grad_norm": 0.20033693313598633,
+      "kl": 0.01674652099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0878,
+      "num_tokens": 86418634.0,
+      "reward": 0.9722098112106323,
+      "reward_std": 0.20250627398490906,
+      "rewards/code_format_reward/mean": 0.7857142686843872,
+      "rewards/code_format_reward/std": 0.41078460216522217,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18649554252624512,
+      "rewards/curriculum_aware_reward_fn/std": 0.1416252702474594,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2366071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4040.0,
+      "completions/mean_length": 1811.4888916015625,
+      "completions/mean_terminated_length": 1103.4239501953125,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.2723053120165034,
+      "grad_norm": 0.21493232250213623,
+      "kl": 0.0157470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0875,
+      "num_tokens": 87501102.0,
+      "reward": 0.9482142925262451,
+      "reward_std": 0.1998186707496643,
+      "rewards/code_format_reward/mean": 0.7633928656578064,
+      "rewards/code_format_reward/std": 0.4254741966724396,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18482144176959991,
+      "rewards/curriculum_aware_reward_fn/std": 0.13863980770111084,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4087.0,
+      "completions/mean_length": 1813.9376220703125,
+      "completions/mean_terminated_length": 1240.234619140625,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.27643115007735947,
+      "grad_norm": 0.2010592669248581,
+      "kl": 0.0166473388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0958,
+      "num_tokens": 88571731.0,
+      "reward": 1.012834906578064,
+      "reward_std": 0.20978660881519318,
+      "rewards/code_format_reward/mean": 0.8035714030265808,
+      "rewards/code_format_reward/std": 0.3977404832839966,
+      "rewards/curriculum_aware_reward_fn/mean": 0.209263414144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.14208464324474335,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4043.0,
+      "completions/mean_length": 1591.321533203125,
+      "completions/mean_terminated_length": 1166.2454833984375,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.28055698813821556,
+      "grad_norm": 0.3400166928768158,
+      "kl": 0.01915740966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0839,
+      "num_tokens": 89549775.0,
+      "reward": 1.0475447177886963,
+      "reward_std": 0.17855383455753326,
+      "rewards/code_format_reward/mean": 0.8526785969734192,
+      "rewards/code_format_reward/std": 0.3548222780227661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19486607611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.13271550834178925,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2142857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4080.0,
+      "completions/mean_length": 1908.4599609375,
+      "completions/mean_terminated_length": 1311.8580322265625,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.2846828261990717,
+      "grad_norm": 0.35188552737236023,
+      "kl": 0.01520538330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.1303,
+      "num_tokens": 90663597.0,
+      "reward": 0.9658482670783997,
+      "reward_std": 0.23443441092967987,
+      "rewards/code_format_reward/mean": 0.7857142686843872,
+      "rewards/code_format_reward/std": 0.41078460216522217,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18013392388820648,
+      "rewards/curriculum_aware_reward_fn/std": 0.13937532901763916,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1316964285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4009.0,
+      "completions/mean_length": 1637.712158203125,
+      "completions/mean_terminated_length": 1264.8612060546875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.2888086642599278,
+      "grad_norm": 0.21659672260284424,
+      "kl": 0.0170745849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0605,
+      "num_tokens": 91653793.0,
+      "reward": 1.070424199104309,
+      "reward_std": 0.16760630905628204,
+      "rewards/code_format_reward/mean": 0.8705357313156128,
+      "rewards/code_format_reward/std": 0.3360883891582489,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19988839328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.13450957834720612,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1517857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4034.0,
+      "completions/mean_length": 1594.72998046875,
+      "completions/mean_terminated_length": 1147.13427734375,
+      "completions/min_length": 255.0,
+      "completions/min_terminated_length": 255.0,
+      "epoch": 0.2929345023207839,
+      "grad_norm": 0.24188809096813202,
+      "kl": 0.01702117919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.111,
+      "num_tokens": 92642758.0,
+      "reward": 1.0497767925262451,
+      "reward_std": 0.20430657267570496,
+      "rewards/code_format_reward/mean": 0.8482142686843872,
+      "rewards/code_format_reward/std": 0.3592142164707184,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20156250894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.13481204211711884,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2232142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4065.0,
+      "completions/mean_length": 1918.1295166015625,
+      "completions/mean_terminated_length": 1292.3045654296875,
+      "completions/min_length": 250.0,
+      "completions/min_terminated_length": 250.0,
+      "epoch": 0.29706034038164003,
+      "grad_norm": 0.23266896605491638,
+      "kl": 0.01465606689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1266,
+      "num_tokens": 93783375.0,
+      "reward": 0.9430804252624512,
+      "reward_std": 0.2125602513551712,
+      "rewards/code_format_reward/mean": 0.7790178656578064,
+      "rewards/code_format_reward/std": 0.4153723120689392,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1640625,
+      "rewards/curriculum_aware_reward_fn/std": 0.1429663747549057,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2522321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4081.0,
+      "completions/mean_length": 1978.180908203125,
+      "completions/mean_terminated_length": 1263.8118896484375,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.3011861784424961,
+      "grad_norm": 0.2386295050382614,
+      "kl": 0.014862060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1095,
+      "num_tokens": 94935747.0,
+      "reward": 0.9466517567634583,
+      "reward_std": 0.2304624617099762,
+      "rewards/code_format_reward/mean": 0.7544642686843872,
+      "rewards/code_format_reward/std": 0.43088552355766296,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.14789843559265137,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2633928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 2113.1318359375,
+      "completions/mean_terminated_length": 1404.10595703125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.30531201650335227,
+      "grad_norm": 0.1983596831560135,
+      "kl": 0.01523590087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.084,
+      "num_tokens": 96176073.0,
+      "reward": 0.9197545051574707,
+      "reward_std": 0.20995503664016724,
+      "rewards/code_format_reward/mean": 0.7433035969734192,
+      "rewards/code_format_reward/std": 0.4372987747192383,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17645089328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.141729936003685,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2232142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1784.3818359375,
+      "completions/mean_terminated_length": 1120.12353515625,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.30943785456420836,
+      "grad_norm": 0.20856866240501404,
+      "kl": 0.0185394287109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0662,
+      "num_tokens": 97242659.0,
+      "reward": 0.9758929014205933,
+      "reward_std": 0.1802317053079605,
+      "rewards/code_format_reward/mean": 0.7790178656578064,
+      "rewards/code_format_reward/std": 0.4153723120689392,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19687499105930328,
+      "rewards/curriculum_aware_reward_fn/std": 0.14509600400924683,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2589285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4064.0,
+      "completions/mean_length": 2063.15625,
+      "completions/mean_terminated_length": 1352.885498046875,
+      "completions/min_length": 256.0,
+      "completions/min_terminated_length": 256.0,
+      "epoch": 0.31356369262506445,
+      "grad_norm": 0.28605571389198303,
+      "kl": 0.064544677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1413,
+      "num_tokens": 98452756.0,
+      "reward": 0.8792411088943481,
+      "reward_std": 0.2832396924495697,
+      "rewards/code_format_reward/mean": 0.7433035969734192,
+      "rewards/code_format_reward/std": 0.43729880452156067,
+      "rewards/curriculum_aware_reward_fn/mean": 0.13593749701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14190621674060822,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1741071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4071.0,
+      "completions/mean_length": 1724.165283203125,
+      "completions/mean_terminated_length": 1224.15673828125,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.3176895306859206,
+      "grad_norm": 0.21490244567394257,
+      "kl": 0.01772308349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0601,
+      "num_tokens": 99505350.0,
+      "reward": 1.0366071462631226,
+      "reward_std": 0.17229565978050232,
+      "rewards/code_format_reward/mean": 0.8236607313156128,
+      "rewards/code_format_reward/std": 0.3815346360206604,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21294644474983215,
+      "rewards/curriculum_aware_reward_fn/std": 0.14066724479198456,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2276785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4051.0,
+      "completions/mean_length": 1939.0380859375,
+      "completions/mean_terminated_length": 1303.1705322265625,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.3218153687467767,
+      "grad_norm": 0.22845226526260376,
+      "kl": 0.015228271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1391,
+      "num_tokens": 100650427.0,
+      "reward": 0.9670760035514832,
+      "reward_std": 0.2431958168745041,
+      "rewards/code_format_reward/mean": 0.78125,
+      "rewards/code_format_reward/std": 0.4138607978820801,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.13783639669418335,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 1808.8326416015625,
+      "completions/mean_terminated_length": 1370.8642578125,
+      "completions/min_length": 211.0,
+      "completions/min_terminated_length": 211.0,
+      "epoch": 0.3259412068076328,
+      "grad_norm": 0.2206607162952423,
+      "kl": 0.01694488525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1064,
+      "num_tokens": 101732923.0,
+      "reward": 1.0236607789993286,
+      "reward_std": 0.21730710566043854,
+      "rewards/code_format_reward/mean": 0.8415178656578064,
+      "rewards/code_format_reward/std": 0.36560073494911194,
+      "rewards/curriculum_aware_reward_fn/mean": 0.18214286863803864,
+      "rewards/curriculum_aware_reward_fn/std": 0.13819824159145355,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2477678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1931.15185546875,
+      "completions/mean_terminated_length": 1218.100830078125,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.3300670448684889,
+      "grad_norm": 0.2661918103694916,
+      "kl": 0.01607513427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1196,
+      "num_tokens": 102861237.0,
+      "reward": 0.9046875238418579,
+      "reward_std": 0.2178276777267456,
+      "rewards/code_format_reward/mean": 0.75,
+      "rewards/code_format_reward/std": 0.43349677324295044,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15468750894069672,
+      "rewards/curriculum_aware_reward_fn/std": 0.14217200875282288,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2321428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4048.0,
+      "completions/mean_length": 1844.3438720703125,
+      "completions/mean_terminated_length": 1163.6104736328125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.334192882929345,
+      "grad_norm": 0.21588768064975739,
+      "kl": 0.01670074462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1077,
+      "num_tokens": 103959747.0,
+      "reward": 0.9659598469734192,
+      "reward_std": 0.1847652792930603,
+      "rewards/code_format_reward/mean": 0.7700892686843872,
+      "rewards/code_format_reward/std": 0.42124560475349426,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1958705335855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.16381129622459412,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2209821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 1976.0068359375,
+      "completions/mean_terminated_length": 1374.63330078125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.33831872099020116,
+      "grad_norm": 0.20742100477218628,
+      "kl": 0.01610565185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.097,
+      "num_tokens": 105114190.0,
+      "reward": 0.9648438096046448,
+      "reward_std": 0.21596986055374146,
+      "rewards/code_format_reward/mean": 0.7790178656578064,
+      "rewards/code_format_reward/std": 0.4153723120689392,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1858258992433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.14072753489017487,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.25,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4000.0,
+      "completions/mean_length": 2044.899658203125,
+      "completions/mean_terminated_length": 1361.199462890625,
+      "completions/min_length": 196.0,
+      "completions/min_terminated_length": 196.0,
+      "epoch": 0.34244455905105725,
+      "grad_norm": 0.26717469096183777,
+      "kl": 0.01522064208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1235,
+      "num_tokens": 106323115.0,
+      "reward": 0.9233258962631226,
+      "reward_std": 0.23657990992069244,
+      "rewards/code_format_reward/mean": 0.7522321343421936,
+      "rewards/code_format_reward/std": 0.4321989119052887,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14227430522441864,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2522321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4075.0,
+      "completions/mean_length": 2159.348388671875,
+      "completions/mean_terminated_length": 1506.089599609375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.34657039711191334,
+      "grad_norm": 2.97698974609375,
+      "kl": 0.319793701171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0768,
+      "num_tokens": 107574248.0,
+      "reward": 0.9077010154724121,
+      "reward_std": 0.20499999821186066,
+      "rewards/code_format_reward/mean": 0.75,
+      "rewards/code_format_reward/std": 0.43349677324295044,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15770089626312256,
+      "rewards/curriculum_aware_reward_fn/std": 0.14362619817256927,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1941964285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4067.0,
+      "completions/mean_length": 1893.1898193359375,
+      "completions/mean_terminated_length": 1362.3184814453125,
+      "completions/min_length": 284.0,
+      "completions/min_terminated_length": 284.0,
+      "epoch": 0.3506962351727695,
+      "grad_norm": 0.23494893312454224,
+      "kl": 0.01613616943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1371,
+      "num_tokens": 108703958.0,
+      "reward": 0.9831473231315613,
+      "reward_std": 0.28516101837158203,
+      "rewards/code_format_reward/mean": 0.8080357313156128,
+      "rewards/code_format_reward/std": 0.3942854404449463,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17511160671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.1376536637544632,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2611607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4058.0,
+      "completions/mean_length": 2132.754638671875,
+      "completions/mean_terminated_length": 1438.797607421875,
+      "completions/min_length": 254.0,
+      "completions/min_terminated_length": 254.0,
+      "epoch": 0.3548220732336256,
+      "grad_norm": 0.21388478577136993,
+      "kl": 0.01474761962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1361,
+      "num_tokens": 109917858.0,
+      "reward": 0.914843738079071,
+      "reward_std": 0.25436264276504517,
+      "rewards/code_format_reward/mean": 0.7410714030265808,
+      "rewards/code_format_reward/std": 0.43853598833084106,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17377233505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.14255832135677338,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1919642857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4035.0,
+      "completions/mean_length": 1760.279052734375,
+      "completions/mean_terminated_length": 1205.384033203125,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.35894791129448167,
+      "grad_norm": 0.2028041034936905,
+      "kl": 0.0170440673828125,
+      "learning_rate": 1e-06,
+      "loss": 0.1113,
+      "num_tokens": 110991643.0,
+      "reward": 1.0066964626312256,
+      "reward_std": 0.16249890625476837,
+      "rewards/code_format_reward/mean": 0.8058035969734192,
+      "rewards/code_format_reward/std": 0.3960230052471161,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.13506683707237244,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.203125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4056.0,
+      "completions/mean_length": 1839.2568359375,
+      "completions/mean_terminated_length": 1264.0084228515625,
+      "completions/min_length": 308.0,
+      "completions/min_terminated_length": 308.0,
+      "epoch": 0.3630737493553378,
+      "grad_norm": 0.25737714767456055,
+      "kl": 0.01641082763671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1086,
+      "num_tokens": 112092172.0,
+      "reward": 0.9707589745521545,
+      "reward_std": 0.23582224547863007,
+      "rewards/code_format_reward/mean": 0.7946428656578064,
+      "rewards/code_format_reward/std": 0.40441393852233887,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17611606419086456,
+      "rewards/curriculum_aware_reward_fn/std": 0.14302924275398254,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2388392857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4064.0,
+      "completions/mean_length": 1971.482177734375,
+      "completions/mean_terminated_length": 1304.8446044921875,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.3671995874161939,
+      "grad_norm": 0.43521085381507874,
+      "kl": 0.01739501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1102,
+      "num_tokens": 113258112.0,
+      "reward": 0.9188616275787354,
+      "reward_std": 0.2197439968585968,
+      "rewards/code_format_reward/mean": 0.7611607313156128,
+      "rewards/code_format_reward/std": 0.4268510043621063,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15770088136196136,
+      "rewards/curriculum_aware_reward_fn/std": 0.14327529072761536,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1830357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4060.0,
+      "completions/mean_length": 1960.5224609375,
+      "completions/mean_terminated_length": 1482.0819091796875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 0.37132542547705005,
+      "grad_norm": 1.3905837535858154,
+      "kl": 0.028656005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0801,
+      "num_tokens": 114416783.0,
+      "reward": 1.009151816368103,
+      "reward_std": 0.2122829705476761,
+      "rewards/code_format_reward/mean": 0.8169642686843872,
+      "rewards/code_format_reward/std": 0.387128084897995,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19218751788139343,
+      "rewards/curriculum_aware_reward_fn/std": 0.13840459287166595,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4034.0,
+      "completions/mean_length": 1792.435302734375,
+      "completions/mean_terminated_length": 1213.3267822265625,
+      "completions/min_length": 264.0,
+      "completions/min_terminated_length": 264.0,
+      "epoch": 0.37545126353790614,
+      "grad_norm": 0.19705531001091003,
+      "kl": 0.017974853515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0856,
+      "num_tokens": 115498786.0,
+      "reward": 0.9724330902099609,
+      "reward_std": 0.17091991007328033,
+      "rewards/code_format_reward/mean": 0.8013392686843872,
+      "rewards/code_format_reward/std": 0.3994380533695221,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17109374701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.14368249475955963,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2209821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4092.0,
+      "completions/mean_length": 1975.6273193359375,
+      "completions/mean_terminated_length": 1374.146240234375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.37957710159876223,
+      "grad_norm": 0.43463876843452454,
+      "kl": 0.0468292236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0951,
+      "num_tokens": 116643371.0,
+      "reward": 0.9460937976837158,
+      "reward_std": 0.21486541628837585,
+      "rewards/code_format_reward/mean": 0.7790178656578064,
+      "rewards/code_format_reward/std": 0.4153723120689392,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16707590222358704,
+      "rewards/curriculum_aware_reward_fn/std": 0.1629006415605545,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1674107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3997.0,
+      "completions/mean_length": 1699.790283203125,
+      "completions/mean_terminated_length": 1217.9786376953125,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.3837029396596184,
+      "grad_norm": 0.2277679443359375,
+      "kl": 0.0191802978515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1171,
+      "num_tokens": 117679158.0,
+      "reward": 1.0132813453674316,
+      "reward_std": 0.2020494043827057,
+      "rewards/code_format_reward/mean": 0.8348214030265808,
+      "rewards/code_format_reward/std": 0.37175676226615906,
+      "rewards/curriculum_aware_reward_fn/mean": 0.17845983803272247,
+      "rewards/curriculum_aware_reward_fn/std": 0.1565471738576889,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4056.0,
+      "completions/mean_length": 1753.99560546875,
+      "completions/mean_terminated_length": 1377.818603515625,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.38782877772047447,
+      "grad_norm": 0.22877489030361176,
+      "kl": 0.019775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0487,
+      "num_tokens": 118731312.0,
+      "reward": 1.0579241514205933,
+      "reward_std": 0.16506682336330414,
+      "rewards/code_format_reward/mean": 0.859375,
+      "rewards/code_format_reward/std": 0.3480229377746582,
+      "rewards/curriculum_aware_reward_fn/mean": 0.19854912161827087,
+      "rewards/curriculum_aware_reward_fn/std": 0.13795046508312225,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1495535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4084.0,
+      "completions/mean_length": 1703.1160888671875,
+      "completions/mean_terminated_length": 1282.3201904296875,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.39195461578133056,
+      "grad_norm": 0.2043149769306183,
+      "kl": 0.0197296142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0728,
+      "num_tokens": 119770878.0,
+      "reward": 1.0570311546325684,
+      "reward_std": 0.18754906952381134,
+      "rewards/code_format_reward/mean": 0.8504464030265808,
+      "rewards/code_format_reward/std": 0.3570319712162018,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.1388920247554779,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1004464285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4057.0,
+      "completions/mean_length": 1530.4085693359375,
+      "completions/mean_terminated_length": 1243.927978515625,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.3960804538421867,
+      "grad_norm": 0.30418333411216736,
+      "kl": 0.0242156982421875,
+      "learning_rate": 1e-06,
+      "loss": 0.1313,
+      "num_tokens": 120728556.0,
+      "reward": 1.0689733028411865,
+      "reward_std": 0.18825943768024445,
+      "rewards/code_format_reward/mean": 0.8995535969734192,
+      "rewards/code_format_reward/std": 0.30093035101890564,
+      "rewards/curriculum_aware_reward_fn/mean": 0.16941964626312256,
+      "rewards/curriculum_aware_reward_fn/std": 0.14304180443286896,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4045.0,
+      "completions/mean_length": 1664.134033203125,
+      "completions/mean_terminated_length": 1251.4151611328125,
+      "completions/min_length": 200.0,
+      "completions/min_terminated_length": 200.0,
+      "epoch": 0.4002062919030428,
+      "grad_norm": 0.6233256459236145,
+      "kl": 0.0251312255859375,
+      "learning_rate": 1e-06,
+      "loss": 0.074,
+      "num_tokens": 121740935.0,
+      "reward": 1.0614955425262451,
+      "reward_std": 0.16232284903526306,
+      "rewards/code_format_reward/mean": 0.8549107313156128,
+      "rewards/code_format_reward/std": 0.3525845408439636,
+      "rewards/curriculum_aware_reward_fn/mean": 0.20658482611179352,
+      "rewards/curriculum_aware_reward_fn/std": 0.14282289147377014,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4048.0,
+      "completions/mean_length": 1824.4576416015625,
+      "completions/mean_terminated_length": 1292.5537109375,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.4043321299638989,
+      "grad_norm": 0.22889189422130585,
+      "kl": 0.01766204833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.104,
+      "num_tokens": 122834834.0,
+      "reward": 1.1388393640518188,
+      "reward_std": 0.28884097933769226,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252743124961853,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3285714089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.318551242351532,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4000.0,
+      "completions/mean_length": 1661.8616943359375,
+      "completions/mean_terminated_length": 1248.7572021484375,
+      "completions/min_length": 265.0,
+      "completions/min_terminated_length": 265.0,
+      "epoch": 0.40845796802475504,
+      "grad_norm": 0.7959763407707214,
+      "kl": 0.0198974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0787,
+      "num_tokens": 123850430.0,
+      "reward": 1.2450892925262451,
+      "reward_std": 0.2383604198694229,
+      "rewards/code_format_reward/mean": 0.8571428656578064,
+      "rewards/code_format_reward/std": 0.3503182828426361,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3879464268684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.3007444441318512,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2410714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4068.0,
+      "completions/mean_length": 2082.6005859375,
+      "completions/mean_terminated_length": 1443.050048828125,
+      "completions/min_length": 165.0,
+      "completions/min_terminated_length": 165.0,
+      "epoch": 0.4125838060856111,
+      "grad_norm": 0.22675098478794098,
+      "kl": 0.016845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1181,
+      "num_tokens": 125048924.0,
+      "reward": 1.1112724542617798,
+      "reward_std": 0.3455579876899719,
+      "rewards/code_format_reward/mean": 0.7589285969734192,
+      "rewards/code_format_reward/std": 0.4282117187976837,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35234373807907104,
+      "rewards/curriculum_aware_reward_fn/std": 0.30698445439338684,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1517857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3952.0,
+      "completions/mean_length": 1744.680908203125,
+      "completions/mean_terminated_length": 1323.91845703125,
+      "completions/min_length": 269.0,
+      "completions/min_terminated_length": 269.0,
+      "epoch": 0.4167096441464673,
+      "grad_norm": 0.22585518658161163,
+      "kl": 0.0187835693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1183,
+      "num_tokens": 126114405.0,
+      "reward": 1.2287946939468384,
+      "reward_std": 0.2684269845485687,
+      "rewards/code_format_reward/mean": 0.8482142686843872,
+      "rewards/code_format_reward/std": 0.3592142164707184,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3805803656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.29996660351753235,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2098214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3967.0,
+      "completions/mean_length": 1884.2835693359375,
+      "completions/mean_terminated_length": 1296.9915771484375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.42083548220732336,
+      "grad_norm": 0.3677929639816284,
+      "kl": 0.0188140869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.1043,
+      "num_tokens": 127244766.0,
+      "reward": 1.1637277603149414,
+      "reward_std": 0.3110141158103943,
+      "rewards/code_format_reward/mean": 0.7901785969734192,
+      "rewards/code_format_reward/std": 0.40763622522354126,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37354913353919983,
+      "rewards/curriculum_aware_reward_fn/std": 0.3067074716091156,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1964285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4051.0,
+      "completions/mean_length": 1755.8148193359375,
+      "completions/mean_terminated_length": 1183.76953125,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.42496132026817945,
+      "grad_norm": 0.229450061917305,
+      "kl": 0.0190887451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.1335,
+      "num_tokens": 128312631.0,
+      "reward": 1.1440848112106323,
+      "reward_std": 0.22956174612045288,
+      "rewards/code_format_reward/mean": 0.8035714030265808,
+      "rewards/code_format_reward/std": 0.39774051308631897,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3405133783817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.2985129952430725,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1964285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4094.0,
+      "completions/mean_length": 1909.044677734375,
+      "completions/mean_terminated_length": 1374.45556640625,
+      "completions/min_length": 302.0,
+      "completions/min_terminated_length": 302.0,
+      "epoch": 0.4290871583290356,
+      "grad_norm": 0.24083521962165833,
+      "kl": 0.017913818359375,
+      "learning_rate": 1e-06,
+      "loss": 0.1011,
+      "num_tokens": 129424734.0,
+      "reward": 1.1671875715255737,
+      "reward_std": 0.3163076937198639,
+      "rewards/code_format_reward/mean": 0.8035714030265808,
+      "rewards/code_format_reward/std": 0.3977404832839966,
+      "rewards/curriculum_aware_reward_fn/mean": 0.36361604928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.29430437088012695,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0758928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4094.0,
+      "completions/mean_length": 1389.8460693359375,
+      "completions/mean_terminated_length": 1167.6014404296875,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.4332129963898917,
+      "grad_norm": 0.2619832158088684,
+      "kl": 0.0225067138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1098,
+      "num_tokens": 130308331.0,
+      "reward": 1.3381696939468384,
+      "reward_std": 0.24706150591373444,
+      "rewards/code_format_reward/mean": 0.9241071343421936,
+      "rewards/code_format_reward/std": 0.265122652053833,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4140624701976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.2952509820461273,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4042.0,
+      "completions/mean_length": 1655.8326416015625,
+      "completions/mean_terminated_length": 1307.2371826171875,
+      "completions/min_length": 213.0,
+      "completions/min_terminated_length": 213.0,
+      "epoch": 0.4373388344507478,
+      "grad_norm": 0.24595658481121063,
+      "kl": 0.0212249755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0767,
+      "num_tokens": 131325034.0,
+      "reward": 1.2672991752624512,
+      "reward_std": 0.24971547722816467,
+      "rewards/code_format_reward/mean": 0.875,
+      "rewards/code_format_reward/std": 0.3310886323451996,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3922991156578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.2960248589515686,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4057.0,
+      "completions/mean_length": 1656.15185546875,
+      "completions/mean_terminated_length": 1188.94677734375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.44146467251160393,
+      "grad_norm": 0.3704790771007538,
+      "kl": 0.020477294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.1244,
+      "num_tokens": 132320691.0,
+      "reward": 1.2327009439468384,
+      "reward_std": 0.2673065662384033,
+      "rewards/code_format_reward/mean": 0.8370535969734192,
+      "rewards/code_format_reward/std": 0.3697296679019928,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3956473171710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.29913750290870667,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1361607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 1538.372802734375,
+      "completions/mean_terminated_length": 1135.2325439453125,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.44559051057246,
+      "grad_norm": 0.32088541984558105,
+      "kl": 0.0222625732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0987,
+      "num_tokens": 133278850.0,
+      "reward": 1.200446605682373,
+      "reward_std": 0.1880311220884323,
+      "rewards/code_format_reward/mean": 0.8660714030265808,
+      "rewards/code_format_reward/std": 0.34095627069473267,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313,
+      "rewards/curriculum_aware_reward_fn/std": 0.31286945939064026,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1852678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4089.0,
+      "completions/mean_length": 1741.90185546875,
+      "completions/mean_terminated_length": 1206.5863037109375,
+      "completions/min_length": 251.0,
+      "completions/min_terminated_length": 251.0,
+      "epoch": 0.44971634863331617,
+      "grad_norm": 0.22313106060028076,
+      "kl": 0.0215301513671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0738,
+      "num_tokens": 134327455.0,
+      "reward": 1.2027901411056519,
+      "reward_std": 0.25760817527770996,
+      "rewards/code_format_reward/mean": 0.8147321343421936,
+      "rewards/code_format_reward/std": 0.38894903659820557,
+      "rewards/curriculum_aware_reward_fn/mean": 0.38805803656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3171059191226959,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1339285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 1705.8482666015625,
+      "completions/mean_terminated_length": 1336.237060546875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.45384218669417226,
+      "grad_norm": 0.5322986245155334,
+      "kl": 0.064666748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.1118,
+      "num_tokens": 135364563.0,
+      "reward": 1.2229912281036377,
+      "reward_std": 0.2728971242904663,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334254264831543,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35915178060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.29508450627326965,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0669642857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3892.0,
+      "completions/mean_length": 1267.446533203125,
+      "completions/mean_terminated_length": 1064.440185546875,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.45796802475502835,
+      "grad_norm": 0.2848234474658966,
+      "kl": 0.0265960693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0998,
+      "num_tokens": 136178695.0,
+      "reward": 1.4200893640518188,
+      "reward_std": 0.19471189379692078,
+      "rewards/code_format_reward/mean": 0.9308035969734192,
+      "rewards/code_format_reward/std": 0.25407159328460693,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4892856776714325,
+      "rewards/curriculum_aware_reward_fn/std": 0.28257983922958374,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0825892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4065.0,
+      "completions/mean_length": 1507.4710693359375,
+      "completions/mean_terminated_length": 1274.4404296875,
+      "completions/min_length": 206.0,
+      "completions/min_terminated_length": 206.0,
+      "epoch": 0.4620938628158845,
+      "grad_norm": 0.24630674719810486,
+      "kl": 0.023223876953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0871,
+      "num_tokens": 137136779.0,
+      "reward": 1.3398438692092896,
+      "reward_std": 0.24005301296710968,
+      "rewards/code_format_reward/mean": 0.9174107313156128,
+      "rewards/code_format_reward/std": 0.2755681276321411,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.2932768166065216,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2209821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4051.0,
+      "completions/mean_length": 1992.060302734375,
+      "completions/mean_terminated_length": 1395.24072265625,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.4662197008767406,
+      "grad_norm": 4.038174629211426,
+      "kl": 0.0250091552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1024,
+      "num_tokens": 138315249.0,
+      "reward": 1.1348215341567993,
+      "reward_std": 0.31040358543395996,
+      "rewards/code_format_reward/mean": 0.78125,
+      "rewards/code_format_reward/std": 0.4138607978820801,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35357141494750977,
+      "rewards/curriculum_aware_reward_fn/std": 0.2992880642414093,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4021.0,
+      "completions/mean_length": 1731.1160888671875,
+      "completions/mean_terminated_length": 1177.3553466796875,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.4703455389375967,
+      "grad_norm": 0.21624675393104553,
+      "kl": 0.021209716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1069,
+      "num_tokens": 139343846.0,
+      "reward": 1.1998885869979858,
+      "reward_std": 0.27098482847213745,
+      "rewards/code_format_reward/mean": 0.8125,
+      "rewards/code_format_reward/std": 0.3907487094402313,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3873883783817291,
+      "rewards/curriculum_aware_reward_fn/std": 0.30484986305236816,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0892857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 1448.509033203125,
+      "completions/mean_terminated_length": 1188.9510498046875,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 0.4744713769984528,
+      "grad_norm": 0.717583179473877,
+      "kl": 0.0406646728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0648,
+      "num_tokens": 140255075.0,
+      "reward": 1.3869420289993286,
+      "reward_std": 0.22124750912189484,
+      "rewards/code_format_reward/mean": 0.9107142686843872,
+      "rewards/code_format_reward/std": 0.2854745090007782,
+      "rewards/curriculum_aware_reward_fn/mean": 0.47622767090797424,
+      "rewards/curriculum_aware_reward_fn/std": 0.29526206851005554,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1696428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4090.0,
+      "completions/mean_length": 1866.904052734375,
+      "completions/mean_terminated_length": 1411.497314453125,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 0.4785972150593089,
+      "grad_norm": 0.25914421677589417,
+      "kl": 0.0201873779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.104,
+      "num_tokens": 141357756.0,
+      "reward": 1.1671875715255737,
+      "reward_std": 0.29631105065345764,
+      "rewards/code_format_reward/mean": 0.8303571343421936,
+      "rewards/code_format_reward/std": 0.37573832273483276,
+      "rewards/curriculum_aware_reward_fn/mean": 0.33683034777641296,
+      "rewards/curriculum_aware_reward_fn/std": 0.30225586891174316,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1361607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4062.0,
+      "completions/mean_length": 1616.9376220703125,
+      "completions/mean_terminated_length": 1226.180908203125,
+      "completions/min_length": 164.0,
+      "completions/min_terminated_length": 164.0,
+      "epoch": 0.48272305312016506,
+      "grad_norm": 0.3568706810474396,
+      "kl": 0.0214080810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.1059,
+      "num_tokens": 142361990.0,
+      "reward": 1.2168527841567993,
+      "reward_std": 0.20818667113780975,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334251284599304,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35301336646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.30549928545951843,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1428571428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 1734.7567138671875,
+      "completions/mean_terminated_length": 1341.2161865234375,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.48684889118102115,
+      "grad_norm": 0.2393302172422409,
+      "kl": 0.0227508544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0651,
+      "num_tokens": 143411237.0,
+      "reward": 1.2437500953674316,
+      "reward_std": 0.2932717204093933,
+      "rewards/code_format_reward/mean": 0.8526785969734192,
+      "rewards/code_format_reward/std": 0.3548222780227661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3910714089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.3008536696434021,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4092.0,
+      "completions/mean_length": 1937.966552734375,
+      "completions/mean_terminated_length": 1395.444091796875,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.49097472924187724,
+      "grad_norm": 144.79115295410156,
+      "kl": 11.142135620117188,
+      "learning_rate": 1e-06,
+      "loss": 0.2232,
+      "num_tokens": 144552754.0,
+      "reward": 1.1873886585235596,
+      "reward_std": 0.27385786175727844,
+      "rewards/code_format_reward/mean": 0.7991071343421936,
+      "rewards/code_format_reward/std": 0.4011159837245941,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3882812559604645,
+      "rewards/curriculum_aware_reward_fn/std": 0.3559999465942383,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2142857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4052.0,
+      "completions/mean_length": 2016.8951416015625,
+      "completions/mean_terminated_length": 1449.8665771484375,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.4951005673027334,
+      "grad_norm": 0.3591211438179016,
+      "kl": 0.019683837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.1078,
+      "num_tokens": 145719796.0,
+      "reward": 1.1426339149475098,
+      "reward_std": 0.36062249541282654,
+      "rewards/code_format_reward/mean": 0.7857142686843872,
+      "rewards/code_format_reward/std": 0.41078460216522217,
+      "rewards/curriculum_aware_reward_fn/mean": 0.35691961646080017,
+      "rewards/curriculum_aware_reward_fn/std": 0.3005719482898712,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1584821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4015.0,
+      "completions/mean_length": 1744.2857666015625,
+      "completions/mean_terminated_length": 1301.389892578125,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.4992264053635895,
+      "grad_norm": 0.3126465976238251,
+      "kl": 0.047943115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0751,
+      "num_tokens": 146769224.0,
+      "reward": 1.2447545528411865,
+      "reward_std": 0.2551679313182831,
+      "rewards/code_format_reward/mean": 0.84375,
+      "rewards/code_format_reward/std": 0.36349809169769287,
+      "rewards/curriculum_aware_reward_fn/mean": 0.40100446343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.30527445673942566,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1205357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4042.0,
+      "completions/mean_length": 1506.857177734375,
+      "completions/mean_terminated_length": 1152.0,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.5033522434244456,
+      "grad_norm": 0.2568581998348236,
+      "kl": 0.0231475830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0733,
+      "num_tokens": 147711045.0,
+      "reward": 1.3008930683135986,
+      "reward_std": 0.24989202618598938,
+      "rewards/code_format_reward/mean": 0.8794642686843872,
+      "rewards/code_format_reward/std": 0.3259509205818176,
+      "rewards/curriculum_aware_reward_fn/mean": 0.42142853140830994,
+      "rewards/curriculum_aware_reward_fn/std": 0.30747321248054504,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1964285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4052.0,
+      "completions/mean_length": 1926.27685546875,
+      "completions/mean_terminated_length": 1395.9000244140625,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.5074780814853017,
+      "grad_norm": 0.23189201951026917,
+      "kl": 0.01876068115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0917,
+      "num_tokens": 148856558.0,
+      "reward": 1.1736607551574707,
+      "reward_std": 0.2991105318069458,
+      "rewards/code_format_reward/mean": 0.8035714030265808,
+      "rewards/code_format_reward/std": 0.3977404832839966,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37008926272392273,
+      "rewards/curriculum_aware_reward_fn/std": 0.3080544173717499,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 1860.857177734375,
+      "completions/mean_terminated_length": 1432.85107421875,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.5116039195461578,
+      "grad_norm": 0.2435297667980194,
+      "kl": 0.0208282470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.1266,
+      "num_tokens": 149947157.0,
+      "reward": 1.2025669813156128,
+      "reward_std": 0.3079543709754944,
+      "rewards/code_format_reward/mean": 0.8415178656578064,
+      "rewards/code_format_reward/std": 0.36560073494911194,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3610491156578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.30205413699150085,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3839.0,
+      "completions/mean_length": 1687.22998046875,
+      "completions/mean_terminated_length": 1225.97607421875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.5157297576070139,
+      "grad_norm": 1.005789875984192,
+      "kl": 0.0277099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0732,
+      "num_tokens": 150966362.0,
+      "reward": 1.2402902841567993,
+      "reward_std": 0.2671992778778076,
+      "rewards/code_format_reward/mean": 0.8415178656578064,
+      "rewards/code_format_reward/std": 0.36560073494911194,
+      "rewards/curriculum_aware_reward_fn/mean": 0.39877229928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.29583922028541565,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1696428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4089.0,
+      "completions/mean_length": 1716.3304443359375,
+      "completions/mean_terminated_length": 1230.1612548828125,
+      "completions/min_length": 148.0,
+      "completions/min_terminated_length": 148.0,
+      "epoch": 0.51985559566787,
+      "grad_norm": 0.23711848258972168,
+      "kl": 0.0218658447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.1152,
+      "num_tokens": 151995531.0,
+      "reward": 1.2325893640518188,
+      "reward_std": 0.2793101370334625,
+      "rewards/code_format_reward/mean": 0.8303571343421936,
+      "rewards/code_format_reward/std": 0.37573832273483276,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4022321403026581,
+      "rewards/curriculum_aware_reward_fn/std": 0.296654611825943,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0982142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4071.0,
+      "completions/mean_length": 1562.4285888671875,
+      "completions/mean_terminated_length": 1286.4949951171875,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 0.5239814337287262,
+      "grad_norm": 0.25690746307373047,
+      "kl": 0.0216522216796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0878,
+      "num_tokens": 152969745.0,
+      "reward": 1.32421875,
+      "reward_std": 0.25885266065597534,
+      "rewards/code_format_reward/mean": 0.9017857313156128,
+      "rewards/code_format_reward/std": 0.29793688654899597,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4224330484867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.3098207712173462,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1361607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 1784.1585693359375,
+      "completions/mean_terminated_length": 1419.759765625,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.5281072717895823,
+      "grad_norm": 0.2524595558643341,
+      "kl": 0.0201263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.073,
+      "num_tokens": 154036553.0,
+      "reward": 1.2431920766830444,
+      "reward_std": 0.2910597026348114,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334254264831543,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3793526291847229,
+      "rewards/curriculum_aware_reward_fn/std": 0.3008619546890259,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1830357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4034.0,
+      "completions/mean_length": 1931.9130859375,
+      "completions/mean_terminated_length": 1447.062744140625,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.5322331098504384,
+      "grad_norm": 0.620236337184906,
+      "kl": 0.0193634033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.093,
+      "num_tokens": 155200845.0,
+      "reward": 1.173437476158142,
+      "reward_std": 0.2876041829586029,
+      "rewards/code_format_reward/mean": 0.8214285969734192,
+      "rewards/code_format_reward/std": 0.3834212124347687,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3520089089870453,
+      "rewards/curriculum_aware_reward_fn/std": 0.30668607354164124,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1763392857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3971.0,
+      "completions/mean_length": 1707.0491943359375,
+      "completions/mean_terminated_length": 1195.593505859375,
+      "completions/min_length": 170.0,
+      "completions/min_terminated_length": 170.0,
+      "epoch": 0.5363589479112945,
+      "grad_norm": 0.3680073022842407,
+      "kl": 0.05645751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1403,
+      "num_tokens": 156233115.0,
+      "reward": 1.2227678298950195,
+      "reward_std": 0.28796523809432983,
+      "rewards/code_format_reward/mean": 0.8258928656578064,
+      "rewards/code_format_reward/std": 0.37962549924850464,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3968749940395355,
+      "rewards/curriculum_aware_reward_fn/std": 0.3155217170715332,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1808035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4006.0,
+      "completions/mean_length": 1826.3773193359375,
+      "completions/mean_terminated_length": 1325.4522705078125,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.5404847859721505,
+      "grad_norm": 0.22368091344833374,
+      "kl": 0.0381317138671875,
+      "learning_rate": 1e-06,
+      "loss": 0.064,
+      "num_tokens": 157330149.0,
+      "reward": 1.1661831140518188,
+      "reward_std": 0.26647037267684937,
+      "rewards/code_format_reward/mean": 0.8147321343421936,
+      "rewards/code_format_reward/std": 0.38894903659820557,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3514508605003357,
+      "rewards/curriculum_aware_reward_fn/std": 0.3114034831523895,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4093.0,
+      "completions/mean_length": 1739.4263916015625,
+      "completions/mean_terminated_length": 1360.9093017578125,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.5446106240330068,
+      "grad_norm": 5.251157283782959,
+      "kl": 1.00634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0779,
+      "num_tokens": 158392779.0,
+      "reward": 1.2868304252624512,
+      "reward_std": 0.26909175515174866,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4252232015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.2903171479701996,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4067.0,
+      "completions/mean_length": 1873.0045166015625,
+      "completions/mean_terminated_length": 1352.4683837890625,
+      "completions/min_length": 272.0,
+      "completions/min_terminated_length": 272.0,
+      "epoch": 0.5487364620938628,
+      "grad_norm": 0.2221236675977707,
+      "kl": 0.0202484130859375,
+      "learning_rate": 1e-06,
+      "loss": 0.078,
+      "num_tokens": 159511220.0,
+      "reward": 1.130357265472412,
+      "reward_std": 0.2697470784187317,
+      "rewards/code_format_reward/mean": 0.8147321343421936,
+      "rewards/code_format_reward/std": 0.38894903659820557,
+      "rewards/curriculum_aware_reward_fn/mean": 0.31562498211860657,
+      "rewards/curriculum_aware_reward_fn/std": 0.30488425493240356,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1049107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4046.0,
+      "completions/mean_length": 1628.3438720703125,
+      "completions/mean_terminated_length": 1339.1173095703125,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 0.5528623001547189,
+      "grad_norm": 0.2322208136320114,
+      "kl": 0.0207061767578125,
+      "learning_rate": 1e-06,
+      "loss": 0.093,
+      "num_tokens": 160512094.0,
+      "reward": 1.2918527126312256,
+      "reward_std": 0.24635407328605652,
+      "rewards/code_format_reward/mean": 0.8973214030265808,
+      "rewards/code_format_reward/std": 0.30387791991233826,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3945312201976776,
+      "rewards/curriculum_aware_reward_fn/std": 0.28814804553985596,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1540178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4014.0,
+      "completions/mean_length": 1850.7724609375,
+      "completions/mean_terminated_length": 1442.0106201171875,
+      "completions/min_length": 197.0,
+      "completions/min_terminated_length": 197.0,
+      "epoch": 0.556988138215575,
+      "grad_norm": 0.2326047718524933,
+      "kl": 0.02008056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.102,
+      "num_tokens": 161625734.0,
+      "reward": 1.2510045766830444,
+      "reward_std": 0.3010588586330414,
+      "rewards/code_format_reward/mean": 0.8459821343421936,
+      "rewards/code_format_reward/std": 0.36136940121650696,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4050223231315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.3193120062351227,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1272321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 1741.94873046875,
+      "completions/mean_terminated_length": 1398.77490234375,
+      "completions/min_length": 287.0,
+      "completions/min_terminated_length": 287.0,
+      "epoch": 0.5611139762764311,
+      "grad_norm": 0.2658071219921112,
+      "kl": 0.0209503173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0679,
+      "num_tokens": 162668816.0,
+      "reward": 1.3080357313156128,
+      "reward_std": 0.2536565661430359,
+      "rewards/code_format_reward/mean": 0.875,
+      "rewards/code_format_reward/std": 0.3310886323451996,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4330357015132904,
+      "rewards/curriculum_aware_reward_fn/std": 0.28444406390190125,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4061.0,
+      "completions/mean_length": 1614.5938720703125,
+      "completions/mean_terminated_length": 1193.4674072265625,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.5652398143372873,
+      "grad_norm": 0.24400874972343445,
+      "kl": 0.022796630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0584,
+      "num_tokens": 163658504.0,
+      "reward": 1.2035715579986572,
+      "reward_std": 0.22330662608146667,
+      "rewards/code_format_reward/mean": 0.8549107313156128,
+      "rewards/code_format_reward/std": 0.3525845408439636,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3486607074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.31307879090309143,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3776.0,
+      "completions/mean_length": 1638.0938720703125,
+      "completions/mean_terminated_length": 1243.300537109375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.5693656523981434,
+      "grad_norm": 0.217624232172966,
+      "kl": 0.01983642578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0868,
+      "num_tokens": 164669143.0,
+      "reward": 1.2754465341567993,
+      "reward_std": 0.2384774088859558,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.41383928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.32119113206863403,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0870535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4006.0,
+      "completions/mean_length": 1413.9129638671875,
+      "completions/mean_terminated_length": 1158.163818359375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.5734914904589995,
+      "grad_norm": 0.2783685624599457,
+      "kl": 0.0218353271484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0606,
+      "num_tokens": 165554219.0,
+      "reward": 1.3580358028411865,
+      "reward_std": 0.22280895709991455,
+      "rewards/code_format_reward/mean": 0.9129464030265808,
+      "rewards/code_format_reward/std": 0.2822287082672119,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44508928060531616,
+      "rewards/curriculum_aware_reward_fn/std": 0.2796177864074707,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1116071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3983.0,
+      "completions/mean_length": 1759.0826416015625,
+      "completions/mean_terminated_length": 1465.5,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 0.5776173285198556,
+      "grad_norm": 0.2369956076145172,
+      "kl": 0.020294189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.074,
+      "num_tokens": 166618536.0,
+      "reward": 1.2844866514205933,
+      "reward_std": 0.29120200872421265,
+      "rewards/code_format_reward/mean": 0.890625,
+      "rewards/code_format_reward/std": 0.3124580383300781,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3938615620136261,
+      "rewards/curriculum_aware_reward_fn/std": 0.29871317744255066,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3962.0,
+      "completions/mean_length": 1629.52685546875,
+      "completions/mean_terminated_length": 1117.6171875,
+      "completions/min_length": 304.0,
+      "completions/min_terminated_length": 304.0,
+      "epoch": 0.5817431665807117,
+      "grad_norm": 0.22888554632663727,
+      "kl": 0.02154541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0965,
+      "num_tokens": 167620136.0,
+      "reward": 1.2700893878936768,
+      "reward_std": 0.22284676134586334,
+      "rewards/code_format_reward/mean": 0.828125,
+      "rewards/code_format_reward/std": 0.3776935040950775,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4419642984867096,
+      "rewards/curriculum_aware_reward_fn/std": 0.2943010926246643,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1808035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4022.0,
+      "completions/mean_length": 1929.825927734375,
+      "completions/mean_terminated_length": 1451.73291015625,
+      "completions/min_length": 296.0,
+      "completions/min_terminated_length": 296.0,
+      "epoch": 0.5858690046415678,
+      "grad_norm": 0.31053388118743896,
+      "kl": 0.0206146240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0926,
+      "num_tokens": 168775254.0,
+      "reward": 1.1685268878936768,
+      "reward_std": 0.2711971700191498,
+      "rewards/code_format_reward/mean": 0.8169642686843872,
+      "rewards/code_format_reward/std": 0.387128084897995,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3515625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3206498324871063,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1964285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4066.0,
+      "completions/mean_length": 2099.91748046875,
+      "completions/mean_terminated_length": 1611.9862060546875,
+      "completions/min_length": 208.0,
+      "completions/min_terminated_length": 208.0,
+      "epoch": 0.589994842702424,
+      "grad_norm": 1.0171209573745728,
+      "kl": 0.0308685302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.092,
+      "num_tokens": 169998306.0,
+      "reward": 1.1707589626312256,
+      "reward_std": 0.32189029455184937,
+      "rewards/code_format_reward/mean": 0.8058035969734192,
+      "rewards/code_format_reward/std": 0.3960230052471161,
+      "rewards/curriculum_aware_reward_fn/mean": 0.3649553656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.3043855130672455,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1540178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 1779.51123046875,
+      "completions/mean_terminated_length": 1357.7757568359375,
+      "completions/min_length": 331.0,
+      "completions/min_terminated_length": 331.0,
+      "epoch": 0.5941206807632801,
+      "grad_norm": 0.2087247222661972,
+      "kl": 0.0209197998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0873,
+      "num_tokens": 171055451.0,
+      "reward": 1.2494419813156128,
+      "reward_std": 0.22824469208717346,
+      "rewards/code_format_reward/mean": 0.8459821343421936,
+      "rewards/code_format_reward/std": 0.36136940121650696,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4034597873687744,
+      "rewards/curriculum_aware_reward_fn/std": 0.2976873219013214,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1808035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4040.0,
+      "completions/mean_length": 1915.65185546875,
+      "completions/mean_terminated_length": 1434.4305419921875,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.5982465188241362,
+      "grad_norm": 0.22233060002326965,
+      "kl": 0.021392822265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0797,
+      "num_tokens": 172183898.0,
+      "reward": 1.1960937976837158,
+      "reward_std": 0.28867703676223755,
+      "rewards/code_format_reward/mean": 0.8191964030265808,
+      "rewards/code_format_reward/std": 0.38528555631637573,
+      "rewards/curriculum_aware_reward_fn/mean": 0.37689733505249023,
+      "rewards/curriculum_aware_reward_fn/std": 0.3042333126068115,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1272321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4037.0,
+      "completions/mean_length": 1554.509033203125,
+      "completions/mean_terminated_length": 1184.01025390625,
+      "completions/min_length": 246.0,
+      "completions/min_terminated_length": 246.0,
+      "epoch": 0.6023723568849922,
+      "grad_norm": 0.25177621841430664,
+      "kl": 0.0221405029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0628,
+      "num_tokens": 173136882.0,
+      "reward": 1.3223215341567993,
+      "reward_std": 0.2333114594221115,
+      "rewards/code_format_reward/mean": 0.8727678656578064,
+      "rewards/code_format_reward/std": 0.3336053788661957,
+      "rewards/curriculum_aware_reward_fn/mean": 0.44955354928970337,
+      "rewards/curriculum_aware_reward_fn/std": 0.2995332181453705,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1026785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4040.0,
+      "completions/mean_length": 1709.7098388671875,
+      "completions/mean_terminated_length": 1436.6517333984375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.6064981949458483,
+      "grad_norm": 0.23244866728782654,
+      "kl": 0.02008056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0525,
+      "num_tokens": 174169469.0,
+      "reward": 1.5930804014205933,
+      "reward_std": 0.3070758581161499,
+      "rewards/code_format_reward/mean": 0.8995535969734192,
+      "rewards/code_format_reward/std": 0.30093035101890564,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6935268044471741,
+      "rewards/curriculum_aware_reward_fn/std": 0.43586739897727966,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1852678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4094.0,
+      "completions/mean_length": 2015.8974609375,
+      "completions/mean_terminated_length": 1542.8876953125,
+      "completions/min_length": 268.0,
+      "completions/min_terminated_length": 268.0,
+      "epoch": 0.6106240330067045,
+      "grad_norm": 0.22977960109710693,
+      "kl": 0.020538330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0895,
+      "num_tokens": 175345031.0,
+      "reward": 1.3625000715255737,
+      "reward_std": 0.37156882882118225,
+      "rewards/code_format_reward/mean": 0.8191964030265808,
+      "rewards/code_format_reward/std": 0.38528555631637573,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5433035492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.44534605741500854,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1741071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4055.0,
+      "completions/mean_length": 1978.6429443359375,
+      "completions/mean_terminated_length": 1532.2811279296875,
+      "completions/min_length": 167.0,
+      "completions/min_terminated_length": 167.0,
+      "epoch": 0.6147498710675606,
+      "grad_norm": 54.07054138183594,
+      "kl": 5.5802459716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.1208,
+      "num_tokens": 176520752.0,
+      "reward": 1.4250000715255737,
+      "reward_std": 0.3232128322124481,
+      "rewards/code_format_reward/mean": 0.8258928656578064,
+      "rewards/code_format_reward/std": 0.37962549924850464,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5991071462631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4462988078594208,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1473214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3912.0,
+      "completions/mean_length": 1759.966552734375,
+      "completions/mean_terminated_length": 1356.358642578125,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.6188757091284167,
+      "grad_norm": 0.2159195989370346,
+      "kl": 0.0220794677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0831,
+      "num_tokens": 177595706.0,
+      "reward": 1.485267996788025,
+      "reward_std": 0.31410348415374756,
+      "rewards/code_format_reward/mean": 0.8504464030265808,
+      "rewards/code_format_reward/std": 0.3570319712162018,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6348214745521545,
+      "rewards/curriculum_aware_reward_fn/std": 0.43525242805480957,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1227678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4063.0,
+      "completions/mean_length": 1610.05810546875,
+      "completions/mean_terminated_length": 1262.1527099609375,
+      "completions/min_length": 241.0,
+      "completions/min_terminated_length": 241.0,
+      "epoch": 0.6230015471892728,
+      "grad_norm": 0.26271775364875793,
+      "kl": 0.0230865478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0389,
+      "num_tokens": 178580982.0,
+      "reward": 1.5331473350524902,
+      "reward_std": 0.29026809334754944,
+      "rewards/code_format_reward/mean": 0.8772321343421936,
+      "rewards/code_format_reward/std": 0.3285374045372009,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6559152007102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.42833590507507324,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0758928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4065.0,
+      "completions/mean_length": 1439.6920166015625,
+      "completions/mean_terminated_length": 1221.541015625,
+      "completions/min_length": 158.0,
+      "completions/min_terminated_length": 158.0,
+      "epoch": 0.6271273852501289,
+      "grad_norm": 0.2625259459018707,
+      "kl": 0.0258941650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0202,
+      "num_tokens": 179483746.0,
+      "reward": 1.6277902126312256,
+      "reward_std": 0.24596910178661346,
+      "rewards/code_format_reward/mean": 0.9241071343421936,
+      "rewards/code_format_reward/std": 0.2651226818561554,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7036830186843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.3924238681793213,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.140625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3992.0,
+      "completions/mean_length": 1682.0826416015625,
+      "completions/mean_terminated_length": 1287.077880859375,
+      "completions/min_length": 188.0,
+      "completions/min_terminated_length": 188.0,
+      "epoch": 0.631253223310985,
+      "grad_norm": 0.22368377447128296,
+      "kl": 0.0244598388671875,
+      "learning_rate": 1e-06,
+      "loss": 0.056,
+      "num_tokens": 180512524.0,
+      "reward": 1.5142858028411865,
+      "reward_std": 0.2848253846168518,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6526784896850586,
+      "rewards/curriculum_aware_reward_fn/std": 0.4242819547653198,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1473214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3971.0,
+      "completions/mean_length": 1798.2724609375,
+      "completions/mean_terminated_length": 1401.28271484375,
+      "completions/min_length": 223.0,
+      "completions/min_terminated_length": 223.0,
+      "epoch": 0.6353790613718412,
+      "grad_norm": 3.2692782878875732,
+      "kl": 0.0210113525390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0781,
+      "num_tokens": 181588987.0,
+      "reward": 1.4170759916305542,
+      "reward_std": 0.3211572766304016,
+      "rewards/code_format_reward/mean": 0.8549107313156128,
+      "rewards/code_format_reward/std": 0.3525845408439636,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5621652007102966,
+      "rewards/curriculum_aware_reward_fn/std": 0.4653093218803406,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1495535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4062.0,
+      "completions/mean_length": 1706.3795166015625,
+      "completions/mean_terminated_length": 1286.157470703125,
+      "completions/min_length": 266.0,
+      "completions/min_terminated_length": 266.0,
+      "epoch": 0.6395048994326973,
+      "grad_norm": 0.20925132930278778,
+      "kl": 0.02191162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0643,
+      "num_tokens": 182636882.0,
+      "reward": 1.430915355682373,
+      "reward_std": 0.28560274839401245,
+      "rewards/code_format_reward/mean": 0.8504464030265808,
+      "rewards/code_format_reward/std": 0.3570319712162018,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5804687142372131,
+      "rewards/curriculum_aware_reward_fn/std": 0.4736019968986511,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 1834.4130859375,
+      "completions/mean_terminated_length": 1265.8575439453125,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.6436307374935534,
+      "grad_norm": 0.2284567803144455,
+      "kl": 0.0224151611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.061,
+      "num_tokens": 183734975.0,
+      "reward": 1.3992189168930054,
+      "reward_std": 0.2940734326839447,
+      "rewards/code_format_reward/mean": 0.8013392686843872,
+      "rewards/code_format_reward/std": 0.3994380831718445,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5978794693946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4464387595653534,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2120535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 1867.7857666015625,
+      "completions/mean_terminated_length": 1268.1246337890625,
+      "completions/min_length": 171.0,
+      "completions/min_terminated_length": 171.0,
+      "epoch": 0.6477565755544095,
+      "grad_norm": 0.24589860439300537,
+      "kl": 0.0221405029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1204,
+      "num_tokens": 184850196.0,
+      "reward": 1.4158483743667603,
+      "reward_std": 0.31856080889701843,
+      "rewards/code_format_reward/mean": 0.7879464030265808,
+      "rewards/code_format_reward/std": 0.40921953320503235,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6279017329216003,
+      "rewards/curriculum_aware_reward_fn/std": 0.4837891459465027,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1808035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4061.0,
+      "completions/mean_length": 1900.57373046875,
+      "completions/mean_terminated_length": 1416.0245361328125,
+      "completions/min_length": 278.0,
+      "completions/min_terminated_length": 278.0,
+      "epoch": 0.6518824136152656,
+      "grad_norm": 0.23172283172607422,
+      "kl": 0.02203369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.094,
+      "num_tokens": 185946744.0,
+      "reward": 1.4448662996292114,
+      "reward_std": 0.3671303391456604,
+      "rewards/code_format_reward/mean": 0.828125,
+      "rewards/code_format_reward/std": 0.3776935040950775,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6167410612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4445529878139496,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4059.0,
+      "completions/mean_length": 1822.5179443359375,
+      "completions/mean_terminated_length": 1457.34716796875,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.6560082516761218,
+      "grad_norm": 0.21886523067951202,
+      "kl": 0.023040771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.029,
+      "num_tokens": 187044471.0,
+      "reward": 1.4851562976837158,
+      "reward_std": 0.27218568325042725,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334251284599304,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6213169693946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.43654102087020874,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1428571428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4056.0,
+      "completions/mean_length": 1853.3037109375,
+      "completions/mean_terminated_length": 1479.5208740234375,
+      "completions/min_length": 153.0,
+      "completions/min_terminated_length": 153.0,
+      "epoch": 0.6601340897369778,
+      "grad_norm": 0.25297442078590393,
+      "kl": 0.022857666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0641,
+      "num_tokens": 188175443.0,
+      "reward": 1.4321428537368774,
+      "reward_std": 0.3305543065071106,
+      "rewards/code_format_reward/mean": 0.859375,
+      "rewards/code_format_reward/std": 0.3480229377746582,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5727678537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.4538741111755371,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2120535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 2163.453125,
+      "completions/mean_terminated_length": 1643.362548828125,
+      "completions/min_length": 249.0,
+      "completions/min_terminated_length": 249.0,
+      "epoch": 0.6642599277978339,
+      "grad_norm": 15.494752883911133,
+      "kl": 0.241455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.1041,
+      "num_tokens": 189418118.0,
+      "reward": 1.4010045528411865,
+      "reward_std": 0.42570188641548157,
+      "rewards/code_format_reward/mean": 0.7879464030265808,
+      "rewards/code_format_reward/std": 0.40921953320503235,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.46330952644348145,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.109375,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4063.0,
+      "completions/mean_length": 1600.154052734375,
+      "completions/mean_terminated_length": 1293.6466064453125,
+      "completions/min_length": 228.0,
+      "completions/min_terminated_length": 228.0,
+      "epoch": 0.66838576585869,
+      "grad_norm": 0.23208573460578918,
+      "kl": 0.02618408203125,
+      "learning_rate": 1e-06,
+      "loss": 0.041,
+      "num_tokens": 190404746.0,
+      "reward": 1.5679688453674316,
+      "reward_std": 0.28690239787101746,
+      "rewards/code_format_reward/mean": 0.8928571343421936,
+      "rewards/code_format_reward/std": 0.3096405565738678,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6751116514205933,
+      "rewards/curriculum_aware_reward_fn/std": 0.43228229880332947,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4031.0,
+      "completions/mean_length": 1678.0067138671875,
+      "completions/mean_terminated_length": 1332.5791015625,
+      "completions/min_length": 202.0,
+      "completions/min_terminated_length": 202.0,
+      "epoch": 0.6725116039195461,
+      "grad_norm": 0.21358643472194672,
+      "kl": 0.02490234375,
+      "learning_rate": 1e-06,
+      "loss": 0.079,
+      "num_tokens": 191424173.0,
+      "reward": 1.4213169813156128,
+      "reward_std": 0.3107840418815613,
+      "rewards/code_format_reward/mean": 0.8727678656578064,
+      "rewards/code_format_reward/std": 0.3336053788661957,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5485491156578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4599668085575104,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1071428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4073.0,
+      "completions/mean_length": 1544.6473388671875,
+      "completions/mean_terminated_length": 1238.4849853515625,
+      "completions/min_length": 282.0,
+      "completions/min_terminated_length": 282.0,
+      "epoch": 0.6766374419804023,
+      "grad_norm": 0.22339744865894318,
+      "kl": 0.027435302734375,
+      "learning_rate": 1e-06,
+      "loss": 0.1136,
+      "num_tokens": 192387628.0,
+      "reward": 1.5645090341567993,
+      "reward_std": 0.3053089678287506,
+      "rewards/code_format_reward/mean": 0.8928571343421936,
+      "rewards/code_format_reward/std": 0.3096405565738678,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6716518402099609,
+      "rewards/curriculum_aware_reward_fn/std": 0.42687925696372986,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 1722.247802734375,
+      "completions/mean_terminated_length": 1340.971435546875,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.6807632800412584,
+      "grad_norm": 0.22346089780330658,
+      "kl": 0.026641845703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0703,
+      "num_tokens": 193439790.0,
+      "reward": 1.5340402126312256,
+      "reward_std": 0.3189355731010437,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6724330186843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4248095750808716,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1473214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4091.0,
+      "completions/mean_length": 1860.029052734375,
+      "completions/mean_terminated_length": 1473.70947265625,
+      "completions/min_length": 248.0,
+      "completions/min_terminated_length": 248.0,
+      "epoch": 0.6848891181021145,
+      "grad_norm": 0.4491024613380432,
+      "kl": 0.0242767333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0949,
+      "num_tokens": 194559180.0,
+      "reward": 1.485602855682373,
+      "reward_std": 0.37783434987068176,
+      "rewards/code_format_reward/mean": 0.8571428656578064,
+      "rewards/code_format_reward/std": 0.3503182828426361,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6284598112106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4338955879211426,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4080.0,
+      "completions/mean_length": 1602.30810546875,
+      "completions/mean_terminated_length": 1246.0662841796875,
+      "completions/min_length": 198.0,
+      "completions/min_terminated_length": 198.0,
+      "epoch": 0.6890149561629706,
+      "grad_norm": 0.2428234964609146,
+      "kl": 0.031341552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0549,
+      "num_tokens": 195544246.0,
+      "reward": 1.5504463911056519,
+      "reward_std": 0.2811921536922455,
+      "rewards/code_format_reward/mean": 0.8772321343421936,
+      "rewards/code_format_reward/std": 0.3285374045372009,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6732142567634583,
+      "rewards/curriculum_aware_reward_fn/std": 0.4268362522125244,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4023.0,
+      "completions/mean_length": 1865.9130859375,
+      "completions/mean_terminated_length": 1507.71240234375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.6931407942238267,
+      "grad_norm": 0.2817108929157257,
+      "kl": 0.025543212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0211,
+      "num_tokens": 196671669.0,
+      "reward": 1.4904019832611084,
+      "reward_std": 0.3173461854457855,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334254264831543,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6265625357627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.43362295627593994,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1674107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4082.0,
+      "completions/mean_length": 1881.388427734375,
+      "completions/mean_terminated_length": 1436.0911865234375,
+      "completions/min_length": 229.0,
+      "completions/min_terminated_length": 229.0,
+      "epoch": 0.6972666322846828,
+      "grad_norm": 0.3142848312854767,
+      "kl": 0.0292205810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0543,
+      "num_tokens": 197800782.0,
+      "reward": 1.4146206378936768,
+      "reward_std": 0.3862914443016052,
+      "rewards/code_format_reward/mean": 0.8348214030265808,
+      "rewards/code_format_reward/std": 0.37175676226615906,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5797991156578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4677990674972534,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2142857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4075.0,
+      "completions/mean_length": 2074.52685546875,
+      "completions/mean_terminated_length": 1523.2159423828125,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.701392470345539,
+      "grad_norm": 0.2322724312543869,
+      "kl": 0.026275634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0639,
+      "num_tokens": 199023176.0,
+      "reward": 1.3515626192092896,
+      "reward_std": 0.3802144229412079,
+      "rewards/code_format_reward/mean": 0.7857142686843872,
+      "rewards/code_format_reward/std": 0.41078460216522217,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5658482313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4406941831111908,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1696428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4061.0,
+      "completions/mean_length": 1990.6407470703125,
+      "completions/mean_terminated_length": 1560.513427734375,
+      "completions/min_length": 281.0,
+      "completions/min_terminated_length": 281.0,
+      "epoch": 0.7055183084063951,
+      "grad_norm": 0.2236924022436142,
+      "kl": 0.026702880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0675,
+      "num_tokens": 200189656.0,
+      "reward": 1.4627233743667603,
+      "reward_std": 0.32308143377304077,
+      "rewards/code_format_reward/mean": 0.8303571343421936,
+      "rewards/code_format_reward/std": 0.37573832273483276,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6323660612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.43709608912467957,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.15625,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4032.0,
+      "completions/mean_length": 1724.93310546875,
+      "completions/mean_terminated_length": 1285.8465576171875,
+      "completions/min_length": 194.0,
+      "completions/min_terminated_length": 194.0,
+      "epoch": 0.7096441464672512,
+      "grad_norm": 0.22402040660381317,
+      "kl": 0.0262603759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0855,
+      "num_tokens": 201223736.0,
+      "reward": 1.447767972946167,
+      "reward_std": 0.3190096616744995,
+      "rewards/code_format_reward/mean": 0.84375,
+      "rewards/code_format_reward/std": 0.36349809169769287,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6040178537368774,
+      "rewards/curriculum_aware_reward_fn/std": 0.48629483580589294,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1741071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 1807.4287109375,
+      "completions/mean_terminated_length": 1324.9730224609375,
+      "completions/min_length": 291.0,
+      "completions/min_terminated_length": 291.0,
+      "epoch": 0.7137699845281072,
+      "grad_norm": 0.20310117304325104,
+      "kl": 0.026702880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0933,
+      "num_tokens": 202291253.0,
+      "reward": 1.5252233743667603,
+      "reward_std": 0.3091839551925659,
+      "rewards/code_format_reward/mean": 0.828125,
+      "rewards/code_format_reward/std": 0.3776935040950775,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6970981955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.44606462121009827,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0892857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4081.0,
+      "completions/mean_length": 1324.3170166015625,
+      "completions/mean_terminated_length": 1052.5833740234375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.7178958225889633,
+      "grad_norm": 0.25287967920303345,
+      "kl": 0.0323333740234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0458,
+      "num_tokens": 203151912.0,
+      "reward": 1.601562738418579,
+      "reward_std": 0.264218270778656,
+      "rewards/code_format_reward/mean": 0.9107142686843872,
+      "rewards/code_format_reward/std": 0.2854744791984558,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6908482313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4577873945236206,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4046.0,
+      "completions/mean_length": 1976.7076416015625,
+      "completions/mean_terminated_length": 1487.64013671875,
+      "completions/min_length": 176.0,
+      "completions/min_terminated_length": 176.0,
+      "epoch": 0.7220216606498195,
+      "grad_norm": 0.22337962687015533,
+      "kl": 0.0275726318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0868,
+      "num_tokens": 204336881.0,
+      "reward": 1.3909599781036377,
+      "reward_std": 0.35256850719451904,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252740144729614,
+      "rewards/curriculum_aware_reward_fn/mean": 0.580691933631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.4459373652935028,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0982142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1631.310302734375,
+      "completions/mean_terminated_length": 1362.878662109375,
+      "completions/min_length": 225.0,
+      "completions/min_terminated_length": 225.0,
+      "epoch": 0.7261474987106756,
+      "grad_norm": 0.22130419313907623,
+      "kl": 0.028656005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0352,
+      "num_tokens": 205337467.0,
+      "reward": 1.6895090341567993,
+      "reward_std": 0.2801817059516907,
+      "rewards/code_format_reward/mean": 0.9040178656578064,
+      "rewards/code_format_reward/std": 0.29489603638648987,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7854910492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.40007179975509644,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1785714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4021.0,
+      "completions/mean_length": 1944.5826416015625,
+      "completions/mean_terminated_length": 1476.8831787109375,
+      "completions/min_length": 203.0,
+      "completions/min_terminated_length": 203.0,
+      "epoch": 0.7302733367715317,
+      "grad_norm": 0.2612294852733612,
+      "kl": 0.027130126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0948,
+      "num_tokens": 206477860.0,
+      "reward": 1.4131697416305542,
+      "reward_std": 0.355887770652771,
+      "rewards/code_format_reward/mean": 0.8214285969734192,
+      "rewards/code_format_reward/std": 0.3834212124347687,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4985761046409607,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1941964285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4091.0,
+      "completions/mean_length": 1801.9443359375,
+      "completions/mean_terminated_length": 1249.0830078125,
+      "completions/min_length": 224.0,
+      "completions/min_terminated_length": 224.0,
+      "epoch": 0.7343991748323878,
+      "grad_norm": 0.2323751598596573,
+      "kl": 0.0294036865234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0978,
+      "num_tokens": 207566363.0,
+      "reward": 1.3904019594192505,
+      "reward_std": 0.31305989623069763,
+      "rewards/code_format_reward/mean": 0.8080357313156128,
+      "rewards/code_format_reward/std": 0.3942854404449463,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5823659896850586,
+      "rewards/curriculum_aware_reward_fn/std": 0.44665220379829407,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1071428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4059.0,
+      "completions/mean_length": 1608.8773193359375,
+      "completions/mean_terminated_length": 1310.4224853515625,
+      "completions/min_length": 192.0,
+      "completions/min_terminated_length": 192.0,
+      "epoch": 0.7385250128932439,
+      "grad_norm": 0.24387770891189575,
+      "kl": 0.0294342041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0693,
+      "num_tokens": 208537103.0,
+      "reward": 1.486830472946167,
+      "reward_std": 0.3190726041793823,
+      "rewards/code_format_reward/mean": 0.8950892686843872,
+      "rewards/code_format_reward/std": 0.3067808747291565,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5917410850524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4914354383945465,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1785714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4007.0,
+      "completions/mean_length": 1846.66748046875,
+      "completions/mean_terminated_length": 1357.68212890625,
+      "completions/min_length": 215.0,
+      "completions/min_terminated_length": 215.0,
+      "epoch": 0.7426508509541001,
+      "grad_norm": 0.42223504185676575,
+      "kl": 0.0312347412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0936,
+      "num_tokens": 209624936.0,
+      "reward": 1.4097100496292114,
+      "reward_std": 0.31943628191947937,
+      "rewards/code_format_reward/mean": 0.8236607313156128,
+      "rewards/code_format_reward/std": 0.3815346360206604,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.4472143352031708,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4026.0,
+      "completions/mean_length": 1833.060302734375,
+      "completions/mean_terminated_length": 1303.1707763671875,
+      "completions/min_length": 240.0,
+      "completions/min_terminated_length": 240.0,
+      "epoch": 0.7467766890149562,
+      "grad_norm": 0.22536613047122955,
+      "kl": 0.029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0963,
+      "num_tokens": 210728513.0,
+      "reward": 1.450334906578064,
+      "reward_std": 0.2970465123653412,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252740144729614,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6400669813156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4401248097419739,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1517857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4071.0,
+      "completions/mean_length": 1873.4866943359375,
+      "completions/mean_terminated_length": 1475.7738037109375,
+      "completions/min_length": 222.0,
+      "completions/min_terminated_length": 222.0,
+      "epoch": 0.7509025270758123,
+      "grad_norm": 0.25091269612312317,
+      "kl": 0.0291290283203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0639,
+      "num_tokens": 211823881.0,
+      "reward": 1.4522322416305542,
+      "reward_std": 0.3361148536205292,
+      "rewards/code_format_reward/mean": 0.8504464030265808,
+      "rewards/code_format_reward/std": 0.3570319712162018,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6017856597900391,
+      "rewards/curriculum_aware_reward_fn/std": 0.46313437819480896,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1651785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4022.0,
+      "completions/mean_length": 1746.0023193359375,
+      "completions/mean_terminated_length": 1281.0294189453125,
+      "completions/min_length": 219.0,
+      "completions/min_terminated_length": 219.0,
+      "epoch": 0.7550283651366684,
+      "grad_norm": 0.24613533914089203,
+      "kl": 0.03045654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0629,
+      "num_tokens": 212866385.0,
+      "reward": 1.489174246788025,
+      "reward_std": 0.297740638256073,
+      "rewards/code_format_reward/mean": 0.8348214030265808,
+      "rewards/code_format_reward/std": 0.37175676226615906,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6543526649475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.4452936053276062,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1674107142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4068.0,
+      "completions/mean_length": 2007.6295166015625,
+      "completions/mean_terminated_length": 1587.7158203125,
+      "completions/min_length": 233.0,
+      "completions/min_terminated_length": 233.0,
+      "epoch": 0.7591542031975245,
+      "grad_norm": 0.23514395952224731,
+      "kl": 0.0292510986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.046,
+      "num_tokens": 214051270.0,
+      "reward": 1.4108260869979858,
+      "reward_std": 0.36220675706863403,
+      "rewards/code_format_reward/mean": 0.8370535969734192,
+      "rewards/code_format_reward/std": 0.3697296679019928,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5737723112106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4432297646999359,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1361607142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4082.0,
+      "completions/mean_length": 1622.7344970703125,
+      "completions/mean_terminated_length": 1232.8914794921875,
+      "completions/min_length": 212.0,
+      "completions/min_terminated_length": 212.0,
+      "epoch": 0.7632800412583806,
+      "grad_norm": 0.2874307930469513,
+      "kl": 0.0306396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.1072,
+      "num_tokens": 215048952.0,
+      "reward": 1.3916295766830444,
+      "reward_std": 0.27101579308509827,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5300223231315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4591200649738312,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2276785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4077.0,
+      "completions/mean_length": 2090.618408203125,
+      "completions/mean_terminated_length": 1499.4364013671875,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.7674058793192368,
+      "grad_norm": 0.22228585183620453,
+      "kl": 0.0281829833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.1194,
+      "num_tokens": 216246425.0,
+      "reward": 1.2716518640518188,
+      "reward_std": 0.4031871259212494,
+      "rewards/code_format_reward/mean": 0.78125,
+      "rewards/code_format_reward/std": 0.4138607978820801,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4904017746448517,
+      "rewards/curriculum_aware_reward_fn/std": 0.4519253373146057,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0870535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4049.0,
+      "completions/mean_length": 1512.0535888671875,
+      "completions/mean_terminated_length": 1265.66259765625,
+      "completions/min_length": 322.0,
+      "completions/min_terminated_length": 322.0,
+      "epoch": 0.7715317173800929,
+      "grad_norm": 0.22118829190731049,
+      "kl": 0.031341552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0398,
+      "num_tokens": 217192268.0,
+      "reward": 1.527009129524231,
+      "reward_std": 0.26555386185646057,
+      "rewards/code_format_reward/mean": 0.9196428656578064,
+      "rewards/code_format_reward/std": 0.2721492052078247,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6073660254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.46336886286735535,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1183035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1622.5201416015625,
+      "completions/mean_terminated_length": 1290.635498046875,
+      "completions/min_length": 333.0,
+      "completions/min_terminated_length": 333.0,
+      "epoch": 0.7756575554409489,
+      "grad_norm": 0.22401097416877747,
+      "kl": 0.0296630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0562,
+      "num_tokens": 218182868.0,
+      "reward": 1.4532368183135986,
+      "reward_std": 0.2841085195541382,
+      "rewards/code_format_reward/mean": 0.8839285969734192,
+      "rewards/code_format_reward/std": 0.32066863775253296,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5693081021308899,
+      "rewards/curriculum_aware_reward_fn/std": 0.4530544579029083,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4048.0,
+      "completions/mean_length": 1664.4554443359375,
+      "completions/mean_terminated_length": 1273.8963623046875,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.779783393501805,
+      "grad_norm": 0.2380165308713913,
+      "kl": 0.0315399169921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0685,
+      "num_tokens": 219187502.0,
+      "reward": 1.4876116514205933,
+      "reward_std": 0.32384008169174194,
+      "rewards/code_format_reward/mean": 0.859375,
+      "rewards/code_format_reward/std": 0.3480229377746582,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6282365918159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.4722626507282257,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1138392857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4096.0,
+      "completions/mean_length": 1659.7723388671875,
+      "completions/mean_terminated_length": 1346.8060302734375,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 0.7839092315626611,
+      "grad_norm": 0.34679973125457764,
+      "kl": 0.0302276611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.1048,
+      "num_tokens": 220184697.0,
+      "reward": 1.539955496788025,
+      "reward_std": 0.3225671648979187,
+      "rewards/code_format_reward/mean": 0.8861607313156128,
+      "rewards/code_format_reward/std": 0.31797102093696594,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6537945866584778,
+      "rewards/curriculum_aware_reward_fn/std": 0.4496917724609375,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1183035714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3969.0,
+      "completions/mean_length": 1594.0648193359375,
+      "completions/mean_terminated_length": 1258.362060546875,
+      "completions/min_length": 180.0,
+      "completions/min_terminated_length": 180.0,
+      "epoch": 0.7880350696235173,
+      "grad_norm": 0.347991406917572,
+      "kl": 0.0323486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.11,
+      "num_tokens": 221148836.0,
+      "reward": 1.6414064168930054,
+      "reward_std": 0.328239381313324,
+      "rewards/code_format_reward/mean": 0.8861607313156128,
+      "rewards/code_format_reward/std": 0.31797102093696594,
+      "rewards/curriculum_aware_reward_fn/mean": 0.755245566368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.41560760140419006,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4076.0,
+      "completions/mean_length": 1887.9844970703125,
+      "completions/mean_terminated_length": 1513.255859375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.7921609076843734,
+      "grad_norm": 0.7645904421806335,
+      "kl": 0.0301361083984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0756,
+      "num_tokens": 222288485.0,
+      "reward": 1.4526787996292114,
+      "reward_std": 0.30151334404945374,
+      "rewards/code_format_reward/mean": 0.8571428656578064,
+      "rewards/code_format_reward/std": 0.3503182828426361,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5955356955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.44083017110824585,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1339285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1725.9442138671875,
+      "completions/mean_terminated_length": 1359.440673828125,
+      "completions/min_length": 184.0,
+      "completions/min_terminated_length": 184.0,
+      "epoch": 0.7962867457452295,
+      "grad_norm": 0.2471107840538025,
+      "kl": 0.0307464599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0607,
+      "num_tokens": 223335947.0,
+      "reward": 1.5162948369979858,
+      "reward_std": 0.32580631971359253,
+      "rewards/code_format_reward/mean": 0.8660714030265808,
+      "rewards/code_format_reward/std": 0.34095630049705505,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6502231955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.4345405101776123,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2254464285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4078.0,
+      "completions/mean_length": 2046.5001220703125,
+      "completions/mean_terminated_length": 1449.9595947265625,
+      "completions/min_length": 183.0,
+      "completions/min_terminated_length": 183.0,
+      "epoch": 0.8004125838060856,
+      "grad_norm": 0.2045373022556305,
+      "kl": 0.0279083251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.1047,
+      "num_tokens": 224518009.0,
+      "reward": 1.3463170528411865,
+      "reward_std": 0.3773540258407593,
+      "rewards/code_format_reward/mean": 0.7745535969734192,
+      "rewards/code_format_reward/std": 0.41834309697151184,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5717633962631226,
+      "rewards/curriculum_aware_reward_fn/std": 0.4517683684825897,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4078.0,
+      "completions/mean_length": 1744.388427734375,
+      "completions/mean_terminated_length": 1345.289794921875,
+      "completions/min_length": 298.0,
+      "completions/min_terminated_length": 298.0,
+      "epoch": 0.8045384218669417,
+      "grad_norm": 0.2941077947616577,
+      "kl": 0.0295867919921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0859,
+      "num_tokens": 225559174.0,
+      "reward": 1.5177457332611084,
+      "reward_std": 0.3033301830291748,
+      "rewards/code_format_reward/mean": 0.8549107313156128,
+      "rewards/code_format_reward/std": 0.3525845408439636,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6628348231315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.4127408564090729,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1205357142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1809.15185546875,
+      "completions/mean_terminated_length": 1495.725830078125,
+      "completions/min_length": 186.0,
+      "completions/min_terminated_length": 186.0,
+      "epoch": 0.8086642599277978,
+      "grad_norm": 0.2557990550994873,
+      "kl": 0.031646728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.1066,
+      "num_tokens": 226641974.0,
+      "reward": 1.4722100496292114,
+      "reward_std": 0.37684550881385803,
+      "rewards/code_format_reward/mean": 0.8861607313156128,
+      "rewards/code_format_reward/std": 0.31797102093696594,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5860490798950195,
+      "rewards/curriculum_aware_reward_fn/std": 0.44295454025268555,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 1995.5982666015625,
+      "completions/mean_terminated_length": 1503.7686767578125,
+      "completions/min_length": 263.0,
+      "completions/min_terminated_length": 263.0,
+      "epoch": 0.812790097988654,
+      "grad_norm": 0.20381715893745422,
+      "kl": 0.0287933349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0545,
+      "num_tokens": 227803885.0,
+      "reward": 1.3908482789993286,
+      "reward_std": 0.3567146360874176,
+      "rewards/code_format_reward/mean": 0.8147321343421936,
+      "rewards/code_format_reward/std": 0.38894903659820557,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5761160254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.47179847955703735,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4059.0,
+      "completions/mean_length": 1600.185302734375,
+      "completions/mean_terminated_length": 1199.3031005859375,
+      "completions/min_length": 236.0,
+      "completions/min_terminated_length": 236.0,
+      "epoch": 0.8169159360495101,
+      "grad_norm": 0.29694321751594543,
+      "kl": 0.0296630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0737,
+      "num_tokens": 228775491.0,
+      "reward": 1.4993302822113037,
+      "reward_std": 0.23577921092510223,
+      "rewards/code_format_reward/mean": 0.8638392686843872,
+      "rewards/code_format_reward/std": 0.34334254264831543,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6354910731315613,
+      "rewards/curriculum_aware_reward_fn/std": 0.42602863907814026,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4077.0,
+      "completions/mean_length": 1745.0513916015625,
+      "completions/mean_terminated_length": 1346.0653076171875,
+      "completions/min_length": 175.0,
+      "completions/min_terminated_length": 175.0,
+      "epoch": 0.8210417741103662,
+      "grad_norm": 0.2424536943435669,
+      "kl": 0.0304412841796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0717,
+      "num_tokens": 229817282.0,
+      "reward": 1.483035683631897,
+      "reward_std": 0.3085847795009613,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6214285492897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.45782846212387085,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0915178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4066.0,
+      "completions/mean_length": 1753.384033203125,
+      "completions/mean_terminated_length": 1517.3955078125,
+      "completions/min_length": 293.0,
+      "completions/min_terminated_length": 293.0,
+      "epoch": 0.8251676121712223,
+      "grad_norm": 0.2286684364080429,
+      "kl": 0.0298004150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0483,
+      "num_tokens": 230885598.0,
+      "reward": 1.54676353931427,
+      "reward_std": 0.31968414783477783,
+      "rewards/code_format_reward/mean": 0.9129464030265808,
+      "rewards/code_format_reward/std": 0.2822287082672119,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6338168978691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.44459256529808044,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 1600.5157470703125,
+      "completions/mean_terminated_length": 1244.017822265625,
+      "completions/min_length": 214.0,
+      "completions/min_terminated_length": 214.0,
+      "epoch": 0.8292934502320783,
+      "grad_norm": 0.22150000929832458,
+      "kl": 0.03021240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.094,
+      "num_tokens": 231872199.0,
+      "reward": 1.434598445892334,
+      "reward_std": 0.298090398311615,
+      "rewards/code_format_reward/mean": 0.875,
+      "rewards/code_format_reward/std": 0.3310886323451996,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5595981478691101,
+      "rewards/curriculum_aware_reward_fn/std": 0.46301397681236267,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1495535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4066.0,
+      "completions/mean_length": 1712.7254638671875,
+      "completions/mean_terminated_length": 1293.619384765625,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.8334192882929345,
+      "grad_norm": 0.24653927981853485,
+      "kl": 0.0312347412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0796,
+      "num_tokens": 232899585.0,
+      "reward": 1.3952010869979858,
+      "reward_std": 0.31651803851127625,
+      "rewards/code_format_reward/mean": 0.8549107313156128,
+      "rewards/code_format_reward/std": 0.3525845408439636,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5402902364730835,
+      "rewards/curriculum_aware_reward_fn/std": 0.45736637711524963,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1741071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4076.0,
+      "completions/mean_length": 2037.8304443359375,
+      "completions/mean_terminated_length": 1603.946044921875,
+      "completions/min_length": 352.0,
+      "completions/min_terminated_length": 352.0,
+      "epoch": 0.8375451263537906,
+      "grad_norm": 0.22224228084087372,
+      "kl": 0.029876708984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0668,
+      "num_tokens": 234095361.0,
+      "reward": 1.3809152841567993,
+      "reward_std": 0.37990495562553406,
+      "rewards/code_format_reward/mean": 0.828125,
+      "rewards/code_format_reward/std": 0.3776935040950775,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5527901649475098,
+      "rewards/curriculum_aware_reward_fn/std": 0.43541234731674194,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3876.0,
+      "completions/mean_length": 1682.0201416015625,
+      "completions/mean_terminated_length": 1337.165771484375,
+      "completions/min_length": 201.0,
+      "completions/min_terminated_length": 201.0,
+      "epoch": 0.8416709644146467,
+      "grad_norm": 0.25886648893356323,
+      "kl": 0.0328826904296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0476,
+      "num_tokens": 235114591.0,
+      "reward": 1.5552457571029663,
+      "reward_std": 0.3339327573776245,
+      "rewards/code_format_reward/mean": 0.8839285969734192,
+      "rewards/code_format_reward/std": 0.32066863775253296,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6713169813156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.4169880151748657,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4068.0,
+      "completions/mean_length": 1614.8438720703125,
+      "completions/mean_terminated_length": 1260.392822265625,
+      "completions/min_length": 204.0,
+      "completions/min_terminated_length": 204.0,
+      "epoch": 0.8457968024755028,
+      "grad_norm": 0.25539520382881165,
+      "kl": 0.03082275390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0544,
+      "num_tokens": 236103705.0,
+      "reward": 1.507924199104309,
+      "reward_std": 0.25643816590309143,
+      "rewards/code_format_reward/mean": 0.8794642686843872,
+      "rewards/code_format_reward/std": 0.3259509205818176,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6284598708152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.5155189037322998,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0959821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4074.0,
+      "completions/mean_length": 1644.69873046875,
+      "completions/mean_terminated_length": 1384.4371337890625,
+      "completions/min_length": 199.0,
+      "completions/min_terminated_length": 199.0,
+      "epoch": 0.8499226405363589,
+      "grad_norm": 0.2182522863149643,
+      "kl": 0.032928466796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0571,
+      "num_tokens": 237102526.0,
+      "reward": 1.6372768878936768,
+      "reward_std": 0.2757372558116913,
+      "rewards/code_format_reward/mean": 0.9151785969734192,
+      "rewards/code_format_reward/std": 0.2789272665977478,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7220982313156128,
+      "rewards/curriculum_aware_reward_fn/std": 0.3957159221172333,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4093.0,
+      "completions/mean_length": 1881.337158203125,
+      "completions/mean_terminated_length": 1370.260986328125,
+      "completions/min_length": 189.0,
+      "completions/min_terminated_length": 189.0,
+      "epoch": 0.8540484785972151,
+      "grad_norm": 0.20540083944797516,
+      "kl": 0.03094482421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0761,
+      "num_tokens": 238202951.0,
+      "reward": 1.3666294813156128,
+      "reward_std": 0.3059476613998413,
+      "rewards/code_format_reward/mean": 0.8147321343421936,
+      "rewards/code_format_reward/std": 0.38894903659820557,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5518973469734192,
+      "rewards/curriculum_aware_reward_fn/std": 0.4562782943248749,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4072.0,
+      "completions/mean_length": 1735.805908203125,
+      "completions/mean_terminated_length": 1398.6351318359375,
+      "completions/min_length": 317.0,
+      "completions/min_terminated_length": 317.0,
+      "epoch": 0.8581743166580712,
+      "grad_norm": 0.2484840750694275,
+      "kl": 0.0309600830078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0979,
+      "num_tokens": 239242658.0,
+      "reward": 1.5085939168930054,
+      "reward_std": 0.34762099385261536,
+      "rewards/code_format_reward/mean": 0.8794642686843872,
+      "rewards/code_format_reward/std": 0.3259509205818176,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6291294097900391,
+      "rewards/curriculum_aware_reward_fn/std": 0.4308629035949707,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2008928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4067.0,
+      "completions/mean_length": 2012.274658203125,
+      "completions/mean_terminated_length": 1488.432861328125,
+      "completions/min_length": 234.0,
+      "completions/min_terminated_length": 234.0,
+      "epoch": 0.8623001547189273,
+      "grad_norm": 0.19853706657886505,
+      "kl": 0.030975341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0884,
+      "num_tokens": 240423849.0,
+      "reward": 1.4322545528411865,
+      "reward_std": 0.3431403338909149,
+      "rewards/code_format_reward/mean": 0.8191964030265808,
+      "rewards/code_format_reward/std": 0.38528555631637573,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6130580306053162,
+      "rewards/curriculum_aware_reward_fn/std": 0.4514111578464508,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2165178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4089.0,
+      "completions/mean_length": 2111.489013671875,
+      "completions/mean_terminated_length": 1563.0626220703125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.8664259927797834,
+      "grad_norm": 0.24466735124588013,
+      "kl": 0.030792236328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0651,
+      "num_tokens": 241637357.0,
+      "reward": 1.4133929014205933,
+      "reward_std": 0.33380740880966187,
+      "rewards/code_format_reward/mean": 0.796875,
+      "rewards/code_format_reward/std": 0.4027745723724365,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6165178418159485,
+      "rewards/curriculum_aware_reward_fn/std": 0.46240732073783875,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1785714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4071.0,
+      "completions/mean_length": 1902.3037109375,
+      "completions/mean_terminated_length": 1425.4130859375,
+      "completions/min_length": 227.0,
+      "completions/min_terminated_length": 227.0,
+      "epoch": 0.8705518308406395,
+      "grad_norm": 0.1979178786277771,
+      "kl": 0.032989501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0419,
+      "num_tokens": 242745187.0,
+      "reward": 1.4494421482086182,
+      "reward_std": 0.26812899112701416,
+      "rewards/code_format_reward/mean": 0.8258928656578064,
+      "rewards/code_format_reward/std": 0.37962549924850464,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6235490441322327,
+      "rewards/curriculum_aware_reward_fn/std": 0.47366687655448914,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.078125,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4085.0,
+      "completions/mean_length": 1524.2188720703125,
+      "completions/mean_terminated_length": 1306.271240234375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.8746776689014956,
+      "grad_norm": 0.23214657604694366,
+      "kl": 0.03704833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.054,
+      "num_tokens": 243692420.0,
+      "reward": 1.6546876430511475,
+      "reward_std": 0.2663484513759613,
+      "rewards/code_format_reward/mean": 0.9263392686843872,
+      "rewards/code_format_reward/std": 0.2615099549293518,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7283481955528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.40330010652542114,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.171875,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4064.0,
+      "completions/mean_length": 1860.7412109375,
+      "completions/mean_terminated_length": 1396.8193359375,
+      "completions/min_length": 193.0,
+      "completions/min_terminated_length": 193.0,
+      "epoch": 0.8788035069623518,
+      "grad_norm": 0.2359783798456192,
+      "kl": 0.0379638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.1015,
+      "num_tokens": 244798745.0,
+      "reward": 1.4878350496292114,
+      "reward_std": 0.3436294496059418,
+      "rewards/code_format_reward/mean": 0.8392857313156128,
+      "rewards/code_format_reward/std": 0.3676777780056,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6485491394996643,
+      "rewards/curriculum_aware_reward_fn/std": 0.4317037761211395,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1294642857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3974.0,
+      "completions/mean_length": 1827.13623046875,
+      "completions/mean_terminated_length": 1489.7154541015625,
+      "completions/min_length": 323.0,
+      "completions/min_terminated_length": 323.0,
+      "epoch": 0.8829293450232079,
+      "grad_norm": 0.43438419699668884,
+      "kl": 0.032135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0871,
+      "num_tokens": 245898247.0,
+      "reward": 1.5524554252624512,
+      "reward_std": 0.309803307056427,
+      "rewards/code_format_reward/mean": 0.8861607313156128,
+      "rewards/code_format_reward/std": 0.31797102093696594,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4090176224708557,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0959821428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4083.0,
+      "completions/mean_length": 1742.2366943359375,
+      "completions/mean_terminated_length": 1492.3309326171875,
+      "completions/min_length": 309.0,
+      "completions/min_terminated_length": 309.0,
+      "epoch": 0.887055183084064,
+      "grad_norm": 0.29153332114219666,
+      "kl": 0.0323944091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0339,
+      "num_tokens": 246938763.0,
+      "reward": 1.610714316368103,
+      "reward_std": 0.3009406626224518,
+      "rewards/code_format_reward/mean": 0.9174107313156128,
+      "rewards/code_format_reward/std": 0.2755681276321411,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6933035254478455,
+      "rewards/curriculum_aware_reward_fn/std": 0.40388187766075134,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1495535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3960.0,
+      "completions/mean_length": 1818.85498046875,
+      "completions/mean_terminated_length": 1418.412109375,
+      "completions/min_length": 289.0,
+      "completions/min_terminated_length": 289.0,
+      "epoch": 0.89118102114492,
+      "grad_norm": 0.21342267096042633,
+      "kl": 0.0300750732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.083,
+      "num_tokens": 248023628.0,
+      "reward": 1.3243303298950195,
+      "reward_std": 0.2561975419521332,
+      "rewards/code_format_reward/mean": 0.8660714030265808,
+      "rewards/code_format_reward/std": 0.34095627069473267,
+      "rewards/curriculum_aware_reward_fn/mean": 0.45825889706611633,
+      "rewards/curriculum_aware_reward_fn/std": 0.4662056863307953,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1629464285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4095.0,
+      "completions/mean_length": 1759.1407470703125,
+      "completions/mean_terminated_length": 1304.23193359375,
+      "completions/min_length": 290.0,
+      "completions/min_terminated_length": 290.0,
+      "epoch": 0.8953068592057761,
+      "grad_norm": 0.20606671273708344,
+      "kl": 0.033477783203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0774,
+      "num_tokens": 249074727.0,
+      "reward": 1.3753349781036377,
+      "reward_std": 0.2294098436832428,
+      "rewards/code_format_reward/mean": 0.8683035969734192,
+      "rewards/code_format_reward/std": 0.3385384678840637,
+      "rewards/curriculum_aware_reward_fn/mean": 0.507031261920929,
+      "rewards/curriculum_aware_reward_fn/std": 0.46656760573387146,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1071428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4040.0,
+      "completions/mean_length": 1633.0782470703125,
+      "completions/mean_terminated_length": 1337.5274658203125,
+      "completions/min_length": 156.0,
+      "completions/min_terminated_length": 156.0,
+      "epoch": 0.8994326972666323,
+      "grad_norm": 0.26139992475509644,
+      "kl": 0.03546142578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0694,
+      "num_tokens": 250064611.0,
+      "reward": 1.6659599542617798,
+      "reward_std": 0.2920212149620056,
+      "rewards/code_format_reward/mean": 0.9040178656578064,
+      "rewards/code_format_reward/std": 0.29489603638648987,
+      "rewards/curriculum_aware_reward_fn/mean": 0.7619419693946838,
+      "rewards/curriculum_aware_reward_fn/std": 0.4175769090652466,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.2098214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4019.0,
+      "completions/mean_length": 1939.6005859375,
+      "completions/mean_terminated_length": 1366.9971923828125,
+      "completions/min_length": 191.0,
+      "completions/min_terminated_length": 191.0,
+      "epoch": 0.9035585353274884,
+      "grad_norm": 0.2301202118396759,
+      "kl": 0.03155517578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0979,
+      "num_tokens": 251204739.0,
+      "reward": 1.364174246788025,
+      "reward_std": 0.3046819865703583,
+      "rewards/code_format_reward/mean": 0.8169642686843872,
+      "rewards/code_format_reward/std": 0.387128084897995,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5472097992897034,
+      "rewards/curriculum_aware_reward_fn/std": 0.4933087229728699,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1964285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4022.0,
+      "completions/mean_length": 1786.0157470703125,
+      "completions/mean_terminated_length": 1221.352783203125,
+      "completions/min_length": 210.0,
+      "completions/min_terminated_length": 210.0,
+      "epoch": 0.9076843733883445,
+      "grad_norm": 0.5878376960754395,
+      "kl": 0.13665771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0782,
+      "num_tokens": 252260367.0,
+      "reward": 1.51551353931427,
+      "reward_std": 0.3167440593242645,
+      "rewards/code_format_reward/mean": 0.828125,
+      "rewards/code_format_reward/std": 0.3776935040950775,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6873884201049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.45862656831741333,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1651785714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4074.0,
+      "completions/mean_length": 1775.884033203125,
+      "completions/mean_terminated_length": 1316.8236083984375,
+      "completions/min_length": 262.0,
+      "completions/min_terminated_length": 262.0,
+      "epoch": 0.9118102114492006,
+      "grad_norm": 0.22542373836040497,
+      "kl": 0.0326995849609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0955,
+      "num_tokens": 253308551.0,
+      "reward": 1.4619419574737549,
+      "reward_std": 0.32789382338523865,
+      "rewards/code_format_reward/mean": 0.8392857313156128,
+      "rewards/code_format_reward/std": 0.3676777780056,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6226562857627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.44398173689842224,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1450892857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4030.0,
+      "completions/mean_length": 1829.544677734375,
+      "completions/mean_terminated_length": 1444.898193359375,
+      "completions/min_length": 159.0,
+      "completions/min_terminated_length": 159.0,
+      "epoch": 0.9159360495100567,
+      "grad_norm": 0.24280433356761932,
+      "kl": 0.032867431640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0575,
+      "num_tokens": 254392535.0,
+      "reward": 1.5252233743667603,
+      "reward_std": 0.3197804391384125,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6636161208152771,
+      "rewards/curriculum_aware_reward_fn/std": 0.4355611503124237,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0982142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4086.0,
+      "completions/mean_length": 1836.180908203125,
+      "completions/mean_terminated_length": 1590.0618896484375,
+      "completions/min_length": 207.0,
+      "completions/min_terminated_length": 207.0,
+      "epoch": 0.9200618875709129,
+      "grad_norm": 0.24582676589488983,
+      "kl": 0.0306854248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0706,
+      "num_tokens": 255481207.0,
+      "reward": 1.5697544813156128,
+      "reward_std": 0.3073837459087372,
+      "rewards/code_format_reward/mean": 0.9174107313156128,
+      "rewards/code_format_reward/std": 0.2755681276321411,
+      "rewards/curriculum_aware_reward_fn/mean": 0.65234375,
+      "rewards/curriculum_aware_reward_fn/std": 0.4572802186012268,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1116071428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4023.0,
+      "completions/mean_length": 1784.857177734375,
+      "completions/mean_terminated_length": 1494.5125732421875,
+      "completions/min_length": 270.0,
+      "completions/min_terminated_length": 270.0,
+      "epoch": 0.924187725631769,
+      "grad_norm": 0.32395270466804504,
+      "kl": 0.029998779296875,
+      "learning_rate": 1e-06,
+      "loss": 0.1009,
+      "num_tokens": 256559521.0,
+      "reward": 1.5242189168930054,
+      "reward_std": 0.3307342529296875,
+      "rewards/code_format_reward/mean": 0.8928571343421936,
+      "rewards/code_format_reward/std": 0.3096405565738678,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6313616633415222,
+      "rewards/curriculum_aware_reward_fn/std": 0.43223538994789124,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0803571428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4072.0,
+      "completions/mean_length": 1617.3192138671875,
+      "completions/mean_terminated_length": 1400.7354736328125,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.9283135636926251,
+      "grad_norm": 0.21705108880996704,
+      "kl": 0.032135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0599,
+      "num_tokens": 257553552.0,
+      "reward": 1.4319196939468384,
+      "reward_std": 0.19349054992198944,
+      "rewards/code_format_reward/mean": 0.9196428656578064,
+      "rewards/code_format_reward/std": 0.2721492052078247,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5122767686843872,
+      "rewards/curriculum_aware_reward_fn/std": 0.4762067496776581,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1227678571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4022.0,
+      "completions/mean_length": 1747.665283203125,
+      "completions/mean_terminated_length": 1419.017822265625,
+      "completions/min_length": 168.0,
+      "completions/min_terminated_length": 168.0,
+      "epoch": 0.9324394017534812,
+      "grad_norm": 0.3296845555305481,
+      "kl": 0.0321044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0784,
+      "num_tokens": 258600945.0,
+      "reward": 1.543861746788025,
+      "reward_std": 0.30385205149650574,
+      "rewards/code_format_reward/mean": 0.8772321343421936,
+      "rewards/code_format_reward/std": 0.3285374045372009,
+      "rewards/curriculum_aware_reward_fn/mean": 0.666629433631897,
+      "rewards/curriculum_aware_reward_fn/std": 0.4261413514614105,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1785714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4091.0,
+      "completions/mean_length": 1931.638427734375,
+      "completions/mean_terminated_length": 1461.125,
+      "completions/min_length": 226.0,
+      "completions/min_terminated_length": 226.0,
+      "epoch": 0.9365652398143373,
+      "grad_norm": 0.23225350677967072,
+      "kl": 0.0313873291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.1046,
+      "num_tokens": 259729778.0,
+      "reward": 1.358147382736206,
+      "reward_std": 0.32802098989486694,
+      "rewards/code_format_reward/mean": 0.8214285969734192,
+      "rewards/code_format_reward/std": 0.3834212124347687,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5367187857627869,
+      "rewards/curriculum_aware_reward_fn/std": 0.46693459153175354,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1540178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4088.0,
+      "completions/mean_length": 1851.51123046875,
+      "completions/mean_terminated_length": 1442.8839111328125,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.9406910778751933,
+      "grad_norm": 0.21108375489711761,
+      "kl": 0.03179931640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0731,
+      "num_tokens": 260825178.0,
+      "reward": 1.5152901411056519,
+      "reward_std": 0.33156508207321167,
+      "rewards/code_format_reward/mean": 0.84375,
+      "rewards/code_format_reward/std": 0.36349809169769287,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6715401411056519,
+      "rewards/curriculum_aware_reward_fn/std": 0.4522121250629425,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1428571428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4065.0,
+      "completions/mean_length": 1721.6451416015625,
+      "completions/mean_terminated_length": 1325.9193115234375,
+      "completions/min_length": 209.0,
+      "completions/min_terminated_length": 209.0,
+      "epoch": 0.9448169159360496,
+      "grad_norm": 0.3119603097438812,
+      "kl": 0.035125732421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0564,
+      "num_tokens": 261878428.0,
+      "reward": 1.5291296243667603,
+      "reward_std": 0.29912132024765015,
+      "rewards/code_format_reward/mean": 0.859375,
+      "rewards/code_format_reward/std": 0.3480229377746582,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6697544455528259,
+      "rewards/curriculum_aware_reward_fn/std": 0.42858609557151794,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4075.0,
+      "completions/mean_length": 1753.6607666015625,
+      "completions/mean_terminated_length": 1305.1275634765625,
+      "completions/min_length": 232.0,
+      "completions/min_terminated_length": 232.0,
+      "epoch": 0.9489427539969056,
+      "grad_norm": 0.22429104149341583,
+      "kl": 0.0317535400390625,
+      "learning_rate": 1e-06,
+      "loss": 0.1006,
+      "num_tokens": 262934310.0,
+      "reward": 1.381361722946167,
+      "reward_std": 0.3329118490219116,
+      "rewards/code_format_reward/mean": 0.8392857313156128,
+      "rewards/code_format_reward/std": 0.3676777780056,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5420759320259094,
+      "rewards/curriculum_aware_reward_fn/std": 0.4664093852043152,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4090.0,
+      "completions/mean_length": 1739.4263916015625,
+      "completions/mean_terminated_length": 1360.9093017578125,
+      "completions/min_length": 313.0,
+      "completions/min_terminated_length": 313.0,
+      "epoch": 0.9530685920577617,
+      "grad_norm": 0.22260624170303345,
+      "kl": 0.032623291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.065,
+      "num_tokens": 263976059.0,
+      "reward": 1.5345982313156128,
+      "reward_std": 0.28138771653175354,
+      "rewards/code_format_reward/mean": 0.8683035969734192,
+      "rewards/code_format_reward/std": 0.3385384678840637,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6662946343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4627295732498169,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0892857142857143,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4026.0,
+      "completions/mean_length": 1576.6473388671875,
+      "completions/mean_terminated_length": 1329.6519775390625,
+      "completions/min_length": 155.0,
+      "completions/min_terminated_length": 155.0,
+      "epoch": 0.9571944301186178,
+      "grad_norm": 0.22776390612125397,
+      "kl": 0.03472900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.041,
+      "num_tokens": 264949986.0,
+      "reward": 1.6268973350524902,
+      "reward_std": 0.2715020775794983,
+      "rewards/code_format_reward/mean": 0.9107142686843872,
+      "rewards/code_format_reward/std": 0.2854745090007782,
+      "rewards/curriculum_aware_reward_fn/mean": 0.716183066368103,
+      "rewards/curriculum_aware_reward_fn/std": 0.4235256612300873,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1607142857142857,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4092.0,
+      "completions/mean_length": 1813.0826416015625,
+      "completions/mean_terminated_length": 1375.9281005859375,
+      "completions/min_length": 221.0,
+      "completions/min_terminated_length": 221.0,
+      "epoch": 0.9613202681794739,
+      "grad_norm": 0.5938295125961304,
+      "kl": 0.03570556640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0627,
+      "num_tokens": 266030928.0,
+      "reward": 1.4172991514205933,
+      "reward_std": 0.27423596382141113,
+      "rewards/code_format_reward/mean": 0.8392857313156128,
+      "rewards/code_format_reward/std": 0.3676777780056,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5780134201049805,
+      "rewards/curriculum_aware_reward_fn/std": 0.4604513943195343,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1540178571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4043.0,
+      "completions/mean_length": 1860.4107666015625,
+      "completions/mean_terminated_length": 1453.40380859375,
+      "completions/min_length": 247.0,
+      "completions/min_terminated_length": 247.0,
+      "epoch": 0.9654461062403301,
+      "grad_norm": 0.2252698838710785,
+      "kl": 0.032440185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0919,
+      "num_tokens": 267137231.0,
+      "reward": 1.4508929252624512,
+      "reward_std": 0.35087722539901733,
+      "rewards/code_format_reward/mean": 0.84375,
+      "rewards/code_format_reward/std": 0.36349809169769287,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6071428656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.4518719017505646,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1495535714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4072.0,
+      "completions/mean_length": 1817.727783203125,
+      "completions/mean_terminated_length": 1417.086669921875,
+      "completions/min_length": 161.0,
+      "completions/min_terminated_length": 161.0,
+      "epoch": 0.9695719443011862,
+      "grad_norm": 0.2286522537469864,
+      "kl": 0.0334320068359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0584,
+      "num_tokens": 268223397.0,
+      "reward": 1.3939732313156128,
+      "reward_std": 0.27675938606262207,
+      "rewards/code_format_reward/mean": 0.8526785969734192,
+      "rewards/code_format_reward/std": 0.3548222780227661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5412946343421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.48773789405822754,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1897321428571429,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4053.0,
+      "completions/mean_length": 1985.325927734375,
+      "completions/mean_terminated_length": 1491.0909423828125,
+      "completions/min_length": 217.0,
+      "completions/min_terminated_length": 217.0,
+      "epoch": 0.9736977823620423,
+      "grad_norm": 0.20148524641990662,
+      "kl": 0.03509521484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0899,
+      "num_tokens": 269388133.0,
+      "reward": 1.4079241752624512,
+      "reward_std": 0.3953123390674591,
+      "rewards/code_format_reward/mean": 0.8080357313156128,
+      "rewards/code_format_reward/std": 0.3942854106426239,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5998883843421936,
+      "rewards/curriculum_aware_reward_fn/std": 0.4576530158519745,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1383928571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4057.0,
+      "completions/mean_length": 1784.509033203125,
+      "completions/mean_terminated_length": 1413.233154296875,
+      "completions/min_length": 174.0,
+      "completions/min_terminated_length": 174.0,
+      "epoch": 0.9778236204228984,
+      "grad_norm": 0.21030192077159882,
+      "kl": 0.036651611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0617,
+      "num_tokens": 270444745.0,
+      "reward": 1.5034600496292114,
+      "reward_std": 0.2465612292289734,
+      "rewards/code_format_reward/mean": 0.8616071343421936,
+      "rewards/code_format_reward/std": 0.34569787979125977,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6418526768684387,
+      "rewards/curriculum_aware_reward_fn/std": 0.4315384030342102,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1071428571428571,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4069.0,
+      "completions/mean_length": 1658.04248046875,
+      "completions/mean_terminated_length": 1365.4874267578125,
+      "completions/min_length": 182.0,
+      "completions/min_terminated_length": 182.0,
+      "epoch": 0.9819494584837545,
+      "grad_norm": 0.22513382136821747,
+      "kl": 0.03680419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0645,
+      "num_tokens": 271464328.0,
+      "reward": 1.5197545289993286,
+      "reward_std": 0.30367511510849,
+      "rewards/code_format_reward/mean": 0.8928571343421936,
+      "rewards/code_format_reward/std": 0.3096405565738678,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6268973350524902,
+      "rewards/curriculum_aware_reward_fn/std": 0.4268620014190674,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1473214285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4087.0,
+      "completions/mean_length": 1725.8326416015625,
+      "completions/mean_terminated_length": 1316.3272705078125,
+      "completions/min_length": 205.0,
+      "completions/min_terminated_length": 205.0,
+      "epoch": 0.9860752965446106,
+      "grad_norm": 0.2153979241847992,
+      "kl": 0.036102294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0744,
+      "num_tokens": 272505594.0,
+      "reward": 1.3497768640518188,
+      "reward_std": 0.31243571639060974,
+      "rewards/code_format_reward/mean": 0.8526785969734192,
+      "rewards/code_format_reward/std": 0.3548222780227661,
+      "rewards/curriculum_aware_reward_fn/mean": 0.4970982074737549,
+      "rewards/curriculum_aware_reward_fn/std": 0.46139857172966003,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1339285714285714,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4089.0,
+      "completions/mean_length": 1712.7344970703125,
+      "completions/mean_terminated_length": 1344.1881103515625,
+      "completions/min_length": 299.0,
+      "completions/min_terminated_length": 299.0,
+      "epoch": 0.9902011346054668,
+      "grad_norm": 0.22380459308624268,
+      "kl": 0.0360107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.058,
+      "num_tokens": 273546494.0,
+      "reward": 1.521875023841858,
+      "reward_std": 0.3063981533050537,
+      "rewards/code_format_reward/mean": 0.8660714030265808,
+      "rewards/code_format_reward/std": 0.34095630049705505,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.42365700006484985,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.1160714285714286,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4063.0,
+      "completions/mean_length": 1675.0223388671875,
+      "completions/mean_terminated_length": 1357.1162109375,
+      "completions/min_length": 220.0,
+      "completions/min_terminated_length": 220.0,
+      "epoch": 0.9943269726663229,
+      "grad_norm": 0.22325699031352997,
+      "kl": 0.036895751953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0827,
+      "num_tokens": 274574076.0,
+      "reward": 1.5726563930511475,
+      "reward_std": 0.253899484872818,
+      "rewards/code_format_reward/mean": 0.8816964030265808,
+      "rewards/code_format_reward/std": 0.32332828640937805,
+      "rewards/curriculum_aware_reward_fn/mean": 0.6909598112106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153672456741333,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0948905109489051,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 4078.0,
+      "completions/mean_length": 1584.3941650390625,
+      "completions/mean_terminated_length": 1321.08056640625,
+      "completions/min_length": 244.0,
+      "completions/min_terminated_length": 244.0,
+      "epoch": 0.998452810727179,
+      "grad_norm": 0.24855820834636688,
+      "kl": 0.037872314453125,
+      "learning_rate": 1e-06,
+      "loss": 0.1187,
+      "num_tokens": 275609321.0,
+      "reward": 1.4710938930511475,
+      "reward_std": 0.3146342933177948,
+      "rewards/code_format_reward/mean": 0.8816964030265808,
+      "rewards/code_format_reward/std": 0.32332828640937805,
+      "rewards/curriculum_aware_reward_fn/mean": 0.5893973112106323,
+      "rewards/curriculum_aware_reward_fn/std": 0.4410484731197357,
+      "step": 242
+    },
+    {
+      "epoch": 0.998452810727179,
+      "step": 242,
+      "total_flos": 0.0,
+      "train_loss": 0.08443626471047583,
+      "train_runtime": 92210.0342,
+      "train_samples_per_second": 0.168,
+      "train_steps_per_second": 0.003
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 242,
+  "num_input_tokens_seen": 275609321,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}