{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998452810727179, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4241071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2706.9443359375, "completions/mean_terminated_length": 1683.99609375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.0041258380608561115, "grad_norm": 0.09953349083662033, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1480384.0, "reward": 0.04453125223517418, "reward_std": 0.07316319644451141, "rewards/code_format_reward/mean": 0.03125, "rewards/code_format_reward/std": 0.17418713867664337, "rewards/curriculum_aware_reward_fn/mean": 0.013281249441206455, "rewards/curriculum_aware_reward_fn/std": 0.06694811582565308, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4151785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 2685.828125, "completions/mean_terminated_length": 1684.7137451171875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.008251676121712223, "grad_norm": 0.48230159282684326, "kl": 0.0005209445953369141, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 2933843.0, "reward": 0.05323661491274834, "reward_std": 0.07855270802974701, "rewards/code_format_reward/mean": 0.02901785634458065, "rewards/code_format_reward/std": 0.16804419457912445, "rewards/curriculum_aware_reward_fn/mean": 0.02421875111758709, "rewards/curriculum_aware_reward_fn/std": 0.08892504870891571, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4263392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 2555.9130859375, "completions/mean_terminated_length": 1411.3345947265625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.012377514182568335, "grad_norm": 0.14204378426074982, "kl": 0.0005156993865966797, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 4332173.0, "reward": 0.07834821939468384, "reward_std": 0.11501716077327728, "rewards/code_format_reward/mean": 0.0424107126891613, "rewards/code_format_reward/std": 0.20174959301948547, "rewards/curriculum_aware_reward_fn/mean": 0.03593749552965164, "rewards/curriculum_aware_reward_fn/std": 0.12083236873149872, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 3068.216552734375, "completions/mean_terminated_length": 1607.0972900390625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.016503352243424446, "grad_norm": 0.11061254143714905, "kl": 0.0005185604095458984, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 5987581.0, "reward": 0.07901786267757416, "reward_std": 0.10961197316646576, "rewards/code_format_reward/mean": 0.0446428582072258, "rewards/code_format_reward/std": 0.2067493349313736, "rewards/curriculum_aware_reward_fn/mean": 0.03437500074505806, "rewards/curriculum_aware_reward_fn/std": 0.10427789390087128, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5669642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2925.247802734375, "completions/mean_terminated_length": 1392.4072265625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.020629190304280558, "grad_norm": 27.4289493560791, "kl": 2.3755736351013184, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 7577875.0, "reward": 0.1395089328289032, "reward_std": 0.1650572568178177, "rewards/code_format_reward/mean": 0.0848214253783226, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.0546875, "rewards/curriculum_aware_reward_fn/std": 0.1355670541524887, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 2588.587158203125, "completions/mean_terminated_length": 1576.1455078125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.02475502836513667, "grad_norm": 0.22997821867465973, "kl": 0.0009093284606933594, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 9013705.0, "reward": 0.240513414144516, "reward_std": 0.2777326703071594, "rewards/code_format_reward/mean": 0.1584821492433548, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.08203125, "rewards/curriculum_aware_reward_fn/std": 0.14842838048934937, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4151785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 2580.649658203125, "completions/mean_terminated_length": 1504.866455078125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.02888086642599278, "grad_norm": 0.17054465413093567, "kl": 0.0014181137084960938, "learning_rate": 1e-06, "loss": 0.0514, "num_tokens": 10451404.0, "reward": 0.3081473708152771, "reward_std": 0.285627156496048, "rewards/code_format_reward/mean": 0.1964285671710968, "rewards/code_format_reward/std": 0.39774051308631897, "rewards/curriculum_aware_reward_fn/mean": 0.11171875149011612, "rewards/curriculum_aware_reward_fn/std": 0.1633402407169342, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4665178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 2633.1005859375, "completions/mean_terminated_length": 1353.828369140625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.03300670448684889, "grad_norm": 0.16662493348121643, "kl": 0.0016994476318359375, "learning_rate": 1e-06, "loss": 0.0372, "num_tokens": 11895234.0, "reward": 0.3180803656578064, "reward_std": 0.2946862578392029, "rewards/code_format_reward/mean": 0.2008928507566452, "rewards/code_format_reward/std": 0.4011159837245941, "rewards/curriculum_aware_reward_fn/mean": 0.1171875, "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5022321428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2903.497802734375, "completions/mean_terminated_length": 1700.300537109375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.037132542547705004, "grad_norm": 0.16252301633358002, "kl": 0.0015096664428710938, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 13458326.0, "reward": 0.34631696343421936, "reward_std": 0.35680821537971497, "rewards/code_format_reward/mean": 0.2299107164144516, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.11640624701976776, "rewards/curriculum_aware_reward_fn/std": 0.1650836169719696, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4508928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2713.796875, "completions/mean_terminated_length": 1578.8170166015625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.041258380608561115, "grad_norm": 0.15428705513477325, "kl": 0.0021228790283203125, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 14945420.0, "reward": 0.5176340341567993, "reward_std": 0.32653772830963135, "rewards/code_format_reward/mean": 0.3504464328289032, "rewards/code_format_reward/std": 0.47764310240745544, "rewards/curriculum_aware_reward_fn/mean": 0.16718748211860657, "rewards/curriculum_aware_reward_fn/std": 0.1750209629535675, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5022321428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2780.732177734375, "completions/mean_terminated_length": 1453.668212890625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.04538421866941723, "grad_norm": 0.15486416220664978, "kl": 0.002979278564453125, "learning_rate": 1e-06, "loss": 0.0472, "num_tokens": 16457062.0, "reward": 0.5746652483940125, "reward_std": 0.2564198970794678, "rewards/code_format_reward/mean": 0.4129464328289032, "rewards/code_format_reward/std": 0.49291378259658813, "rewards/curriculum_aware_reward_fn/mean": 0.16171874105930328, "rewards/curriculum_aware_reward_fn/std": 0.1746903657913208, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4709821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2697.83935546875, "completions/mean_terminated_length": 1453.063232421875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.04951005673027334, "grad_norm": 0.16553913056850433, "kl": 0.0033550262451171875, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 17934487.0, "reward": 0.6356027722358704, "reward_std": 0.23742982745170593, "rewards/code_format_reward/mean": 0.4754464328289032, "rewards/code_format_reward/std": 0.4999549984931946, "rewards/curriculum_aware_reward_fn/mean": 0.16015625, "rewards/curriculum_aware_reward_fn/std": 0.17456425726413727, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 3273.172119140625, "completions/mean_terminated_length": 1989.5599365234375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.05363589479112945, "grad_norm": 0.16705065965652466, "kl": 0.005349159240722656, "learning_rate": 1e-06, "loss": 0.044, "num_tokens": 19670249.0, "reward": 0.4877232611179352, "reward_std": 0.23604732751846313, "rewards/code_format_reward/mean": 0.3705357015132904, "rewards/code_format_reward/std": 0.4834881126880646, "rewards/curriculum_aware_reward_fn/mean": 0.1171875, "rewards/curriculum_aware_reward_fn/std": 0.16535945236682892, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4977678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 2728.977783203125, "completions/mean_terminated_length": 1374.106689453125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.05776173285198556, "grad_norm": 5.121767997741699, "kl": 0.0045318603515625, "learning_rate": 1e-06, "loss": 0.0457, "num_tokens": 21178732.0, "reward": 0.6415179371833801, "reward_std": 0.22798539698123932, "rewards/code_format_reward/mean": 0.4821428656578064, "rewards/code_format_reward/std": 0.5002396702766418, "rewards/curriculum_aware_reward_fn/mean": 0.15937499701976776, "rewards/curriculum_aware_reward_fn/std": 0.1851634681224823, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4866071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 2634.060302734375, "completions/mean_terminated_length": 1248.3956298828125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06188757091284167, "grad_norm": 0.15185804665088654, "kl": 0.004016876220703125, "learning_rate": 1e-06, "loss": 0.0493, "num_tokens": 22614221.0, "reward": 0.6506697535514832, "reward_std": 0.2517409324645996, "rewards/code_format_reward/mean": 0.4866071343421936, "rewards/code_format_reward/std": 0.5003793835639954, "rewards/curriculum_aware_reward_fn/mean": 0.1640625, "rewards/curriculum_aware_reward_fn/std": 0.17485311627388, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3995535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2429.9375, "completions/mean_terminated_length": 1321.293701171875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.06601340897369778, "grad_norm": 0.14529380202293396, "kl": 0.004016876220703125, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 23988265.0, "reward": 0.7601563334465027, "reward_std": 0.19341711699962616, "rewards/code_format_reward/mean": 0.578125, "rewards/code_format_reward/std": 0.4944108724594116, "rewards/curriculum_aware_reward_fn/mean": 0.18203125894069672, "rewards/curriculum_aware_reward_fn/std": 0.1750541627407074, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2562.794677734375, "completions/mean_terminated_length": 1443.9691162109375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.07013924703455389, "grad_norm": 0.17270275950431824, "kl": 0.003620147705078125, "learning_rate": 1e-06, "loss": 0.095, "num_tokens": 25396314.0, "reward": 0.7247769236564636, "reward_std": 0.30583035945892334, "rewards/code_format_reward/mean": 0.5513392686843872, "rewards/code_format_reward/std": 0.49791330099105835, "rewards/curriculum_aware_reward_fn/mean": 0.17343749105930328, "rewards/curriculum_aware_reward_fn/std": 0.1858164519071579, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4107142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 2482.625, "completions/mean_terminated_length": 1358.151611328125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.07426508509541001, "grad_norm": 0.1558626890182495, "kl": 0.0041179656982421875, "learning_rate": 1e-06, "loss": 0.0645, "num_tokens": 26778709.0, "reward": 0.7610491514205933, "reward_std": 0.21481552720069885, "rewards/code_format_reward/mean": 0.5758928656578064, "rewards/code_format_reward/std": 0.4947591722011566, "rewards/curriculum_aware_reward_fn/mean": 0.18515624105930328, "rewards/curriculum_aware_reward_fn/std": 0.1749003380537033, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4888392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2817.154052734375, "completions/mean_terminated_length": 1594.15283203125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.07839092315626611, "grad_norm": 0.1250939816236496, "kl": 0.0029201507568359375, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 28332113.0, "reward": 0.6272321939468384, "reward_std": 0.19977706670761108, "rewards/code_format_reward/mean": 0.4866071343421936, "rewards/code_format_reward/std": 0.5003793835639954, "rewards/curriculum_aware_reward_fn/mean": 0.140625, "rewards/curriculum_aware_reward_fn/std": 0.1717824935913086, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5178571428571428, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 2882.270263671875, "completions/mean_terminated_length": 1578.63427734375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.08251676121712223, "grad_norm": 0.14638496935367584, "kl": 0.003143310546875, "learning_rate": 1e-06, "loss": 0.0602, "num_tokens": 29893423.0, "reward": 0.6104912161827087, "reward_std": 0.22606003284454346, "rewards/code_format_reward/mean": 0.4776785671710968, "rewards/code_format_reward/std": 0.5000599026679993, "rewards/curriculum_aware_reward_fn/mean": 0.1328125, "rewards/curriculum_aware_reward_fn/std": 0.1700286567211151, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4441964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2700.0693359375, "completions/mean_terminated_length": 1584.4456787109375, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.08664259927797834, "grad_norm": 0.1355072557926178, "kl": 0.0038661956787109375, "learning_rate": 1e-06, "loss": 0.0566, "num_tokens": 31364917.0, "reward": 0.7122768759727478, "reward_std": 0.1853857785463333, "rewards/code_format_reward/mean": 0.5513392686843872, "rewards/code_format_reward/std": 0.49791327118873596, "rewards/curriculum_aware_reward_fn/mean": 0.16093750298023224, "rewards/curriculum_aware_reward_fn/std": 0.1746290773153305, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4553571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 2679.185302734375, "completions/mean_terminated_length": 1494.6351318359375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.09076843733883445, "grad_norm": 0.9295708537101746, "kl": 0.00348663330078125, "learning_rate": 1e-06, "loss": 0.0999, "num_tokens": 32823191.0, "reward": 0.6949778199195862, "reward_std": 0.26760414242744446, "rewards/code_format_reward/mean": 0.5379464030265808, "rewards/code_format_reward/std": 0.49911534786224365, "rewards/curriculum_aware_reward_fn/mean": 0.15703125298023224, "rewards/curriculum_aware_reward_fn/std": 0.17583517730236053, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 2562.45556640625, "completions/mean_terminated_length": 1532.462646484375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.09489427539969056, "grad_norm": 0.21064940094947815, "kl": 0.0037479400634765625, "learning_rate": 1e-06, "loss": 0.0993, "num_tokens": 34234574.0, "reward": 0.7625001072883606, "reward_std": 0.23642751574516296, "rewards/code_format_reward/mean": 0.59375, "rewards/code_format_reward/std": 0.49168136715888977, "rewards/curriculum_aware_reward_fn/mean": 0.16875000298023224, "rewards/curriculum_aware_reward_fn/std": 0.18123672902584076, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3660714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2425.8037109375, "completions/mean_terminated_length": 1461.323974609375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.09902011346054668, "grad_norm": 26.764358520507812, "kl": 1.740875244140625, "learning_rate": 1e-06, "loss": 0.1108, "num_tokens": 35594383.0, "reward": 0.7958706021308899, "reward_std": 0.2806268334388733, "rewards/code_format_reward/mean": 0.6294642686843872, "rewards/code_format_reward/std": 0.48348814249038696, "rewards/curriculum_aware_reward_fn/mean": 0.16640625894069672, "rewards/curriculum_aware_reward_fn/std": 0.17498427629470825, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3727678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 2481.263427734375, "completions/mean_terminated_length": 1521.6156005859375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.10314595152140278, "grad_norm": 0.15260478854179382, "kl": 0.0043849945068359375, "learning_rate": 1e-06, "loss": 0.0532, "num_tokens": 36977246.0, "reward": 0.8117188215255737, "reward_std": 0.20510579645633698, "rewards/code_format_reward/mean": 0.625, "rewards/code_format_reward/std": 0.48466411232948303, "rewards/curriculum_aware_reward_fn/mean": 0.18671874701976776, "rewards/curriculum_aware_reward_fn/std": 0.17480237782001495, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3883928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 2414.290283203125, "completions/mean_terminated_length": 1346.343017578125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.1072717895822589, "grad_norm": 0.16675879061222076, "kl": 0.004638671875, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 38337007.0, "reward": 0.7741072177886963, "reward_std": 0.18030057847499847, "rewards/code_format_reward/mean": 0.5959821343421936, "rewards/code_format_reward/std": 0.49124953150749207, "rewards/curriculum_aware_reward_fn/mean": 0.17812500894069672, "rewards/curriculum_aware_reward_fn/std": 0.17516769468784332, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 2726.602783203125, "completions/mean_terminated_length": 1539.791748046875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.111397627643115, "grad_norm": 0.15529079735279083, "kl": 0.0050945281982421875, "learning_rate": 1e-06, "loss": 0.0957, "num_tokens": 39834843.0, "reward": 0.7099331021308899, "reward_std": 0.26638171076774597, "rewards/code_format_reward/mean": 0.5357142686843872, "rewards/code_format_reward/std": 0.4992803931236267, "rewards/curriculum_aware_reward_fn/mean": 0.17421874403953552, "rewards/curriculum_aware_reward_fn/std": 0.18582138419151306, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4508928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 2655.837158203125, "completions/mean_terminated_length": 1473.26416015625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.11552346570397112, "grad_norm": 0.14765554666519165, "kl": 0.004909515380859375, "learning_rate": 1e-06, "loss": 0.0619, "num_tokens": 41296734.0, "reward": 0.7210938334465027, "reward_std": 0.2006417065858841, "rewards/code_format_reward/mean": 0.546875, "rewards/code_format_reward/std": 0.49835437536239624, "rewards/curriculum_aware_reward_fn/mean": 0.17421875894069672, "rewards/curriculum_aware_reward_fn/std": 0.1751938909292221, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4174107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2603.03125, "completions/mean_terminated_length": 1533.3563232421875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.11964930376482723, "grad_norm": 0.20744574069976807, "kl": 0.004703521728515625, "learning_rate": 1e-06, "loss": 0.0741, "num_tokens": 42740750.0, "reward": 0.7460938096046448, "reward_std": 0.24164797365665436, "rewards/code_format_reward/mean": 0.578125, "rewards/code_format_reward/std": 0.4944108724594116, "rewards/curriculum_aware_reward_fn/mean": 0.16796875, "rewards/curriculum_aware_reward_fn/std": 0.17815767228603363, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2604.0849609375, "completions/mean_terminated_length": 1525.3154296875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.12377514182568335, "grad_norm": 0.1693020910024643, "kl": 0.005634307861328125, "learning_rate": 1e-06, "loss": 0.1051, "num_tokens": 44171602.0, "reward": 0.7592634558677673, "reward_std": 0.28111451864242554, "rewards/code_format_reward/mean": 0.5803571343421936, "rewards/code_format_reward/std": 0.4940521717071533, "rewards/curriculum_aware_reward_fn/mean": 0.17890623211860657, "rewards/curriculum_aware_reward_fn/std": 0.1751519739627838, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4263392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 2548.7099609375, "completions/mean_terminated_length": 1398.7781982421875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.12790097988653945, "grad_norm": 1.6184011697769165, "kl": 0.004535675048828125, "learning_rate": 1e-06, "loss": 0.0995, "num_tokens": 45596210.0, "reward": 0.7340403199195862, "reward_std": 0.2579808235168457, "rewards/code_format_reward/mean": 0.5691964030265808, "rewards/code_format_reward/std": 0.4957422614097595, "rewards/curriculum_aware_reward_fn/mean": 0.16484375298023224, "rewards/curriculum_aware_reward_fn/std": 0.18406172096729279, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4040178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 2474.984375, "completions/mean_terminated_length": 1376.0936279296875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.13202681794739557, "grad_norm": 0.1796254962682724, "kl": 0.0054779052734375, "learning_rate": 1e-06, "loss": 0.0946, "num_tokens": 46976449.0, "reward": 0.7672991156578064, "reward_std": 0.24826881289482117, "rewards/code_format_reward/mean": 0.5915178656578064, "rewards/code_format_reward/std": 0.49210265278816223, "rewards/curriculum_aware_reward_fn/mean": 0.17578125, "rewards/curriculum_aware_reward_fn/std": 0.1751938909292221, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3995535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3867.0, "completions/mean_length": 2514.62060546875, "completions/mean_terminated_length": 1462.3271484375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.1361526560082517, "grad_norm": 0.17813006043434143, "kl": 0.005443572998046875, "learning_rate": 1e-06, "loss": 0.0981, "num_tokens": 48381438.0, "reward": 0.777901828289032, "reward_std": 0.252145916223526, "rewards/code_format_reward/mean": 0.5982142686843872, "rewards/code_format_reward/std": 0.49080711603164673, "rewards/curriculum_aware_reward_fn/mean": 0.1796875, "rewards/curriculum_aware_reward_fn/std": 0.17513276636600494, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2869.640869140625, "completions/mean_terminated_length": 1806.7958984375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.14027849406910778, "grad_norm": 0.21324868500232697, "kl": 0.0045490264892578125, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 49929075.0, "reward": 0.6772322058677673, "reward_std": 0.3161991536617279, "rewards/code_format_reward/mean": 0.5334821343421936, "rewards/code_format_reward/std": 0.4994353950023651, "rewards/curriculum_aware_reward_fn/mean": 0.14374999701976776, "rewards/curriculum_aware_reward_fn/std": 0.17237970232963562, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 2433.560302734375, "completions/mean_terminated_length": 1140.5516357421875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1444043321299639, "grad_norm": 0.1527215540409088, "kl": 0.00621795654296875, "learning_rate": 1e-06, "loss": 0.0569, "num_tokens": 51304546.0, "reward": 0.724330484867096, "reward_std": 0.20448964834213257, "rewards/code_format_reward/mean": 0.5602678656578064, "rewards/code_format_reward/std": 0.49690937995910645, "rewards/curriculum_aware_reward_fn/mean": 0.1640625, "rewards/curriculum_aware_reward_fn/std": 0.17485311627388, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 2354.341552734375, "completions/mean_terminated_length": 1520.8746337890625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.14853017019082002, "grad_norm": 1.0937434434890747, "kl": 0.04998016357421875, "learning_rate": 1e-06, "loss": 0.1111, "num_tokens": 52629421.0, "reward": 0.8491072058677673, "reward_std": 0.24866290390491486, "rewards/code_format_reward/mean": 0.6741071343421936, "rewards/code_format_reward/std": 0.4692314565181732, "rewards/curriculum_aware_reward_fn/mean": 0.17499998211860657, "rewards/curriculum_aware_reward_fn/std": 0.1751956343650818, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 2291.004638671875, "completions/mean_terminated_length": 1409.4949951171875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.15265600825167613, "grad_norm": 1.7211939096450806, "kl": 0.0144500732421875, "learning_rate": 1e-06, "loss": 0.1017, "num_tokens": 53924717.0, "reward": 0.8410715460777283, "reward_std": 0.2989359498023987, "rewards/code_format_reward/mean": 0.6629464030265808, "rewards/code_format_reward/std": 0.47323182225227356, "rewards/curriculum_aware_reward_fn/mean": 0.17812500894069672, "rewards/curriculum_aware_reward_fn/std": 0.17516770958900452, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 2116.78369140625, "completions/mean_terminated_length": 1350.832763671875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.15678184631253222, "grad_norm": 0.1926206350326538, "kl": 0.007659912109375, "learning_rate": 1e-06, "loss": 0.1211, "num_tokens": 55131378.0, "reward": 0.9237724542617798, "reward_std": 0.24951638281345367, "rewards/code_format_reward/mean": 0.7120535969734192, "rewards/code_format_reward/std": 0.4533122181892395, "rewards/curriculum_aware_reward_fn/mean": 0.21171875298023224, "rewards/curriculum_aware_reward_fn/std": 0.17129571735858917, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3303571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2307.48681640625, "completions/mean_terminated_length": 1425.1533203125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.16090768437338834, "grad_norm": 0.19070778787136078, "kl": 0.00756072998046875, "learning_rate": 1e-06, "loss": 0.126, "num_tokens": 56436641.0, "reward": 0.8602679371833801, "reward_std": 0.29408273100852966, "rewards/code_format_reward/mean": 0.6696428656578064, "rewards/code_format_reward/std": 0.4708675146102905, "rewards/curriculum_aware_reward_fn/mean": 0.19062499701976776, "rewards/curriculum_aware_reward_fn/std": 0.1744959056377411, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4776785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 2806.250244140625, "completions/mean_terminated_length": 1626.735107421875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.16503352243424446, "grad_norm": 0.1646624356508255, "kl": 0.00652313232421875, "learning_rate": 1e-06, "loss": 0.1101, "num_tokens": 57958364.0, "reward": 0.6613839864730835, "reward_std": 0.3108566403388977, "rewards/code_format_reward/mean": 0.5223214030265808, "rewards/code_format_reward/std": 0.5000599026679993, "rewards/curriculum_aware_reward_fn/mean": 0.13906249403953552, "rewards/curriculum_aware_reward_fn/std": 0.17146170139312744, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3950892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2462.138427734375, "completions/mean_terminated_length": 1395.003662109375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.16915936049510058, "grad_norm": 0.18411129713058472, "kl": 0.0084075927734375, "learning_rate": 1e-06, "loss": 0.0975, "num_tokens": 59332904.0, "reward": 0.770647406578064, "reward_std": 0.2517809271812439, "rewards/code_format_reward/mean": 0.6026785969734192, "rewards/code_format_reward/std": 0.48989057540893555, "rewards/curriculum_aware_reward_fn/mean": 0.16796875, "rewards/curriculum_aware_reward_fn/std": 0.1812080442905426, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 1976.6943359375, "completions/mean_terminated_length": 962.5050048828125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.17328519855595667, "grad_norm": 0.16723960638046265, "kl": 0.01207733154296875, "learning_rate": 1e-06, "loss": 0.093, "num_tokens": 60460705.0, "reward": 0.8786831498146057, "reward_std": 0.18310710787773132, "rewards/code_format_reward/mean": 0.6763392686843872, "rewards/code_format_reward/std": 0.46839532256126404, "rewards/curriculum_aware_reward_fn/mean": 0.20234373211860657, "rewards/curriculum_aware_reward_fn/std": 0.17304380238056183, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3571428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 2386.9375, "completions/mean_terminated_length": 1437.4583740234375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.1774110366168128, "grad_norm": 0.6888564825057983, "kl": 0.029937744140625, "learning_rate": 1e-06, "loss": 0.056, "num_tokens": 61791601.0, "reward": 0.8256697058677673, "reward_std": 0.24204161763191223, "rewards/code_format_reward/mean": 0.6428571343421936, "rewards/code_format_reward/std": 0.47969305515289307, "rewards/curriculum_aware_reward_fn/mean": 0.18281248211860657, "rewards/curriculum_aware_reward_fn/std": 0.18417634069919586, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2290.140625, "completions/mean_terminated_length": 1425.9505615234375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.1815368746776689, "grad_norm": 0.1773161143064499, "kl": 0.00868988037109375, "learning_rate": 1e-06, "loss": 0.109, "num_tokens": 63063935.0, "reward": 0.8530134558677673, "reward_std": 0.2758612036705017, "rewards/code_format_reward/mean": 0.6584821343421936, "rewards/code_format_reward/std": 0.4747488796710968, "rewards/curriculum_aware_reward_fn/mean": 0.19453124701976776, "rewards/curriculum_aware_reward_fn/std": 0.1847914308309555, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3928571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 2577.075927734375, "completions/mean_terminated_length": 1594.24267578125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.18566271273852503, "grad_norm": 0.2087288498878479, "kl": 0.00749969482421875, "learning_rate": 1e-06, "loss": 0.1336, "num_tokens": 64508713.0, "reward": 0.7774555087089539, "reward_std": 0.30489087104797363, "rewards/code_format_reward/mean": 0.6071428656578064, "rewards/code_format_reward/std": 0.48893147706985474, "rewards/curriculum_aware_reward_fn/mean": 0.17031249403953552, "rewards/curriculum_aware_reward_fn/std": 0.17823491990566254, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1938.587158203125, "completions/mean_terminated_length": 1184.7921142578125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.18978855079938112, "grad_norm": 17.614559173583984, "kl": 1.5489120483398438, "learning_rate": 1e-06, "loss": 0.1382, "num_tokens": 65646203.0, "reward": 0.9474331736564636, "reward_std": 0.20855267345905304, "rewards/code_format_reward/mean": 0.7388392686843872, "rewards/code_format_reward/std": 0.43975841999053955, "rewards/curriculum_aware_reward_fn/mean": 0.20859374105930328, "rewards/curriculum_aware_reward_fn/std": 0.17509609460830688, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3638392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 2346.118408203125, "completions/mean_terminated_length": 1345.308837890625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.19391438886023724, "grad_norm": 0.16382618248462677, "kl": 0.009052276611328125, "learning_rate": 1e-06, "loss": 0.1029, "num_tokens": 66968234.0, "reward": 0.8010045886039734, "reward_std": 0.22550004720687866, "rewards/code_format_reward/mean": 0.6361607313156128, "rewards/code_format_reward/std": 0.4816409945487976, "rewards/curriculum_aware_reward_fn/mean": 0.16484372317790985, "rewards/curriculum_aware_reward_fn/std": 0.1749003380537033, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 1981.419677734375, "completions/mean_terminated_length": 1207.7926025390625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.19804022692109335, "grad_norm": 0.6225883960723877, "kl": 0.0121917724609375, "learning_rate": 1e-06, "loss": 0.1302, "num_tokens": 68111473.0, "reward": 0.927455484867096, "reward_std": 0.25366178154945374, "rewards/code_format_reward/mean": 0.7321428656578064, "rewards/code_format_reward/std": 0.4433377683162689, "rewards/curriculum_aware_reward_fn/mean": 0.1953125, "rewards/curriculum_aware_reward_fn/std": 0.1740114688873291, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2433035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3951.0, "completions/mean_length": 1870.482177734375, "completions/mean_terminated_length": 1154.9027099609375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.20216606498194944, "grad_norm": 0.20527565479278564, "kl": 0.0138092041015625, "learning_rate": 1e-06, "loss": 0.0978, "num_tokens": 69220251.0, "reward": 0.9450894594192505, "reward_std": 0.23362573981285095, "rewards/code_format_reward/mean": 0.7544642686843872, "rewards/code_format_reward/std": 0.43088552355766296, "rewards/curriculum_aware_reward_fn/mean": 0.19062499701976776, "rewards/curriculum_aware_reward_fn/std": 0.1851634681224823, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2098.78369140625, "completions/mean_terminated_length": 1228.201904296875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.20629190304280556, "grad_norm": 0.19599811732769012, "kl": 0.01262664794921875, "learning_rate": 1e-06, "loss": 0.1151, "num_tokens": 70421844.0, "reward": 1.0685268640518188, "reward_std": 0.3160112500190735, "rewards/code_format_reward/mean": 0.6919642686843872, "rewards/code_format_reward/std": 0.46219751238822937, "rewards/curriculum_aware_reward_fn/mean": 0.3765625059604645, "rewards/curriculum_aware_reward_fn/std": 0.33167564868927, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1994.2545166015625, "completions/mean_terminated_length": 1326.6412353515625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21041774110366168, "grad_norm": 0.21723540127277374, "kl": 0.014495849609375, "learning_rate": 1e-06, "loss": 0.1318, "num_tokens": 71580038.0, "reward": 1.153571605682373, "reward_std": 0.3419857323169708, "rewards/code_format_reward/mean": 0.7566964030265808, "rewards/code_format_reward/std": 0.42955654859542847, "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313, "rewards/curriculum_aware_reward_fn/std": 0.32353827357292175, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2369.58056640625, "completions/mean_terminated_length": 1333.7286376953125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.2145435791645178, "grad_norm": 0.18016190826892853, "kl": 0.01081085205078125, "learning_rate": 1e-06, "loss": 0.095, "num_tokens": 72933503.0, "reward": 0.9376117587089539, "reward_std": 0.27316465973854065, "rewards/code_format_reward/mean": 0.6227678656578064, "rewards/code_format_reward/std": 0.4852356016635895, "rewards/curriculum_aware_reward_fn/mean": 0.31484371423721313, "rewards/curriculum_aware_reward_fn/std": 0.33045893907546997, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3013392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 2235.310302734375, "completions/mean_terminated_length": 1432.7763671875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.2186694172253739, "grad_norm": 0.25433245301246643, "kl": 0.012481689453125, "learning_rate": 1e-06, "loss": 0.1146, "num_tokens": 74199634.0, "reward": 1.0901787281036377, "reward_std": 0.38967522978782654, "rewards/code_format_reward/mean": 0.6964285969734192, "rewards/code_format_reward/std": 0.4603137671947479, "rewards/curriculum_aware_reward_fn/mean": 0.39374998211860657, "rewards/curriculum_aware_reward_fn/std": 0.34764304757118225, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 2147.55810546875, "completions/mean_terminated_length": 1205.5960693359375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.22279525528623, "grad_norm": 0.19524136185646057, "kl": 0.01263427734375, "learning_rate": 1e-06, "loss": 0.1253, "num_tokens": 75416106.0, "reward": 1.0398439168930054, "reward_std": 0.33224788308143616, "rewards/code_format_reward/mean": 0.65625, "rewards/code_format_reward/std": 0.47548985481262207, "rewards/curriculum_aware_reward_fn/mean": 0.38359376788139343, "rewards/curriculum_aware_reward_fn/std": 0.32645100355148315, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2924107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 2168.8349609375, "completions/mean_terminated_length": 1372.435302734375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.22692109334708613, "grad_norm": 207.28619384765625, "kl": 13.938743591308594, "learning_rate": 1e-06, "loss": 0.288, "num_tokens": 76682920.0, "reward": 1.1044644117355347, "reward_std": 0.32954928278923035, "rewards/code_format_reward/mean": 0.7075892686843872, "rewards/code_format_reward/std": 0.4553784728050232, "rewards/curriculum_aware_reward_fn/mean": 0.39687496423721313, "rewards/curriculum_aware_reward_fn/std": 0.35115858912467957, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 2275.6630859375, "completions/mean_terminated_length": 1395.6324462890625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.23104693140794225, "grad_norm": 0.3154338598251343, "kl": 0.01227569580078125, "learning_rate": 1e-06, "loss": 0.1528, "num_tokens": 77992973.0, "reward": 1.031026840209961, "reward_std": 0.3192930221557617, "rewards/code_format_reward/mean": 0.6763392686843872, "rewards/code_format_reward/std": 0.46839529275894165, "rewards/curriculum_aware_reward_fn/mean": 0.35468748211860657, "rewards/curriculum_aware_reward_fn/std": 0.32011306285858154, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1915.055908203125, "completions/mean_terminated_length": 1247.419921875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.23517276946879834, "grad_norm": 0.2013625055551529, "kl": 0.01450347900390625, "learning_rate": 1e-06, "loss": 0.1054, "num_tokens": 79129462.0, "reward": 1.2039062976837158, "reward_std": 0.28518518805503845, "rewards/code_format_reward/mean": 0.765625, "rewards/code_format_reward/std": 0.42408111691474915, "rewards/curriculum_aware_reward_fn/mean": 0.43828126788139343, "rewards/curriculum_aware_reward_fn/std": 0.32963600754737854, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 2174.3818359375, "completions/mean_terminated_length": 1254.79541015625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.23929860752965446, "grad_norm": 0.2084730714559555, "kl": 0.0130462646484375, "learning_rate": 1e-06, "loss": 0.1749, "num_tokens": 80365322.0, "reward": 1.0460938215255737, "reward_std": 0.369284451007843, "rewards/code_format_reward/mean": 0.671875, "rewards/code_format_reward/std": 0.470055490732193, "rewards/curriculum_aware_reward_fn/mean": 0.37421873211860657, "rewards/curriculum_aware_reward_fn/std": 0.3289523720741272, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3013392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 2060.160888671875, "completions/mean_terminated_length": 1182.0830078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.24342444559051057, "grad_norm": 14.467960357666016, "kl": 1.0142898559570312, "learning_rate": 1e-06, "loss": 0.1016, "num_tokens": 81558908.0, "reward": 1.046540379524231, "reward_std": 0.3372170627117157, "rewards/code_format_reward/mean": 0.6941964030265808, "rewards/code_format_reward/std": 0.461262047290802, "rewards/curriculum_aware_reward_fn/mean": 0.35234376788139343, "rewards/curriculum_aware_reward_fn/std": 0.330666184425354, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3080357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 2118.216552734375, "completions/mean_terminated_length": 1237.7838134765625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.2475502836513667, "grad_norm": 0.18600618839263916, "kl": 0.01309967041015625, "learning_rate": 1e-06, "loss": 0.0934, "num_tokens": 82790200.0, "reward": 1.0263394117355347, "reward_std": 0.2972944974899292, "rewards/code_format_reward/mean": 0.6919642686843872, "rewards/code_format_reward/std": 0.46219751238822937, "rewards/curriculum_aware_reward_fn/mean": 0.33437496423721313, "rewards/curriculum_aware_reward_fn/std": 0.3290574252605438, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 2027.305908203125, "completions/mean_terminated_length": 1312.8919677734375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2516761217122228, "grad_norm": 0.19900083541870117, "kl": 0.014739990234375, "learning_rate": 1e-06, "loss": 0.0958, "num_tokens": 83959526.0, "reward": 1.1183037757873535, "reward_std": 0.2880536615848541, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.4372987747192383, "rewards/curriculum_aware_reward_fn/mean": 0.375, "rewards/curriculum_aware_reward_fn/std": 0.3326222598552704, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1918.6763916015625, "completions/mean_terminated_length": 1276.8062744140625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.2558019597730789, "grad_norm": 0.33191752433776855, "kl": 0.01528167724609375, "learning_rate": 1e-06, "loss": 0.104, "num_tokens": 85103741.0, "reward": 1.136495590209961, "reward_std": 0.3333035707473755, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.36640623211860657, "rewards/curriculum_aware_reward_fn/std": 0.3360246419906616, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2924107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 2018.4219970703125, "completions/mean_terminated_length": 1159.8643798828125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.259927797833935, "grad_norm": 0.27109867334365845, "kl": 0.0150299072265625, "learning_rate": 1e-06, "loss": 0.1268, "num_tokens": 86271362.0, "reward": 1.080357313156128, "reward_std": 0.29518193006515503, "rewards/code_format_reward/mean": 0.7053571343421936, "rewards/code_format_reward/std": 0.45639169216156006, "rewards/curriculum_aware_reward_fn/mean": 0.375, "rewards/curriculum_aware_reward_fn/std": 0.34475940465927124, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2879464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2164.3974609375, "completions/mean_terminated_length": 1383.2789306640625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.26405363589479114, "grad_norm": 3.1654841899871826, "kl": 0.05632781982421875, "learning_rate": 1e-06, "loss": 0.1225, "num_tokens": 87521705.0, "reward": 1.0496653318405151, "reward_std": 0.35806527733802795, "rewards/code_format_reward/mean": 0.7098214030265808, "rewards/code_format_reward/std": 0.4543519914150238, "rewards/curriculum_aware_reward_fn/mean": 0.33984375, "rewards/curriculum_aware_reward_fn/std": 0.3263460695743561, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1941.49560546875, "completions/mean_terminated_length": 1116.93212890625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.26817947395564723, "grad_norm": 0.19728857278823853, "kl": 0.01558685302734375, "learning_rate": 1e-06, "loss": 0.1281, "num_tokens": 88661744.0, "reward": 1.1483259201049805, "reward_std": 0.2981908917427063, "rewards/code_format_reward/mean": 0.7209821343421936, "rewards/code_format_reward/std": 0.449017733335495, "rewards/curriculum_aware_reward_fn/mean": 0.4273437559604645, "rewards/curriculum_aware_reward_fn/std": 0.328230082988739, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1825.915283203125, "completions/mean_terminated_length": 1113.6011962890625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.2723053120165034, "grad_norm": 0.2051296830177307, "kl": 0.01531219482421875, "learning_rate": 1e-06, "loss": 0.0892, "num_tokens": 89750675.0, "reward": 1.1573662757873535, "reward_std": 0.25977766513824463, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.3984375, "rewards/curriculum_aware_reward_fn/std": 0.3216077387332916, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 1863.5826416015625, "completions/mean_terminated_length": 1222.0833740234375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.27643115007735947, "grad_norm": 0.2858634889125824, "kl": 0.0157470703125, "learning_rate": 1e-06, "loss": 0.1198, "num_tokens": 90843545.0, "reward": 1.2074779272079468, "reward_std": 0.2963373363018036, "rewards/code_format_reward/mean": 0.7723214030265808, "rewards/code_format_reward/std": 0.41980284452438354, "rewards/curriculum_aware_reward_fn/mean": 0.4351562559604645, "rewards/curriculum_aware_reward_fn/std": 0.32205918431282043, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 1650.2254638671875, "completions/mean_terminated_length": 1181.8856201171875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.28055698813821556, "grad_norm": 0.2547251284122467, "kl": 0.0168304443359375, "learning_rate": 1e-06, "loss": 0.0937, "num_tokens": 91847978.0, "reward": 1.2831475734710693, "reward_std": 0.26948824524879456, "rewards/code_format_reward/mean": 0.8370535969734192, "rewards/code_format_reward/std": 0.3697296679019928, "rewards/curriculum_aware_reward_fn/mean": 0.44609373807907104, "rewards/curriculum_aware_reward_fn/std": 0.29762545228004456, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 1962.4287109375, "completions/mean_terminated_length": 1292.9501953125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2846828261990717, "grad_norm": 0.3637019693851471, "kl": 0.01421356201171875, "learning_rate": 1e-06, "loss": 0.1214, "num_tokens": 92985978.0, "reward": 1.1479911804199219, "reward_std": 0.3081384599208832, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.38906246423721313, "rewards/curriculum_aware_reward_fn/std": 0.3262607753276825, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1691.930908203125, "completions/mean_terminated_length": 1246.7327880859375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2888086642599278, "grad_norm": 0.2200901359319687, "kl": 0.015411376953125, "learning_rate": 1e-06, "loss": 0.101, "num_tokens": 94000464.0, "reward": 1.282142996788025, "reward_std": 0.28132227063179016, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, "rewards/curriculum_aware_reward_fn/std": 0.31145405769348145, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1723.560302734375, "completions/mean_terminated_length": 1176.07421875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.2929345023207839, "grad_norm": 61471350784.0, "kl": 6488064.012535095, "learning_rate": 1e-06, "loss": 64954.4414, "num_tokens": 95047145.0, "reward": 1.2531250715255737, "reward_std": 0.3082839846611023, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.44062498211860657, "rewards/curriculum_aware_reward_fn/std": 0.3243841826915741, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 2050.629638671875, "completions/mean_terminated_length": 1241.40185546875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.29706034038164003, "grad_norm": 0.21607491374015808, "kl": 0.01206207275390625, "learning_rate": 1e-06, "loss": 0.144, "num_tokens": 96247122.0, "reward": 1.1008930206298828, "reward_std": 0.31440243124961853, "rewards/code_format_reward/mean": 0.7165178656578064, "rewards/code_format_reward/std": 0.4511922299861908, "rewards/curriculum_aware_reward_fn/mean": 0.3843749463558197, "rewards/curriculum_aware_reward_fn/std": 0.32846200466156006, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2095.33056640625, "completions/mean_terminated_length": 1250.6031494140625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3011861784424961, "grad_norm": 0.22686128318309784, "kl": 0.011993408203125, "learning_rate": 1e-06, "loss": 0.1328, "num_tokens": 97451977.0, "reward": 1.0945314168930054, "reward_std": 0.35809198021888733, "rewards/code_format_reward/mean": 0.703125, "rewards/code_format_reward/std": 0.45739173889160156, "rewards/curriculum_aware_reward_fn/mean": 0.39140623807907104, "rewards/curriculum_aware_reward_fn/std": 0.34116989374160767, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3013392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 2236.4443359375, "completions/mean_terminated_length": 1434.3992919921875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.30531201650335227, "grad_norm": 0.19187304377555847, "kl": 0.01161956787109375, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 98747547.0, "reward": 1.0845983028411865, "reward_std": 0.25117871165275574, "rewards/code_format_reward/mean": 0.6986607313156128, "rewards/code_format_reward/std": 0.45935267210006714, "rewards/curriculum_aware_reward_fn/mean": 0.38593748211860657, "rewards/curriculum_aware_reward_fn/std": 0.3332653343677521, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1820.8482666015625, "completions/mean_terminated_length": 1071.4658203125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.30943785456420836, "grad_norm": 0.28201088309288025, "kl": 0.0150909423828125, "learning_rate": 1e-06, "loss": 0.0819, "num_tokens": 99830470.0, "reward": 1.205357313156128, "reward_std": 0.23357708752155304, "rewards/code_format_reward/mean": 0.7522321343421936, "rewards/code_format_reward/std": 0.4321989119052887, "rewards/curriculum_aware_reward_fn/mean": 0.453125, "rewards/curriculum_aware_reward_fn/std": 0.340517520904541, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3169642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2171.47998046875, "completions/mean_terminated_length": 1278.4019775390625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.31356369262506445, "grad_norm": 0.47655731439590454, "kl": 0.01276397705078125, "learning_rate": 1e-06, "loss": 0.1489, "num_tokens": 101089096.0, "reward": 1.0447545051574707, "reward_std": 0.3714196979999542, "rewards/code_format_reward/mean": 0.6830357313156128, "rewards/code_format_reward/std": 0.4658135175704956, "rewards/curriculum_aware_reward_fn/mean": 0.36171871423721313, "rewards/curriculum_aware_reward_fn/std": 0.33129456639289856, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 1797.259033203125, "completions/mean_terminated_length": 1178.6175537109375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.3176895306859206, "grad_norm": 0.8108563423156738, "kl": 0.01357269287109375, "learning_rate": 1e-06, "loss": 0.082, "num_tokens": 102174436.0, "reward": 1.262834906578064, "reward_std": 0.2654839754104614, "rewards/code_format_reward/mean": 0.7901785969734192, "rewards/code_format_reward/std": 0.40763622522354126, "rewards/curriculum_aware_reward_fn/mean": 0.47265625, "rewards/curriculum_aware_reward_fn/std": 0.3376373052597046, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 2021.7724609375, "completions/mean_terminated_length": 1219.0526123046875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.3218153687467767, "grad_norm": 0.1962243765592575, "kl": 0.01082611083984375, "learning_rate": 1e-06, "loss": 0.0701, "num_tokens": 103356578.0, "reward": 1.101562738418579, "reward_std": 0.25508174300193787, "rewards/code_format_reward/mean": 0.71875, "rewards/code_format_reward/std": 0.45011183619499207, "rewards/curriculum_aware_reward_fn/mean": 0.3828124701976776, "rewards/curriculum_aware_reward_fn/std": 0.3344087302684784, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2299107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1888.2210693359375, "completions/mean_terminated_length": 1229.0870361328125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3259412068076328, "grad_norm": 2.6811349391937256, "kl": 0.03322601318359375, "learning_rate": 1e-06, "loss": 0.1252, "num_tokens": 104474640.0, "reward": 1.1739957332611084, "reward_std": 0.3065490126609802, "rewards/code_format_reward/mean": 0.7700892686843872, "rewards/code_format_reward/std": 0.42124560475349426, "rewards/curriculum_aware_reward_fn/mean": 0.4039062559604645, "rewards/curriculum_aware_reward_fn/std": 0.32201358675956726, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 1970.294677734375, "completions/mean_terminated_length": 1156.7530517578125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3300670448684889, "grad_norm": 0.29853591322898865, "kl": 0.013824462890625, "learning_rate": 1e-06, "loss": 0.0999, "num_tokens": 105620490.0, "reward": 1.0888394117355347, "reward_std": 0.28020328283309937, "rewards/code_format_reward/mean": 0.7232142686843872, "rewards/code_format_reward/std": 0.44790980219841003, "rewards/curriculum_aware_reward_fn/mean": 0.36562496423721313, "rewards/curriculum_aware_reward_fn/std": 0.3340170383453369, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1940.4732666015625, "completions/mean_terminated_length": 1187.3372802734375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.334192882929345, "grad_norm": 0.1921972632408142, "kl": 0.013336181640625, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 106762066.0, "reward": 1.1690850257873535, "reward_std": 0.26687270402908325, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.4372987747192383, "rewards/curriculum_aware_reward_fn/mean": 0.42578125, "rewards/curriculum_aware_reward_fn/std": 0.3811108469963074, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2092.8974609375, "completions/mean_terminated_length": 1360.0548095703125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.33831872099020116, "grad_norm": 0.2101062387228012, "kl": 0.01308441162109375, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 107968876.0, "reward": 1.0719866752624512, "reward_std": 0.2806049585342407, "rewards/code_format_reward/mean": 0.7321428656578064, "rewards/code_format_reward/std": 0.4433377683162689, "rewards/curriculum_aware_reward_fn/mean": 0.33984375, "rewards/curriculum_aware_reward_fn/std": 0.3288556635379791, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2158.26123046875, "completions/mean_terminated_length": 1408.3621826171875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.34244455905105725, "grad_norm": 0.39877673983573914, "kl": 0.01123046875, "learning_rate": 1e-06, "loss": 0.1138, "num_tokens": 109228587.0, "reward": 1.1007813215255737, "reward_std": 0.3238913118839264, "rewards/code_format_reward/mean": 0.71875, "rewards/code_format_reward/std": 0.45011183619499207, "rewards/curriculum_aware_reward_fn/mean": 0.38203123211860657, "rewards/curriculum_aware_reward_fn/std": 0.3381514549255371, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3080357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2274.317138671875, "completions/mean_terminated_length": 1463.3741455078125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.34657039711191334, "grad_norm": 23064898.0, "kl": 282624.3104248047, "learning_rate": 1e-06, "loss": 2820.5662, "num_tokens": 110531226.0, "reward": 1.04285728931427, "reward_std": 0.3372339904308319, "rewards/code_format_reward/mean": 0.6897321343421936, "rewards/code_format_reward/std": 0.46312037110328674, "rewards/curriculum_aware_reward_fn/mean": 0.3531249463558197, "rewards/curriculum_aware_reward_fn/std": 0.32774609327316284, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3932.0, "completions/mean_length": 2078.774658203125, "completions/mean_terminated_length": 1468.9156494140625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.3506962351727695, "grad_norm": 0.1948719620704651, "kl": 0.0124359130859375, "learning_rate": 1e-06, "loss": 0.1045, "num_tokens": 111744078.0, "reward": 1.1530135869979858, "reward_std": 0.3290906846523285, "rewards/code_format_reward/mean": 0.7678571343421936, "rewards/code_format_reward/std": 0.4226716458797455, "rewards/curriculum_aware_reward_fn/mean": 0.38515621423721313, "rewards/curriculum_aware_reward_fn/std": 0.32120710611343384, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 2249.6005859375, "completions/mean_terminated_length": 1444.7596435546875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3548220732336256, "grad_norm": 0.18358947336673737, "kl": 0.01184844970703125, "learning_rate": 1e-06, "loss": 0.1097, "num_tokens": 113010325.0, "reward": 1.0815849304199219, "reward_std": 0.3133898377418518, "rewards/code_format_reward/mean": 0.6964285969734192, "rewards/code_format_reward/std": 0.4603137969970703, "rewards/curriculum_aware_reward_fn/mean": 0.38515621423721313, "rewards/curriculum_aware_reward_fn/std": 0.3287961184978485, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1831.122802734375, "completions/mean_terminated_length": 1213.428955078125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.35894791129448167, "grad_norm": 0.21182571351528168, "kl": 0.0136566162109375, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 114115848.0, "reward": 1.218526840209961, "reward_std": 0.23270158469676971, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078463196754456, "rewards/curriculum_aware_reward_fn/mean": 0.43281248211860657, "rewards/curriculum_aware_reward_fn/std": 0.31362658739089966, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 2075.180908203125, "completions/mean_terminated_length": 1377.30029296875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.3630737493553378, "grad_norm": 0.19150634109973907, "kl": 0.01174163818359375, "learning_rate": 1e-06, "loss": 0.0729, "num_tokens": 115322071.0, "reward": 1.1378350257873535, "reward_std": 0.3045061230659485, "rewards/code_format_reward/mean": 0.7433035969734192, "rewards/code_format_reward/std": 0.4372987747192383, "rewards/curriculum_aware_reward_fn/mean": 0.39453125, "rewards/curriculum_aware_reward_fn/std": 0.3367302417755127, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 2124.5625, "completions/mean_terminated_length": 1335.987548828125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3671995874161939, "grad_norm": 0.2297547310590744, "kl": 0.05477142333984375, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 116556591.0, "reward": 1.0736608505249023, "reward_std": 0.32245033979415894, "rewards/code_format_reward/mean": 0.7142857313156128, "rewards/code_format_reward/std": 0.45225897431373596, "rewards/curriculum_aware_reward_fn/mean": 0.359375, "rewards/curriculum_aware_reward_fn/std": 0.33994215726852417, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 2154.805908203125, "completions/mean_terminated_length": 1476.55712890625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.37132542547705005, "grad_norm": 16.685142517089844, "kl": 2.131072998046875, "learning_rate": 1e-06, "loss": 0.1646, "num_tokens": 117802301.0, "reward": 1.1551340818405151, "reward_std": 0.3826047480106354, "rewards/code_format_reward/mean": 0.7410714030265808, "rewards/code_format_reward/std": 0.43853598833084106, "rewards/curriculum_aware_reward_fn/mean": 0.4140625, "rewards/curriculum_aware_reward_fn/std": 0.32142511010169983, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 1905.962158203125, "completions/mean_terminated_length": 1218.765380859375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.37545126353790614, "grad_norm": 0.21024605631828308, "kl": 0.0134735107421875, "learning_rate": 1e-06, "loss": 0.1014, "num_tokens": 118935164.0, "reward": 1.1588170528411865, "reward_std": 0.25268182158470154, "rewards/code_format_reward/mean": 0.7611607313156128, "rewards/code_format_reward/std": 0.4268510043621063, "rewards/curriculum_aware_reward_fn/mean": 0.39765623211860657, "rewards/curriculum_aware_reward_fn/std": 0.3490960896015167, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 2146.58056640625, "completions/mean_terminated_length": 1375.3145751953125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.37957710159876223, "grad_norm": 2.7486345767974854, "kl": 0.46689605712890625, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 120156336.0, "reward": 1.0909600257873535, "reward_std": 0.2893161177635193, "rewards/code_format_reward/mean": 0.7120535969734192, "rewards/code_format_reward/std": 0.4533122181892395, "rewards/curriculum_aware_reward_fn/mean": 0.37890625, "rewards/curriculum_aware_reward_fn/std": 0.37675216794013977, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1860.40185546875, "completions/mean_terminated_length": 1313.9222412109375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3837029396596184, "grad_norm": 0.22773477435112, "kl": 0.0144805908203125, "learning_rate": 1e-06, "loss": 0.1312, "num_tokens": 121264077.0, "reward": 1.2443082332611084, "reward_std": 0.32754769921302795, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380533695221, "rewards/curriculum_aware_reward_fn/mean": 0.4429686963558197, "rewards/curriculum_aware_reward_fn/std": 0.3349525034427643, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1907.8907470703125, "completions/mean_terminated_length": 1342.4241943359375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.38782877772047447, "grad_norm": 0.2255377620458603, "kl": 0.0151214599609375, "learning_rate": 1e-06, "loss": 0.0897, "num_tokens": 122385176.0, "reward": 1.2165180444717407, "reward_std": 0.3012862205505371, "rewards/code_format_reward/mean": 0.7946428656578064, "rewards/code_format_reward/std": 0.40441393852233887, "rewards/curriculum_aware_reward_fn/mean": 0.421875, "rewards/curriculum_aware_reward_fn/std": 0.3257090747356415, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 1828.5179443359375, "completions/mean_terminated_length": 1350.5081787109375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.39195461578133056, "grad_norm": 0.22716814279556274, "kl": 0.01505279541015625, "learning_rate": 1e-06, "loss": 0.0892, "num_tokens": 123480922.0, "reward": 1.2835938930511475, "reward_std": 0.3164493143558502, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.45546871423721313, "rewards/curriculum_aware_reward_fn/std": 0.3253774046897888, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 1769.3438720703125, "completions/mean_terminated_length": 1294.00537109375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3960804538421867, "grad_norm": 0.2642739713191986, "kl": 0.01670074462890625, "learning_rate": 1e-06, "loss": 0.1178, "num_tokens": 124545643.0, "reward": 1.2764509916305542, "reward_std": 0.36772122979164124, "rewards/code_format_reward/mean": 0.8303571343421936, "rewards/code_format_reward/std": 0.37573832273483276, "rewards/curriculum_aware_reward_fn/mean": 0.44609373807907104, "rewards/curriculum_aware_reward_fn/std": 0.3084770441055298, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1793.30810546875, "completions/mean_terminated_length": 1322.8656005859375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4002062919030428, "grad_norm": 0.19620507955551147, "kl": 0.0139312744140625, "learning_rate": 1e-06, "loss": 0.088, "num_tokens": 125615892.0, "reward": 1.2727680206298828, "reward_std": 0.2771356701850891, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962546944618225, "rewards/curriculum_aware_reward_fn/mean": 0.4468750059604645, "rewards/curriculum_aware_reward_fn/std": 0.3334304988384247, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2433035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1908.727783203125, "completions/mean_terminated_length": 1205.4454345703125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4043321299638989, "grad_norm": 0.2542637586593628, "kl": 0.0135955810546875, "learning_rate": 1e-06, "loss": 0.0952, "num_tokens": 126747544.0, "reward": 1.2448662519454956, "reward_std": 0.3202023208141327, "rewards/code_format_reward/mean": 0.7566964030265808, "rewards/code_format_reward/std": 0.4295565187931061, "rewards/curriculum_aware_reward_fn/mean": 0.48816967010498047, "rewards/curriculum_aware_reward_fn/std": 0.402654230594635, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1787.3662109375, "completions/mean_terminated_length": 1285.4891357421875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.40845796802475504, "grad_norm": 0.30569538474082947, "kl": 0.01509857177734375, "learning_rate": 1e-06, "loss": 0.1071, "num_tokens": 127819366.0, "reward": 1.3381696939468384, "reward_std": 0.3241099417209625, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.5167410969734192, "rewards/curriculum_aware_reward_fn/std": 0.36987948417663574, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2946428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 2188.973388671875, "completions/mean_terminated_length": 1392.3670654296875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4125838060856111, "grad_norm": 0.3979156017303467, "kl": 0.01331329345703125, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 129065515.0, "reward": 1.162834882736206, "reward_std": 0.35521644353866577, "rewards/code_format_reward/mean": 0.7098214030265808, "rewards/code_format_reward/std": 0.4543519914150238, "rewards/curriculum_aware_reward_fn/mean": 0.45301342010498047, "rewards/curriculum_aware_reward_fn/std": 0.39779433608055115, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2165178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 1926.3974609375, "completions/mean_terminated_length": 1326.820556640625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.4167096441464673, "grad_norm": 596.6317138671875, "kl": 2.7932891845703125, "learning_rate": 1e-06, "loss": 0.1426, "num_tokens": 130212405.0, "reward": 1.2630581855773926, "reward_std": 0.3349149823188782, "rewards/code_format_reward/mean": 0.7834821343421936, "rewards/code_format_reward/std": 0.41233164072036743, "rewards/curriculum_aware_reward_fn/mean": 0.47957590222358704, "rewards/curriculum_aware_reward_fn/std": 0.38721349835395813, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 2113.082763671875, "completions/mean_terminated_length": 1345.69970703125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.42083548220732336, "grad_norm": 0.2139917016029358, "kl": 0.0146026611328125, "learning_rate": 1e-06, "loss": 0.1284, "num_tokens": 131445268.0, "reward": 1.1873886585235596, "reward_std": 0.40018337965011597, "rewards/code_format_reward/mean": 0.71875, "rewards/code_format_reward/std": 0.45011183619499207, "rewards/curriculum_aware_reward_fn/mean": 0.46863842010498047, "rewards/curriculum_aware_reward_fn/std": 0.3954916000366211, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1869.3482666015625, "completions/mean_terminated_length": 1309.575439453125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.42496132026817945, "grad_norm": 0.5777689814567566, "kl": 0.01442718505859375, "learning_rate": 1e-06, "loss": 0.1126, "num_tokens": 132563996.0, "reward": 1.2658483982086182, "reward_std": 0.2914493680000305, "rewards/code_format_reward/mean": 0.7991071343421936, "rewards/code_format_reward/std": 0.4011160135269165, "rewards/curriculum_aware_reward_fn/mean": 0.46674108505249023, "rewards/curriculum_aware_reward_fn/std": 0.37757155299186707, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2611607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 2056.27685546875, "completions/mean_terminated_length": 1335.2869873046875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.4290871583290356, "grad_norm": 0.20045840740203857, "kl": 0.0137481689453125, "learning_rate": 1e-06, "loss": 0.0953, "num_tokens": 133742059.0, "reward": 1.1110491752624512, "reward_std": 0.29968130588531494, "rewards/code_format_reward/mean": 0.7388392686843872, "rewards/code_format_reward/std": 0.43975841999053955, "rewards/curriculum_aware_reward_fn/mean": 0.3722098469734192, "rewards/curriculum_aware_reward_fn/std": 0.3889741897583008, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 1529.3192138671875, "completions/mean_terminated_length": 1155.1483154296875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.4332129963898917, "grad_norm": 0.33533167839050293, "kl": 0.019073486328125, "learning_rate": 1e-06, "loss": 0.1169, "num_tokens": 134688140.0, "reward": 1.4626117944717407, "reward_std": 0.2607215642929077, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.5876116156578064, "rewards/curriculum_aware_reward_fn/std": 0.3578576445579529, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 1897.821533203125, "completions/mean_terminated_length": 1329.7528076171875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.4373388344507478, "grad_norm": 0.2289149910211563, "kl": 0.01496124267578125, "learning_rate": 1e-06, "loss": 0.0809, "num_tokens": 135813254.0, "reward": 1.2926340103149414, "reward_std": 0.3256075978279114, "rewards/code_format_reward/mean": 0.7946428656578064, "rewards/code_format_reward/std": 0.40441396832466125, "rewards/curriculum_aware_reward_fn/mean": 0.49799108505249023, "rewards/curriculum_aware_reward_fn/std": 0.38303855061531067, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2120535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1880.9241943359375, "completions/mean_terminated_length": 1284.798828125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.44146467251160393, "grad_norm": 0.21886080503463745, "kl": 0.01514434814453125, "learning_rate": 1e-06, "loss": 0.14, "num_tokens": 136909609.0, "reward": 1.293973445892334, "reward_std": 0.3218976557254791, "rewards/code_format_reward/mean": 0.7857142686843872, "rewards/code_format_reward/std": 0.41078460216522217, "rewards/curriculum_aware_reward_fn/mean": 0.5082589387893677, "rewards/curriculum_aware_reward_fn/std": 0.3749256134033203, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1758.7232666015625, "completions/mean_terminated_length": 1187.388916015625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.44559051057246, "grad_norm": 0.23373863101005554, "kl": 0.01739501953125, "learning_rate": 1e-06, "loss": 0.0946, "num_tokens": 137966485.0, "reward": 1.2852680683135986, "reward_std": 0.31339845061302185, "rewards/code_format_reward/mean": 0.8035714030265808, "rewards/code_format_reward/std": 0.3977404832839966, "rewards/curriculum_aware_reward_fn/mean": 0.4816964268684387, "rewards/curriculum_aware_reward_fn/std": 0.3875146210193634, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 1916.2210693359375, "completions/mean_terminated_length": 1223.820556640625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.44971634863331617, "grad_norm": 0.20984645187854767, "kl": 0.01628875732421875, "learning_rate": 1e-06, "loss": 0.0996, "num_tokens": 139093185.0, "reward": 1.2770090103149414, "reward_std": 0.30474624037742615, "rewards/code_format_reward/mean": 0.7611607313156128, "rewards/code_format_reward/std": 0.4268510043621063, "rewards/curriculum_aware_reward_fn/mean": 0.5158482193946838, "rewards/curriculum_aware_reward_fn/std": 0.409445196390152, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2075892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 1859.4576416015625, "completions/mean_terminated_length": 1273.54638671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.45384218669417226, "grad_norm": 0.5128569006919861, "kl": 0.0165252685546875, "learning_rate": 1e-06, "loss": 0.1196, "num_tokens": 140199110.0, "reward": 1.2483259439468384, "reward_std": 0.32321012020111084, "rewards/code_format_reward/mean": 0.7924107313156128, "rewards/code_format_reward/std": 0.4060344398021698, "rewards/curriculum_aware_reward_fn/mean": 0.4559151828289032, "rewards/curriculum_aware_reward_fn/std": 0.37757861614227295, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 1393.9219970703125, "completions/mean_terminated_length": 1121.7222900390625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.45796802475502835, "grad_norm": 0.21977221965789795, "kl": 0.02191162109375, "learning_rate": 1e-06, "loss": 0.1095, "num_tokens": 141069903.0, "reward": 1.5670760869979858, "reward_std": 0.23180526494979858, "rewards/code_format_reward/mean": 0.9084821343421936, "rewards/code_format_reward/std": 0.2886664867401123, "rewards/curriculum_aware_reward_fn/mean": 0.6585937738418579, "rewards/curriculum_aware_reward_fn/std": 0.3625333905220032, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3896.0, "completions/mean_length": 1532.5045166015625, "completions/mean_terminated_length": 1288.0635986328125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4620938628158845, "grad_norm": 0.29682549834251404, "kl": 0.019744873046875, "learning_rate": 1e-06, "loss": 0.1197, "num_tokens": 142039202.0, "reward": 1.4710938930511475, "reward_std": 0.27673768997192383, "rewards/code_format_reward/mean": 0.9107142686843872, "rewards/code_format_reward/std": 0.2854745090007782, "rewards/curriculum_aware_reward_fn/mean": 0.5603795051574707, "rewards/curriculum_aware_reward_fn/std": 0.36809030175209045, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 2081.504638671875, "completions/mean_terminated_length": 1344.493896484375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4662197008767406, "grad_norm": 0.19321902096271515, "kl": 0.01572418212890625, "learning_rate": 1e-06, "loss": 0.0758, "num_tokens": 143257743.0, "reward": 1.1703126430511475, "reward_std": 0.3091709613800049, "rewards/code_format_reward/mean": 0.7321428656578064, "rewards/code_format_reward/std": 0.4433377683162689, "rewards/curriculum_aware_reward_fn/mean": 0.4381696581840515, "rewards/curriculum_aware_reward_fn/std": 0.3880119025707245, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2254464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1851.54248046875, "completions/mean_terminated_length": 1198.2564697265625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4703455389375967, "grad_norm": 0.21663129329681396, "kl": 0.0176849365234375, "learning_rate": 1e-06, "loss": 0.0966, "num_tokens": 144340291.0, "reward": 1.2756696939468384, "reward_std": 0.3172203600406647, "rewards/code_format_reward/mean": 0.7723214030265808, "rewards/code_format_reward/std": 0.41980284452438354, "rewards/curriculum_aware_reward_fn/mean": 0.5033482313156128, "rewards/curriculum_aware_reward_fn/std": 0.3864597976207733, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1572.5782470703125, "completions/mean_terminated_length": 1204.7135009765625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4744713769984528, "grad_norm": 0.49621084332466125, "kl": 0.0229034423828125, "learning_rate": 1e-06, "loss": 0.1239, "num_tokens": 145307103.0, "reward": 1.4524556398391724, "reward_std": 0.2906745374202728, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.5841518044471741, "rewards/curriculum_aware_reward_fn/std": 0.3829342722892761, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 1971.4866943359375, "completions/mean_terminated_length": 1376.622802734375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4785972150593089, "grad_norm": 0.23582282662391663, "kl": 0.01757049560546875, "learning_rate": 1e-06, "loss": 0.1193, "num_tokens": 146456637.0, "reward": 1.2378349304199219, "reward_std": 0.3621983230113983, "rewards/code_format_reward/mean": 0.7834821343421936, "rewards/code_format_reward/std": 0.41233161091804504, "rewards/curriculum_aware_reward_fn/mean": 0.4543526768684387, "rewards/curriculum_aware_reward_fn/std": 0.38843461871147156, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1808035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 1749.3907470703125, "completions/mean_terminated_length": 1231.47412109375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.48272305312016506, "grad_norm": 0.22831237316131592, "kl": 0.0184326171875, "learning_rate": 1e-06, "loss": 0.1331, "num_tokens": 147520210.0, "reward": 1.3574777841567993, "reward_std": 0.32930701971054077, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.538281261920929, "rewards/curriculum_aware_reward_fn/std": 0.372250497341156, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1811.6407470703125, "completions/mean_terminated_length": 1276.735595703125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.48684889118102115, "grad_norm": 0.23114219307899475, "kl": 0.01885986328125, "learning_rate": 1e-06, "loss": 0.1114, "num_tokens": 148603901.0, "reward": 1.3013395071029663, "reward_std": 0.3289732038974762, "rewards/code_format_reward/mean": 0.8080357313156128, "rewards/code_format_reward/std": 0.3942854404449463, "rewards/curriculum_aware_reward_fn/mean": 0.4933035671710968, "rewards/curriculum_aware_reward_fn/std": 0.39247652888298035, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1975.04248046875, "completions/mean_terminated_length": 1251.122802734375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.49097472924187724, "grad_norm": 0.20781484246253967, "kl": 0.0164947509765625, "learning_rate": 1e-06, "loss": 0.1171, "num_tokens": 149762028.0, "reward": 1.2449778318405151, "reward_std": 0.34720858931541443, "rewards/code_format_reward/mean": 0.7455357313156128, "rewards/code_format_reward/std": 0.4360465705394745, "rewards/curriculum_aware_reward_fn/mean": 0.4994419515132904, "rewards/curriculum_aware_reward_fn/std": 0.45315784215927124, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 2032.3326416015625, "completions/mean_terminated_length": 1376.814697265625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4951005673027334, "grad_norm": 0.25265783071517944, "kl": 0.0178985595703125, "learning_rate": 1e-06, "loss": 0.154, "num_tokens": 150935986.0, "reward": 1.0713170766830444, "reward_std": 0.39104020595550537, "rewards/code_format_reward/mean": 0.7589285969734192, "rewards/code_format_reward/std": 0.4282117187976837, "rewards/curriculum_aware_reward_fn/mean": 0.31238842010498047, "rewards/curriculum_aware_reward_fn/std": 0.391317218542099, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1820.6898193359375, "completions/mean_terminated_length": 1310.9207763671875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4992264053635895, "grad_norm": 0.19575944542884827, "kl": 0.018218994140625, "learning_rate": 1e-06, "loss": 0.0795, "num_tokens": 152019643.0, "reward": 1.3352679014205933, "reward_std": 0.2855936288833618, "rewards/code_format_reward/mean": 0.8147321343421936, "rewards/code_format_reward/std": 0.38894903659820557, "rewards/curriculum_aware_reward_fn/mean": 0.5205357670783997, "rewards/curriculum_aware_reward_fn/std": 0.37896621227264404, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1652.0782470703125, "completions/mean_terminated_length": 1222.30712890625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.5033522434244456, "grad_norm": 0.2649840712547302, "kl": 0.0213165283203125, "learning_rate": 1e-06, "loss": 0.158, "num_tokens": 153026523.0, "reward": 1.3676341772079468, "reward_std": 0.3618910014629364, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.5171875357627869, "rewards/curriculum_aware_reward_fn/std": 0.40689706802368164, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1938.0693359375, "completions/mean_terminated_length": 1333.8485107421875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5074780814853017, "grad_norm": 0.2801723778247833, "kl": 0.017120361328125, "learning_rate": 1e-06, "loss": 0.1482, "num_tokens": 154177319.0, "reward": 1.2242188453674316, "reward_std": 0.37541112303733826, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.44520092010498047, "rewards/curriculum_aware_reward_fn/std": 0.3993484079837799, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1873.0648193359375, "completions/mean_terminated_length": 1375.030029296875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5116039195461578, "grad_norm": 0.24661336839199066, "kl": 0.020294189453125, "learning_rate": 1e-06, "loss": 0.0898, "num_tokens": 155273387.0, "reward": 1.2632813453674316, "reward_std": 0.34729352593421936, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.44631698727607727, "rewards/curriculum_aware_reward_fn/std": 0.3932637870311737, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4000.0, "completions/mean_length": 1740.415283203125, "completions/mean_terminated_length": 1251.5201416015625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5157297576070139, "grad_norm": 0.2343399077653885, "kl": 0.021331787109375, "learning_rate": 1e-06, "loss": 0.0798, "num_tokens": 156316419.0, "reward": 1.3706475496292114, "reward_std": 0.3081948757171631, "rewards/code_format_reward/mean": 0.8303571343421936, "rewards/code_format_reward/std": 0.37573832273483276, "rewards/curriculum_aware_reward_fn/mean": 0.5402902364730835, "rewards/curriculum_aware_reward_fn/std": 0.3639678955078125, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1885.4376220703125, "completions/mean_terminated_length": 1250.2183837890625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.51985559566787, "grad_norm": 0.2045391947031021, "kl": 0.01904296875, "learning_rate": 1e-06, "loss": 0.1082, "num_tokens": 157421348.0, "reward": 1.2103794813156128, "reward_std": 0.30370932817459106, "rewards/code_format_reward/mean": 0.7767857313156128, "rewards/code_format_reward/std": 0.41686633229255676, "rewards/curriculum_aware_reward_fn/mean": 0.43359375, "rewards/curriculum_aware_reward_fn/std": 0.40424734354019165, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1622.99560546875, "completions/mean_terminated_length": 1203.2950439453125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5239814337287262, "grad_norm": 0.27579593658447266, "kl": 0.0200347900390625, "learning_rate": 1e-06, "loss": 0.1294, "num_tokens": 158422696.0, "reward": 1.3474332094192505, "reward_std": 0.329692542552948, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.49252229928970337, "rewards/curriculum_aware_reward_fn/std": 0.3930818438529968, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1901.243408203125, "completions/mean_terminated_length": 1394.760986328125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5281072717895823, "grad_norm": 0.2577130198478699, "kl": 0.0180206298828125, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 159541958.0, "reward": 1.313392996788025, "reward_std": 0.2995237708091736, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5008928775787354, "rewards/curriculum_aware_reward_fn/std": 0.3886667490005493, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1964285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1948.7210693359375, "completions/mean_terminated_length": 1423.83056640625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5322331098504384, "grad_norm": 0.24326616525650024, "kl": 0.017669677734375, "learning_rate": 1e-06, "loss": 0.0939, "num_tokens": 160713780.0, "reward": 1.2681920528411865, "reward_std": 0.30479398369789124, "rewards/code_format_reward/mean": 0.8013392686843872, "rewards/code_format_reward/std": 0.3994380831718445, "rewards/curriculum_aware_reward_fn/mean": 0.46685269474983215, "rewards/curriculum_aware_reward_fn/std": 0.38321977853775024, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1919642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1751.0001220703125, "completions/mean_terminated_length": 1193.900634765625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5363589479112945, "grad_norm": 0.23594826459884644, "kl": 0.0184783935546875, "learning_rate": 1e-06, "loss": 0.1056, "num_tokens": 161765740.0, "reward": 1.3129466772079468, "reward_std": 0.3430105745792389, "rewards/code_format_reward/mean": 0.8080357313156128, "rewards/code_format_reward/std": 0.3942854404449463, "rewards/curriculum_aware_reward_fn/mean": 0.5049107670783997, "rewards/curriculum_aware_reward_fn/std": 0.40329793095588684, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1794.8438720703125, "completions/mean_terminated_length": 1339.5347900390625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5404847859721505, "grad_norm": 0.2709508240222931, "kl": 0.0223236083984375, "learning_rate": 1e-06, "loss": 0.0885, "num_tokens": 162848647.0, "reward": 1.329017996788025, "reward_std": 0.3155292272567749, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.49419647455215454, "rewards/curriculum_aware_reward_fn/std": 0.38577884435653687, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1712.9398193359375, "completions/mean_terminated_length": 1293.871337890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5446106240330068, "grad_norm": 1.2333673238754272, "kl": 0.0237884521484375, "learning_rate": 1e-06, "loss": 0.0916, "num_tokens": 163899411.0, "reward": 1.3271206617355347, "reward_std": 0.25644823908805847, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.4766741394996643, "rewards/curriculum_aware_reward_fn/std": 0.38201674818992615, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1741071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1827.3170166015625, "completions/mean_terminated_length": 1349.0540771484375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5487364620938628, "grad_norm": 0.24530315399169922, "kl": 0.019317626953125, "learning_rate": 1e-06, "loss": 0.0958, "num_tokens": 164997384.0, "reward": 1.1158483028411865, "reward_std": 0.2780385911464691, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.28995537757873535, "rewards/curriculum_aware_reward_fn/std": 0.37317731976509094, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1726.6317138671875, "completions/mean_terminated_length": 1302.6395263671875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5528623001547189, "grad_norm": 0.22773702442646027, "kl": 0.019500732421875, "learning_rate": 1e-06, "loss": 0.0971, "num_tokens": 166042291.0, "reward": 1.3700894117355347, "reward_std": 0.2921365797519684, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.5196428894996643, "rewards/curriculum_aware_reward_fn/std": 0.3764224350452423, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1779.3438720703125, "completions/mean_terminated_length": 1407.23828125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.556988138215575, "grad_norm": 0.197788268327713, "kl": 0.0192108154296875, "learning_rate": 1e-06, "loss": 0.0974, "num_tokens": 167123931.0, "reward": 1.001227855682373, "reward_std": 0.20402835309505463, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.14185267686843872, "rewards/curriculum_aware_reward_fn/std": 0.3079221248626709, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 1738.149658203125, "completions/mean_terminated_length": 1301.510498046875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5611139762764311, "grad_norm": 0.24345508217811584, "kl": 0.0193328857421875, "learning_rate": 1e-06, "loss": 0.1148, "num_tokens": 168165311.0, "reward": 1.3750001192092896, "reward_std": 0.34778735041618347, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.53125, "rewards/curriculum_aware_reward_fn/std": 0.3785865008831024, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1681.07373046875, "completions/mean_terminated_length": 1187.70166015625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5652398143372873, "grad_norm": 0.2188788205385208, "kl": 0.018768310546875, "learning_rate": 1e-06, "loss": 0.0821, "num_tokens": 169184782.0, "reward": 1.2645089626312256, "reward_std": 0.2465325891971588, "rewards/code_format_reward/mean": 0.8348214030265808, "rewards/code_format_reward/std": 0.37175676226615906, "rewards/curriculum_aware_reward_fn/mean": 0.4296875, "rewards/curriculum_aware_reward_fn/std": 0.41227027773857117, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 1603.0023193359375, "completions/mean_terminated_length": 1268.498779296875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5693656523981434, "grad_norm": 0.23375459015369415, "kl": 0.019622802734375, "learning_rate": 1e-06, "loss": 0.0413, "num_tokens": 170179700.0, "reward": 1.4341518878936768, "reward_std": 0.25356078147888184, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.5524553656578064, "rewards/curriculum_aware_reward_fn/std": 0.3973832130432129, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1455.1295166015625, "completions/mean_terminated_length": 1160.2431640625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5734914904589995, "grad_norm": 0.23874713480472565, "kl": 0.0207366943359375, "learning_rate": 1e-06, "loss": 0.0539, "num_tokens": 171083241.0, "reward": 1.454017996788025, "reward_std": 0.2525254786014557, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.29793688654899597, "rewards/curriculum_aware_reward_fn/mean": 0.5522321462631226, "rewards/curriculum_aware_reward_fn/std": 0.3668530285358429, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1741.8638916015625, "completions/mean_terminated_length": 1398.677734375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5776173285198556, "grad_norm": 0.247205451130867, "kl": 0.018646240234375, "learning_rate": 1e-06, "loss": 0.1183, "num_tokens": 172139844.0, "reward": 1.3781250715255737, "reward_std": 0.3515448272228241, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.5053571462631226, "rewards/curriculum_aware_reward_fn/std": 0.37020501494407654, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 1658.8482666015625, "completions/mean_terminated_length": 1129.0325927734375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5817431665807117, "grad_norm": 0.44727662205696106, "kl": 0.0311126708984375, "learning_rate": 1e-06, "loss": 0.0728, "num_tokens": 173154580.0, "reward": 1.4180805683135986, "reward_std": 0.27598464488983154, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.5966517329216003, "rewards/curriculum_aware_reward_fn/std": 0.37194687128067017, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1975.12060546875, "completions/mean_terminated_length": 1478.495849609375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5858690046415678, "grad_norm": 0.2354760318994522, "kl": 0.0256195068359375, "learning_rate": 1e-06, "loss": 0.1228, "num_tokens": 174329990.0, "reward": 1.2723214626312256, "reward_std": 0.34845033288002014, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252740144729614, "rewards/curriculum_aware_reward_fn/mean": 0.4620535671710968, "rewards/curriculum_aware_reward_fn/std": 0.40496060252189636, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 2030.4107666015625, "completions/mean_terminated_length": 1553.736328125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.589994842702424, "grad_norm": 0.2157117873430252, "kl": 0.0220184326171875, "learning_rate": 1e-06, "loss": 0.1281, "num_tokens": 175521903.0, "reward": 1.3176339864730835, "reward_std": 0.3941625952720642, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5051339268684387, "rewards/curriculum_aware_reward_fn/std": 0.3888208270072937, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1629464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1813.7076416015625, "completions/mean_terminated_length": 1369.4212646484375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.5941206807632801, "grad_norm": 0.22565767168998718, "kl": 0.0193634033203125, "learning_rate": 1e-06, "loss": 0.114, "num_tokens": 176594368.0, "reward": 1.37488853931427, "reward_std": 0.3494480848312378, "rewards/code_format_reward/mean": 0.8370535969734192, "rewards/code_format_reward/std": 0.3697296679019928, "rewards/curriculum_aware_reward_fn/mean": 0.5378348231315613, "rewards/curriculum_aware_reward_fn/std": 0.37038344144821167, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 1881.357177734375, "completions/mean_terminated_length": 1377.75341796875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5982465188241362, "grad_norm": 0.22476831078529358, "kl": 0.02020263671875, "learning_rate": 1e-06, "loss": 0.1128, "num_tokens": 177707451.0, "reward": 1.3183037042617798, "reward_std": 0.36240866780281067, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5058035850524902, "rewards/curriculum_aware_reward_fn/std": 0.3833063542842865, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1339285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 1536.790283203125, "completions/mean_terminated_length": 1141.0360107421875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6023723568849922, "grad_norm": 0.26155343651771545, "kl": 0.02178955078125, "learning_rate": 1e-06, "loss": 0.0659, "num_tokens": 178652497.0, "reward": 1.4607144594192505, "reward_std": 0.22950772941112518, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095630049705505, "rewards/curriculum_aware_reward_fn/mean": 0.5946429371833801, "rewards/curriculum_aware_reward_fn/std": 0.3741273283958435, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 1788.0068359375, "completions/mean_terminated_length": 1382.13916015625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6064981949458483, "grad_norm": 1.0388872623443604, "kl": 0.0193939208984375, "learning_rate": 1e-06, "loss": 0.1008, "num_tokens": 179720161.0, "reward": 1.5102678537368774, "reward_std": 0.3413771986961365, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.662053644657135, "rewards/curriculum_aware_reward_fn/std": 0.46296849846839905, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 2100.16748046875, "completions/mean_terminated_length": 1466.1971435546875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6106240330067045, "grad_norm": 0.21630822122097015, "kl": 0.019805908203125, "learning_rate": 1e-06, "loss": 0.0918, "num_tokens": 180933476.0, "reward": 1.2727679014205933, "reward_std": 0.39400404691696167, "rewards/code_format_reward/mean": 0.7566964030265808, "rewards/code_format_reward/std": 0.4295565187931061, "rewards/curriculum_aware_reward_fn/mean": 0.5160713791847229, "rewards/curriculum_aware_reward_fn/std": 0.4506998658180237, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 2091.69873046875, "completions/mean_terminated_length": 1478.1370849609375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6147498710675606, "grad_norm": 0.22199887037277222, "kl": 0.019317626953125, "learning_rate": 1e-06, "loss": 0.099, "num_tokens": 182159846.0, "reward": 1.3185268640518188, "reward_std": 0.3951811194419861, "rewards/code_format_reward/mean": 0.765625, "rewards/code_format_reward/std": 0.42408111691474915, "rewards/curriculum_aware_reward_fn/mean": 0.5529018044471741, "rewards/curriculum_aware_reward_fn/std": 0.4583703875541687, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1787.446533203125, "completions/mean_terminated_length": 1409.68310546875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6188757091284167, "grad_norm": 0.2299708127975464, "kl": 0.021514892578125, "learning_rate": 1e-06, "loss": 0.1087, "num_tokens": 183247111.0, "reward": 1.4218751192092896, "reward_std": 0.3345509469509125, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.5625, "rewards/curriculum_aware_reward_fn/std": 0.4604127109050751, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1138392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1599.435302734375, "completions/mean_terminated_length": 1278.7177734375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6230015471892728, "grad_norm": 0.24896688759326935, "kl": 0.022369384765625, "learning_rate": 1e-06, "loss": 0.0958, "num_tokens": 184227628.0, "reward": 1.5521206855773926, "reward_std": 0.30572912096977234, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.6659597754478455, "rewards/curriculum_aware_reward_fn/std": 0.4227410852909088, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 1433.6273193359375, "completions/mean_terminated_length": 1136.3399658203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6271273852501289, "grad_norm": 0.4775170087814331, "kl": 0.02508544921875, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 185127675.0, "reward": 1.3047993183135986, "reward_std": 0.23483391106128693, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.4052455425262451, "rewards/curriculum_aware_reward_fn/std": 0.4659194052219391, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1643.2879638671875, "completions/mean_terminated_length": 1256.684814453125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.631253223310985, "grad_norm": 0.22107750177383423, "kl": 0.02374267578125, "learning_rate": 1e-06, "loss": 0.0531, "num_tokens": 186139073.0, "reward": 1.5174108743667603, "reward_std": 0.31378087401390076, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6558035612106323, "rewards/curriculum_aware_reward_fn/std": 0.4248434603214264, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 1793.2857666015625, "completions/mean_terminated_length": 1352.34033203125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6353790613718412, "grad_norm": 213.0500030517578, "kl": 4.206878662109375, "learning_rate": 1e-06, "loss": 0.0975, "num_tokens": 187213302.0, "reward": 1.495759129524231, "reward_std": 0.2779863178730011, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6564732193946838, "rewards/curriculum_aware_reward_fn/std": 0.4365084767341614, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 1663.466552734375, "completions/mean_terminated_length": 1280.0439453125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6395048994326973, "grad_norm": 0.22665973007678986, "kl": 0.0211181640625, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 188241972.0, "reward": 1.4443081617355347, "reward_std": 0.3500446081161499, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.5871651768684387, "rewards/curriculum_aware_reward_fn/std": 0.4618165194988251, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2008928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1768.6273193359375, "completions/mean_terminated_length": 1183.533447265625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6436307374935534, "grad_norm": 0.20810863375663757, "kl": 0.02264404296875, "learning_rate": 1e-06, "loss": 0.0891, "num_tokens": 189310593.0, "reward": 1.3198660612106323, "reward_std": 0.2814616560935974, "rewards/code_format_reward/mean": 0.7991071343421936, "rewards/code_format_reward/std": 0.4011160135269165, "rewards/curriculum_aware_reward_fn/mean": 0.5207589268684387, "rewards/curriculum_aware_reward_fn/std": 0.4600163996219635, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1768.450927734375, "completions/mean_terminated_length": 1285.3746337890625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6477565755544095, "grad_norm": 0.239214688539505, "kl": 0.0230560302734375, "learning_rate": 1e-06, "loss": 0.1321, "num_tokens": 190381312.0, "reward": 1.4726563692092896, "reward_std": 0.3661116063594818, "rewards/code_format_reward/mean": 0.8258928656578064, "rewards/code_format_reward/std": 0.37962549924850464, "rewards/curriculum_aware_reward_fn/mean": 0.6467633843421936, "rewards/curriculum_aware_reward_fn/std": 0.4802655875682831, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 1846.18310546875, "completions/mean_terminated_length": 1422.4774169921875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6518824136152656, "grad_norm": 0.2303915023803711, "kl": 0.022308349609375, "learning_rate": 1e-06, "loss": 0.094, "num_tokens": 191453493.0, "reward": 1.449330449104309, "reward_std": 0.3553183376789093, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6100445985794067, "rewards/curriculum_aware_reward_fn/std": 0.4413088262081146, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1718.0201416015625, "completions/mean_terminated_length": 1343.1964111328125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6560082516761218, "grad_norm": 0.2290109395980835, "kl": 0.0232391357421875, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 192504405.0, "reward": 1.4784599542617798, "reward_std": 0.28895553946495056, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334251284599304, "rewards/curriculum_aware_reward_fn/mean": 0.614620566368103, "rewards/curriculum_aware_reward_fn/std": 0.4428318738937378, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1801.8438720703125, "completions/mean_terminated_length": 1391.310546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6601340897369778, "grad_norm": 0.24925446510314941, "kl": 0.023712158203125, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 193612323.0, "reward": 1.440178632736206, "reward_std": 0.30794984102249146, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.5941964387893677, "rewards/curriculum_aware_reward_fn/std": 0.45045316219329834, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2066.785888671875, "completions/mean_terminated_length": 1491.163330078125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.6642599277978339, "grad_norm": 2.0607409477233887, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": 0.0947, "num_tokens": 194811691.0, "reward": 1.395424246788025, "reward_std": 0.37766921520233154, "rewards/code_format_reward/mean": 0.7790178656578064, "rewards/code_format_reward/std": 0.4153723120689392, "rewards/curriculum_aware_reward_fn/mean": 0.616406261920929, "rewards/curriculum_aware_reward_fn/std": 0.4849223494529724, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1484.1607666015625, "completions/mean_terminated_length": 1199.7030029296875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.66838576585869, "grad_norm": 0.30098825693130493, "kl": 0.0265960693359375, "learning_rate": 1e-06, "loss": 0.0433, "num_tokens": 195746354.0, "reward": 1.5621651411056519, "reward_std": 0.28447994589805603, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.6626116037368774, "rewards/curriculum_aware_reward_fn/std": 0.43061330914497375, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 1539.4710693359375, "completions/mean_terminated_length": 1246.932861328125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6725116039195461, "grad_norm": 0.23855797946453094, "kl": 0.031982421875, "learning_rate": 1e-06, "loss": 0.1266, "num_tokens": 196703717.0, "reward": 1.447656273841858, "reward_std": 0.3021145462989807, "rewards/code_format_reward/mean": 0.8973214030265808, "rewards/code_format_reward/std": 0.30387791991233826, "rewards/curriculum_aware_reward_fn/mean": 0.5503348112106323, "rewards/curriculum_aware_reward_fn/std": 0.45792168378829956, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1385.15185546875, "completions/mean_terminated_length": 1082.45166015625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6766374419804023, "grad_norm": 0.25144195556640625, "kl": 0.0297088623046875, "learning_rate": 1e-06, "loss": 0.1121, "num_tokens": 197595718.0, "reward": 1.587388515472412, "reward_std": 0.26974406838417053, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.6878347396850586, "rewards/curriculum_aware_reward_fn/std": 0.41807910799980164, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1294642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 1562.8148193359375, "completions/mean_terminated_length": 1186.0845947265625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6807632800412584, "grad_norm": 0.2671045958995819, "kl": 0.0441436767578125, "learning_rate": 1e-06, "loss": 0.0354, "num_tokens": 198576454.0, "reward": 1.5322545766830444, "reward_std": 0.2892707884311676, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.6617187857627869, "rewards/curriculum_aware_reward_fn/std": 0.42239058017730713, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 1682.310302734375, "completions/mean_terminated_length": 1344.5166015625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6848891181021145, "grad_norm": 0.24003268778324127, "kl": 0.0265960693359375, "learning_rate": 1e-06, "loss": 0.0905, "num_tokens": 199616226.0, "reward": 1.5041295289993286, "reward_std": 0.3453480899333954, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374045372009, "rewards/curriculum_aware_reward_fn/mean": 0.6268973350524902, "rewards/curriculum_aware_reward_fn/std": 0.45591655373573303, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1336.94873046875, "completions/mean_terminated_length": 1124.7139892578125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6890149561629706, "grad_norm": 0.263192355632782, "kl": 0.0506591796875, "learning_rate": 1e-06, "loss": 0.0515, "num_tokens": 200482411.0, "reward": 1.6373885869979858, "reward_std": 0.26034244894981384, "rewards/code_format_reward/mean": 0.9263392686843872, "rewards/code_format_reward/std": 0.2615099549293518, "rewards/curriculum_aware_reward_fn/mean": 0.7110490798950195, "rewards/curriculum_aware_reward_fn/std": 0.40943244099617004, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1605.6629638671875, "completions/mean_terminated_length": 1299.83203125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6931407942238267, "grad_norm": 0.2897297143936157, "kl": 0.02880859375, "learning_rate": 1e-06, "loss": 0.088, "num_tokens": 201493242.0, "reward": 1.4989956617355347, "reward_std": 0.3241947889328003, "rewards/code_format_reward/mean": 0.890625, "rewards/code_format_reward/std": 0.3124580383300781, "rewards/curriculum_aware_reward_fn/mean": 0.6083706021308899, "rewards/curriculum_aware_reward_fn/std": 0.41958779096603394, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1116071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1574.837158203125, "completions/mean_terminated_length": 1258.1080322265625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6972666322846828, "grad_norm": 0.2755286693572998, "kl": 0.0564727783203125, "learning_rate": 1e-06, "loss": 0.1192, "num_tokens": 202485020.0, "reward": 1.5125000476837158, "reward_std": 0.3663952052593231, "rewards/code_format_reward/mean": 0.8883928656578064, "rewards/code_format_reward/std": 0.315234512090683, "rewards/curriculum_aware_reward_fn/mean": 0.6241070628166199, "rewards/curriculum_aware_reward_fn/std": 0.45831769704818726, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 1939.821533203125, "completions/mean_terminated_length": 1449.5123291015625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.701392470345539, "grad_norm": 0.2619319558143616, "kl": 0.027374267578125, "learning_rate": 1e-06, "loss": 0.1405, "num_tokens": 203647066.0, "reward": 1.3794643878936768, "reward_std": 0.46084168553352356, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.5625, "rewards/curriculum_aware_reward_fn/std": 0.4440005123615265, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1811.2032470703125, "completions/mean_terminated_length": 1395.237548828125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.7055183084063951, "grad_norm": 0.34664231538772583, "kl": 0.0272216796875, "learning_rate": 1e-06, "loss": 0.1296, "num_tokens": 204733158.0, "reward": 1.4449776411056519, "reward_std": 0.3556159734725952, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.5989955067634583, "rewards/curriculum_aware_reward_fn/std": 0.439473032951355, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4004.0, "completions/mean_length": 1604.6920166015625, "completions/mean_terminated_length": 1143.338623046875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7096441464672512, "grad_norm": 0.3994239866733551, "kl": 0.0269927978515625, "learning_rate": 1e-06, "loss": 0.126, "num_tokens": 205713370.0, "reward": 1.4523438215255737, "reward_std": 0.2969651222229004, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.608593761920929, "rewards/curriculum_aware_reward_fn/std": 0.47555315494537354, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1617.3282470703125, "completions/mean_terminated_length": 1219.199462890625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7137699845281072, "grad_norm": 0.2154603898525238, "kl": 0.027008056640625, "learning_rate": 1e-06, "loss": 0.0961, "num_tokens": 206695722.0, "reward": 1.5544644594192505, "reward_std": 0.31067079305648804, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.6950892806053162, "rewards/curriculum_aware_reward_fn/std": 0.44053351879119873, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 3907.0, "completions/mean_length": 1281.1473388671875, "completions/mean_terminated_length": 1020.258544921875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7178958225889633, "grad_norm": 0.3718879520893097, "kl": 0.031036376953125, "learning_rate": 1e-06, "loss": 0.0903, "num_tokens": 207537041.0, "reward": 1.6243306398391724, "reward_std": 0.2838735282421112, "rewards/code_format_reward/mean": 0.9151785969734192, "rewards/code_format_reward/std": 0.2789272665977478, "rewards/curriculum_aware_reward_fn/mean": 0.7091518044471741, "rewards/curriculum_aware_reward_fn/std": 0.4553619921207428, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4041.0, "completions/mean_length": 1884.2724609375, "completions/mean_terminated_length": 1373.8736572265625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7220216606498195, "grad_norm": 0.2386198192834854, "kl": 0.0254669189453125, "learning_rate": 1e-06, "loss": 0.0984, "num_tokens": 208680599.0, "reward": 1.3593751192092896, "reward_std": 0.31741124391555786, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.546875, "rewards/curriculum_aware_reward_fn/std": 0.45284491777420044, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 1552.321533203125, "completions/mean_terminated_length": 1275.287109375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7261474987106756, "grad_norm": 0.24585580825805664, "kl": 0.02642822265625, "learning_rate": 1e-06, "loss": 0.0588, "num_tokens": 209645798.0, "reward": 1.6802456378936768, "reward_std": 0.30681973695755005, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.29793688654899597, "rewards/curriculum_aware_reward_fn/mean": 0.7784598469734192, "rewards/curriculum_aware_reward_fn/std": 0.41498667001724243, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 1800.9420166015625, "completions/mean_terminated_length": 1499.5706787109375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.7302733367715317, "grad_norm": 0.22753238677978516, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.1122, "num_tokens": 210721840.0, "reward": 1.4109376668930054, "reward_std": 0.3212467432022095, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.5292410850524902, "rewards/curriculum_aware_reward_fn/std": 0.5174082517623901, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 1630.0357666015625, "completions/mean_terminated_length": 1241.34375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7343991748323878, "grad_norm": 0.2533700466156006, "kl": 0.027496337890625, "learning_rate": 1e-06, "loss": 0.1076, "num_tokens": 211733328.0, "reward": 1.4767858982086182, "reward_std": 0.3183857798576355, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.6129464507102966, "rewards/curriculum_aware_reward_fn/std": 0.43392208218574524, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4002.0, "completions/mean_length": 1448.7523193359375, "completions/mean_terminated_length": 1153.15380859375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7385250128932439, "grad_norm": 0.2421787679195404, "kl": 0.0265960693359375, "learning_rate": 1e-06, "loss": 0.0773, "num_tokens": 212632332.0, "reward": 1.4965404272079468, "reward_std": 0.270067036151886, "rewards/code_format_reward/mean": 0.8995535969734192, "rewards/code_format_reward/std": 0.30093035101890564, "rewards/curriculum_aware_reward_fn/mean": 0.5969866514205933, "rewards/curriculum_aware_reward_fn/std": 0.4876974821090698, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1789.571533203125, "completions/mean_terminated_length": 1383.97900390625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7426508509541001, "grad_norm": 0.31309306621551514, "kl": 0.0263519287109375, "learning_rate": 1e-06, "loss": 0.0679, "num_tokens": 213694586.0, "reward": 1.4526787996292114, "reward_std": 0.3394067883491516, "rewards/code_format_reward/mean": 0.8504464030265808, "rewards/code_format_reward/std": 0.3570319712162018, "rewards/curriculum_aware_reward_fn/mean": 0.6022320985794067, "rewards/curriculum_aware_reward_fn/std": 0.43776625394821167, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1725.638427734375, "completions/mean_terminated_length": 1241.3709716796875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7467766890149562, "grad_norm": 0.2224123626947403, "kl": 0.0270233154296875, "learning_rate": 1e-06, "loss": 0.0689, "num_tokens": 214750038.0, "reward": 1.4532368183135986, "reward_std": 0.28235524892807007, "rewards/code_format_reward/mean": 0.828125, "rewards/code_format_reward/std": 0.3776935040950775, "rewards/curriculum_aware_reward_fn/mean": 0.6251115798950195, "rewards/curriculum_aware_reward_fn/std": 0.4479008913040161, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1770.430908203125, "completions/mean_terminated_length": 1354.2763671875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7509025270758123, "grad_norm": 0.2114259898662567, "kl": 0.0256195068359375, "learning_rate": 1e-06, "loss": 0.0638, "num_tokens": 215799237.0, "reward": 1.434933066368103, "reward_std": 0.2999463677406311, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.5867186784744263, "rewards/curriculum_aware_reward_fn/std": 0.4654589891433716, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1701.075927734375, "completions/mean_terminated_length": 1309.17919921875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7550283651366684, "grad_norm": 0.22379787266254425, "kl": 0.0278778076171875, "learning_rate": 1e-06, "loss": 0.0706, "num_tokens": 216821614.0, "reward": 1.5437501668930054, "reward_std": 0.31739112734794617, "rewards/code_format_reward/mean": 0.859375, "rewards/code_format_reward/std": 0.3480229377746582, "rewards/curriculum_aware_reward_fn/mean": 0.684374988079071, "rewards/curriculum_aware_reward_fn/std": 0.4320380389690399, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1990.634033203125, "completions/mean_terminated_length": 1532.9456787109375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7591542031975245, "grad_norm": 0.24986539781093597, "kl": 0.0315704345703125, "learning_rate": 1e-06, "loss": 0.0654, "num_tokens": 217998885.0, "reward": 1.3780137300491333, "reward_std": 0.3857751786708832, "rewards/code_format_reward/mean": 0.8191964030265808, "rewards/code_format_reward/std": 0.38528555631637573, "rewards/curriculum_aware_reward_fn/mean": 0.5588169693946838, "rewards/curriculum_aware_reward_fn/std": 0.45324498414993286, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 1551.680908203125, "completions/mean_terminated_length": 1165.781494140625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7632800412583806, "grad_norm": 0.2209348976612091, "kl": 0.0284576416015625, "learning_rate": 1e-06, "loss": 0.0693, "num_tokens": 218964735.0, "reward": 1.4757813215255737, "reward_std": 0.2281898558139801, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.6074776649475098, "rewards/curriculum_aware_reward_fn/std": 0.44274526834487915, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 2035.74560546875, "completions/mean_terminated_length": 1458.874267578125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7674058793192368, "grad_norm": 0.22559720277786255, "kl": 0.0242767333984375, "learning_rate": 1e-06, "loss": 0.0761, "num_tokens": 220137625.0, "reward": 1.2690849304199219, "reward_std": 0.32984551787376404, "rewards/code_format_reward/mean": 0.7767857313156128, "rewards/code_format_reward/std": 0.41686633229255676, "rewards/curriculum_aware_reward_fn/mean": 0.4922991096973419, "rewards/curriculum_aware_reward_fn/std": 0.4548543393611908, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0647321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1462.5023193359375, "completions/mean_terminated_length": 1280.2315673828125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.7715317173800929, "grad_norm": 0.25089648365974426, "kl": 0.0278167724609375, "learning_rate": 1e-06, "loss": 0.0374, "num_tokens": 221061269.0, "reward": 1.526116132736206, "reward_std": 0.21711213886737823, "rewards/code_format_reward/mean": 0.9352678656578064, "rewards/code_format_reward/std": 0.24632768332958221, "rewards/curriculum_aware_reward_fn/mean": 0.5908482670783997, "rewards/curriculum_aware_reward_fn/std": 0.4568212032318115, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 1583.49560546875, "completions/mean_terminated_length": 1224.5662841796875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.7756575554409489, "grad_norm": 0.24917955696582794, "kl": 0.0264434814453125, "learning_rate": 1e-06, "loss": 0.1206, "num_tokens": 222034386.0, "reward": 1.5512278079986572, "reward_std": 0.31712469458580017, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.6784598231315613, "rewards/curriculum_aware_reward_fn/std": 0.4226253032684326, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1450892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1637.4732666015625, "completions/mean_terminated_length": 1220.229736328125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.779783393501805, "grad_norm": 0.3060227632522583, "kl": 0.0265045166015625, "learning_rate": 1e-06, "loss": 0.0848, "num_tokens": 223026932.0, "reward": 1.469866156578064, "reward_std": 0.33409756422042847, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.6149553656578064, "rewards/curriculum_aware_reward_fn/std": 0.46824148297309875, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1383928571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1643.5201416015625, "completions/mean_terminated_length": 1249.598388671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7839092315626611, "grad_norm": 0.8266240954399109, "kl": 0.201629638671875, "learning_rate": 1e-06, "loss": 0.1109, "num_tokens": 224016846.0, "reward": 1.5034598112106323, "reward_std": 0.34203994274139404, "rewards/code_format_reward/mean": 0.8616071343421936, "rewards/code_format_reward/std": 0.34569787979125977, "rewards/curriculum_aware_reward_fn/mean": 0.6418526768684387, "rewards/curriculum_aware_reward_fn/std": 0.44388148188591003, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1560.919677734375, "completions/mean_terminated_length": 1228.0302734375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7880350696235173, "grad_norm": 0.2337990254163742, "kl": 0.0277099609375, "learning_rate": 1e-06, "loss": 0.1017, "num_tokens": 224966136.0, "reward": 1.6110491752624512, "reward_std": 0.31097590923309326, "rewards/code_format_reward/mean": 0.8839285969734192, "rewards/code_format_reward/std": 0.32066863775253296, "rewards/curriculum_aware_reward_fn/mean": 0.7271205186843872, "rewards/curriculum_aware_reward_fn/std": 0.420710951089859, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1915.5804443359375, "completions/mean_terminated_length": 1504.9442138671875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7921609076843734, "grad_norm": 1.356719970703125, "kl": 0.0239105224609375, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 226118148.0, "reward": 1.4401787519454956, "reward_std": 0.3068878650665283, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.5986607670783997, "rewards/curriculum_aware_reward_fn/std": 0.4407366216182709, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1294642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1712.5045166015625, "completions/mean_terminated_length": 1358.035888671875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7962867457452295, "grad_norm": 0.23961399495601654, "kl": 0.0257110595703125, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 227159589.0, "reward": 1.4156250953674316, "reward_std": 0.30595844984054565, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.5428571701049805, "rewards/curriculum_aware_reward_fn/std": 0.45850682258605957, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 2042.5982666015625, "completions/mean_terminated_length": 1414.005859375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8004125838060856, "grad_norm": 0.24642403423786163, "kl": 0.02288818359375, "learning_rate": 1e-06, "loss": 0.082, "num_tokens": 228339903.0, "reward": 1.3223215341567993, "reward_std": 0.37197059392929077, "rewards/code_format_reward/mean": 0.7633928656578064, "rewards/code_format_reward/std": 0.4254741966724396, "rewards/curriculum_aware_reward_fn/mean": 0.5589285492897034, "rewards/curriculum_aware_reward_fn/std": 0.45412421226501465, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1674107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1769.7857666015625, "completions/mean_terminated_length": 1302.04833984375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8045384218669417, "grad_norm": 0.2354077249765396, "kl": 0.0252838134765625, "learning_rate": 1e-06, "loss": 0.1052, "num_tokens": 229392446.0, "reward": 1.4168527126312256, "reward_std": 0.27983659505844116, "rewards/code_format_reward/mean": 0.8325892686843872, "rewards/code_format_reward/std": 0.37375950813293457, "rewards/curriculum_aware_reward_fn/mean": 0.5842633843421936, "rewards/curriculum_aware_reward_fn/std": 0.4433867931365967, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1763.1719970703125, "completions/mean_terminated_length": 1423.092041015625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8086642599277978, "grad_norm": 0.2680366039276123, "kl": 0.02752685546875, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 230454647.0, "reward": 1.4303573369979858, "reward_std": 0.3536536395549774, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.5598214268684387, "rewards/curriculum_aware_reward_fn/std": 0.4470679461956024, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1830357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 1976.7857666015625, "completions/mean_terminated_length": 1501.989013671875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.812790097988654, "grad_norm": 0.2032593935728073, "kl": 0.02392578125, "learning_rate": 1e-06, "loss": 0.0803, "num_tokens": 231608130.0, "reward": 1.3766741752624512, "reward_std": 0.35376885533332825, "rewards/code_format_reward/mean": 0.8169642686843872, "rewards/code_format_reward/std": 0.387128084897995, "rewards/curriculum_aware_reward_fn/mean": 0.5597098469734192, "rewards/curriculum_aware_reward_fn/std": 0.4766124188899994, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1616.7857666015625, "completions/mean_terminated_length": 1142.04248046875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8169159360495101, "grad_norm": 0.2183210849761963, "kl": 0.0261688232421875, "learning_rate": 1e-06, "loss": 0.0869, "num_tokens": 232587173.0, "reward": 1.3754465579986572, "reward_std": 0.24333688616752625, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.5339285731315613, "rewards/curriculum_aware_reward_fn/std": 0.4529650807380676, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 1754.21435546875, "completions/mean_terminated_length": 1327.8734130859375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8210417741103662, "grad_norm": 0.23124267160892487, "kl": 0.0271148681640625, "learning_rate": 1e-06, "loss": 0.062, "num_tokens": 233633069.0, "reward": 1.4648438692092896, "reward_std": 0.3172808289527893, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.6188616156578064, "rewards/curriculum_aware_reward_fn/std": 0.46663418412208557, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1722.7857666015625, "completions/mean_terminated_length": 1411.1514892578125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8251676121712223, "grad_norm": 0.21612757444381714, "kl": 0.0260009765625, "learning_rate": 1e-06, "loss": 0.0876, "num_tokens": 234687677.0, "reward": 1.399330496788025, "reward_std": 0.3321603238582611, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.5176339149475098, "rewards/curriculum_aware_reward_fn/std": 0.485944926738739, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1553.4420166015625, "completions/mean_terminated_length": 1197.61328125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8292934502320783, "grad_norm": 0.34310203790664673, "kl": 0.027313232421875, "learning_rate": 1e-06, "loss": 0.0583, "num_tokens": 235653189.0, "reward": 1.5735491514205933, "reward_std": 0.28810420632362366, "rewards/code_format_reward/mean": 0.8772321343421936, "rewards/code_format_reward/std": 0.3285374343395233, "rewards/curriculum_aware_reward_fn/mean": 0.6963170170783997, "rewards/curriculum_aware_reward_fn/std": 0.4250316917896271, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1697.712158203125, "completions/mean_terminated_length": 1253.5845947265625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8334192882929345, "grad_norm": 0.25292858481407166, "kl": 0.0266571044921875, "learning_rate": 1e-06, "loss": 0.0926, "num_tokens": 236673849.0, "reward": 1.477455496788025, "reward_std": 0.34063848853111267, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.635937511920929, "rewards/curriculum_aware_reward_fn/std": 0.4406279921531677, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1852678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 2054.513427734375, "completions/mean_terminated_length": 1590.2850341796875, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8375451263537906, "grad_norm": 0.21040566265583038, "kl": 0.0247650146484375, "learning_rate": 1e-06, "loss": 0.0471, "num_tokens": 237877099.0, "reward": 1.3340402841567993, "reward_std": 0.4357722997665405, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5215401649475098, "rewards/curriculum_aware_reward_fn/std": 0.44028064608573914, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 1691.3594970703125, "completions/mean_terminated_length": 1268.49609375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8416709644146467, "grad_norm": 0.2229364961385727, "kl": 0.027587890625, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 238900513.0, "reward": 1.5187500715255737, "reward_std": 0.2642645239830017, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.6705357432365417, "rewards/curriculum_aware_reward_fn/std": 0.42630526423454285, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1294642857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 1630.462158203125, "completions/mean_terminated_length": 1263.7923583984375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8457968024755028, "grad_norm": 0.1862471103668213, "kl": 0.0264129638671875, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 239896624.0, "reward": 1.5054689645767212, "reward_std": 0.20619100332260132, "rewards/code_format_reward/mean": 0.8705357313156128, "rewards/code_format_reward/std": 0.3360883891582489, "rewards/curriculum_aware_reward_fn/mean": 0.6349330544471741, "rewards/curriculum_aware_reward_fn/std": 0.5093549489974976, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1643.2232666015625, "completions/mean_terminated_length": 1307.0557861328125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.8499226405363589, "grad_norm": 0.25529512763023376, "kl": 0.0282745361328125, "learning_rate": 1e-06, "loss": 0.0841, "num_tokens": 240894784.0, "reward": 1.5532366037368774, "reward_std": 0.30373936891555786, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.6737723350524902, "rewards/curriculum_aware_reward_fn/std": 0.42407768964767456, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1728.2813720703125, "completions/mean_terminated_length": 1282.371337890625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8540484785972151, "grad_norm": 0.23259170353412628, "kl": 0.026702880859375, "learning_rate": 1e-06, "loss": 0.0704, "num_tokens": 241926640.0, "reward": 1.4595983028411865, "reward_std": 0.2936285734176636, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.6180803179740906, "rewards/curriculum_aware_reward_fn/std": 0.42976123094558716, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1657.9442138671875, "completions/mean_terminated_length": 1273.6512451171875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.8581743166580712, "grad_norm": 0.2791071832180023, "kl": 0.0261993408203125, "learning_rate": 1e-06, "loss": 0.1156, "num_tokens": 242931465.0, "reward": 1.4529019594192505, "reward_std": 0.3942738473415375, "rewards/code_format_reward/mean": 0.8660714030265808, "rewards/code_format_reward/std": 0.34095630049705505, "rewards/curriculum_aware_reward_fn/mean": 0.5868303179740906, "rewards/curriculum_aware_reward_fn/std": 0.44126445055007935, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 1816.040283203125, "completions/mean_terminated_length": 1282.165283203125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8623001547189273, "grad_norm": 0.24899612367153168, "kl": 0.0265350341796875, "learning_rate": 1e-06, "loss": 0.0658, "num_tokens": 244024743.0, "reward": 1.4465402364730835, "reward_std": 0.3748583197593689, "rewards/code_format_reward/mean": 0.8102678656578064, "rewards/code_format_reward/std": 0.39252740144729614, "rewards/curriculum_aware_reward_fn/mean": 0.6362723112106323, "rewards/curriculum_aware_reward_fn/std": 0.4409621059894562, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 2029.493408203125, "completions/mean_terminated_length": 1465.900634765625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8664259927797834, "grad_norm": 0.19784729182720184, "kl": 0.0258941650390625, "learning_rate": 1e-06, "loss": 0.0864, "num_tokens": 245201517.0, "reward": 1.281250238418579, "reward_std": 0.2926739752292633, "rewards/code_format_reward/mean": 0.7879464030265808, "rewards/code_format_reward/std": 0.40921956300735474, "rewards/curriculum_aware_reward_fn/mean": 0.4933035373687744, "rewards/curriculum_aware_reward_fn/std": 0.4879530668258667, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1820.88623046875, "completions/mean_terminated_length": 1385.22607421875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.8705518308406395, "grad_norm": 0.23332077264785767, "kl": 0.0264129638671875, "learning_rate": 1e-06, "loss": 0.0944, "num_tokens": 246272872.0, "reward": 1.468415379524231, "reward_std": 0.32783567905426025, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6291294097900391, "rewards/curriculum_aware_reward_fn/std": 0.47205427289009094, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1453.85498046875, "completions/mean_terminated_length": 1158.8262939453125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8746776689014956, "grad_norm": 0.4520083963871002, "kl": 0.029754638671875, "learning_rate": 1e-06, "loss": 0.0828, "num_tokens": 247188582.0, "reward": 1.6032366752624512, "reward_std": 0.2740907371044159, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.29793688654899597, "rewards/curriculum_aware_reward_fn/mean": 0.7014508843421936, "rewards/curriculum_aware_reward_fn/std": 0.42149075865745544, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1540178571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 1736.3148193359375, "completions/mean_terminated_length": 1306.715087890625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8788035069623518, "grad_norm": 0.22580303251743317, "kl": 0.027374267578125, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 248239164.0, "reward": 1.4832589626312256, "reward_std": 0.3306499719619751, "rewards/code_format_reward/mean": 0.8459821343421936, "rewards/code_format_reward/std": 0.36136940121650696, "rewards/curriculum_aware_reward_fn/mean": 0.6372767686843872, "rewards/curriculum_aware_reward_fn/std": 0.43575507402420044, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1183035714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 1742.529052734375, "completions/mean_terminated_length": 1426.746826171875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.8829293450232079, "grad_norm": 0.24393945932388306, "kl": 0.02630615234375, "learning_rate": 1e-06, "loss": 0.0638, "num_tokens": 249300762.0, "reward": 1.5498884916305542, "reward_std": 0.3314237594604492, "rewards/code_format_reward/mean": 0.8816964030265808, "rewards/code_format_reward/std": 0.32332828640937805, "rewards/curriculum_aware_reward_fn/mean": 0.6681919097900391, "rewards/curriculum_aware_reward_fn/std": 0.4118741750717163, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1049107142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 1637.3348388671875, "completions/mean_terminated_length": 1349.162109375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.887055183084064, "grad_norm": 0.22524137794971466, "kl": 0.0273284912109375, "learning_rate": 1e-06, "loss": 0.0634, "num_tokens": 250294282.0, "reward": 1.5626118183135986, "reward_std": 0.2888850271701813, "rewards/code_format_reward/mean": 0.8950892686843872, "rewards/code_format_reward/std": 0.3067808747291565, "rewards/curriculum_aware_reward_fn/mean": 0.6675223112106323, "rewards/curriculum_aware_reward_fn/std": 0.41152289509773254, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1709.44873046875, "completions/mean_terminated_length": 1382.3577880859375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.89118102114492, "grad_norm": 0.274906188249588, "kl": 0.0260162353515625, "learning_rate": 1e-06, "loss": 0.1016, "num_tokens": 251330133.0, "reward": 1.5188616514205933, "reward_std": 0.2989726960659027, "rewards/code_format_reward/mean": 0.8794642686843872, "rewards/code_format_reward/std": 0.3259509205818176, "rewards/curriculum_aware_reward_fn/mean": 0.6393973231315613, "rewards/curriculum_aware_reward_fn/std": 0.43986761569976807, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1560.7410888671875, "completions/mean_terminated_length": 1284.623779296875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8953068592057761, "grad_norm": 0.35695627331733704, "kl": 0.0279083251953125, "learning_rate": 1e-06, "loss": 0.0427, "num_tokens": 252292349.0, "reward": 1.5180803537368774, "reward_std": 0.2546882629394531, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.29793688654899597, "rewards/curriculum_aware_reward_fn/mean": 0.6162945628166199, "rewards/curriculum_aware_reward_fn/std": 0.4382215440273285, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1227678571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1619.3013916015625, "completions/mean_terminated_length": 1272.6895751953125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8994326972666323, "grad_norm": 4.665146350860596, "kl": 0.5009918212890625, "learning_rate": 1e-06, "loss": 0.1163, "num_tokens": 253276061.0, "reward": 1.59363853931427, "reward_std": 0.3767644762992859, "rewards/code_format_reward/mean": 0.8727678656578064, "rewards/code_format_reward/std": 0.3336053788661957, "rewards/curriculum_aware_reward_fn/mean": 0.7208705544471741, "rewards/curriculum_aware_reward_fn/std": 0.44314906001091003, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1841.732177734375, "completions/mean_terminated_length": 1351.6739501953125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9035585353274884, "grad_norm": 0.22911052405834198, "kl": 0.026397705078125, "learning_rate": 1e-06, "loss": 0.0755, "num_tokens": 254372344.0, "reward": 1.4458706378936768, "reward_std": 0.3519558310508728, "rewards/code_format_reward/mean": 0.8236607313156128, "rewards/code_format_reward/std": 0.3815346360206604, "rewards/curriculum_aware_reward_fn/mean": 0.6222098469734192, "rewards/curriculum_aware_reward_fn/std": 0.47246676683425903, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1644.13623046875, "completions/mean_terminated_length": 1235.4921875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9076843733883445, "grad_norm": 1.5455594062805176, "kl": 0.195648193359375, "learning_rate": 1e-06, "loss": 0.0786, "num_tokens": 255364410.0, "reward": 1.5593750476837158, "reward_std": 0.31317561864852905, "rewards/code_format_reward/mean": 0.8549107313156128, "rewards/code_format_reward/std": 0.3525845408439636, "rewards/curriculum_aware_reward_fn/mean": 0.704464316368103, "rewards/curriculum_aware_reward_fn/std": 0.4480034410953522, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 1678.4376220703125, "completions/mean_terminated_length": 1297.374755859375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.9118102114492006, "grad_norm": 0.28620752692222595, "kl": 0.0301666259765625, "learning_rate": 1e-06, "loss": 0.0814, "num_tokens": 256368938.0, "reward": 1.4873883724212646, "reward_std": 0.33688098192214966, "rewards/code_format_reward/mean": 0.8638392686843872, "rewards/code_format_reward/std": 0.34334254264831543, "rewards/curriculum_aware_reward_fn/mean": 0.6235490441322327, "rewards/curriculum_aware_reward_fn/std": 0.43734773993492126, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1803.087158203125, "completions/mean_terminated_length": 1364.0185546875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.9159360495100567, "grad_norm": 0.3427606225013733, "kl": 0.0285491943359375, "learning_rate": 1e-06, "loss": 0.0813, "num_tokens": 257441069.0, "reward": 1.4981027841567993, "reward_std": 0.3513133227825165, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6588169932365417, "rewards/curriculum_aware_reward_fn/std": 0.4329485297203064, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1689.680908203125, "completions/mean_terminated_length": 1434.195068359375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9200618875709129, "grad_norm": 0.2436770796775818, "kl": 0.027801513671875, "learning_rate": 1e-06, "loss": 0.1213, "num_tokens": 258464109.0, "reward": 1.5488840341567993, "reward_std": 0.3513474762439728, "rewards/code_format_reward/mean": 0.9040178656578064, "rewards/code_format_reward/std": 0.29489603638648987, "rewards/curriculum_aware_reward_fn/mean": 0.6448661088943481, "rewards/curriculum_aware_reward_fn/std": 0.4577891230583191, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1707.9063720703125, "completions/mean_terminated_length": 1447.8167724609375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.924187725631769, "grad_norm": 216.50401306152344, "kl": 1.946990966796875, "learning_rate": 1e-06, "loss": 0.1121, "num_tokens": 259507949.0, "reward": 1.539955496788025, "reward_std": 0.31543269753456116, "rewards/code_format_reward/mean": 0.9017857313156128, "rewards/code_format_reward/std": 0.2979368567466736, "rewards/curriculum_aware_reward_fn/mean": 0.6381697058677673, "rewards/curriculum_aware_reward_fn/std": 0.42519187927246094, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 1472.774658203125, "completions/mean_terminated_length": 1250.4674072265625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9283135636926251, "grad_norm": 0.24840383231639862, "kl": 0.03076171875, "learning_rate": 1e-06, "loss": 0.077, "num_tokens": 260437224.0, "reward": 1.5959821939468384, "reward_std": 0.28009700775146484, "rewards/code_format_reward/mean": 0.921875, "rewards/code_format_reward/std": 0.26866820454597473, "rewards/curriculum_aware_reward_fn/mean": 0.6741071343421936, "rewards/curriculum_aware_reward_fn/std": 0.4229956865310669, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1116071428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 1640.9910888671875, "completions/mean_terminated_length": 1332.5728759765625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9324394017534812, "grad_norm": 0.23082372546195984, "kl": 0.0304107666015625, "learning_rate": 1e-06, "loss": 0.0609, "num_tokens": 261436827.0, "reward": 1.5507813692092896, "reward_std": 0.3076208531856537, "rewards/code_format_reward/mean": 0.8883928656578064, "rewards/code_format_reward/std": 0.31523454189300537, "rewards/curriculum_aware_reward_fn/mean": 0.6623883843421936, "rewards/curriculum_aware_reward_fn/std": 0.4220667779445648, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 1829.3460693359375, "completions/mean_terminated_length": 1306.27197265625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9365652398143373, "grad_norm": 0.21185402572155, "kl": 0.0299072265625, "learning_rate": 1e-06, "loss": 0.0719, "num_tokens": 262519833.0, "reward": 1.3339287042617798, "reward_std": 0.2924160361289978, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5214285850524902, "rewards/curriculum_aware_reward_fn/std": 0.47082507610321045, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 1815.2835693359375, "completions/mean_terminated_length": 1378.550537109375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9406910778751933, "grad_norm": 0.22714629769325256, "kl": 0.028564453125, "learning_rate": 1e-06, "loss": 0.0899, "num_tokens": 263599003.0, "reward": 1.4619420766830444, "reward_std": 0.34914925694465637, "rewards/code_format_reward/mean": 0.8392857313156128, "rewards/code_format_reward/std": 0.3676777780056, "rewards/curriculum_aware_reward_fn/mean": 0.6226562857627869, "rewards/curriculum_aware_reward_fn/std": 0.4631439447402954, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 1637.77685546875, "completions/mean_terminated_length": 1286.60205078125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.9448169159360496, "grad_norm": 1.5117418766021729, "kl": 0.036773681640625, "learning_rate": 1e-06, "loss": 0.0746, "num_tokens": 264614680.0, "reward": 1.5395091772079468, "reward_std": 0.26127544045448303, "rewards/code_format_reward/mean": 0.875, "rewards/code_format_reward/std": 0.3310886323451996, "rewards/curriculum_aware_reward_fn/mean": 0.6645089387893677, "rewards/curriculum_aware_reward_fn/std": 0.425923615694046, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1686.8170166015625, "completions/mean_terminated_length": 1285.2864990234375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9489427539969056, "grad_norm": 0.2744755446910858, "kl": 0.0299835205078125, "learning_rate": 1e-06, "loss": 0.0701, "num_tokens": 265640616.0, "reward": 1.5354912281036377, "reward_std": 0.28136691451072693, "rewards/code_format_reward/mean": 0.8571428656578064, "rewards/code_format_reward/std": 0.3503182828426361, "rewards/curriculum_aware_reward_fn/mean": 0.6783482432365417, "rewards/curriculum_aware_reward_fn/std": 0.4348878562450409, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 1739.966552734375, "completions/mean_terminated_length": 1296.25732421875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.9530685920577617, "grad_norm": 0.26357001066207886, "kl": 0.0283203125, "learning_rate": 1e-06, "loss": 0.0814, "num_tokens": 266682607.0, "reward": 1.4681919813156128, "reward_std": 0.3145821988582611, "rewards/code_format_reward/mean": 0.8415178656578064, "rewards/code_format_reward/std": 0.36560073494911194, "rewards/curriculum_aware_reward_fn/mean": 0.6266741156578064, "rewards/curriculum_aware_reward_fn/std": 0.4754810333251953, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 3944.0, "completions/mean_length": 1573.41748046875, "completions/mean_terminated_length": 1270.70751953125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9571944301186178, "grad_norm": 0.2473004311323166, "kl": 0.0310516357421875, "learning_rate": 1e-06, "loss": 0.0644, "num_tokens": 267655087.0, "reward": 1.5863840579986572, "reward_std": 0.28958457708358765, "rewards/code_format_reward/mean": 0.8928571343421936, "rewards/code_format_reward/std": 0.3096405565738678, "rewards/curriculum_aware_reward_fn/mean": 0.6935268044471741, "rewards/curriculum_aware_reward_fn/std": 0.4322984218597412, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 1770.4844970703125, "completions/mean_terminated_length": 1417.771240234375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9613202681794739, "grad_norm": 0.2097795307636261, "kl": 0.02978515625, "learning_rate": 1e-06, "loss": 0.0473, "num_tokens": 268716945.0, "reward": 1.4421876668930054, "reward_std": 0.2832888960838318, "rewards/code_format_reward/mean": 0.8683035969734192, "rewards/code_format_reward/std": 0.3385384678840637, "rewards/curriculum_aware_reward_fn/mean": 0.5738838911056519, "rewards/curriculum_aware_reward_fn/std": 0.4533480703830719, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1986607142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1953.888427734375, "completions/mean_terminated_length": 1422.8355712890625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9654461062403301, "grad_norm": 0.2278265655040741, "kl": 0.0291748046875, "learning_rate": 1e-06, "loss": 0.0964, "num_tokens": 269865126.0, "reward": 1.3801339864730835, "reward_std": 0.3481306731700897, "rewards/code_format_reward/mean": 0.796875, "rewards/code_format_reward/std": 0.4027745723724365, "rewards/curriculum_aware_reward_fn/mean": 0.5832589268684387, "rewards/curriculum_aware_reward_fn/std": 0.46434247493743896, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1584821428571429, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 1825.3170166015625, "completions/mean_terminated_length": 1397.681640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9695719443011862, "grad_norm": 0.22196418046951294, "kl": 0.0289154052734375, "learning_rate": 1e-06, "loss": 0.0819, "num_tokens": 270954692.0, "reward": 1.405022382736206, "reward_std": 0.28231561183929443, "rewards/code_format_reward/mean": 0.84375, "rewards/code_format_reward/std": 0.36349809169769287, "rewards/curriculum_aware_reward_fn/mean": 0.5612723231315613, "rewards/curriculum_aware_reward_fn/std": 0.48800045251846313, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1936.357177734375, "completions/mean_terminated_length": 1437.97802734375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9736977823620423, "grad_norm": 0.23538300395011902, "kl": 0.029541015625, "learning_rate": 1e-06, "loss": 0.0935, "num_tokens": 272097490.0, "reward": 1.344642996788025, "reward_std": 0.3809891939163208, "rewards/code_format_reward/mean": 0.8125, "rewards/code_format_reward/std": 0.3907487094402313, "rewards/curriculum_aware_reward_fn/mean": 0.5321428179740906, "rewards/curriculum_aware_reward_fn/std": 0.46611618995666504, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4000.0, "completions/mean_length": 1853.134033203125, "completions/mean_terminated_length": 1365.5543212890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9778236204228984, "grad_norm": 0.2134256660938263, "kl": 0.0307464599609375, "learning_rate": 1e-06, "loss": 0.109, "num_tokens": 273184846.0, "reward": 1.4496653079986572, "reward_std": 0.3251318335533142, "rewards/code_format_reward/mean": 0.8214285969734192, "rewards/code_format_reward/std": 0.3834212124347687, "rewards/curriculum_aware_reward_fn/mean": 0.6282366514205933, "rewards/curriculum_aware_reward_fn/std": 0.4360702633857727, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1657.5313720703125, "completions/mean_terminated_length": 1364.9149169921875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.9819494584837545, "grad_norm": 359.472900390625, "kl": 42.028839111328125, "learning_rate": 1e-06, "loss": 0.4948, "num_tokens": 274204200.0, "reward": 1.5136160850524902, "reward_std": 0.3207034468650818, "rewards/code_format_reward/mean": 0.8950892686843872, "rewards/code_format_reward/std": 0.3067808747291565, "rewards/curriculum_aware_reward_fn/mean": 0.618526816368103, "rewards/curriculum_aware_reward_fn/std": 0.42753708362579346, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1473214285714286, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1698.7679443359375, "completions/mean_terminated_length": 1284.58642578125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9860752965446106, "grad_norm": 0.23639072477817535, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": 0.0896, "num_tokens": 275233341.0, "reward": 1.4335938692092896, "reward_std": 0.3041447401046753, "rewards/code_format_reward/mean": 0.8482142686843872, "rewards/code_format_reward/std": 0.3592142164707184, "rewards/curriculum_aware_reward_fn/mean": 0.5853794813156128, "rewards/curriculum_aware_reward_fn/std": 0.4475054442882538, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1138392857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1666.40185546875, "completions/mean_terminated_length": 1354.287109375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9902011346054668, "grad_norm": 0.24273402988910675, "kl": 0.0313262939453125, "learning_rate": 1e-06, "loss": 0.0602, "num_tokens": 276253484.0, "reward": 1.526116132736206, "reward_std": 0.3411323130130768, "rewards/code_format_reward/mean": 0.8861607313156128, "rewards/code_format_reward/std": 0.31797102093696594, "rewards/curriculum_aware_reward_fn/mean": 0.6399553418159485, "rewards/curriculum_aware_reward_fn/std": 0.42472463846206665, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 1558.727783203125, "completions/mean_terminated_length": 1296.251220703125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9943269726663229, "grad_norm": 0.33681198954582214, "kl": 0.0325164794921875, "learning_rate": 1e-06, "loss": 0.0342, "num_tokens": 277228966.0, "reward": 1.6078126430511475, "reward_std": 0.2525419592857361, "rewards/code_format_reward/mean": 0.90625, "rewards/code_format_reward/std": 0.2918064594268799, "rewards/curriculum_aware_reward_fn/mean": 0.7015624642372131, "rewards/curriculum_aware_reward_fn/std": 0.4125516712665558, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10948905109489049, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 1572.927001953125, "completions/mean_terminated_length": 1262.7130126953125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.998452810727179, "grad_norm": 0.24264025688171387, "kl": 0.0335235595703125, "learning_rate": 1e-06, "loss": 0.0709, "num_tokens": 278247365.0, "reward": 1.497544765472412, "reward_std": 0.283774197101593, "rewards/code_format_reward/mean": 0.8973214030265808, "rewards/code_format_reward/std": 0.30387789011001587, "rewards/curriculum_aware_reward_fn/mean": 0.6002232432365417, "rewards/curriculum_aware_reward_fn/std": 0.4450950026512146, "step": 242 }, { "epoch": 0.998452810727179, "step": 242, "total_flos": 0.0, "train_loss": 280.1541930416698, "train_runtime": 97295.5274, "train_samples_per_second": 0.159, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 242, "num_input_tokens_seen": 278247365, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }