{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.005813953488372093, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5501.0, "completions/max_terminated_length": 5501.0, "completions/mean_length": 3558.09375, "completions/mean_terminated_length": 3558.09375, "completions/min_length": 2215.0, "completions/min_terminated_length": 2215.0, "epoch": 2.4224806201550387e-05, "grad_norm": 0.00640977891147893, "kl": 0.0007143020629882812, "learning_rate": 0.0, "loss": 0.0006, "num_tokens": 568407.0, "reward": 0.4926603138446808, "reward_std": 0.08448069542646408, "rewards/avg_thinking_length_func": 157.22222900390625, "rewards/confidence_score_reward_func": 0.7339284420013428, "rewards/correct_answer_reward_func": 0.640625, "rewards/efficient_thinking_reward_func": 0.9699548628723149, "rewards/format_and_efficient_reward_func": 0.5214560031890869, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.318666696548462, "rewards/tool_execution_reward_func": 1.983011245727539, "rewards/visit_tool_reward_func": 0.9305298328399658, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 4.8449612403100775e-05, "grad_norm": 0.0064083920341846115, "kl": 0.0007143020629882812, "learning_rate": 6.25e-08, "loss": 0.0006, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 7.267441860465116e-05, "grad_norm": 0.006447812260611595, "kl": 0.0007295608520507812, "learning_rate": 1.25e-07, "loss": 0.0006, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 9.689922480620155e-05, "grad_norm": 0.0066225031847143186, "kl": 0.0007305145263671875, "learning_rate": 1.875e-07, "loss": 0.0006, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7891.0, "completions/max_terminated_length": 7891.0, "completions/mean_length": 3465.828125, "completions/mean_terminated_length": 3465.828125, "completions/min_length": 1264.0, "completions/min_terminated_length": 1264.0, "epoch": 0.00012112403100775194, "grad_norm": 0.011221982806523546, "kl": 0.0008029937744140625, "learning_rate": 2.5e-07, "loss": 0.0003, "num_tokens": 1050218.0, "reward": 0.35228461027145386, "reward_std": 0.11903564631938934, "rewards/avg_thinking_length_func": 172.3975830078125, "rewards/confidence_score_reward_func": 0.7573737502098083, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.8796035517984737, "rewards/format_and_efficient_reward_func": 0.3536693751811981, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.489912509918213, "rewards/tool_execution_reward_func": 1.9884867668151855, "rewards/visit_tool_reward_func": 0.9384097456932068, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00014534883720930232, "grad_norm": 0.011369566083514073, "kl": 0.0008258819580078125, "learning_rate": 3.1249999999999997e-07, "loss": 0.0003, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001695736434108527, "grad_norm": 0.011325781329231437, "kl": 0.000820159912109375, "learning_rate": 3.75e-07, "loss": 0.0003, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0001937984496124031, "grad_norm": 0.011468177438620898, "kl": 0.0008134841918945312, "learning_rate": 4.375e-07, "loss": 0.0003, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9790.0, "completions/max_terminated_length": 9790.0, "completions/mean_length": 4101.421875, "completions/mean_terminated_length": 4101.421875, "completions/min_length": 1141.0, "completions/min_terminated_length": 1141.0, "epoch": 0.00021802325581395349, "grad_norm": 0.008533015854175789, "kl": 0.00080108642578125, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 1636681.0, "reward": 0.4183655381202698, "reward_std": 0.0931699275970459, "rewards/avg_thinking_length_func": 176.92233276367188, "rewards/confidence_score_reward_func": 0.7306747436523438, "rewards/correct_answer_reward_func": 0.546875, "rewards/efficient_thinking_reward_func": 0.8954936332818751, "rewards/format_and_efficient_reward_func": 0.4208581745624542, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.53083336353302, "rewards/tool_execution_reward_func": 1.9508955478668213, "rewards/visit_tool_reward_func": 0.8424738645553589, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00024224806201550387, "grad_norm": 0.009520985221391949, "kl": 0.0007925033569335938, "learning_rate": 5.625e-07, "loss": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00026647286821705426, "grad_norm": 0.010085270290120536, "kl": 0.0011835098266601562, "learning_rate": 6.249999999999999e-07, "loss": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00029069767441860465, "grad_norm": 0.008472445513601271, "kl": 0.0008249282836914062, "learning_rate": 6.875e-07, "loss": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7445.0, "completions/max_terminated_length": 7445.0, "completions/mean_length": 3379.234375, "completions/mean_terminated_length": 3379.234375, "completions/min_length": 1491.0, "completions/min_terminated_length": 1491.0, "epoch": 0.00031492248062015503, "grad_norm": 0.01258823765843166, "kl": 0.0009145736694335938, "learning_rate": 7.5e-07, "loss": -0.0001, "num_tokens": 2110165.0, "reward": 0.4067286550998688, "reward_std": 0.18041250109672546, "rewards/avg_thinking_length_func": 170.76950073242188, "rewards/confidence_score_reward_func": 0.763248085975647, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.8802126246942265, "rewards/format_and_efficient_reward_func": 0.4241780936717987, "rewards/format_reward_func": 0.99958336353302, "rewards/num_xml_reward_func": 1.6099066734313965, "rewards/tool_execution_reward_func": 1.9751970767974854, "rewards/visit_tool_reward_func": 0.9391972422599792, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003391472868217054, "grad_norm": 0.012500551984662189, "kl": 0.0009927749633789062, "learning_rate": 8.125e-07, "loss": -0.0001, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003633720930232558, "grad_norm": 0.012416715432446976, "kl": 0.0010967254638671875, "learning_rate": 8.75e-07, "loss": -0.0001, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003875968992248062, "grad_norm": 0.01288145978177755, "kl": 0.001140594482421875, "learning_rate": 9.374999999999999e-07, "loss": -0.0001, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10205.0, "completions/max_terminated_length": 10205.0, "completions/mean_length": 4119.96875, "completions/mean_terminated_length": 4119.96875, "completions/min_length": 1159.0, "completions/min_terminated_length": 1159.0, "epoch": 0.0004118217054263566, "grad_norm": 0.009407611593031055, "kl": 0.0011768341064453125, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 2691117.0, "reward": 0.4201432466506958, "reward_std": 0.0907188206911087, "rewards/avg_thinking_length_func": 171.4025115966797, "rewards/confidence_score_reward_func": 0.7308521270751953, "rewards/correct_answer_reward_func": 0.546875, "rewards/efficient_thinking_reward_func": 0.861229582956026, "rewards/format_and_efficient_reward_func": 0.37111079692840576, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.3203115463256836, "rewards/tool_execution_reward_func": 1.9717044830322266, "rewards/visit_tool_reward_func": 0.8859716653823853, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00043604651162790697, "grad_norm": 0.009347834781139657, "kl": 0.00139617919921875, "learning_rate": 1.0625e-06, "loss": 0.0004, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00046027131782945736, "grad_norm": 0.00928664951165006, "kl": 0.00167083740234375, "learning_rate": 1.125e-06, "loss": 0.0004, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00048449612403100775, "grad_norm": 0.009342230945576057, "kl": 0.00212860107421875, "learning_rate": 1.1874999999999999e-06, "loss": 0.0004, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7792.0, "completions/max_terminated_length": 7792.0, "completions/mean_length": 3474.78125, "completions/mean_terminated_length": 3474.78125, "completions/min_length": 1307.0, "completions/min_terminated_length": 1307.0, "epoch": 0.0005087209302325581, "grad_norm": 0.010737570621935045, "kl": 0.002620697021484375, "learning_rate": 1.2499999999999999e-06, "loss": -0.0, "num_tokens": 3182962.0, "reward": 0.3430381715297699, "reward_std": 0.15257038176059723, "rewards/avg_thinking_length_func": 163.67486572265625, "rewards/confidence_score_reward_func": 0.7590060234069824, "rewards/correct_answer_reward_func": 0.4375, "rewards/efficient_thinking_reward_func": 0.9094734969614356, "rewards/format_and_efficient_reward_func": 0.354397177696228, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.4648277759552002, "rewards/tool_execution_reward_func": 1.9753289222717285, "rewards/visit_tool_reward_func": 0.9633350968360901, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005329457364341085, "grad_norm": 0.010610611287841326, "kl": 0.003204345703125, "learning_rate": 1.3125e-06, "loss": -0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005571705426356589, "grad_norm": 0.010883725821996518, "kl": 0.003814697265625, "learning_rate": 1.375e-06, "loss": -0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0005813953488372093, "grad_norm": 0.010728950563018041, "kl": 0.00518798828125, "learning_rate": 1.4375e-06, "loss": -0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9082.0, "completions/max_terminated_length": 9082.0, "completions/mean_length": 4205.453125, "completions/mean_terminated_length": 4205.453125, "completions/min_length": 1188.0, "completions/min_terminated_length": 1188.0, "epoch": 0.0006056201550387597, "grad_norm": 0.011708703395331976, "kl": 0.0054779052734375, "learning_rate": 1.5e-06, "loss": 0.0016, "num_tokens": 3788330.0, "reward": 0.4100201725959778, "reward_std": 0.12962010502815247, "rewards/avg_thinking_length_func": 167.64987182617188, "rewards/confidence_score_reward_func": 0.7269817590713501, "rewards/correct_answer_reward_func": 0.53125, "rewards/efficient_thinking_reward_func": 0.894405090660734, "rewards/format_and_efficient_reward_func": 0.4734077453613281, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.2609543800354004, "rewards/tool_execution_reward_func": 1.9624817371368408, "rewards/visit_tool_reward_func": 0.8933978080749512, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006298449612403101, "grad_norm": 0.012029441405275343, "kl": 0.00725555419921875, "learning_rate": 1.5624999999999999e-06, "loss": 0.0016, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006540697674418605, "grad_norm": 0.011965973272488425, "kl": 0.010589599609375, "learning_rate": 1.625e-06, "loss": 0.0016, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0006782945736434108, "grad_norm": 0.018054158629818226, "kl": 0.017059326171875, "learning_rate": 1.6875e-06, "loss": 0.0016, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7466.0, "completions/max_terminated_length": 7466.0, "completions/mean_length": 3525.1875, "completions/mean_terminated_length": 3525.1875, "completions/min_length": 1458.0, "completions/min_terminated_length": 1458.0, "epoch": 0.0007025193798449612, "grad_norm": 0.011398719184495674, "kl": 0.013671875, "learning_rate": 1.75e-06, "loss": 0.0001, "num_tokens": 4289196.0, "reward": 0.3574071526527405, "reward_std": 0.09749965369701385, "rewards/avg_thinking_length_func": 163.65969848632812, "rewards/confidence_score_reward_func": 0.7581030130386353, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.9089163330381327, "rewards/format_and_efficient_reward_func": 0.3653510808944702, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.4366014003753662, "rewards/tool_execution_reward_func": 1.9675538539886475, "rewards/visit_tool_reward_func": 0.960380494594574, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007267441860465116, "grad_norm": 0.010833682609318612, "kl": 0.015838623046875, "learning_rate": 1.8125e-06, "loss": 0.0001, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.000750968992248062, "grad_norm": 0.0231097533212703, "kl": 0.022918701171875, "learning_rate": 1.8749999999999998e-06, "loss": 0.0001, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0007751937984496124, "grad_norm": 0.011257228008738334, "kl": 0.021575927734375, "learning_rate": 1.9375e-06, "loss": 0.0001, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8888.0, "completions/max_terminated_length": 8888.0, "completions/mean_length": 3804.265625, "completions/mean_terminated_length": 3804.265625, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "epoch": 0.0007994186046511628, "grad_norm": 0.02927961829217277, "kl": 0.030914306640625, "learning_rate": 2e-06, "loss": 0.0005, "num_tokens": 4848644.0, "reward": 0.46360254287719727, "reward_std": 0.10140425711870193, "rewards/avg_thinking_length_func": 168.85345458984375, "rewards/confidence_score_reward_func": 0.7187485694885254, "rewards/correct_answer_reward_func": 0.609375, "rewards/efficient_thinking_reward_func": 0.8848315117904739, "rewards/format_and_efficient_reward_func": 0.46383440494537354, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.3748114109039307, "rewards/tool_execution_reward_func": 1.9836355447769165, "rewards/visit_tool_reward_func": 0.8981889486312866, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008236434108527132, "grad_norm": 0.00984263633299767, "kl": 0.026763916015625, "learning_rate": 2e-06, "loss": 0.0005, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008478682170542636, "grad_norm": 0.022916321346866338, "kl": 0.03643798828125, "learning_rate": 2e-06, "loss": 0.0005, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0008720930232558139, "grad_norm": 0.010968578899761567, "kl": 0.03680419921875, "learning_rate": 2e-06, "loss": 0.0005, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7119.0, "completions/max_terminated_length": 7119.0, "completions/mean_length": 3045.9375, "completions/mean_terminated_length": 3045.9375, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "epoch": 0.0008963178294573643, "grad_norm": 0.11556192957878203, "kl": 0.066314697265625, "learning_rate": 2e-06, "loss": 0.0011, "num_tokens": 5286042.0, "reward": 0.38059696555137634, "reward_std": 0.20472648739814758, "rewards/avg_thinking_length_func": 171.9969024658203, "rewards/confidence_score_reward_func": 0.7361885905265808, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 0.8792661921309781, "rewards/format_and_efficient_reward_func": 0.4069961905479431, "rewards/format_reward_func": 0.9985389709472656, "rewards/num_xml_reward_func": 1.7584354877471924, "rewards/tool_execution_reward_func": 1.9876766204833984, "rewards/visit_tool_reward_func": 0.926859438419342, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009205426356589147, "grad_norm": 0.013991455687567742, "kl": 0.034393310546875, "learning_rate": 2e-06, "loss": 0.001, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009447674418604651, "grad_norm": 0.01433251116157902, "kl": 0.0352783203125, "learning_rate": 2e-06, "loss": 0.001, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0009689922480620155, "grad_norm": 0.01682769595676241, "kl": 0.0426025390625, "learning_rate": 2e-06, "loss": 0.001, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6696.0, "completions/max_terminated_length": 6696.0, "completions/mean_length": 3054.78125, "completions/mean_terminated_length": 3054.78125, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.0009932170542635659, "grad_norm": 0.034243564194623544, "kl": 0.0538330078125, "learning_rate": 2e-06, "loss": 0.0003, "num_tokens": 5728827.0, "reward": 0.5321023464202881, "reward_std": 0.07992984354496002, "rewards/avg_thinking_length_func": 185.18777465820312, "rewards/confidence_score_reward_func": 0.699253261089325, "rewards/correct_answer_reward_func": 0.734375, "rewards/efficient_thinking_reward_func": 0.8423659179880447, "rewards/format_and_efficient_reward_func": 0.5654621124267578, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.836081624031067, "rewards/tool_execution_reward_func": 1.9795209169387817, "rewards/visit_tool_reward_func": 0.8331901431083679, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010174418604651163, "grad_norm": 0.008357434600682397, "kl": 0.0467529296875, "learning_rate": 2e-06, "loss": 0.0003, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0010416666666666667, "grad_norm": 0.009143109288946688, "kl": 0.05499267578125, "learning_rate": 2e-06, "loss": 0.0003, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001065891472868217, "grad_norm": 0.018383062802239766, "kl": 0.07135009765625, "learning_rate": 2e-06, "loss": 0.0003, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5912.0, "completions/max_terminated_length": 5912.0, "completions/mean_length": 2513.34375, "completions/mean_terminated_length": 2513.34375, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.0010901162790697674, "grad_norm": 1.2449457515517797, "kl": 0.5546875, "learning_rate": 2e-06, "loss": 0.0011, "num_tokens": 6121552.0, "reward": 0.41406646370887756, "reward_std": 0.1448429971933365, "rewards/avg_thinking_length_func": 159.43849182128906, "rewards/confidence_score_reward_func": 0.7091017961502075, "rewards/correct_answer_reward_func": 0.5625, "rewards/efficient_thinking_reward_func": 0.9100999417514477, "rewards/format_and_efficient_reward_func": 0.40307265520095825, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.7179009914398193, "rewards/tool_execution_reward_func": 1.9982638359069824, "rewards/visit_tool_reward_func": 0.8534926772117615, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011143410852713178, "grad_norm": 0.04725193167363872, "kl": 0.0830078125, "learning_rate": 2e-06, "loss": 0.0007, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011385658914728682, "grad_norm": 0.01076799271094143, "kl": 0.0728759765625, "learning_rate": 2e-06, "loss": 0.0007, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0011627906976744186, "grad_norm": 1.2844930338395364, "kl": 0.594970703125, "learning_rate": 2e-06, "loss": 0.0012, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6441.0, "completions/max_terminated_length": 6441.0, "completions/mean_length": 2998.25, "completions/mean_terminated_length": 2998.25, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.001187015503875969, "grad_norm": 0.07335407885154412, "kl": 0.0946044921875, "learning_rate": 2e-06, "loss": 0.0003, "num_tokens": 6632198.0, "reward": 0.4027416408061981, "reward_std": 0.18368688225746155, "rewards/avg_thinking_length_func": 144.1616668701172, "rewards/confidence_score_reward_func": 0.6523082852363586, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.8715830269761213, "rewards/format_and_efficient_reward_func": 0.30888205766677856, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.2804265022277832, "rewards/tool_execution_reward_func": 1.9967105388641357, "rewards/visit_tool_reward_func": 0.777007520198822, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012112403100775194, "grad_norm": 3382.951158532336, "kl": 386.0513916015625, "learning_rate": 2e-06, "loss": 0.1955, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012354651162790698, "grad_norm": 0.04763618115574692, "kl": 0.111083984375, "learning_rate": 2e-06, "loss": 0.0003, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0012596899224806201, "grad_norm": 0.011361146003702229, "kl": 0.0693359375, "learning_rate": 2e-06, "loss": 0.0002, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5699.0, "completions/max_terminated_length": 5699.0, "completions/mean_length": 2702.609375, "completions/mean_terminated_length": 2702.609375, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.0012839147286821705, "grad_norm": 641.0279979496779, "kl": 340.21875, "learning_rate": 2e-06, "loss": 0.3029, "num_tokens": 7071302.0, "reward": 0.38491296768188477, "reward_std": 0.20615670084953308, "rewards/avg_thinking_length_func": 144.03466796875, "rewards/confidence_score_reward_func": 0.6775128841400146, "rewards/correct_answer_reward_func": 0.546875, "rewards/efficient_thinking_reward_func": 0.8956235775099276, "rewards/format_and_efficient_reward_func": 0.298817902803421, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.156145691871643, "rewards/tool_execution_reward_func": 2.0, "rewards/visit_tool_reward_func": 0.8991793990135193, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001308139534883721, "grad_norm": 10.07283016494114, "kl": 6.52294921875, "learning_rate": 2e-06, "loss": 0.0054, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0013323643410852713, "grad_norm": 0.024178305719161252, "kl": 0.1031494140625, "learning_rate": 2e-06, "loss": -0.0001, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0013565891472868217, "grad_norm": 0.010123659301215143, "kl": 0.0853271484375, "learning_rate": 2e-06, "loss": -0.0001, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7974.0, "completions/max_terminated_length": 7974.0, "completions/mean_length": 3418.6875, "completions/mean_terminated_length": 3418.6875, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 0.001380813953488372, "grad_norm": 0.011918704634373778, "kl": 0.0814208984375, "learning_rate": 2e-06, "loss": 0.0006, "num_tokens": 7618083.0, "reward": 0.33670923113822937, "reward_std": 0.2170744389295578, "rewards/avg_thinking_length_func": 162.87310791015625, "rewards/confidence_score_reward_func": 0.6380267143249512, "rewards/correct_answer_reward_func": 0.484375, "rewards/efficient_thinking_reward_func": 0.8769457565983968, "rewards/format_and_efficient_reward_func": 0.15387150645256042, "rewards/format_reward_func": 0.9937499761581421, "rewards/num_xml_reward_func": 0.7425504326820374, "rewards/tool_execution_reward_func": 1.984375, "rewards/visit_tool_reward_func": 0.7900611162185669, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014050387596899225, "grad_norm": 0.012656826141930118, "kl": 0.0870361328125, "learning_rate": 2e-06, "loss": 0.0006, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014292635658914728, "grad_norm": 0.01963879028272825, "kl": 0.102783203125, "learning_rate": 2e-06, "loss": 0.0006, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0014534883720930232, "grad_norm": 0.023803010795812877, "kl": 0.111328125, "learning_rate": 2e-06, "loss": 0.0006, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6602.0, "completions/max_terminated_length": 6602.0, "completions/mean_length": 2937.375, "completions/mean_terminated_length": 2937.375, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "epoch": 0.0014777131782945736, "grad_norm": 0.6399010033168665, "kl": 0.328857421875, "learning_rate": 2e-06, "loss": 0.0008, "num_tokens": 8081395.0, "reward": 0.41028502583503723, "reward_std": 0.1911381632089615, "rewards/avg_thinking_length_func": 154.1159210205078, "rewards/confidence_score_reward_func": 0.6654285192489624, "rewards/correct_answer_reward_func": 0.59375, "rewards/efficient_thinking_reward_func": 0.8800399071963756, "rewards/format_and_efficient_reward_func": 0.1847984343767166, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 0.921923816204071, "rewards/tool_execution_reward_func": 1.9983552694320679, "rewards/visit_tool_reward_func": 0.883500337600708, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001501937984496124, "grad_norm": 0.011193139638749735, "kl": 0.091552734375, "learning_rate": 2e-06, "loss": 0.0006, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0015261627906976744, "grad_norm": 0.010209194017758182, "kl": 0.086181640625, "learning_rate": 2e-06, "loss": 0.0006, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0015503875968992248, "grad_norm": 0.14653936372168078, "kl": 0.1170654296875, "learning_rate": 2e-06, "loss": 0.0006, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8850.0, "completions/max_terminated_length": 8850.0, "completions/mean_length": 3542.96875, "completions/mean_terminated_length": 3542.96875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.0015746124031007752, "grad_norm": 0.48126334443141955, "kl": 0.248046875, "learning_rate": 2e-06, "loss": -0.0001, "num_tokens": 8636201.0, "reward": 0.39273786544799805, "reward_std": 0.12296080589294434, "rewards/avg_thinking_length_func": 150.4586639404297, "rewards/confidence_score_reward_func": 0.6261853575706482, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.8429494490638886, "rewards/format_and_efficient_reward_func": 0.26941436529159546, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 0.9826317429542542, "rewards/tool_execution_reward_func": 1.9983552694320679, "rewards/visit_tool_reward_func": 0.8202804327011108, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0015988372093023256, "grad_norm": 0.011505230665385266, "kl": 0.087646484375, "learning_rate": 2e-06, "loss": -0.0003, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001623062015503876, "grad_norm": 0.011219221768431348, "kl": 0.0863037109375, "learning_rate": 2e-06, "loss": -0.0003, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0016472868217054263, "grad_norm": 0.013493117517357446, "kl": 0.0845947265625, "learning_rate": 2e-06, "loss": -0.0003, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6164.0, "completions/max_terminated_length": 6164.0, "completions/mean_length": 3045.984375, "completions/mean_terminated_length": 3045.984375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.0016715116279069767, "grad_norm": 0.015474671509878156, "kl": 0.085205078125, "learning_rate": 2e-06, "loss": -0.0001, "num_tokens": 9080324.0, "reward": 0.3584170639514923, "reward_std": 0.2464786320924759, "rewards/avg_thinking_length_func": 171.05947875976562, "rewards/confidence_score_reward_func": 0.6698201298713684, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.9022617067768229, "rewards/format_and_efficient_reward_func": 0.18420693278312683, "rewards/format_reward_func": 0.999218761920929, "rewards/num_xml_reward_func": 0.9476650953292847, "rewards/tool_execution_reward_func": 1.9967105388641357, "rewards/visit_tool_reward_func": 0.922633707523346, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001695736434108527, "grad_norm": 0.01314183789857302, "kl": 0.082275390625, "learning_rate": 2e-06, "loss": -0.0001, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0017199612403100775, "grad_norm": 0.012255008742171034, "kl": 0.0802001953125, "learning_rate": 2e-06, "loss": -0.0001, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0017441860465116279, "grad_norm": 0.016022338448163764, "kl": 0.0791015625, "learning_rate": 2e-06, "loss": -0.0001, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9687.0, "completions/max_terminated_length": 9687.0, "completions/mean_length": 4153.765625, "completions/mean_terminated_length": 4153.765625, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.0017684108527131783, "grad_norm": 0.009771623312563241, "kl": 0.07470703125, "learning_rate": 2e-06, "loss": 0.0005, "num_tokens": 9647713.0, "reward": 0.39447835087776184, "reward_std": 0.1022053211927414, "rewards/avg_thinking_length_func": 180.9823455810547, "rewards/confidence_score_reward_func": 0.6325613260269165, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.8102246632773766, "rewards/format_and_efficient_reward_func": 0.31101614236831665, "rewards/format_reward_func": 0.9996874928474426, "rewards/num_xml_reward_func": 1.1014292240142822, "rewards/tool_execution_reward_func": 1.9983552694320679, "rewards/visit_tool_reward_func": 0.9176727533340454, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0017926356589147287, "grad_norm": 0.009518866209493148, "kl": 0.0743408203125, "learning_rate": 2e-06, "loss": 0.0005, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001816860465116279, "grad_norm": 0.01107061263145856, "kl": 0.074462890625, "learning_rate": 2e-06, "loss": 0.0005, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0018410852713178294, "grad_norm": 0.010455700609646703, "kl": 0.0758056640625, "learning_rate": 2e-06, "loss": 0.0005, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 6094.0, "completions/max_terminated_length": 6094.0, "completions/mean_length": 3365.65625, "completions/mean_terminated_length": 3386.5714285714284, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "epoch": 0.0018653100775193798, "grad_norm": 0.010697094262847633, "kl": 0.07177734375, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 10096064.0, "reward": 0.4402870833873749, "reward_std": 0.17748701572418213, "rewards/avg_thinking_length_func": 184.61854553222656, "rewards/confidence_score_reward_func": 0.6924824714660645, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.8674089768653666, "rewards/format_and_efficient_reward_func": 0.46700799465179443, "rewards/format_reward_func": 0.9821969866752625, "rewards/num_xml_reward_func": 1.4879558086395264, "rewards/tool_execution_reward_func": 1.9514802694320679, "rewards/visit_tool_reward_func": 0.9262524843215942, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0018895348837209302, "grad_norm": 0.010757613261067228, "kl": 0.0716552734375, "learning_rate": 2e-06, "loss": 0.0002, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0019137596899224806, "grad_norm": 0.010687573666984099, "kl": 0.0711669921875, "learning_rate": 2e-06, "loss": 0.0002, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.001937984496124031, "grad_norm": 0.010774872814522038, "kl": 0.07177734375, "learning_rate": 2e-06, "loss": 0.0002, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9500.0, "completions/max_terminated_length": 9500.0, "completions/mean_length": 4230.6875, "completions/mean_terminated_length": 4230.6875, "completions/min_length": 1095.0, "completions/min_terminated_length": 1095.0, "epoch": 0.0019622093023255816, "grad_norm": 46.52685366161902, "kl": 28.5504150390625, "learning_rate": 2e-06, "loss": 0.0212, "num_tokens": 10633304.0, "reward": 0.4479905962944031, "reward_std": 0.11886347830295563, "rewards/avg_thinking_length_func": 196.62542724609375, "rewards/confidence_score_reward_func": 0.6686310768127441, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.8074578120916676, "rewards/format_and_efficient_reward_func": 0.4098377823829651, "rewards/format_reward_func": 0.9993749856948853, "rewards/num_xml_reward_func": 1.3076300621032715, "rewards/tool_execution_reward_func": 1.9934210777282715, "rewards/visit_tool_reward_func": 0.9281606674194336, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0019864341085271318, "grad_norm": 0.011024444662647613, "kl": 0.0682373046875, "learning_rate": 2e-06, "loss": 0.0007, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0020106589147286824, "grad_norm": 0.0110905273039609, "kl": 0.0682373046875, "learning_rate": 2e-06, "loss": 0.0007, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0020348837209302325, "grad_norm": 0.011161056303561772, "kl": 0.068359375, "learning_rate": 2e-06, "loss": 0.0007, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7332.0, "completions/max_terminated_length": 7332.0, "completions/mean_length": 3184.28125, "completions/mean_terminated_length": 3184.28125, "completions/min_length": 1380.0, "completions/min_terminated_length": 1380.0, "epoch": 0.002059108527131783, "grad_norm": 0.007262566160814956, "kl": 0.0670166015625, "learning_rate": 2e-06, "loss": -0.0, "num_tokens": 11072119.0, "reward": 0.48964226245880127, "reward_std": 0.09526845812797546, "rewards/avg_thinking_length_func": 183.27981567382812, "rewards/confidence_score_reward_func": 0.7107405066490173, "rewards/correct_answer_reward_func": 0.671875, "rewards/efficient_thinking_reward_func": 0.8552614079949872, "rewards/format_and_efficient_reward_func": 0.509292721748352, "rewards/format_reward_func": 0.9996874928474426, "rewards/num_xml_reward_func": 1.630164384841919, "rewards/tool_execution_reward_func": 1.9862616062164307, "rewards/visit_tool_reward_func": 0.9241018295288086, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0020833333333333333, "grad_norm": 0.007239493682926299, "kl": 0.0675048828125, "learning_rate": 2e-06, "loss": -0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002107558139534884, "grad_norm": 0.007565680492649283, "kl": 0.06787109375, "learning_rate": 2e-06, "loss": -0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002131782945736434, "grad_norm": 0.007407335837345995, "kl": 0.0682373046875, "learning_rate": 2e-06, "loss": -0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9640.0, "completions/max_terminated_length": 9640.0, "completions/mean_length": 3956.140625, "completions/mean_terminated_length": 3956.140625, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.0021560077519379847, "grad_norm": 0.009630461090198177, "kl": 0.06591796875, "learning_rate": 2e-06, "loss": 0.0004, "num_tokens": 11589191.0, "reward": 0.4685676693916321, "reward_std": 0.08529931306838989, "rewards/avg_thinking_length_func": 185.34999084472656, "rewards/confidence_score_reward_func": 0.673518717288971, "rewards/correct_answer_reward_func": 0.65625, "rewards/efficient_thinking_reward_func": 0.8117772322905137, "rewards/format_and_efficient_reward_func": 0.4981999397277832, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.5498807430267334, "rewards/tool_execution_reward_func": 1.9884867668151855, "rewards/visit_tool_reward_func": 0.9419025182723999, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002180232558139535, "grad_norm": 0.010035272389521673, "kl": 0.0660400390625, "learning_rate": 2e-06, "loss": 0.0004, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0022044573643410855, "grad_norm": 0.009886020878154878, "kl": 0.0653076171875, "learning_rate": 2e-06, "loss": 0.0004, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0022286821705426356, "grad_norm": 0.010179048111382292, "kl": 0.0648193359375, "learning_rate": 2e-06, "loss": 0.0004, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5889.0, "completions/max_terminated_length": 5889.0, "completions/mean_length": 3288.046875, "completions/mean_terminated_length": 3288.046875, "completions/min_length": 1106.0, "completions/min_terminated_length": 1106.0, "epoch": 0.0022529069767441862, "grad_norm": 0.36462018525457834, "kl": 0.1248779296875, "learning_rate": 2e-06, "loss": 0.001, "num_tokens": 12047117.0, "reward": 0.5035778284072876, "reward_std": 0.09110674262046814, "rewards/avg_thinking_length_func": 180.05084228515625, "rewards/confidence_score_reward_func": 0.7095786333084106, "rewards/correct_answer_reward_func": 0.6875, "rewards/efficient_thinking_reward_func": 0.865053232533276, "rewards/format_and_efficient_reward_func": 0.5739701986312866, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.6645023822784424, "rewards/tool_execution_reward_func": 1.9736841917037964, "rewards/visit_tool_reward_func": 0.9475066065788269, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0022771317829457364, "grad_norm": 0.010049427947341465, "kl": 0.0684814453125, "learning_rate": 2e-06, "loss": 0.001, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002301356589147287, "grad_norm": 0.008406367137924373, "kl": 0.067138671875, "learning_rate": 2e-06, "loss": 0.001, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002325581395348837, "grad_norm": 0.008646991679074768, "kl": 0.0682373046875, "learning_rate": 2e-06, "loss": 0.001, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9717.0, "completions/max_terminated_length": 9717.0, "completions/mean_length": 4042.09375, "completions/mean_terminated_length": 4042.09375, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.0023498062015503878, "grad_norm": 0.36433347439984676, "kl": 0.266357421875, "learning_rate": 2e-06, "loss": 0.0004, "num_tokens": 12594675.0, "reward": 0.4354441165924072, "reward_std": 0.10702547430992126, "rewards/avg_thinking_length_func": 178.31576538085938, "rewards/confidence_score_reward_func": 0.6778514385223389, "rewards/correct_answer_reward_func": 0.59375, "rewards/efficient_thinking_reward_func": 0.8262231594607177, "rewards/format_and_efficient_reward_func": 0.4731639623641968, "rewards/format_reward_func": 0.9996874928474426, "rewards/num_xml_reward_func": 1.5230944156646729, "rewards/tool_execution_reward_func": 1.977658987045288, "rewards/visit_tool_reward_func": 0.90561443567276, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002374031007751938, "grad_norm": 0.017653090743062046, "kl": 0.07763671875, "learning_rate": 2e-06, "loss": 0.0001, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0023982558139534886, "grad_norm": 0.009650143183516308, "kl": 0.066650390625, "learning_rate": 2e-06, "loss": 0.0001, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0024224806201550387, "grad_norm": 0.009666383934140476, "kl": 0.066650390625, "learning_rate": 2e-06, "loss": 0.0001, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7009.0, "completions/max_terminated_length": 7009.0, "completions/mean_length": 3569.1875, "completions/mean_terminated_length": 3569.1875, "completions/min_length": 1350.0, "completions/min_terminated_length": 1350.0, "epoch": 0.0024467054263565893, "grad_norm": 0.012628187028225836, "kl": 0.0160369873046875, "learning_rate": 2e-06, "loss": 0.0013, "num_tokens": 13095521.0, "reward": 0.4694232642650604, "reward_std": 0.11920525133609772, "rewards/avg_thinking_length_func": 166.68763732910156, "rewards/confidence_score_reward_func": 0.693173885345459, "rewards/correct_answer_reward_func": 0.640625, "rewards/efficient_thinking_reward_func": 0.8890269113384983, "rewards/format_and_efficient_reward_func": 0.52373868227005, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.4931187629699707, "rewards/tool_execution_reward_func": 1.9407894611358643, "rewards/visit_tool_reward_func": 0.9543420076370239, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0024709302325581395, "grad_norm": 0.013764666926511201, "kl": 0.016693115234375, "learning_rate": 2e-06, "loss": 0.0013, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00249515503875969, "grad_norm": 0.015582325932853322, "kl": 0.017730712890625, "learning_rate": 2e-06, "loss": 0.0013, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0025193798449612403, "grad_norm": 0.017864538067072777, "kl": 0.01995849609375, "learning_rate": 2e-06, "loss": 0.0013, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11718.0, "completions/max_terminated_length": 11718.0, "completions/mean_length": 4337.8125, "completions/mean_terminated_length": 4337.8125, "completions/min_length": 1402.0, "completions/min_terminated_length": 1402.0, "epoch": 0.002543604651162791, "grad_norm": 0.011715145428905095, "kl": 0.023681640625, "learning_rate": 2e-06, "loss": 0.0003, "num_tokens": 13691037.0, "reward": 0.4581317901611328, "reward_std": 0.07780471444129944, "rewards/avg_thinking_length_func": 141.15011596679688, "rewards/confidence_score_reward_func": 0.6525664925575256, "rewards/correct_answer_reward_func": 0.65625, "rewards/efficient_thinking_reward_func": 0.7593332235923487, "rewards/format_and_efficient_reward_func": 0.45769202709198, "rewards/format_reward_func": 0.9993749856948853, "rewards/num_xml_reward_func": 1.3809731006622314, "rewards/tool_execution_reward_func": 1.9640991687774658, "rewards/visit_tool_reward_func": 0.9199192523956299, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002567829457364341, "grad_norm": 0.012478280222631418, "kl": 0.03009033203125, "learning_rate": 2e-06, "loss": 0.0003, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0025920542635658917, "grad_norm": 0.013305867700430574, "kl": 0.0390625, "learning_rate": 2e-06, "loss": 0.0004, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002616279069767442, "grad_norm": 0.0183428412461533, "kl": 0.0509033203125, "learning_rate": 2e-06, "loss": 0.0004, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6386.0, "completions/max_terminated_length": 6386.0, "completions/mean_length": 3297.5625, "completions/mean_terminated_length": 3297.5625, "completions/min_length": 1296.0, "completions/min_terminated_length": 1296.0, "epoch": 0.0026405038759689924, "grad_norm": 0.02337332236690807, "kl": 0.0616455078125, "learning_rate": 2e-06, "loss": 0.001, "num_tokens": 14184466.0, "reward": 0.40722835063934326, "reward_std": 0.14360609650611877, "rewards/avg_thinking_length_func": 138.28097534179688, "rewards/confidence_score_reward_func": 0.644202470779419, "rewards/correct_answer_reward_func": 0.59375, "rewards/efficient_thinking_reward_func": 0.7607926960767375, "rewards/format_and_efficient_reward_func": 0.46497124433517456, "rewards/format_reward_func": 1.0, "rewards/num_xml_reward_func": 1.4057281017303467, "rewards/tool_execution_reward_func": 1.9434621334075928, "rewards/visit_tool_reward_func": 0.9184768199920654, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0026647286821705426, "grad_norm": 0.012698799447402773, "kl": 0.06781005859375, "learning_rate": 2e-06, "loss": 0.001, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002688953488372093, "grad_norm": 0.012619226324675306, "kl": 0.0758056640625, "learning_rate": 2e-06, "loss": 0.001, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0027131782945736434, "grad_norm": 0.013347372933753418, "kl": 0.0892333984375, "learning_rate": 2e-06, "loss": 0.001, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10254.0, "completions/max_terminated_length": 10254.0, "completions/mean_length": 4017.296875, "completions/mean_terminated_length": 4017.296875, "completions/min_length": 1163.0, "completions/min_terminated_length": 1163.0, "epoch": 0.002737403100775194, "grad_norm": 0.8482856229199331, "kl": 0.163818359375, "learning_rate": 2e-06, "loss": 0.0003, "num_tokens": 14783302.0, "reward": 0.3793744742870331, "reward_std": 0.08317889273166656, "rewards/avg_thinking_length_func": 96.98873901367188, "rewards/confidence_score_reward_func": 0.5890461206436157, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.4956153760102844, "rewards/format_and_efficient_reward_func": 0.3040567636489868, "rewards/format_reward_func": 0.991857647895813, "rewards/num_xml_reward_func": 0.9565892815589905, "rewards/tool_execution_reward_func": 1.883992075920105, "rewards/visit_tool_reward_func": 0.6309776306152344, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002761627906976744, "grad_norm": 2.892668930951565, "kl": 0.87744140625, "learning_rate": 2e-06, "loss": 0.002, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0027858527131782948, "grad_norm": 0.11540075032392746, "kl": 0.258544921875, "learning_rate": 2e-06, "loss": 0.0005, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002810077519379845, "grad_norm": 0.03602102455529362, "kl": 0.205078125, "learning_rate": 2e-06, "loss": 0.0003, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7956.0, "completions/max_terminated_length": 7956.0, "completions/mean_length": 3060.15625, "completions/mean_terminated_length": 3060.15625, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "epoch": 0.0028343023255813955, "grad_norm": 0.011947818843430099, "kl": 0.1031494140625, "learning_rate": 2e-06, "loss": 0.0013, "num_tokens": 15239738.0, "reward": 0.4257793724536896, "reward_std": 0.15445315837860107, "rewards/avg_thinking_length_func": 111.71697235107422, "rewards/confidence_score_reward_func": 0.6188951730728149, "rewards/correct_answer_reward_func": 0.671875, "rewards/efficient_thinking_reward_func": 0.7151743089595498, "rewards/format_and_efficient_reward_func": 0.3122476637363434, "rewards/format_reward_func": 0.9918689727783203, "rewards/num_xml_reward_func": 1.2823729515075684, "rewards/tool_execution_reward_func": 1.9500064849853516, "rewards/visit_tool_reward_func": 0.8597963452339172, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0028585271317829457, "grad_norm": 0.01184782503909529, "kl": 0.0999755859375, "learning_rate": 2e-06, "loss": 0.0013, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0028827519379844963, "grad_norm": 0.01222442223239816, "kl": 0.099365234375, "learning_rate": 2e-06, "loss": 0.0013, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0029069767441860465, "grad_norm": 0.01288408566646706, "kl": 0.1002197265625, "learning_rate": 2e-06, "loss": 0.0013, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11579.0, "completions/max_terminated_length": 11579.0, "completions/mean_length": 3778.484375, "completions/mean_terminated_length": 3778.484375, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.002931201550387597, "grad_norm": 0.013127560986285324, "kl": 0.170654296875, "learning_rate": 2e-06, "loss": 0.0006, "num_tokens": 15802870.0, "reward": 0.35960614681243896, "reward_std": 0.09336411207914352, "rewards/avg_thinking_length_func": 120.11376953125, "rewards/confidence_score_reward_func": 0.5505574941635132, "rewards/correct_answer_reward_func": 0.609375, "rewards/efficient_thinking_reward_func": 0.5848998658707487, "rewards/format_and_efficient_reward_func": 0.09069697558879852, "rewards/format_reward_func": 0.9635053873062134, "rewards/num_xml_reward_func": 0.6183948516845703, "rewards/tool_execution_reward_func": 1.921267032623291, "rewards/visit_tool_reward_func": 0.408791184425354, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0029554263565891472, "grad_norm": 0.011473925349363189, "kl": 0.173828125, "learning_rate": 2e-06, "loss": 0.0006, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.002979651162790698, "grad_norm": 0.010667583254555548, "kl": 0.1767578125, "learning_rate": 2e-06, "loss": 0.0006, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003003875968992248, "grad_norm": 0.010839236682357098, "kl": 0.1826171875, "learning_rate": 2e-06, "loss": 0.0006, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7543.0, "completions/max_terminated_length": 7543.0, "completions/mean_length": 3495.625, "completions/mean_terminated_length": 3495.625, "completions/min_length": 1272.0, "completions/min_terminated_length": 1272.0, "epoch": 0.0030281007751937986, "grad_norm": 0.014134486127382969, "kl": 0.136474609375, "learning_rate": 2e-06, "loss": 0.0011, "num_tokens": 16267847.0, "reward": 0.40116244554519653, "reward_std": 0.11558952927589417, "rewards/avg_thinking_length_func": 171.4405975341797, "rewards/confidence_score_reward_func": 0.592523455619812, "rewards/correct_answer_reward_func": 0.65625, "rewards/efficient_thinking_reward_func": 0.78887382548876, "rewards/format_and_efficient_reward_func": -0.007415967993438244, "rewards/format_reward_func": 0.9569429159164429, "rewards/num_xml_reward_func": 0.533742368221283, "rewards/tool_execution_reward_func": 1.984920620918274, "rewards/visit_tool_reward_func": 0.8972762823104858, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003052325581395349, "grad_norm": 0.01438304535919498, "kl": 0.140625, "learning_rate": 2e-06, "loss": 0.0011, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0030765503875968994, "grad_norm": 0.014656756114246808, "kl": 0.14794921875, "learning_rate": 2e-06, "loss": 0.0011, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0031007751937984496, "grad_norm": 0.015042904271731165, "kl": 0.15869140625, "learning_rate": 2e-06, "loss": 0.0012, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12404.0, "completions/max_terminated_length": 12404.0, "completions/mean_length": 4594.015625, "completions/mean_terminated_length": 4594.015625, "completions/min_length": 1214.0, "completions/min_terminated_length": 1214.0, "epoch": 0.003125, "grad_norm": 0.01590013423348445, "kl": 0.26904296875, "learning_rate": 2e-06, "loss": 0.0009, "num_tokens": 16831957.0, "reward": 0.3612688183784485, "reward_std": 0.08134222030639648, "rewards/avg_thinking_length_func": 189.8800048828125, "rewards/confidence_score_reward_func": 0.5268421173095703, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.6692969275756135, "rewards/format_and_efficient_reward_func": -0.032691895961761475, "rewards/format_reward_func": 0.9466335773468018, "rewards/num_xml_reward_func": 0.4149753153324127, "rewards/tool_execution_reward_func": 1.9272011518478394, "rewards/visit_tool_reward_func": 0.7673778533935547, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0031492248062015503, "grad_norm": 0.01646874780720208, "kl": 0.29443359375, "learning_rate": 2e-06, "loss": 0.0009, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003173449612403101, "grad_norm": 0.01694506623714648, "kl": 0.314453125, "learning_rate": 2e-06, "loss": 0.0009, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003197674418604651, "grad_norm": 0.016867539615718644, "kl": 0.3271484375, "learning_rate": 2e-06, "loss": 0.001, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6944.0, "completions/max_terminated_length": 6944.0, "completions/mean_length": 2774.265625, "completions/mean_terminated_length": 2774.265625, "completions/min_length": 1204.0, "completions/min_terminated_length": 1204.0, "epoch": 0.0032218992248062017, "grad_norm": 0.022617229528507702, "kl": 0.26953125, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 17249404.0, "reward": 0.20413580536842346, "reward_std": 0.05481432378292084, "rewards/avg_thinking_length_func": 129.03866577148438, "rewards/confidence_score_reward_func": 0.49319422245025635, "rewards/correct_answer_reward_func": 0.34375, "rewards/efficient_thinking_reward_func": 0.7432039407243382, "rewards/format_and_efficient_reward_func": 0.17171993851661682, "rewards/format_reward_func": 0.9746097326278687, "rewards/num_xml_reward_func": 0.8615504503250122, "rewards/tool_execution_reward_func": 1.9303656816482544, "rewards/visit_tool_reward_func": 0.9013795852661133, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003246124031007752, "grad_norm": 0.020143739711233816, "kl": 0.252685546875, "learning_rate": 2e-06, "loss": 0.0002, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0032703488372093025, "grad_norm": 0.01785809415589292, "kl": 0.227294921875, "learning_rate": 2e-06, "loss": 0.0001, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0032945736434108527, "grad_norm": 0.015380281270199666, "kl": 0.199462890625, "learning_rate": 2e-06, "loss": 0.0001, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7166.0, "completions/max_terminated_length": 7166.0, "completions/mean_length": 3239.453125, "completions/mean_terminated_length": 3239.453125, "completions/min_length": 1458.0, "completions/min_terminated_length": 1458.0, "epoch": 0.0033187984496124033, "grad_norm": 0.012800365899215092, "kl": 0.138427734375, "learning_rate": 2e-06, "loss": 0.0004, "num_tokens": 17686794.0, "reward": 0.3108579218387604, "reward_std": 0.13888844847679138, "rewards/avg_thinking_length_func": 171.369384765625, "rewards/confidence_score_reward_func": 0.5435695648193359, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.802592893497664, "rewards/format_and_efficient_reward_func": 0.2916308343410492, "rewards/format_reward_func": 0.9913173913955688, "rewards/num_xml_reward_func": 1.4043910503387451, "rewards/tool_execution_reward_func": 1.8357443809509277, "rewards/visit_tool_reward_func": 0.8753163814544678, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0033430232558139534, "grad_norm": 0.014285839855776115, "kl": 0.1318359375, "learning_rate": 2e-06, "loss": 0.0004, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003367248062015504, "grad_norm": 0.015433812962718682, "kl": 0.128173828125, "learning_rate": 2e-06, "loss": 0.0004, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003391472868217054, "grad_norm": 0.015720560114809618, "kl": 0.1229248046875, "learning_rate": 2e-06, "loss": 0.0004, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5424.0, "completions/max_terminated_length": 5424.0, "completions/mean_length": 3209.875, "completions/mean_terminated_length": 3209.875, "completions/min_length": 1301.0, "completions/min_terminated_length": 1301.0, "epoch": 0.003415697674418605, "grad_norm": 0.009160832793565006, "kl": 0.089599609375, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 18156998.0, "reward": 0.2771710753440857, "reward_std": 0.10209451615810394, "rewards/avg_thinking_length_func": 144.3570556640625, "rewards/confidence_score_reward_func": 0.5883906483650208, "rewards/correct_answer_reward_func": 0.421875, "rewards/efficient_thinking_reward_func": 0.9227171305298694, "rewards/format_and_efficient_reward_func": 0.303905189037323, "rewards/format_reward_func": 0.9965387582778931, "rewards/num_xml_reward_func": 1.6496015787124634, "rewards/tool_execution_reward_func": 1.9101753234863281, "rewards/visit_tool_reward_func": 1.0097795724868774, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003439922480620155, "grad_norm": 0.009348804877622782, "kl": 0.0853271484375, "learning_rate": 2e-06, "loss": 0.0002, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0034641472868217056, "grad_norm": 0.009332442022472659, "kl": 0.080322265625, "learning_rate": 2e-06, "loss": 0.0002, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0034883720930232558, "grad_norm": 0.009512893821144673, "kl": 0.0767822265625, "learning_rate": 2e-06, "loss": 0.0002, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7366.0, "completions/max_terminated_length": 7366.0, "completions/mean_length": 4435.890625, "completions/mean_terminated_length": 4435.890625, "completions/min_length": 1397.0, "completions/min_terminated_length": 1397.0, "epoch": 0.0035125968992248064, "grad_norm": 0.013329760690267301, "kl": 0.055419921875, "learning_rate": 2e-06, "loss": -0.0002, "num_tokens": 18712707.0, "reward": 0.4343380331993103, "reward_std": 0.1319217085838318, "rewards/avg_thinking_length_func": 213.60223388671875, "rewards/confidence_score_reward_func": 0.6497268080711365, "rewards/correct_answer_reward_func": 0.625, "rewards/efficient_thinking_reward_func": 0.8139017177985812, "rewards/format_and_efficient_reward_func": 0.4802235960960388, "rewards/format_reward_func": 0.9989955425262451, "rewards/num_xml_reward_func": 1.751387119293213, "rewards/tool_execution_reward_func": 1.9038957357406616, "rewards/visit_tool_reward_func": 0.9324563145637512, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0035368217054263565, "grad_norm": 0.013975012703647748, "kl": 0.0540771484375, "learning_rate": 2e-06, "loss": -0.0002, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003561046511627907, "grad_norm": 0.014076489547319788, "kl": 0.0531005859375, "learning_rate": 2e-06, "loss": -0.0002, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0035852713178294573, "grad_norm": 0.014165636449546886, "kl": 0.0531005859375, "learning_rate": 2e-06, "loss": -0.0002, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5721.0, "completions/max_terminated_length": 5721.0, "completions/mean_length": 3476.890625, "completions/mean_terminated_length": 3476.890625, "completions/min_length": 1375.0, "completions/min_terminated_length": 1375.0, "epoch": 0.003609496124031008, "grad_norm": 0.007532410662647794, "kl": 0.06439208984375, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 19224629.0, "reward": 0.3097182512283325, "reward_std": 0.06608685851097107, "rewards/avg_thinking_length_func": 155.74346923828125, "rewards/confidence_score_reward_func": 0.6070291996002197, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.9227627606272979, "rewards/format_and_efficient_reward_func": 0.3381012976169586, "rewards/format_reward_func": 0.9996874928474426, "rewards/num_xml_reward_func": 1.6836090087890625, "rewards/tool_execution_reward_func": 1.8510758876800537, "rewards/visit_tool_reward_func": 0.8944061994552612, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003633720930232558, "grad_norm": 0.007379430347015788, "kl": 0.0645751953125, "learning_rate": 2e-06, "loss": 0.0001, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0036579457364341087, "grad_norm": 0.008138518366845196, "kl": 0.0657958984375, "learning_rate": 2e-06, "loss": 0.0001, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003682170542635659, "grad_norm": 0.008284296957527382, "kl": 0.0673828125, "learning_rate": 2e-06, "loss": 0.0001, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9618.0, "completions/max_terminated_length": 9618.0, "completions/mean_length": 4875.078125, "completions/mean_terminated_length": 4875.078125, "completions/min_length": 1847.0, "completions/min_terminated_length": 1847.0, "epoch": 0.0037063953488372095, "grad_norm": 0.014704297852595168, "kl": 0.05523681640625, "learning_rate": 2e-06, "loss": 0.0015, "num_tokens": 19820623.0, "reward": 0.428906112909317, "reward_std": 0.16942133009433746, "rewards/avg_thinking_length_func": 210.6763916015625, "rewards/confidence_score_reward_func": 0.6548709869384766, "rewards/correct_answer_reward_func": 0.609375, "rewards/efficient_thinking_reward_func": 0.7212743512877299, "rewards/format_and_efficient_reward_func": 0.4301028251647949, "rewards/format_reward_func": 0.9975892305374146, "rewards/num_xml_reward_func": 1.4759665727615356, "rewards/tool_execution_reward_func": 1.8980989456176758, "rewards/visit_tool_reward_func": 0.9375091791152954, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0037306201550387596, "grad_norm": 0.015023473705486283, "kl": 0.0567626953125, "learning_rate": 2e-06, "loss": 0.0015, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0037548449612403102, "grad_norm": 0.015217500076281755, "kl": 0.05841064453125, "learning_rate": 2e-06, "loss": 0.0015, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0037790697674418604, "grad_norm": 0.016114636489248848, "kl": 0.0614013671875, "learning_rate": 2e-06, "loss": 0.0015, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6823.0, "completions/max_terminated_length": 6823.0, "completions/mean_length": 4384.296875, "completions/mean_terminated_length": 4384.296875, "completions/min_length": 1697.0, "completions/min_terminated_length": 1697.0, "epoch": 0.003803294573643411, "grad_norm": 0.006588369322686691, "kl": 0.0643310546875, "learning_rate": 2e-06, "loss": 0.0005, "num_tokens": 20425222.0, "reward": 0.34698012471199036, "reward_std": 0.03517330437898636, "rewards/avg_thinking_length_func": 178.83392333984375, "rewards/confidence_score_reward_func": 0.6313294172286987, "rewards/correct_answer_reward_func": 0.484375, "rewards/efficient_thinking_reward_func": 0.8650427095882729, "rewards/format_and_efficient_reward_func": 0.37807154655456543, "rewards/format_reward_func": 0.9995312690734863, "rewards/num_xml_reward_func": 1.323744297027588, "rewards/tool_execution_reward_func": 1.96144700050354, "rewards/visit_tool_reward_func": 0.9631377458572388, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003827519379844961, "grad_norm": 0.006972139333963718, "kl": 0.0670166015625, "learning_rate": 2e-06, "loss": 0.0005, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003851744186046512, "grad_norm": 0.0071318562836598836, "kl": 0.06884765625, "learning_rate": 2e-06, "loss": 0.0005, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003875968992248062, "grad_norm": 0.007113091376284595, "kl": 0.06982421875, "learning_rate": 2e-06, "loss": 0.0005, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11894.0, "completions/max_terminated_length": 11894.0, "completions/mean_length": 5685.0625, "completions/mean_terminated_length": 5685.0625, "completions/min_length": 1886.0, "completions/min_terminated_length": 1886.0, "epoch": 0.0039001937984496126, "grad_norm": 0.01558937344658329, "kl": 0.06414794921875, "learning_rate": 2e-06, "loss": 0.0018, "num_tokens": 21086786.0, "reward": 0.4025996923446655, "reward_std": 0.13449470698833466, "rewards/avg_thinking_length_func": 254.32508850097656, "rewards/confidence_score_reward_func": 0.6495309472084045, "rewards/correct_answer_reward_func": 0.578125, "rewards/efficient_thinking_reward_func": 0.6637161596148502, "rewards/format_and_efficient_reward_func": 0.458422988653183, "rewards/format_reward_func": 0.9998437166213989, "rewards/num_xml_reward_func": 1.5073208808898926, "rewards/tool_execution_reward_func": 1.9572367668151855, "rewards/visit_tool_reward_func": 0.9573923349380493, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003924418604651163, "grad_norm": 0.016638056430155885, "kl": 0.064453125, "learning_rate": 2e-06, "loss": 0.0018, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.003948643410852713, "grad_norm": 0.01813854752521658, "kl": 0.06536865234375, "learning_rate": 2e-06, "loss": 0.0018, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0039728682170542635, "grad_norm": 0.01938490985845502, "kl": 0.06988525390625, "learning_rate": 2e-06, "loss": 0.0019, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9060.0, "completions/max_terminated_length": 9060.0, "completions/mean_length": 4403.21875, "completions/mean_terminated_length": 4403.21875, "completions/min_length": 1390.0, "completions/min_terminated_length": 1390.0, "epoch": 0.003997093023255814, "grad_norm": 0.005449273513524992, "kl": 0.0662841796875, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 21662894.0, "reward": 0.35001087188720703, "reward_std": 0.009927155449986458, "rewards/avg_thinking_length_func": 188.15765380859375, "rewards/confidence_score_reward_func": 0.6182008981704712, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 0.8001981107519069, "rewards/format_and_efficient_reward_func": 0.36673688888549805, "rewards/format_reward_func": 0.9998437166213989, "rewards/num_xml_reward_func": 1.4394086599349976, "rewards/tool_execution_reward_func": 1.993227481842041, "rewards/visit_tool_reward_func": 0.936252236366272, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004021317829457365, "grad_norm": 0.00568312787735846, "kl": 0.068115234375, "learning_rate": 2e-06, "loss": 0.0001, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0040455426356589145, "grad_norm": 0.005806971085578714, "kl": 0.069580078125, "learning_rate": 2e-06, "loss": 0.0001, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004069767441860465, "grad_norm": 0.00592190722180043, "kl": 0.070556640625, "learning_rate": 2e-06, "loss": 0.0001, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9992.0, "completions/max_terminated_length": 9992.0, "completions/mean_length": 5377.4375, "completions/mean_terminated_length": 5377.4375, "completions/min_length": 1809.0, "completions/min_terminated_length": 1809.0, "epoch": 0.004093992248062016, "grad_norm": 0.359099649617951, "kl": 0.1207275390625, "learning_rate": 2e-06, "loss": 0.0029, "num_tokens": 22286431.0, "reward": 0.40037134289741516, "reward_std": 0.12838459014892578, "rewards/avg_thinking_length_func": 245.9459228515625, "rewards/confidence_score_reward_func": 0.6141020059585571, "rewards/correct_answer_reward_func": 0.609375, "rewards/efficient_thinking_reward_func": 0.6361426555187852, "rewards/format_and_efficient_reward_func": 0.45017051696777344, "rewards/format_reward_func": 0.9981250166893005, "rewards/num_xml_reward_func": 1.532149076461792, "rewards/tool_execution_reward_func": 1.9983552694320679, "rewards/visit_tool_reward_func": 0.9713033437728882, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004118217054263566, "grad_norm": 0.0312847460920415, "kl": 0.08642578125, "learning_rate": 2e-06, "loss": 0.0028, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004142441860465116, "grad_norm": 0.5587996108011728, "kl": 0.2386474609375, "learning_rate": 2e-06, "loss": 0.003, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004166666666666667, "grad_norm": 0.03228792794627183, "kl": 0.092529296875, "learning_rate": 2e-06, "loss": 0.0028, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7094.0, "completions/max_terminated_length": 7094.0, "completions/mean_length": 4163.640625, "completions/mean_terminated_length": 4163.640625, "completions/min_length": 1385.0, "completions/min_terminated_length": 1385.0, "epoch": 0.004190891472868217, "grad_norm": 0.008141555436627606, "kl": 0.1014404296875, "learning_rate": 2e-06, "loss": 0.0007, "num_tokens": 22851695.0, "reward": 0.31291523575782776, "reward_std": 0.0387241393327713, "rewards/avg_thinking_length_func": 150.9978485107422, "rewards/confidence_score_reward_func": 0.5685818195343018, "rewards/correct_answer_reward_func": 0.46875, "rewards/efficient_thinking_reward_func": 0.8065696148258371, "rewards/format_and_efficient_reward_func": 0.30031993985176086, "rewards/format_reward_func": 0.9996874928474426, "rewards/num_xml_reward_func": 1.2274867296218872, "rewards/tool_execution_reward_func": 1.9928336143493652, "rewards/visit_tool_reward_func": 0.9787203073501587, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004215116279069768, "grad_norm": 0.008733677069632446, "kl": 0.1131591796875, "learning_rate": 2e-06, "loss": 0.0007, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0042393410852713176, "grad_norm": 0.009638540295346257, "kl": 0.12744140625, "learning_rate": 2e-06, "loss": 0.0007, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004263565891472868, "grad_norm": 0.010992556993855552, "kl": 0.142822265625, "learning_rate": 2e-06, "loss": 0.0007, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8227.0, "completions/max_terminated_length": 8227.0, "completions/mean_length": 4541.953125, "completions/mean_terminated_length": 4541.953125, "completions/min_length": 1507.0, "completions/min_terminated_length": 1507.0, "epoch": 0.004287790697674419, "grad_norm": 0.1409188461026278, "kl": 0.265625, "learning_rate": 2e-06, "loss": 0.0037, "num_tokens": 23436250.0, "reward": 0.3243735730648041, "reward_std": 0.15356436371803284, "rewards/avg_thinking_length_func": 171.99826049804688, "rewards/confidence_score_reward_func": 0.5453901290893555, "rewards/correct_answer_reward_func": 0.53125, "rewards/efficient_thinking_reward_func": 0.6924963364887087, "rewards/format_and_efficient_reward_func": 0.3312879800796509, "rewards/format_reward_func": 0.998577356338501, "rewards/num_xml_reward_func": 1.3812510967254639, "rewards/tool_execution_reward_func": 1.9967105388641357, "rewards/visit_tool_reward_func": 0.9554424285888672, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004312015503875969, "grad_norm": 0.05228415250398885, "kl": 0.201904296875, "learning_rate": 2e-06, "loss": 0.0037, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004336240310077519, "grad_norm": 0.060068767522700996, "kl": 0.2451171875, "learning_rate": 2e-06, "loss": 0.0037, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00436046511627907, "grad_norm": 0.2730620784971272, "kl": 0.4716796875, "learning_rate": 2e-06, "loss": 0.0041, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8120.0, "completions/max_terminated_length": 8120.0, "completions/mean_length": 3361.40625, "completions/mean_terminated_length": 3361.40625, "completions/min_length": 1075.0, "completions/min_terminated_length": 1075.0, "epoch": 0.00438468992248062, "grad_norm": 0.05853393969832367, "kl": 0.5302734375, "learning_rate": 2e-06, "loss": 0.0007, "num_tokens": 23987107.0, "reward": 0.24436859786510468, "reward_std": 0.04949303716421127, "rewards/avg_thinking_length_func": 81.72256469726562, "rewards/confidence_score_reward_func": 0.45580577850341797, "rewards/correct_answer_reward_func": 0.453125, "rewards/efficient_thinking_reward_func": 0.573834842856046, "rewards/format_and_efficient_reward_func": 0.22879377007484436, "rewards/format_reward_func": 0.995830774307251, "rewards/num_xml_reward_func": 1.104771614074707, "rewards/tool_execution_reward_func": 1.9899488687515259, "rewards/visit_tool_reward_func": 0.8998211622238159, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004408914728682171, "grad_norm": 0.07516497327438276, "kl": 0.6845703125, "learning_rate": 2e-06, "loss": 0.0009, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004433139534883721, "grad_norm": 0.05997132496622212, "kl": 0.626953125, "learning_rate": 2e-06, "loss": 0.0008, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004457364341085271, "grad_norm": 0.037671767248184135, "kl": 0.48681640625, "learning_rate": 2e-06, "loss": 0.0007, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8178.0, "completions/max_terminated_length": 8178.0, "completions/mean_length": 3659.640625, "completions/mean_terminated_length": 3659.640625, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.004481589147286822, "grad_norm": 0.016069114631093232, "kl": 0.34326171875, "learning_rate": 2e-06, "loss": 0.0009, "num_tokens": 24496297.0, "reward": 0.2305455505847931, "reward_std": 0.06948232650756836, "rewards/avg_thinking_length_func": 111.37628936767578, "rewards/confidence_score_reward_func": 0.37327370047569275, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.48277143466617184, "rewards/format_and_efficient_reward_func": 0.1522754281759262, "rewards/format_reward_func": 0.9647905230522156, "rewards/num_xml_reward_func": 0.8915370106697083, "rewards/tool_execution_reward_func": 1.9581143856048584, "rewards/visit_tool_reward_func": 0.5689894556999207, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0045058139534883725, "grad_norm": 0.014918137972398021, "kl": 0.31640625, "learning_rate": 2e-06, "loss": 0.0008, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004530038759689922, "grad_norm": 0.014560290660972823, "kl": 0.2958984375, "learning_rate": 2e-06, "loss": 0.0008, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004554263565891473, "grad_norm": 0.014191965162457063, "kl": 0.27880859375, "learning_rate": 2e-06, "loss": 0.0008, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11027.0, "completions/max_terminated_length": 11027.0, "completions/mean_length": 2904.71875, "completions/mean_terminated_length": 2904.71875, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "epoch": 0.004578488372093023, "grad_norm": 0.02465674538761865, "kl": 0.27099609375, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 24960803.0, "reward": 0.22261814773082733, "reward_std": 0.04196429252624512, "rewards/avg_thinking_length_func": 79.28602600097656, "rewards/confidence_score_reward_func": 0.40539172291755676, "rewards/correct_answer_reward_func": 0.46875, "rewards/efficient_thinking_reward_func": 0.4911669222941917, "rewards/format_and_efficient_reward_func": 0.14570605754852295, "rewards/format_reward_func": 0.9741340279579163, "rewards/num_xml_reward_func": 0.884125292301178, "rewards/tool_execution_reward_func": 1.9560561180114746, "rewards/visit_tool_reward_func": 0.7019689083099365, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004602713178294574, "grad_norm": 0.01002216998448175, "kl": 0.24951171875, "learning_rate": 2e-06, "loss": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004626937984496124, "grad_norm": 0.009283017573166963, "kl": 0.234619140625, "learning_rate": 2e-06, "loss": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004651162790697674, "grad_norm": 0.00871351171533654, "kl": 0.221435546875, "learning_rate": 2e-06, "loss": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6760.0, "completions/max_terminated_length": 6760.0, "completions/mean_length": 3175.625, "completions/mean_terminated_length": 3175.625, "completions/min_length": 1279.0, "completions/min_terminated_length": 1279.0, "epoch": 0.004675387596899225, "grad_norm": 0.01864801542206714, "kl": 0.200927734375, "learning_rate": 2e-06, "loss": 0.0013, "num_tokens": 25421310.0, "reward": 0.3337632417678833, "reward_std": 0.1033831387758255, "rewards/avg_thinking_length_func": 144.4852752685547, "rewards/confidence_score_reward_func": 0.5157345533370972, "rewards/correct_answer_reward_func": 0.609375, "rewards/efficient_thinking_reward_func": 0.6954727584239813, "rewards/format_and_efficient_reward_func": 0.2803717255592346, "rewards/format_reward_func": 0.9838045835494995, "rewards/num_xml_reward_func": 1.244771957397461, "rewards/tool_execution_reward_func": 1.9927083253860474, "rewards/visit_tool_reward_func": 0.8324298858642578, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0046996124031007756, "grad_norm": 0.018411722840556213, "kl": 0.193603515625, "learning_rate": 2e-06, "loss": 0.0013, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004723837209302325, "grad_norm": 0.018380172856358755, "kl": 0.189208984375, "learning_rate": 2e-06, "loss": 0.0013, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004748062015503876, "grad_norm": 0.018655645496485265, "kl": 0.1875, "learning_rate": 2e-06, "loss": 0.0013, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5256.0, "completions/max_terminated_length": 5256.0, "completions/mean_length": 2635.984375, "completions/mean_terminated_length": 2635.984375, "completions/min_length": 1134.0, "completions/min_terminated_length": 1134.0, "epoch": 0.0047722868217054265, "grad_norm": 0.004219005229441154, "kl": 0.1275634765625, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 25833510.0, "reward": 0.30341458320617676, "reward_std": 0.014322971925139427, "rewards/avg_thinking_length_func": 147.2517852783203, "rewards/confidence_score_reward_func": 0.5635701417922974, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 0.8586018615751865, "rewards/format_and_efficient_reward_func": 0.28311923146247864, "rewards/format_reward_func": 0.9866694808006287, "rewards/num_xml_reward_func": 1.2634769678115845, "rewards/tool_execution_reward_func": 1.9635450839996338, "rewards/visit_tool_reward_func": 0.80121248960495, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004796511627906977, "grad_norm": 0.004672728095639017, "kl": 0.130615234375, "learning_rate": 2e-06, "loss": 0.0002, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004820736434108527, "grad_norm": 0.004950768699918263, "kl": 0.1329345703125, "learning_rate": 2e-06, "loss": 0.0002, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0048449612403100775, "grad_norm": 0.005160418640186133, "kl": 0.1343994140625, "learning_rate": 2e-06, "loss": 0.0002, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5779.0, "completions/max_terminated_length": 5779.0, "completions/mean_length": 2860.046875, "completions/mean_terminated_length": 2860.046875, "completions/min_length": 1125.0, "completions/min_terminated_length": 1125.0, "epoch": 0.004869186046511628, "grad_norm": 0.008829648064201757, "kl": 0.0411376953125, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 26237865.0, "reward": 0.45401930809020996, "reward_std": 0.09410357475280762, "rewards/avg_thinking_length_func": 182.534423828125, "rewards/confidence_score_reward_func": 0.5977352857589722, "rewards/correct_answer_reward_func": 0.734375, "rewards/efficient_thinking_reward_func": 0.784292215730239, "rewards/format_and_efficient_reward_func": 0.41676729917526245, "rewards/format_reward_func": 0.9937513470649719, "rewards/num_xml_reward_func": 1.5355236530303955, "rewards/tool_execution_reward_func": 1.9931985139846802, "rewards/visit_tool_reward_func": 0.8612196445465088, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004893410852713179, "grad_norm": 0.009196641903985264, "kl": 0.0400390625, "learning_rate": 2e-06, "loss": 0.0002, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004917635658914728, "grad_norm": 0.009490032359266305, "kl": 0.038818359375, "learning_rate": 2e-06, "loss": 0.0002, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.004941860465116279, "grad_norm": 0.009682454113754367, "kl": 0.03753662109375, "learning_rate": 2e-06, "loss": 0.0002, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6331.0, "completions/max_terminated_length": 6331.0, "completions/mean_length": 2659.453125, "completions/mean_terminated_length": 2659.453125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 0.00496608527131783, "grad_norm": 0.0037363827479903167, "kl": 0.03375244140625, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 26646632.0, "reward": 0.3066054582595825, "reward_std": 0.03825566917657852, "rewards/avg_thinking_length_func": 136.1707763671875, "rewards/confidence_score_reward_func": 0.5777994990348816, "rewards/correct_answer_reward_func": 0.484375, "rewards/efficient_thinking_reward_func": 0.786608708417682, "rewards/format_and_efficient_reward_func": 0.3019195795059204, "rewards/format_reward_func": 0.9903415441513062, "rewards/num_xml_reward_func": 1.3805111646652222, "rewards/tool_execution_reward_func": 1.9650006294250488, "rewards/visit_tool_reward_func": 0.8477368354797363, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00499031007751938, "grad_norm": 0.0037822209816054495, "kl": 0.03350830078125, "learning_rate": 2e-06, "loss": 0.0002, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00501453488372093, "grad_norm": 0.0038040246120938713, "kl": 0.033477783203125, "learning_rate": 2e-06, "loss": 0.0002, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0050387596899224806, "grad_norm": 0.0038540122892837783, "kl": 0.03350830078125, "learning_rate": 2e-06, "loss": 0.0002, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5045.0, "completions/max_terminated_length": 5045.0, "completions/mean_length": 2763.5, "completions/mean_terminated_length": 2763.5, "completions/min_length": 1119.0, "completions/min_terminated_length": 1119.0, "epoch": 0.005062984496124031, "grad_norm": 0.00548683325475162, "kl": 0.035003662109375, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 27041016.0, "reward": 0.45102459192276, "reward_std": 0.06410035490989685, "rewards/avg_thinking_length_func": 186.88746643066406, "rewards/confidence_score_reward_func": 0.6191459894180298, "rewards/correct_answer_reward_func": 0.6875, "rewards/efficient_thinking_reward_func": 0.8100582820862734, "rewards/format_and_efficient_reward_func": 0.44868165254592896, "rewards/format_reward_func": 0.9952791929244995, "rewards/num_xml_reward_func": 1.649810552597046, "rewards/tool_execution_reward_func": 1.9959295988082886, "rewards/visit_tool_reward_func": 0.8671329021453857, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005087209302325582, "grad_norm": 0.005414160917662644, "kl": 0.03399658203125, "learning_rate": 2e-06, "loss": 0.0002, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0051114341085271315, "grad_norm": 0.005397000227956369, "kl": 0.033294677734375, "learning_rate": 2e-06, "loss": 0.0002, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005135658914728682, "grad_norm": 0.005329822482164869, "kl": 0.03271484375, "learning_rate": 2e-06, "loss": 0.0002, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5622.0, "completions/max_terminated_length": 5622.0, "completions/mean_length": 2689.5, "completions/mean_terminated_length": 2689.5, "completions/min_length": 1080.0, "completions/min_terminated_length": 1080.0, "epoch": 0.005159883720930233, "grad_norm": 0.004583885118409577, "kl": 0.027679443359375, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 27444785.0, "reward": 0.3377038240432739, "reward_std": 0.03283514827489853, "rewards/avg_thinking_length_func": 156.95558166503906, "rewards/confidence_score_reward_func": 0.6069622039794922, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.8533607950008524, "rewards/format_and_efficient_reward_func": 0.3490750193595886, "rewards/format_reward_func": 0.9963964819908142, "rewards/num_xml_reward_func": 1.565781831741333, "rewards/tool_execution_reward_func": 1.9799107313156128, "rewards/visit_tool_reward_func": 0.886849582195282, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005184108527131783, "grad_norm": 0.004553415372891503, "kl": 0.02728271484375, "learning_rate": 2e-06, "loss": 0.0001, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005208333333333333, "grad_norm": 0.004416753047475649, "kl": 0.026763916015625, "learning_rate": 2e-06, "loss": 0.0001, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005232558139534884, "grad_norm": 0.004302097167180992, "kl": 0.02606201171875, "learning_rate": 2e-06, "loss": 0.0001, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6509.0, "completions/max_terminated_length": 6509.0, "completions/mean_length": 3170.359375, "completions/mean_terminated_length": 3170.359375, "completions/min_length": 1316.0, "completions/min_terminated_length": 1316.0, "epoch": 0.005256782945736434, "grad_norm": 0.00874079090702132, "kl": 0.03131103515625, "learning_rate": 2e-06, "loss": 0.0002, "num_tokens": 27886438.0, "reward": 0.4546785354614258, "reward_std": 0.13061311841011047, "rewards/avg_thinking_length_func": 185.58987426757812, "rewards/confidence_score_reward_func": 0.6329280138015747, "rewards/correct_answer_reward_func": 0.671875, "rewards/efficient_thinking_reward_func": 0.7895888587130873, "rewards/format_and_efficient_reward_func": 0.43139761686325073, "rewards/format_reward_func": 0.9971143007278442, "rewards/num_xml_reward_func": 1.6065764427185059, "rewards/tool_execution_reward_func": 1.9975961446762085, "rewards/visit_tool_reward_func": 0.8967168927192688, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005281007751937985, "grad_norm": 0.009254919184464793, "kl": 0.031158447265625, "learning_rate": 2e-06, "loss": 0.0002, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005305232558139535, "grad_norm": 0.008540278295280325, "kl": 0.03131103515625, "learning_rate": 2e-06, "loss": 0.0002, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005329457364341085, "grad_norm": 0.009027249196409619, "kl": 0.031463623046875, "learning_rate": 2e-06, "loss": 0.0002, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6125.0, "completions/max_terminated_length": 6125.0, "completions/mean_length": 2700.765625, "completions/mean_terminated_length": 2700.765625, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 0.005353682170542636, "grad_norm": 0.001653975947803042, "kl": 0.0260009765625, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 28309543.0, "reward": 0.3285777270793915, "reward_std": 0.013459177687764168, "rewards/avg_thinking_length_func": 149.52700805664062, "rewards/confidence_score_reward_func": 0.6043996214866638, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 0.8903335916310755, "rewards/format_and_efficient_reward_func": 0.3600352108478546, "rewards/format_reward_func": 0.996889591217041, "rewards/num_xml_reward_func": 1.5710426568984985, "rewards/tool_execution_reward_func": 1.9776184558868408, "rewards/visit_tool_reward_func": 0.9032177925109863, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005377906976744186, "grad_norm": 0.001652035863632615, "kl": 0.0264892578125, "learning_rate": 2e-06, "loss": 0.0001, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005402131782945736, "grad_norm": 0.0016513159446787636, "kl": 0.0269775390625, "learning_rate": 2e-06, "loss": 0.0001, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005426356589147287, "grad_norm": 0.0020335905228311916, "kl": 0.027557373046875, "learning_rate": 2e-06, "loss": 0.0001, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4748.0, "completions/max_terminated_length": 4748.0, "completions/mean_length": 2978.90625, "completions/mean_terminated_length": 2978.90625, "completions/min_length": 1244.0, "completions/min_terminated_length": 1244.0, "epoch": 0.005450581395348837, "grad_norm": 0.006026935047182901, "kl": 0.03179931640625, "learning_rate": 2e-06, "loss": 0.0003, "num_tokens": 28740848.0, "reward": 0.4945339560508728, "reward_std": 0.0744490772485733, "rewards/avg_thinking_length_func": 172.45849609375, "rewards/confidence_score_reward_func": 0.6167193651199341, "rewards/correct_answer_reward_func": 0.765625, "rewards/efficient_thinking_reward_func": 0.7966197226027097, "rewards/format_and_efficient_reward_func": 0.512791097164154, "rewards/format_reward_func": 0.9983228445053101, "rewards/num_xml_reward_func": 1.6630462408065796, "rewards/tool_execution_reward_func": 1.9971591234207153, "rewards/visit_tool_reward_func": 0.9000678062438965, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005474806201550388, "grad_norm": 0.005801070538806677, "kl": 0.0323486328125, "learning_rate": 2e-06, "loss": 0.0003, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005499031007751938, "grad_norm": 0.005789539677805553, "kl": 0.03302001953125, "learning_rate": 2e-06, "loss": 0.0003, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005523255813953488, "grad_norm": 0.005731300295942885, "kl": 0.033935546875, "learning_rate": 2e-06, "loss": 0.0003, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5315.0, "completions/max_terminated_length": 5315.0, "completions/mean_length": 2718.109375, "completions/mean_terminated_length": 2718.109375, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "epoch": 0.005547480620155039, "grad_norm": 0.0027604900450052977, "kl": 0.03369140625, "learning_rate": 2e-06, "loss": 0.0, "num_tokens": 29177563.0, "reward": 0.33125773072242737, "reward_std": 0.012095385231077671, "rewards/avg_thinking_length_func": 138.28082275390625, "rewards/confidence_score_reward_func": 0.588701605796814, "rewards/correct_answer_reward_func": 0.5, "rewards/efficient_thinking_reward_func": 0.8968424695250805, "rewards/format_and_efficient_reward_func": 0.36526361107826233, "rewards/format_reward_func": 0.9942506551742554, "rewards/num_xml_reward_func": 1.484344720840454, "rewards/tool_execution_reward_func": 1.9658281803131104, "rewards/visit_tool_reward_func": 0.9050877094268799, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0055717054263565895, "grad_norm": 0.0028469369049688264, "kl": 0.03424072265625, "learning_rate": 2e-06, "loss": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005595930232558139, "grad_norm": 0.0029207200987881226, "kl": 0.03466796875, "learning_rate": 2e-06, "loss": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00562015503875969, "grad_norm": 0.002891989345093088, "kl": 0.03436279296875, "learning_rate": 2e-06, "loss": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5430.0, "completions/max_terminated_length": 5430.0, "completions/mean_length": 3147.21875, "completions/mean_terminated_length": 3147.21875, "completions/min_length": 1208.0, "completions/min_terminated_length": 1208.0, "epoch": 0.0056443798449612404, "grad_norm": 0.008009912903442006, "kl": 0.039306640625, "learning_rate": 2e-06, "loss": 0.0004, "num_tokens": 29641355.0, "reward": 0.45486128330230713, "reward_std": 0.10010581463575363, "rewards/avg_thinking_length_func": 154.7548828125, "rewards/confidence_score_reward_func": 0.5910084247589111, "rewards/correct_answer_reward_func": 0.71875, "rewards/efficient_thinking_reward_func": 0.79141897353926, "rewards/format_and_efficient_reward_func": 0.4532102346420288, "rewards/format_reward_func": 0.9973268508911133, "rewards/num_xml_reward_func": 1.6137380599975586, "rewards/tool_execution_reward_func": 1.9840686321258545, "rewards/visit_tool_reward_func": 0.9216470718383789, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005668604651162791, "grad_norm": 0.008010434434161435, "kl": 0.03924560546875, "learning_rate": 2e-06, "loss": 0.0004, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005692829457364341, "grad_norm": 0.008059617739522514, "kl": 0.03936767578125, "learning_rate": 2e-06, "loss": 0.0004, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005717054263565891, "grad_norm": 0.008321692756210844, "kl": 0.0400390625, "learning_rate": 2e-06, "loss": 0.0004, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5409.0, "completions/max_terminated_length": 5409.0, "completions/mean_length": 2747.0, "completions/mean_terminated_length": 2747.0, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "epoch": 0.005741279069767442, "grad_norm": 0.00607005113841099, "kl": 0.0380859375, "learning_rate": 2e-06, "loss": 0.0001, "num_tokens": 30090756.0, "reward": 0.3245881199836731, "reward_std": 0.030338726937770844, "rewards/avg_thinking_length_func": 118.96601867675781, "rewards/confidence_score_reward_func": 0.5715887546539307, "rewards/correct_answer_reward_func": 0.515625, "rewards/efficient_thinking_reward_func": 0.7931376609790313, "rewards/format_and_efficient_reward_func": 0.3051683306694031, "rewards/format_reward_func": 0.9918498396873474, "rewards/num_xml_reward_func": 1.335392713546753, "rewards/tool_execution_reward_func": 1.956681728363037, "rewards/visit_tool_reward_func": 0.8923399448394775, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005765503875968993, "grad_norm": 0.006076971580972504, "kl": 0.03839111328125, "learning_rate": 2e-06, "loss": 0.0001, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005789728682170542, "grad_norm": 0.005795692009836339, "kl": 0.0380859375, "learning_rate": 2e-06, "loss": 0.0001, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.005813953488372093, "grad_norm": 0.005478655391819232, "kl": 0.0377197265625, "learning_rate": 2e-06, "loss": 0.0001, "step": 240 } ], "logging_steps": 1, "max_steps": 640, "num_input_tokens_seen": 30090756, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }