diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8049 @@ +{ + "best_metric": 0.34333334282040595, + "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/qwen2.5vl-7b-grpo_new_v20_5k/v13-20250325-021847/checkpoint-2475", + "epoch": 1.0, + "eval_steps": 250, + "global_step": 2475, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 359.125, + "epoch": 0.00040404040404040404, + "grad_norm": 1.364031546421686, + "kl": 0.0, + "learning_rate": 1.6129032258064515e-09, + "loss": -0.0474996417760849, + "memory(GiB)": 81.93, + "response_clip_ratio": 0.0, + "reward": 0.2083333432674408, + "reward_std": 0.25746434926986694, + "rewards/MultiModalAccuracyORM": 0.2083333432674408, + "step": 1, + "train_speed(iter/s)": 0.005983 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.95833945274353, + "epoch": 0.00202020202020202, + "grad_norm": 1.6130071483346196, + "kl": 0.00015279650688171387, + "learning_rate": 8.064516129032257e-09, + "loss": -0.0010303221642971039, + "memory(GiB)": 86.73, + "response_clip_ratio": 0.0, + "reward": 0.052083334885537624, + "reward_std": 0.13339675217866898, + "rewards/MultiModalAccuracyORM": 0.052083334885537624, + "step": 5, + "train_speed(iter/s)": 0.019266 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.46667594909667, + "epoch": 0.00404040404040404, + "grad_norm": 1.760454082663187, + "kl": 0.000270843505859375, + "learning_rate": 1.6129032258064514e-08, + "loss": 0.005405974388122558, + "memory(GiB)": 87.09, + "response_clip_ratio": 0.0, + "reward": 0.14166667312383652, + "reward_std": 0.26492767333984374, + "rewards/MultiModalAccuracyORM": 0.14166667312383652, + "step": 10, + "train_speed(iter/s)": 0.026623 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.308349609375, + "epoch": 0.006060606060606061, + "grad_norm": 1.1507264780517972, + "kl": 0.0002508640289306641, + "learning_rate": 2.4193548387096773e-08, + "loss": 0.013352996110916138, + "memory(GiB)": 87.09, + "response_clip_ratio": 0.02500000074505806, + "reward": 0.34166667610406876, + "reward_std": 0.36744636595249175, + "rewards/MultiModalAccuracyORM": 0.34166667610406876, + "step": 15, + "train_speed(iter/s)": 0.027725 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.9916717529297, + "epoch": 0.00808080808080808, + "grad_norm": 1.9440298564534324, + "kl": 0.00028104782104492186, + "learning_rate": 3.225806451612903e-08, + "loss": 0.006416285037994384, + "memory(GiB)": 87.09, + "response_clip_ratio": 0.0, + "reward": 0.2833333373069763, + "reward_std": 0.2916341096162796, + "rewards/MultiModalAccuracyORM": 0.2833333373069763, + "step": 20, + "train_speed(iter/s)": 0.031051 + }, + { + "clip_ratio": 0.0, + "completion_length": 378.5500061035156, + "epoch": 0.010101010101010102, + "grad_norm": 1.6907685802618988, + "kl": 0.0002666950225830078, + "learning_rate": 4.032258064516129e-08, + "loss": -0.018301564455032348, + "memory(GiB)": 87.09, + "response_clip_ratio": 0.0, + "reward": 0.30833334624767306, + "reward_std": 0.3720185041427612, + "rewards/MultiModalAccuracyORM": 0.30833334624767306, + "step": 25, + "train_speed(iter/s)": 0.032339 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.2333450317383, + "epoch": 0.012121212121212121, + "grad_norm": 1.5722363224769262, + "kl": 0.0002593994140625, + "learning_rate": 4.8387096774193546e-08, + "loss": -0.027563482522964478, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000596046446, + "reward_std": 0.3226982891559601, + "rewards/MultiModalAccuracyORM": 0.25000000596046446, + "step": 30, + "train_speed(iter/s)": 0.032649 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.5916778564453, + "epoch": 0.014141414141414142, + "grad_norm": 2.304234213678912, + "kl": 0.00022954940795898436, + "learning_rate": 5.645161290322581e-08, + "loss": 0.048061671853065493, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.1416666716337204, + "reward_std": 0.3226627051830292, + "rewards/MultiModalAccuracyORM": 0.1416666716337204, + "step": 35, + "train_speed(iter/s)": 0.033014 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.97500972747804, + "epoch": 0.01616161616161616, + "grad_norm": 1.6894032790709004, + "kl": 0.0002648591995239258, + "learning_rate": 6.451612903225806e-08, + "loss": 0.012092837691307068, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666753590107, + "reward_std": 0.222271066904068, + "rewards/MultiModalAccuracyORM": 0.2666666753590107, + "step": 40, + "train_speed(iter/s)": 0.034411 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.9333435058594, + "epoch": 0.01818181818181818, + "grad_norm": 1.9171038477045215, + "kl": 0.00023059844970703126, + "learning_rate": 7.258064516129032e-08, + "loss": -0.0132610023021698, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333879709244, + "reward_std": 0.2489179015159607, + "rewards/MultiModalAccuracyORM": 0.15833333879709244, + "step": 45, + "train_speed(iter/s)": 0.034702 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.20001525878905, + "epoch": 0.020202020202020204, + "grad_norm": 1.795783985834061, + "kl": 0.00021610260009765624, + "learning_rate": 8.064516129032257e-08, + "loss": 0.055432689189910886, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333730697633, + "reward_std": 0.320406436920166, + "rewards/MultiModalAccuracyORM": 0.13333333730697633, + "step": 50, + "train_speed(iter/s)": 0.034713 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.8500068664551, + "epoch": 0.022222222222222223, + "grad_norm": 1.570392013394559, + "kl": 0.00024003982543945311, + "learning_rate": 8.870967741935484e-08, + "loss": 0.0527652382850647, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000968575477, + "reward_std": 0.24862808585166932, + "rewards/MultiModalAccuracyORM": 0.17500000968575477, + "step": 55, + "train_speed(iter/s)": 0.035397 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.03333892822266, + "epoch": 0.024242424242424242, + "grad_norm": 1.7404447091659765, + "kl": 0.00024061203002929689, + "learning_rate": 9.677419354838709e-08, + "loss": -0.06867231130599975, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667982935904, + "reward_std": 0.33052347004413607, + "rewards/MultiModalAccuracyORM": 0.39166667982935904, + "step": 60, + "train_speed(iter/s)": 0.036121 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.5083480834961, + "epoch": 0.026262626262626262, + "grad_norm": 1.770871195621109, + "kl": 0.0002596855163574219, + "learning_rate": 1.0483870967741934e-07, + "loss": 0.019220371544361115, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.1416666701436043, + "reward_std": 0.27753120064735415, + "rewards/MultiModalAccuracyORM": 0.1416666701436043, + "step": 65, + "train_speed(iter/s)": 0.035829 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.05834197998047, + "epoch": 0.028282828282828285, + "grad_norm": 1.1236406922162803, + "kl": 0.00025534629821777344, + "learning_rate": 1.1290322580645162e-07, + "loss": 0.006563323736190796, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833334252238274, + "reward_std": 0.18108985424041749, + "rewards/MultiModalAccuracyORM": 0.15833334252238274, + "step": 70, + "train_speed(iter/s)": 0.036273 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.05833969116213, + "epoch": 0.030303030303030304, + "grad_norm": 2.2244576725130276, + "kl": 0.00026721954345703124, + "learning_rate": 1.2096774193548387e-07, + "loss": 0.021188412606716157, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333333805203437, + "reward_std": 0.3494287371635437, + "rewards/MultiModalAccuracyORM": 0.28333333805203437, + "step": 75, + "train_speed(iter/s)": 0.036577 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.70000381469725, + "epoch": 0.03232323232323232, + "grad_norm": 2.238393674944575, + "kl": 0.00026388168334960936, + "learning_rate": 1.2903225806451611e-07, + "loss": 0.029351598024368285, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.22500000521540642, + "reward_std": 0.279270276427269, + "rewards/MultiModalAccuracyORM": 0.22500000521540642, + "step": 80, + "train_speed(iter/s)": 0.036263 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.05000381469728, + "epoch": 0.03434343434343434, + "grad_norm": 1.5092959560425367, + "kl": 0.00028471946716308595, + "learning_rate": 1.3709677419354838e-07, + "loss": -0.036607831716537476, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333334177732467, + "reward_std": 0.39707074165344236, + "rewards/MultiModalAccuracyORM": 0.28333334177732467, + "step": 85, + "train_speed(iter/s)": 0.035112 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.3000152587891, + "epoch": 0.03636363636363636, + "grad_norm": 1.983727747725694, + "kl": 0.0002570152282714844, + "learning_rate": 1.4516129032258064e-07, + "loss": 0.02973529100418091, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000447034836, + "reward_std": 0.27928483188152314, + "rewards/MultiModalAccuracyORM": 0.17500000447034836, + "step": 90, + "train_speed(iter/s)": 0.035019 + }, + { + "clip_ratio": 0.0, + "completion_length": 420.7333511352539, + "epoch": 0.03838383838383838, + "grad_norm": 1.6243054678942601, + "kl": 0.00022783279418945313, + "learning_rate": 1.5322580645161288e-07, + "loss": -0.030441620945930482, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.23333333805203438, + "reward_std": 0.35868159830570223, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 95, + "train_speed(iter/s)": 0.035038 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.6583419799805, + "epoch": 0.04040404040404041, + "grad_norm": 1.5278965004190905, + "kl": 0.00023970603942871093, + "learning_rate": 1.6129032258064515e-07, + "loss": 0.014825087785720826, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3833333417773247, + "reward_std": 0.24560283720493317, + "rewards/MultiModalAccuracyORM": 0.3833333417773247, + "step": 100, + "train_speed(iter/s)": 0.035336 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.6000091552734, + "epoch": 0.04242424242424243, + "grad_norm": 2.275003739183734, + "kl": 0.0002989768981933594, + "learning_rate": 1.6935483870967741e-07, + "loss": 0.021370184421539307, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333410322666, + "reward_std": 0.31520852744579314, + "rewards/MultiModalAccuracyORM": 0.3083333410322666, + "step": 105, + "train_speed(iter/s)": 0.035535 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.37500915527346, + "epoch": 0.044444444444444446, + "grad_norm": 1.3264840189361857, + "kl": 0.00028629302978515624, + "learning_rate": 1.7741935483870968e-07, + "loss": 0.013422733545303345, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1416666701436043, + "reward_std": 0.24885829985141755, + "rewards/MultiModalAccuracyORM": 0.1416666701436043, + "step": 110, + "train_speed(iter/s)": 0.035867 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.7583488464355, + "epoch": 0.046464646464646465, + "grad_norm": 0.0068729642108505875, + "kl": 0.00022754669189453124, + "learning_rate": 1.8548387096774192e-07, + "loss": 0.007101482152938843, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.24166667386889457, + "reward_std": 0.23854664266109465, + "rewards/MultiModalAccuracyORM": 0.24166667386889457, + "step": 115, + "train_speed(iter/s)": 0.035529 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.0500122070313, + "epoch": 0.048484848484848485, + "grad_norm": 1.666888807483155, + "kl": 0.00029277801513671875, + "learning_rate": 1.9354838709677418e-07, + "loss": -0.013055479526519776, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833334028720857, + "reward_std": 0.24041947722434998, + "rewards/MultiModalAccuracyORM": 0.20833334028720857, + "step": 120, + "train_speed(iter/s)": 0.035843 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0916717529297, + "epoch": 0.050505050505050504, + "grad_norm": 3.6057797063570765, + "kl": 0.00020406246185302734, + "learning_rate": 2e-07, + "loss": 0.029223644733428956, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2750000052154064, + "reward_std": 0.27371591329574585, + "rewards/MultiModalAccuracyORM": 0.2750000052154064, + "step": 125, + "train_speed(iter/s)": 0.036026 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.7583526611328, + "epoch": 0.052525252525252523, + "grad_norm": 1.7900187922950372, + "kl": 0.00025043487548828127, + "learning_rate": 2e-07, + "loss": 0.0551780104637146, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333484828472, + "reward_std": 0.3641817569732666, + "rewards/MultiModalAccuracyORM": 0.2833333484828472, + "step": 130, + "train_speed(iter/s)": 0.036075 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.5000072479248, + "epoch": 0.05454545454545454, + "grad_norm": 2.529917707084592, + "kl": 0.0002529144287109375, + "learning_rate": 2e-07, + "loss": 0.02438216805458069, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.3416666708886623, + "reward_std": 0.279270276427269, + "rewards/MultiModalAccuracyORM": 0.3416666708886623, + "step": 135, + "train_speed(iter/s)": 0.036092 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.6333465576172, + "epoch": 0.05656565656565657, + "grad_norm": 1.3049814649570146, + "kl": 0.0002875804901123047, + "learning_rate": 2e-07, + "loss": -0.022501662373542786, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3833333469927311, + "reward_std": 0.34958777129650115, + "rewards/MultiModalAccuracyORM": 0.3833333469927311, + "step": 140, + "train_speed(iter/s)": 0.036095 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.41668395996095, + "epoch": 0.05858585858585859, + "grad_norm": 1.8437868971897566, + "kl": 0.00023627281188964844, + "learning_rate": 2e-07, + "loss": 0.06273630857467652, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.3666666768491268, + "reward_std": 0.3914994150400162, + "rewards/MultiModalAccuracyORM": 0.3666666768491268, + "step": 145, + "train_speed(iter/s)": 0.036226 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.51667098999025, + "epoch": 0.06060606060606061, + "grad_norm": 1.0785517011291799, + "kl": 0.00021938085556030273, + "learning_rate": 2e-07, + "loss": 0.02771698534488678, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.400000012665987, + "reward_std": 0.3516494154930115, + "rewards/MultiModalAccuracyORM": 0.400000012665987, + "step": 150, + "train_speed(iter/s)": 0.036427 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.3500152587891, + "epoch": 0.06262626262626263, + "grad_norm": 12.619972342482905, + "kl": 0.00030460357666015623, + "learning_rate": 2e-07, + "loss": -0.06058757305145264, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2250000074505806, + "reward_std": 0.37600439190864565, + "rewards/MultiModalAccuracyORM": 0.2250000074505806, + "step": 155, + "train_speed(iter/s)": 0.036609 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.7416732788086, + "epoch": 0.06464646464646465, + "grad_norm": 1.306377595968382, + "kl": 0.00027475357055664065, + "learning_rate": 2e-07, + "loss": -0.00979010909795761, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.15833333432674407, + "reward_std": 0.28456337153911593, + "rewards/MultiModalAccuracyORM": 0.15833333432674407, + "step": 160, + "train_speed(iter/s)": 0.036431 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.9916763305664, + "epoch": 0.06666666666666667, + "grad_norm": 0.9830762972924579, + "kl": 0.00030498504638671876, + "learning_rate": 2e-07, + "loss": -0.008201467990875243, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.10000000149011612, + "reward_std": 0.2260383188724518, + "rewards/MultiModalAccuracyORM": 0.10000000149011612, + "step": 165, + "train_speed(iter/s)": 0.036665 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.37500610351563, + "epoch": 0.06868686868686869, + "grad_norm": 2.1917101699979287, + "kl": 0.00025620460510253904, + "learning_rate": 2e-07, + "loss": 0.016992685198783875, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000037252903, + "reward_std": 0.330559054017067, + "rewards/MultiModalAccuracyORM": 0.3500000037252903, + "step": 170, + "train_speed(iter/s)": 0.036951 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.87500762939453, + "epoch": 0.0707070707070707, + "grad_norm": 1.0748542635448965, + "kl": 0.0002711296081542969, + "learning_rate": 2e-07, + "loss": 0.010954010486602783, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833333656191827, + "reward_std": 0.22400068640708923, + "rewards/MultiModalAccuracyORM": 0.20833333656191827, + "step": 175, + "train_speed(iter/s)": 0.037203 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.62500762939453, + "epoch": 0.07272727272727272, + "grad_norm": 2.2725379948331543, + "kl": 0.00025653839111328125, + "learning_rate": 2e-07, + "loss": 0.03469780087471008, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.16666667237877847, + "reward_std": 0.3332285821437836, + "rewards/MultiModalAccuracyORM": 0.16666667237877847, + "step": 180, + "train_speed(iter/s)": 0.037355 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.96667327880857, + "epoch": 0.07474747474747474, + "grad_norm": 1.4486054691502512, + "kl": 0.0002918243408203125, + "learning_rate": 2e-07, + "loss": -0.009673595428466797, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333373069763, + "reward_std": 0.102961727976799, + "rewards/MultiModalAccuracyORM": 0.3083333373069763, + "step": 185, + "train_speed(iter/s)": 0.037564 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.25834197998046, + "epoch": 0.07676767676767676, + "grad_norm": 3.170971594101629, + "kl": 0.00025038719177246095, + "learning_rate": 2e-07, + "loss": 0.0012440800666809082, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.24166667014360427, + "reward_std": 0.30789810717105864, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 190, + "train_speed(iter/s)": 0.037415 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.62500991821287, + "epoch": 0.07878787878787878, + "grad_norm": 1.98318367969525, + "kl": 0.00029687881469726564, + "learning_rate": 2e-07, + "loss": 0.008435648679733277, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.33333334028720857, + "reward_std": 0.24741607010364533, + "rewards/MultiModalAccuracyORM": 0.33333334028720857, + "step": 195, + "train_speed(iter/s)": 0.037342 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.6333465576172, + "epoch": 0.08080808080808081, + "grad_norm": 1.503273341785427, + "kl": 0.000333404541015625, + "learning_rate": 2e-07, + "loss": 0.005708768963813782, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666667237877845, + "reward_std": 0.3603756338357925, + "rewards/MultiModalAccuracyORM": 0.26666667237877845, + "step": 200, + "train_speed(iter/s)": 0.037521 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.68334407806395, + "epoch": 0.08282828282828283, + "grad_norm": 0.5199716532978094, + "kl": 0.0004832744598388672, + "learning_rate": 2e-07, + "loss": -0.014856468141078948, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.33937130570411683, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 205, + "train_speed(iter/s)": 0.037585 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.3916732788086, + "epoch": 0.08484848484848485, + "grad_norm": 2.1287930828371358, + "kl": 0.000292205810546875, + "learning_rate": 2e-07, + "loss": 0.001297689974308014, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667386889457, + "reward_std": 0.32771685123443606, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 210, + "train_speed(iter/s)": 0.037751 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.49167213439944, + "epoch": 0.08686868686868687, + "grad_norm": 1.7796242872827708, + "kl": 0.00042543411254882815, + "learning_rate": 2e-07, + "loss": -0.006988461315631867, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333333656191826, + "reward_std": 0.2692273885011673, + "rewards/MultiModalAccuracyORM": 0.23333333656191826, + "step": 215, + "train_speed(iter/s)": 0.037754 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.81667442321776, + "epoch": 0.08888888888888889, + "grad_norm": 1.7638027896241226, + "kl": 0.0006679534912109375, + "learning_rate": 2e-07, + "loss": 0.006352822482585907, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333507180214, + "reward_std": 0.25008893609046934, + "rewards/MultiModalAccuracyORM": 0.15833333507180214, + "step": 220, + "train_speed(iter/s)": 0.03785 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.2750076293945, + "epoch": 0.09090909090909091, + "grad_norm": 0.012708836578688367, + "kl": 0.00029745101928710935, + "learning_rate": 2e-07, + "loss": 0.0504034161567688, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30000000149011613, + "reward_std": 0.3164917230606079, + "rewards/MultiModalAccuracyORM": 0.30000000149011613, + "step": 225, + "train_speed(iter/s)": 0.038015 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.6750061035156, + "epoch": 0.09292929292929293, + "grad_norm": 2.064611776487197, + "kl": 0.000385284423828125, + "learning_rate": 2e-07, + "loss": 0.07023286819458008, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15000000298023225, + "reward_std": 0.2650228708982468, + "rewards/MultiModalAccuracyORM": 0.15000000298023225, + "step": 230, + "train_speed(iter/s)": 0.03818 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.5416793823242, + "epoch": 0.09494949494949495, + "grad_norm": 1.949431436305181, + "kl": 0.0002506256103515625, + "learning_rate": 2e-07, + "loss": 0.01011454164981842, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3333333469927311, + "reward_std": 0.3637147039175034, + "rewards/MultiModalAccuracyORM": 0.3333333469927311, + "step": 235, + "train_speed(iter/s)": 0.03819 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.533341217041, + "epoch": 0.09696969696969697, + "grad_norm": 0.5471178347466235, + "kl": 0.0010341405868530273, + "learning_rate": 2e-07, + "loss": -0.0015352100133895874, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.01666666716337204, + "reward": 0.28333333805203437, + "reward_std": 0.3511823683977127, + "rewards/MultiModalAccuracyORM": 0.28333333805203437, + "step": 240, + "train_speed(iter/s)": 0.037977 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.3000129699707, + "epoch": 0.09898989898989899, + "grad_norm": 2.3165413137247333, + "kl": 0.00027217864990234373, + "learning_rate": 2e-07, + "loss": 0.0210051491856575, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.00833333358168602, + "reward": 0.32500000968575476, + "reward_std": 0.38450039029121397, + "rewards/MultiModalAccuracyORM": 0.32500000968575476, + "step": 245, + "train_speed(iter/s)": 0.037993 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 2.645674704495033, + "learning_rate": 2e-07, + "loss": -0.03384391665458679, + "memory(GiB)": 87.45, + "step": 250, + "train_speed(iter/s)": 0.038032 + }, + { + "epoch": 0.10101010101010101, + "eval_clip_ratio": 0.0, + "eval_completion_length": 334.34500762939456, + "eval_kl": 0.0004983329772949218, + "eval_loss": 0.023834386840462685, + "eval_response_clip_ratio": 0.003333333432674408, + "eval_reward": 0.24666667267680167, + "eval_reward_std": 0.30061395645141603, + "eval_rewards/MultiModalAccuracyORM": 0.24666667267680167, + "eval_runtime": 585.2435, + "eval_samples_per_second": 0.085, + "eval_steps_per_second": 0.009, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.79583778381345, + "epoch": 0.10303030303030303, + "grad_norm": 1.5910045148895993, + "kl": 0.0006116151809692383, + "learning_rate": 2e-07, + "loss": -0.05511324405670166, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667647659776, + "reward_std": 0.3701108664274216, + "rewards/MultiModalAccuracyORM": 0.34166667647659776, + "step": 255, + "train_speed(iter/s)": 0.03329 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.85, + "epoch": 0.10505050505050505, + "grad_norm": 1.8789057522234565, + "kl": 0.0006687164306640625, + "learning_rate": 2e-07, + "loss": 0.08147464394569397, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000037252903, + "reward_std": 0.3494287371635437, + "rewards/MultiModalAccuracyORM": 0.3500000037252903, + "step": 260, + "train_speed(iter/s)": 0.033421 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.0, + "epoch": 0.10707070707070707, + "grad_norm": 1.685788699755795, + "kl": 0.00030879974365234376, + "learning_rate": 2e-07, + "loss": 0.0021983295679092406, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333358168602, + "reward_std": 0.3010816007852554, + "rewards/MultiModalAccuracyORM": 0.2083333358168602, + "step": 265, + "train_speed(iter/s)": 0.033374 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.5, + "epoch": 0.10909090909090909, + "grad_norm": 2.9700739773322695, + "kl": 0.00040111541748046877, + "learning_rate": 2e-07, + "loss": -0.004064649343490601, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333730697632, + "reward_std": 0.33526621460914613, + "rewards/MultiModalAccuracyORM": 0.15833333730697632, + "step": 270, + "train_speed(iter/s)": 0.033364 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.25, + "epoch": 0.1111111111111111, + "grad_norm": 1.5939506920216808, + "kl": 0.00045032501220703124, + "learning_rate": 2e-07, + "loss": 0.026332959532737732, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2583333395421505, + "reward_std": 0.2526735752820969, + "rewards/MultiModalAccuracyORM": 0.2583333395421505, + "step": 275, + "train_speed(iter/s)": 0.033468 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.5, + "epoch": 0.11313131313131314, + "grad_norm": 1.3058289755881347, + "kl": 0.000375831127166748, + "learning_rate": 2e-07, + "loss": 0.027166426181793213, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.32500001341104506, + "reward_std": 0.37195890247821806, + "rewards/MultiModalAccuracyORM": 0.32500001341104506, + "step": 280, + "train_speed(iter/s)": 0.033383 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.05, + "epoch": 0.11515151515151516, + "grad_norm": 0.5211592745612927, + "kl": 0.0004334449768066406, + "learning_rate": 2e-07, + "loss": -0.001045474410057068, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000149011612, + "reward_std": 0.1808116167783737, + "rewards/MultiModalAccuracyORM": 0.22500000149011612, + "step": 285, + "train_speed(iter/s)": 0.03333 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.25, + "epoch": 0.11717171717171718, + "grad_norm": 1.9995357461573446, + "kl": 0.0005333900451660156, + "learning_rate": 2e-07, + "loss": -0.00281745046377182, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.3385071337223053, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 290, + "train_speed(iter/s)": 0.033413 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.1, + "epoch": 0.1191919191919192, + "grad_norm": 3.694756818436622, + "kl": 0.0010143280029296874, + "learning_rate": 2e-07, + "loss": -0.003062787652015686, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333879709244, + "reward_std": 0.314164274930954, + "rewards/MultiModalAccuracyORM": 0.15833333879709244, + "step": 295, + "train_speed(iter/s)": 0.03345 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.25, + "epoch": 0.12121212121212122, + "grad_norm": 1.5577866137872902, + "kl": 0.00044269561767578124, + "learning_rate": 2e-07, + "loss": -0.022827643156051635, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2583333447575569, + "reward_std": 0.3393001317977905, + "rewards/MultiModalAccuracyORM": 0.2583333447575569, + "step": 300, + "train_speed(iter/s)": 0.033327 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.25, + "epoch": 0.12323232323232323, + "grad_norm": 0.8793802822161716, + "kl": 0.00045299530029296875, + "learning_rate": 2e-07, + "loss": 0.039026769995689395, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2333333395421505, + "reward_std": 0.33277973234653474, + "rewards/MultiModalAccuracyORM": 0.2333333395421505, + "step": 305, + "train_speed(iter/s)": 0.032887 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.3, + "epoch": 0.12525252525252525, + "grad_norm": 1.9841151826732792, + "kl": 0.0006313323974609375, + "learning_rate": 2e-07, + "loss": -0.006224775314331054, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334252238275, + "reward_std": 0.31441850066184995, + "rewards/MultiModalAccuracyORM": 0.23333334252238275, + "step": 310, + "train_speed(iter/s)": 0.032913 + }, + { + "clip_ratio": 0.0, + "completion_length": 537.7, + "epoch": 0.12727272727272726, + "grad_norm": 1.2729907719968943, + "kl": 0.0007027626037597656, + "learning_rate": 2e-07, + "loss": 0.014832744002342224, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.11666666939854622, + "reward_std": 0.25891573131084444, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 315, + "train_speed(iter/s)": 0.032886 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.8, + "epoch": 0.1292929292929293, + "grad_norm": 0.9148877498687834, + "kl": 0.000760650634765625, + "learning_rate": 2e-07, + "loss": 0.06303757429122925, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.2323044866323471, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 320, + "train_speed(iter/s)": 0.032974 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.8, + "epoch": 0.13131313131313133, + "grad_norm": 2.00474803214382, + "kl": 0.0007790565490722656, + "learning_rate": 2e-07, + "loss": 0.02660681903362274, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333656191826, + "reward_std": 0.2486636757850647, + "rewards/MultiModalAccuracyORM": 0.13333333656191826, + "step": 325, + "train_speed(iter/s)": 0.033068 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.8, + "epoch": 0.13333333333333333, + "grad_norm": 1.6448765146368245, + "kl": 0.0005625724792480469, + "learning_rate": 2e-07, + "loss": 0.024477413296699523, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.19166667237877846, + "reward_std": 0.2629852324724197, + "rewards/MultiModalAccuracyORM": 0.19166667237877846, + "step": 330, + "train_speed(iter/s)": 0.0332 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.25, + "epoch": 0.13535353535353536, + "grad_norm": 2.2001765187520776, + "kl": 0.0006697654724121093, + "learning_rate": 2e-07, + "loss": 0.07480921745300292, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500001043081285, + "reward_std": 0.37195890247821806, + "rewards/MultiModalAccuracyORM": 0.27500001043081285, + "step": 335, + "train_speed(iter/s)": 0.033276 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.1, + "epoch": 0.13737373737373737, + "grad_norm": 0.6836764259374134, + "kl": 0.0006744384765625, + "learning_rate": 2e-07, + "loss": 0.050872421264648436, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667386889457, + "reward_std": 0.25897533297538755, + "rewards/MultiModalAccuracyORM": 0.36666667386889457, + "step": 340, + "train_speed(iter/s)": 0.033397 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.0, + "epoch": 0.1393939393939394, + "grad_norm": 0.02974363962833146, + "kl": 0.0007775306701660156, + "learning_rate": 2e-07, + "loss": -0.00942653715610504, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1500000037252903, + "reward_std": 0.1933199405670166, + "rewards/MultiModalAccuracyORM": 0.1500000037252903, + "step": 345, + "train_speed(iter/s)": 0.033409 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.15, + "epoch": 0.1414141414141414, + "grad_norm": 2.153809687333121, + "kl": 0.00106048583984375, + "learning_rate": 2e-07, + "loss": -0.04788823127746582, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3833333425223827, + "reward_std": 0.3908641755580902, + "rewards/MultiModalAccuracyORM": 0.3833333425223827, + "step": 350, + "train_speed(iter/s)": 0.033484 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.95, + "epoch": 0.14343434343434344, + "grad_norm": 2.9003800421035084, + "kl": 0.001187896728515625, + "learning_rate": 2e-07, + "loss": -0.025590839982032775, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1500000014901161, + "reward_std": 0.24484840035438538, + "rewards/MultiModalAccuracyORM": 0.1500000014901161, + "step": 355, + "train_speed(iter/s)": 0.033613 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.15, + "epoch": 0.14545454545454545, + "grad_norm": 1.3041121484800926, + "kl": 0.001438140869140625, + "learning_rate": 2e-07, + "loss": 0.10738253593444824, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.3196970522403717, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 360, + "train_speed(iter/s)": 0.033727 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.15, + "epoch": 0.14747474747474748, + "grad_norm": 0.8360441109730193, + "kl": 0.00127105712890625, + "learning_rate": 2e-07, + "loss": -0.003975853323936462, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.05000000149011612, + "reward_std": 0.13558491468429565, + "rewards/MultiModalAccuracyORM": 0.05000000149011612, + "step": 365, + "train_speed(iter/s)": 0.033745 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.6, + "epoch": 0.1494949494949495, + "grad_norm": 2.3979328705343153, + "kl": 0.001323699951171875, + "learning_rate": 2e-07, + "loss": -0.048431962728500366, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000521540644, + "reward_std": 0.35312480926513673, + "rewards/MultiModalAccuracyORM": 0.25000000521540644, + "step": 370, + "train_speed(iter/s)": 0.033877 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.2, + "epoch": 0.15151515151515152, + "grad_norm": 1.5241819642025198, + "kl": 0.0015224456787109376, + "learning_rate": 2e-07, + "loss": 0.08156558275222778, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666768491268, + "reward_std": 0.30183603167533873, + "rewards/MultiModalAccuracyORM": 0.3916666768491268, + "step": 375, + "train_speed(iter/s)": 0.033941 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7, + "epoch": 0.15353535353535352, + "grad_norm": 1.4091270455051919, + "kl": 0.0014804840087890626, + "learning_rate": 2e-07, + "loss": -0.005422207713127136, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833333656191826, + "reward_std": 0.29863070249557494, + "rewards/MultiModalAccuracyORM": 0.25833333656191826, + "step": 380, + "train_speed(iter/s)": 0.03401 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.4, + "epoch": 0.15555555555555556, + "grad_norm": 1.7741695775671322, + "kl": 0.0017261505126953125, + "learning_rate": 2e-07, + "loss": 0.013069793581962585, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35833334028720853, + "reward_std": 0.41791602969169617, + "rewards/MultiModalAccuracyORM": 0.35833334028720853, + "step": 385, + "train_speed(iter/s)": 0.034132 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.15, + "epoch": 0.15757575757575756, + "grad_norm": 2.1621073881433954, + "kl": 0.001946258544921875, + "learning_rate": 2e-07, + "loss": 0.018825350701808928, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2000000074505806, + "reward_std": 0.329024064540863, + "rewards/MultiModalAccuracyORM": 0.2000000074505806, + "step": 390, + "train_speed(iter/s)": 0.034186 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.7, + "epoch": 0.1595959595959596, + "grad_norm": 1.8573956206789706, + "kl": 0.0013622283935546876, + "learning_rate": 2e-07, + "loss": 0.01834181547164917, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2583333380520344, + "reward_std": 0.33226497769355773, + "rewards/MultiModalAccuracyORM": 0.2583333380520344, + "step": 395, + "train_speed(iter/s)": 0.034168 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.05, + "epoch": 0.16161616161616163, + "grad_norm": 2.2110728171395646, + "kl": 0.0019084930419921875, + "learning_rate": 2e-07, + "loss": 0.019550779461860658, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000521540642, + "reward_std": 0.3008869707584381, + "rewards/MultiModalAccuracyORM": 0.20000000521540642, + "step": 400, + "train_speed(iter/s)": 0.034255 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5, + "epoch": 0.16363636363636364, + "grad_norm": 2.2884019112467, + "kl": 0.00233917236328125, + "learning_rate": 2e-07, + "loss": 0.00730045884847641, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30000000819563866, + "reward_std": 0.32297652661800386, + "rewards/MultiModalAccuracyORM": 0.30000000819563866, + "step": 405, + "train_speed(iter/s)": 0.034354 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.95, + "epoch": 0.16565656565656567, + "grad_norm": 3.384921120442682, + "kl": 0.001834869384765625, + "learning_rate": 2e-07, + "loss": 0.02867870032787323, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30833334401249884, + "reward_std": 0.3604020655155182, + "rewards/MultiModalAccuracyORM": 0.30833334401249884, + "step": 410, + "train_speed(iter/s)": 0.034303 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.85, + "epoch": 0.16767676767676767, + "grad_norm": 2.578682884841481, + "kl": 0.0019824981689453127, + "learning_rate": 2e-07, + "loss": 0.007520823180675507, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833333656191827, + "reward_std": 0.24105713665485382, + "rewards/MultiModalAccuracyORM": 0.20833333656191827, + "step": 415, + "train_speed(iter/s)": 0.034306 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.9, + "epoch": 0.1696969696969697, + "grad_norm": 2.841135168153006, + "kl": 0.003629302978515625, + "learning_rate": 2e-07, + "loss": 0.008403807878494263, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4166666746139526, + "reward_std": 0.31846399009227755, + "rewards/MultiModalAccuracyORM": 0.4166666746139526, + "step": 420, + "train_speed(iter/s)": 0.034394 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.0, + "epoch": 0.1717171717171717, + "grad_norm": 1.3952154788825455, + "kl": 0.0026947021484375, + "learning_rate": 2e-07, + "loss": 0.016321972012519836, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3750000104308128, + "reward_std": 0.3541358977556229, + "rewards/MultiModalAccuracyORM": 0.3750000104308128, + "step": 425, + "train_speed(iter/s)": 0.034427 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.85, + "epoch": 0.17373737373737375, + "grad_norm": 2.642228792263709, + "kl": 0.0035511016845703124, + "learning_rate": 2e-07, + "loss": 0.04757256805896759, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000059604645, + "reward_std": 0.27122943103313446, + "rewards/MultiModalAccuracyORM": 0.3000000059604645, + "step": 430, + "train_speed(iter/s)": 0.034492 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.9, + "epoch": 0.17575757575757575, + "grad_norm": 2.3061110590781433, + "kl": 0.0025909423828125, + "learning_rate": 2e-07, + "loss": -0.02955559492111206, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.42218015491962435, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 435, + "train_speed(iter/s)": 0.034539 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.9, + "epoch": 0.17777777777777778, + "grad_norm": 0.03487250614691778, + "kl": 0.00295562744140625, + "learning_rate": 2e-07, + "loss": 0.03084596395492554, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333432674407, + "reward_std": 0.2657532900571823, + "rewards/MultiModalAccuracyORM": 0.15833333432674407, + "step": 440, + "train_speed(iter/s)": 0.034613 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.5, + "epoch": 0.1797979797979798, + "grad_norm": 1.8186333166660678, + "kl": 0.0029296875, + "learning_rate": 2e-07, + "loss": -0.008677978813648225, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333358168602, + "reward_std": 0.23004821836948394, + "rewards/MultiModalAccuracyORM": 0.3083333358168602, + "step": 445, + "train_speed(iter/s)": 0.034594 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.7, + "epoch": 0.18181818181818182, + "grad_norm": 1.5483724144717876, + "kl": 0.003802490234375, + "learning_rate": 2e-07, + "loss": -0.010931169986724854, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667461395264, + "reward_std": 0.36794900298118594, + "rewards/MultiModalAccuracyORM": 0.21666667461395264, + "step": 450, + "train_speed(iter/s)": 0.034617 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.1, + "epoch": 0.18383838383838383, + "grad_norm": 0.8802169915779423, + "kl": 0.00302734375, + "learning_rate": 2e-07, + "loss": -0.04651644229888916, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15000000596046448, + "reward_std": 0.2963056802749634, + "rewards/MultiModalAccuracyORM": 0.15000000596046448, + "step": 455, + "train_speed(iter/s)": 0.034674 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5, + "epoch": 0.18585858585858586, + "grad_norm": 1.6049021687383316, + "kl": 0.00660247802734375, + "learning_rate": 2e-07, + "loss": 0.008616116642951966, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833333656191826, + "reward_std": 0.25741389989852903, + "rewards/MultiModalAccuracyORM": 0.25833333656191826, + "step": 460, + "train_speed(iter/s)": 0.034754 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.5, + "epoch": 0.18787878787878787, + "grad_norm": 2.893110887441056, + "kl": 0.002629852294921875, + "learning_rate": 2e-07, + "loss": 0.0028022266924381256, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000001415610315, + "reward_std": 0.4707459330558777, + "rewards/MultiModalAccuracyORM": 0.40000001415610315, + "step": 465, + "train_speed(iter/s)": 0.034821 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5, + "epoch": 0.1898989898989899, + "grad_norm": 2.1102869760511584, + "kl": 0.0035003662109375, + "learning_rate": 2e-07, + "loss": 0.0047733023762702945, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000670552254, + "reward_std": 0.3082119345664978, + "rewards/MultiModalAccuracyORM": 0.20000000670552254, + "step": 470, + "train_speed(iter/s)": 0.034862 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.1, + "epoch": 0.1919191919191919, + "grad_norm": 2.403767582762209, + "kl": 0.00347442626953125, + "learning_rate": 2e-07, + "loss": 0.0637534499168396, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5083333484828472, + "reward_std": 0.34557787179946897, + "rewards/MultiModalAccuracyORM": 0.5083333484828472, + "step": 475, + "train_speed(iter/s)": 0.03495 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.65, + "epoch": 0.19393939393939394, + "grad_norm": 0.6979791277265925, + "kl": 0.00365142822265625, + "learning_rate": 2e-07, + "loss": -0.04180996119976044, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666667088866236, + "reward_std": 0.32826719582080843, + "rewards/MultiModalAccuracyORM": 0.26666667088866236, + "step": 480, + "train_speed(iter/s)": 0.034951 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.85, + "epoch": 0.19595959595959597, + "grad_norm": 0.0525932465492366, + "kl": 0.00377197265625, + "learning_rate": 2e-07, + "loss": -0.014869007468223571, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4000000052154064, + "reward_std": 0.20967912971973418, + "rewards/MultiModalAccuracyORM": 0.4000000052154064, + "step": 485, + "train_speed(iter/s)": 0.035021 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.1, + "epoch": 0.19797979797979798, + "grad_norm": 1.6281647114218305, + "kl": 0.004177093505859375, + "learning_rate": 2e-07, + "loss": 0.015925824642181396, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.3227578908205032, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 490, + "train_speed(iter/s)": 0.035088 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.75, + "epoch": 0.2, + "grad_norm": 1.984961473458151, + "kl": 0.00326995849609375, + "learning_rate": 2e-07, + "loss": -0.0037449508905410766, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.2855509877204895, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 495, + "train_speed(iter/s)": 0.035113 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 0.6734714455829673, + "learning_rate": 2e-07, + "loss": -0.013085539638996124, + "memory(GiB)": 87.45, + "step": 500, + "train_speed(iter/s)": 0.035182 + }, + { + "epoch": 0.20202020202020202, + "eval_clip_ratio": 0.0, + "eval_completion_length": 363.1450085449219, + "eval_kl": 0.003147125244140625, + "eval_loss": 0.024374496191740036, + "eval_response_clip_ratio": 0.003333333432674408, + "eval_reward": 0.26666667237877845, + "eval_reward_std": 0.28797652542591096, + "eval_rewards/MultiModalAccuracyORM": 0.26666667237877845, + "eval_runtime": 597.4581, + "eval_samples_per_second": 0.084, + "eval_steps_per_second": 0.008, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.4, + "epoch": 0.20404040404040405, + "grad_norm": 2.0097245676314053, + "kl": 0.002962684631347656, + "learning_rate": 2e-07, + "loss": 0.008341678977012634, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22916666902601718, + "reward_std": 0.28844616413116453, + "rewards/MultiModalAccuracyORM": 0.22916666902601718, + "step": 505, + "train_speed(iter/s)": 0.033026 + }, + { + "clip_ratio": 0.0, + "completion_length": 478.15, + "epoch": 0.20606060606060606, + "grad_norm": 0.04671524557136776, + "kl": 0.004395294189453125, + "learning_rate": 2e-07, + "loss": 0.019101715087890624, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833333656191827, + "reward_std": 0.22704698145389557, + "rewards/MultiModalAccuracyORM": 0.20833333656191827, + "step": 510, + "train_speed(iter/s)": 0.033029 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.65, + "epoch": 0.2080808080808081, + "grad_norm": 1.7656462373703843, + "kl": 0.003029632568359375, + "learning_rate": 2e-07, + "loss": 0.04230659604072571, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000521540642, + "reward_std": 0.248858305811882, + "rewards/MultiModalAccuracyORM": 0.22500000521540642, + "step": 515, + "train_speed(iter/s)": 0.032925 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8, + "epoch": 0.2101010101010101, + "grad_norm": 1.2593604182587, + "kl": 0.0040802001953125, + "learning_rate": 2e-07, + "loss": -0.0020169973373413085, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4583333432674408, + "reward_std": 0.4390155434608459, + "rewards/MultiModalAccuracyORM": 0.4583333432674408, + "step": 520, + "train_speed(iter/s)": 0.032885 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.7, + "epoch": 0.21212121212121213, + "grad_norm": 10.635733115288671, + "kl": 0.006873321533203125, + "learning_rate": 2e-07, + "loss": 0.013639546930789948, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000447034837, + "reward_std": 0.29108133912086487, + "rewards/MultiModalAccuracyORM": 0.25000000447034837, + "step": 525, + "train_speed(iter/s)": 0.032731 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.65, + "epoch": 0.21414141414141413, + "grad_norm": 2.2605304578434664, + "kl": 0.00481109619140625, + "learning_rate": 2e-07, + "loss": 0.029361778497695924, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667610406877, + "reward_std": 0.3948740750551224, + "rewards/MultiModalAccuracyORM": 0.29166667610406877, + "step": 530, + "train_speed(iter/s)": 0.032671 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.95, + "epoch": 0.21616161616161617, + "grad_norm": 3.233553935601456, + "kl": 0.005239105224609375, + "learning_rate": 2e-07, + "loss": -0.02358839809894562, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.32500000670552254, + "reward_std": 0.39305841624736787, + "rewards/MultiModalAccuracyORM": 0.32500000670552254, + "step": 535, + "train_speed(iter/s)": 0.032679 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.15, + "epoch": 0.21818181818181817, + "grad_norm": 1.4435208932830024, + "kl": 0.0038543701171875, + "learning_rate": 2e-07, + "loss": 0.012015002965927123, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.14166666939854622, + "reward_std": 0.2184889554977417, + "rewards/MultiModalAccuracyORM": 0.14166666939854622, + "step": 540, + "train_speed(iter/s)": 0.032663 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.7, + "epoch": 0.2202020202020202, + "grad_norm": 2.124111886424564, + "kl": 0.00633544921875, + "learning_rate": 2e-07, + "loss": 0.016453295946121216, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.43333334773778914, + "reward_std": 0.40082641541957853, + "rewards/MultiModalAccuracyORM": 0.43333334773778914, + "step": 545, + "train_speed(iter/s)": 0.032702 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.4, + "epoch": 0.2222222222222222, + "grad_norm": 2.528384017814939, + "kl": 0.004555511474609375, + "learning_rate": 2e-07, + "loss": -0.013006833195686341, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2333333410322666, + "reward_std": 0.3478317141532898, + "rewards/MultiModalAccuracyORM": 0.2333333410322666, + "step": 550, + "train_speed(iter/s)": 0.032503 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.4, + "epoch": 0.22424242424242424, + "grad_norm": 2.5915001907307977, + "kl": 0.00524749755859375, + "learning_rate": 2e-07, + "loss": 0.02111098766326904, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2916666753590107, + "reward_std": 0.3644451290369034, + "rewards/MultiModalAccuracyORM": 0.2916666753590107, + "step": 555, + "train_speed(iter/s)": 0.032469 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.65, + "epoch": 0.22626262626262628, + "grad_norm": 1.5712795723400375, + "kl": 0.004864501953125, + "learning_rate": 2e-07, + "loss": 0.06747217178344726, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667386889457, + "reward_std": 0.30639870166778566, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 560, + "train_speed(iter/s)": 0.032374 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.25, + "epoch": 0.22828282828282828, + "grad_norm": 2.1872516406963483, + "kl": 0.0059844970703125, + "learning_rate": 2e-07, + "loss": -0.01907222718000412, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.27402731478214265, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 565, + "train_speed(iter/s)": 0.03236 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.8, + "epoch": 0.23030303030303031, + "grad_norm": 1.9388301349526922, + "kl": 0.00828857421875, + "learning_rate": 2e-07, + "loss": 0.0710361123085022, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333333507180214, + "reward_std": 0.26670235097408296, + "rewards/MultiModalAccuracyORM": 0.28333333507180214, + "step": 570, + "train_speed(iter/s)": 0.032388 + }, + { + "clip_ratio": 0.0, + "completion_length": 475.4, + "epoch": 0.23232323232323232, + "grad_norm": 2.0643763651689424, + "kl": 0.0043544769287109375, + "learning_rate": 2e-07, + "loss": 0.038624811172485354, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667386889455, + "reward_std": 0.38400964736938475, + "rewards/MultiModalAccuracyORM": 0.29166667386889455, + "step": 575, + "train_speed(iter/s)": 0.032305 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.45, + "epoch": 0.23434343434343435, + "grad_norm": 2.5185952971698566, + "kl": 0.00495452880859375, + "learning_rate": 2e-07, + "loss": 0.02923307418823242, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667461395264, + "reward_std": 0.35012357234954833, + "rewards/MultiModalAccuracyORM": 0.36666667461395264, + "step": 580, + "train_speed(iter/s)": 0.032206 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.9, + "epoch": 0.23636363636363636, + "grad_norm": 1.8128917450324007, + "kl": 0.0055450439453125, + "learning_rate": 2e-07, + "loss": 0.013245610892772675, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333333879709244, + "reward_std": 0.23710441291332246, + "rewards/MultiModalAccuracyORM": 0.28333333879709244, + "step": 585, + "train_speed(iter/s)": 0.032276 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.4, + "epoch": 0.2383838383838384, + "grad_norm": 4.329439973170006, + "kl": 0.00757293701171875, + "learning_rate": 2e-07, + "loss": -0.0028860807418823243, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000819563867, + "reward_std": 0.30661733746528624, + "rewards/MultiModalAccuracyORM": 0.25000000819563867, + "step": 590, + "train_speed(iter/s)": 0.032341 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.8, + "epoch": 0.2404040404040404, + "grad_norm": 1.8156019329792383, + "kl": 0.005291748046875, + "learning_rate": 2e-07, + "loss": -0.004809608310461044, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4166666753590107, + "reward_std": 0.40967183113098143, + "rewards/MultiModalAccuracyORM": 0.4166666753590107, + "step": 595, + "train_speed(iter/s)": 0.032425 + }, + { + "clip_ratio": 0.0, + "completion_length": 518.65, + "epoch": 0.24242424242424243, + "grad_norm": 1.6812944635615767, + "kl": 0.0045440673828125, + "learning_rate": 2e-07, + "loss": 0.0016623079776763917, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667312383653, + "reward_std": 0.35134140253067014, + "rewards/MultiModalAccuracyORM": 0.21666667312383653, + "step": 600, + "train_speed(iter/s)": 0.032411 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.4, + "epoch": 0.24444444444444444, + "grad_norm": 2.089820121690527, + "kl": 0.00710906982421875, + "learning_rate": 2e-07, + "loss": -0.028999322652816774, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666738688946, + "reward_std": 0.33552044034004214, + "rewards/MultiModalAccuracyORM": 0.2666666738688946, + "step": 605, + "train_speed(iter/s)": 0.032494 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.75, + "epoch": 0.24646464646464647, + "grad_norm": 2.728310100204588, + "kl": 0.00543060302734375, + "learning_rate": 2e-07, + "loss": 0.03924176394939423, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1916666716337204, + "reward_std": 0.27078639566898344, + "rewards/MultiModalAccuracyORM": 0.1916666716337204, + "step": 610, + "train_speed(iter/s)": 0.032569 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.2, + "epoch": 0.24848484848484848, + "grad_norm": 1.3175052417192106, + "kl": 0.00468902587890625, + "learning_rate": 2e-07, + "loss": 0.038245481252670285, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000067055225, + "reward_std": 0.4048719048500061, + "rewards/MultiModalAccuracyORM": 0.2500000067055225, + "step": 615, + "train_speed(iter/s)": 0.032501 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.7, + "epoch": 0.2505050505050505, + "grad_norm": 1.9529912685373527, + "kl": 0.00550537109375, + "learning_rate": 2e-07, + "loss": 0.011770330369472504, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666753590107, + "reward_std": 0.2895964771509171, + "rewards/MultiModalAccuracyORM": 0.3916666753590107, + "step": 620, + "train_speed(iter/s)": 0.032477 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.75, + "epoch": 0.25252525252525254, + "grad_norm": 0.05113023046556139, + "kl": 0.00566864013671875, + "learning_rate": 2e-07, + "loss": 0.01361861228942871, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2333333358168602, + "reward_std": 0.275274920463562, + "rewards/MultiModalAccuracyORM": 0.2333333358168602, + "step": 625, + "train_speed(iter/s)": 0.032463 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.15, + "epoch": 0.2545454545454545, + "grad_norm": 2.556743977258531, + "kl": 0.005108642578125, + "learning_rate": 2e-07, + "loss": 0.014950770139694213, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.34713688492774963, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 630, + "train_speed(iter/s)": 0.032484 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.15, + "epoch": 0.25656565656565655, + "grad_norm": 2.2423462644187624, + "kl": 0.004283905029296875, + "learning_rate": 2e-07, + "loss": 0.008650130033493042, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1833333395421505, + "reward_std": 0.28752902448177337, + "rewards/MultiModalAccuracyORM": 0.1833333395421505, + "step": 635, + "train_speed(iter/s)": 0.032416 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.15, + "epoch": 0.2585858585858586, + "grad_norm": 2.7318256637713327, + "kl": 0.0051483154296875, + "learning_rate": 2e-07, + "loss": 0.021026265621185303, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000111758709, + "reward_std": 0.29385479390621183, + "rewards/MultiModalAccuracyORM": 0.2500000111758709, + "step": 640, + "train_speed(iter/s)": 0.032398 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.9, + "epoch": 0.2606060606060606, + "grad_norm": 0.04170508484645814, + "kl": 0.00531463623046875, + "learning_rate": 2e-07, + "loss": -0.04355872869491577, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000819563867, + "reward_std": 0.3089067697525024, + "rewards/MultiModalAccuracyORM": 0.25000000819563867, + "step": 645, + "train_speed(iter/s)": 0.032375 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.55, + "epoch": 0.26262626262626265, + "grad_norm": 1.2451580073322923, + "kl": 0.003839111328125, + "learning_rate": 2e-07, + "loss": 0.00021180734038352966, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666667461395266, + "reward_std": 0.2676923930644989, + "rewards/MultiModalAccuracyORM": 0.26666667461395266, + "step": 650, + "train_speed(iter/s)": 0.032327 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.75, + "epoch": 0.26464646464646463, + "grad_norm": 1.9808716749773743, + "kl": 0.00391082763671875, + "learning_rate": 2e-07, + "loss": 0.026480630040168762, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000670552253, + "reward_std": 0.2817953139543533, + "rewards/MultiModalAccuracyORM": 0.22500000670552253, + "step": 655, + "train_speed(iter/s)": 0.032322 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.85, + "epoch": 0.26666666666666666, + "grad_norm": 1.1399233339835215, + "kl": 0.004100799560546875, + "learning_rate": 2e-07, + "loss": -0.02441052794456482, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1833333380520344, + "reward_std": 0.25897533297538755, + "rewards/MultiModalAccuracyORM": 0.1833333380520344, + "step": 660, + "train_speed(iter/s)": 0.032385 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.3, + "epoch": 0.2686868686868687, + "grad_norm": 2.4222117834215964, + "kl": 0.0057952880859375, + "learning_rate": 2e-07, + "loss": 0.01856023073196411, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2166666716337204, + "reward_std": 0.3348231792449951, + "rewards/MultiModalAccuracyORM": 0.2166666716337204, + "step": 665, + "train_speed(iter/s)": 0.032448 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.55, + "epoch": 0.27070707070707073, + "grad_norm": 2.596880019981878, + "kl": 0.0034820556640625, + "learning_rate": 2e-07, + "loss": -0.004870015382766724, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000001192092893, + "reward_std": 0.3786772578954697, + "rewards/MultiModalAccuracyORM": 0.40000001192092893, + "step": 670, + "train_speed(iter/s)": 0.032507 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.2, + "epoch": 0.2727272727272727, + "grad_norm": 1.261892143617939, + "kl": 0.003546142578125, + "learning_rate": 2e-07, + "loss": 0.018378911912441252, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667088866234, + "reward_std": 0.25365822613239286, + "rewards/MultiModalAccuracyORM": 0.24166667088866234, + "step": 675, + "train_speed(iter/s)": 0.032509 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.15, + "epoch": 0.27474747474747474, + "grad_norm": 1.5125590979703638, + "kl": 0.00487823486328125, + "learning_rate": 2e-07, + "loss": -0.004463189840316772, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000596046446, + "reward_std": 0.2488823115825653, + "rewards/MultiModalAccuracyORM": 0.25000000596046446, + "step": 680, + "train_speed(iter/s)": 0.03256 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.15, + "epoch": 0.2767676767676768, + "grad_norm": 0.0206379809755319, + "kl": 0.00426177978515625, + "learning_rate": 2e-07, + "loss": 0.021875476837158202, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.11666666865348815, + "reward_std": 0.26496326327323916, + "rewards/MultiModalAccuracyORM": 0.11666666865348815, + "step": 685, + "train_speed(iter/s)": 0.032514 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.5, + "epoch": 0.2787878787878788, + "grad_norm": 2.5475669372401737, + "kl": 0.00420379638671875, + "learning_rate": 2e-07, + "loss": 0.004043090343475342, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15000000447034836, + "reward_std": 0.30210480093955994, + "rewards/MultiModalAccuracyORM": 0.15000000447034836, + "step": 690, + "train_speed(iter/s)": 0.032521 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.7, + "epoch": 0.2808080808080808, + "grad_norm": 1.5500150159182102, + "kl": 0.00518798828125, + "learning_rate": 2e-07, + "loss": -0.023865307867527007, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1833333343267441, + "reward_std": 0.1683032989501953, + "rewards/MultiModalAccuracyORM": 0.1833333343267441, + "step": 695, + "train_speed(iter/s)": 0.032416 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.35, + "epoch": 0.2828282828282828, + "grad_norm": 1.9962407432487237, + "kl": 0.005457305908203125, + "learning_rate": 2e-07, + "loss": -0.028327393531799316, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.2250000037252903, + "reward_std": 0.2099333554506302, + "rewards/MultiModalAccuracyORM": 0.2250000037252903, + "step": 700, + "train_speed(iter/s)": 0.032361 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.4, + "epoch": 0.28484848484848485, + "grad_norm": 1.6074003724487615, + "kl": 0.00528717041015625, + "learning_rate": 2e-07, + "loss": 0.014926820993423462, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333410322666, + "reward_std": 0.27223809361457824, + "rewards/MultiModalAccuracyORM": 0.3083333410322666, + "step": 705, + "train_speed(iter/s)": 0.032343 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5, + "epoch": 0.2868686868686869, + "grad_norm": 1.6995014935336248, + "kl": 0.0051483154296875, + "learning_rate": 2e-07, + "loss": -0.019916635751724244, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.24885829985141755, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 710, + "train_speed(iter/s)": 0.032338 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.3, + "epoch": 0.28888888888888886, + "grad_norm": 2.5308810289000134, + "kl": 0.00496978759765625, + "learning_rate": 2e-07, + "loss": 0.01712719202041626, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000000447034834, + "reward_std": 0.3906455457210541, + "rewards/MultiModalAccuracyORM": 0.40000000447034834, + "step": 715, + "train_speed(iter/s)": 0.03227 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.2, + "epoch": 0.2909090909090909, + "grad_norm": 3.1179537828506865, + "kl": 0.00511016845703125, + "learning_rate": 2e-07, + "loss": -0.0032517150044441222, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500001043081283, + "reward_std": 0.3038526177406311, + "rewards/MultiModalAccuracyORM": 0.22500001043081283, + "step": 720, + "train_speed(iter/s)": 0.032189 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.9, + "epoch": 0.29292929292929293, + "grad_norm": 1.3264200657485663, + "kl": 0.0060546875, + "learning_rate": 2e-07, + "loss": 0.005654716491699218, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2250000059604645, + "reward_std": 0.2988493382930756, + "rewards/MultiModalAccuracyORM": 0.2250000059604645, + "step": 725, + "train_speed(iter/s)": 0.032186 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.65, + "epoch": 0.29494949494949496, + "grad_norm": 0.5240042260688945, + "kl": 0.005621719360351563, + "learning_rate": 2e-07, + "loss": 0.010572614520788193, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1750000037252903, + "reward_std": 0.2945852130651474, + "rewards/MultiModalAccuracyORM": 0.1750000037252903, + "step": 730, + "train_speed(iter/s)": 0.032129 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.2, + "epoch": 0.296969696969697, + "grad_norm": 2.049661779713074, + "kl": 0.00519866943359375, + "learning_rate": 2e-07, + "loss": 0.022058649361133574, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333410322666, + "reward_std": 0.38726511001586916, + "rewards/MultiModalAccuracyORM": 0.2833333410322666, + "step": 735, + "train_speed(iter/s)": 0.032089 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.25, + "epoch": 0.298989898989899, + "grad_norm": 0.962602559613357, + "kl": 0.0046844482421875, + "learning_rate": 2e-07, + "loss": -0.0028517723083496095, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.23328913748264313, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 740, + "train_speed(iter/s)": 0.032072 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.5, + "epoch": 0.301010101010101, + "grad_norm": 2.0529334461639337, + "kl": 0.00500030517578125, + "learning_rate": 2e-07, + "loss": 0.0314439594745636, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334177732468, + "reward_std": 0.3212204694747925, + "rewards/MultiModalAccuracyORM": 0.23333334177732468, + "step": 745, + "train_speed(iter/s)": 0.032037 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.3580773911974338, + "learning_rate": 2e-07, + "loss": -0.007335931062698364, + "memory(GiB)": 87.45, + "step": 750, + "train_speed(iter/s)": 0.032014 + }, + { + "epoch": 0.30303030303030304, + "eval_clip_ratio": 0.0, + "eval_completion_length": 352.49667709350587, + "eval_kl": 0.00640625, + "eval_loss": 0.002320815809071064, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.2716666729748249, + "eval_reward_std": 0.33371097803115846, + "eval_rewards/MultiModalAccuracyORM": 0.2716666729748249, + "eval_runtime": 876.1057, + "eval_samples_per_second": 0.057, + "eval_steps_per_second": 0.006, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.55, + "epoch": 0.30505050505050507, + "grad_norm": 2.1426610619194815, + "kl": 0.00631256103515625, + "learning_rate": 2e-07, + "loss": -0.040098315477371214, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333656191826, + "reward_std": 0.22312387079000473, + "rewards/MultiModalAccuracyORM": 0.13333333656191826, + "step": 755, + "train_speed(iter/s)": 0.029206 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.4, + "epoch": 0.30707070707070705, + "grad_norm": 0.8717248302301553, + "kl": 0.00636749267578125, + "learning_rate": 2e-07, + "loss": 0.015009742975234986, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333358168602, + "reward_std": 0.2940850019454956, + "rewards/MultiModalAccuracyORM": 0.2833333358168602, + "step": 760, + "train_speed(iter/s)": 0.029162 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.8, + "epoch": 0.3090909090909091, + "grad_norm": 2.4403464428155925, + "kl": 0.0062957763671875, + "learning_rate": 2e-07, + "loss": 0.019652032852172853, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.45000001043081284, + "reward_std": 0.3222792655229568, + "rewards/MultiModalAccuracyORM": 0.45000001043081284, + "step": 765, + "train_speed(iter/s)": 0.029211 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.35, + "epoch": 0.3111111111111111, + "grad_norm": 1.6980769345505524, + "kl": 0.0074066162109375, + "learning_rate": 2e-07, + "loss": 0.018609333038330077, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667833924295, + "reward_std": 0.4026396483182907, + "rewards/MultiModalAccuracyORM": 0.31666667833924295, + "step": 770, + "train_speed(iter/s)": 0.029164 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.35, + "epoch": 0.31313131313131315, + "grad_norm": 1.4345330108808567, + "kl": 0.00540924072265625, + "learning_rate": 2e-07, + "loss": 0.034766983985900876, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.26666667088866236, + "reward_std": 0.3167103588581085, + "rewards/MultiModalAccuracyORM": 0.26666667088866236, + "step": 775, + "train_speed(iter/s)": 0.029127 + }, + { + "clip_ratio": 0.0, + "completion_length": 441.5, + "epoch": 0.3151515151515151, + "grad_norm": 1.0920815430357467, + "kl": 0.0054931640625, + "learning_rate": 2e-07, + "loss": -7.512569427490235e-05, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.10833333656191826, + "reward_std": 0.22400068640708923, + "rewards/MultiModalAccuracyORM": 0.10833333656191826, + "step": 780, + "train_speed(iter/s)": 0.029106 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.7, + "epoch": 0.31717171717171716, + "grad_norm": 1.3732918705207908, + "kl": 0.00477752685546875, + "learning_rate": 2e-07, + "loss": 0.015651023387908934, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2416666679084301, + "reward_std": 0.23479096889495848, + "rewards/MultiModalAccuracyORM": 0.2416666679084301, + "step": 785, + "train_speed(iter/s)": 0.029052 + }, + { + "clip_ratio": 0.0, + "completion_length": 453.35, + "epoch": 0.3191919191919192, + "grad_norm": 2.1057593122144005, + "kl": 0.00804595947265625, + "learning_rate": 2e-07, + "loss": -0.0006304442882537842, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.3619014710187912, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 790, + "train_speed(iter/s)": 0.029057 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.2, + "epoch": 0.3212121212121212, + "grad_norm": 2.3354800713445654, + "kl": 0.0078216552734375, + "learning_rate": 2e-07, + "loss": 0.0310418963432312, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667759418486, + "reward_std": 0.40556674003601073, + "rewards/MultiModalAccuracyORM": 0.36666667759418486, + "step": 795, + "train_speed(iter/s)": 0.029005 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.45, + "epoch": 0.32323232323232326, + "grad_norm": 2.4825567652901444, + "kl": 0.0077880859375, + "learning_rate": 2e-07, + "loss": 0.021943604946136473, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3666666761040688, + "reward_std": 0.3470627337694168, + "rewards/MultiModalAccuracyORM": 0.3666666761040688, + "step": 800, + "train_speed(iter/s)": 0.028978 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.9, + "epoch": 0.32525252525252524, + "grad_norm": 3.589672291824819, + "kl": 0.00778350830078125, + "learning_rate": 2e-07, + "loss": 0.008873769640922546, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.44790194034576414, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 805, + "train_speed(iter/s)": 0.028982 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.2, + "epoch": 0.32727272727272727, + "grad_norm": 2.1262920297539925, + "kl": 0.0073883056640625, + "learning_rate": 2e-07, + "loss": -0.04254024624824524, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333447575569, + "reward_std": 0.3589002341032028, + "rewards/MultiModalAccuracyORM": 0.2833333447575569, + "step": 810, + "train_speed(iter/s)": 0.029003 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.9, + "epoch": 0.3292929292929293, + "grad_norm": 2.6338345195445965, + "kl": 0.0073974609375, + "learning_rate": 2e-07, + "loss": 0.008789122104644775, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35833333879709245, + "reward_std": 0.3563657283782959, + "rewards/MultiModalAccuracyORM": 0.35833333879709245, + "step": 815, + "train_speed(iter/s)": 0.028993 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.85, + "epoch": 0.33131313131313134, + "grad_norm": 2.540831778543349, + "kl": 0.0085357666015625, + "learning_rate": 2e-07, + "loss": 0.0017573148012161254, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2583333395421505, + "reward_std": 0.33755565285682676, + "rewards/MultiModalAccuracyORM": 0.2583333395421505, + "step": 820, + "train_speed(iter/s)": 0.02904 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.3, + "epoch": 0.3333333333333333, + "grad_norm": 2.280326105508933, + "kl": 0.011834716796875, + "learning_rate": 2e-07, + "loss": -0.016002975404262543, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000596046447, + "reward_std": 0.24662604331970214, + "rewards/MultiModalAccuracyORM": 0.17500000596046447, + "step": 825, + "train_speed(iter/s)": 0.02907 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.05, + "epoch": 0.33535353535353535, + "grad_norm": 1.64256260222623, + "kl": 0.00889892578125, + "learning_rate": 2e-07, + "loss": -0.008859094977378846, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000149011612, + "reward_std": 0.3164917230606079, + "rewards/MultiModalAccuracyORM": 0.20000000149011612, + "step": 830, + "train_speed(iter/s)": 0.029109 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.15, + "epoch": 0.3373737373737374, + "grad_norm": 0.09646041600368084, + "kl": 0.0067840576171875, + "learning_rate": 2e-07, + "loss": 0.02341327965259552, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2750000096857548, + "reward_std": 0.3184880018234253, + "rewards/MultiModalAccuracyORM": 0.2750000096857548, + "step": 835, + "train_speed(iter/s)": 0.028908 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.35, + "epoch": 0.3393939393939394, + "grad_norm": 0.886588445568382, + "kl": 0.0066741943359375, + "learning_rate": 2e-07, + "loss": 0.011455638706684113, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667312383653, + "reward_std": 0.3142238825559616, + "rewards/MultiModalAccuracyORM": 0.34166667312383653, + "step": 840, + "train_speed(iter/s)": 0.02878 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.75, + "epoch": 0.3414141414141414, + "grad_norm": 0.0732846157739433, + "kl": 0.00753173828125, + "learning_rate": 2e-07, + "loss": 0.010994693636894226, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833333507180213, + "reward_std": 0.19786564111709595, + "rewards/MultiModalAccuracyORM": 0.20833333507180213, + "step": 845, + "train_speed(iter/s)": 0.028759 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.0, + "epoch": 0.3434343434343434, + "grad_norm": 2.016101823545884, + "kl": 0.00940399169921875, + "learning_rate": 2e-07, + "loss": 0.0015551522374153137, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667386889457, + "reward_std": 0.37221312820911406, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 850, + "train_speed(iter/s)": 0.028759 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.75, + "epoch": 0.34545454545454546, + "grad_norm": 1.4804689213107514, + "kl": 0.0074249267578125, + "learning_rate": 2e-07, + "loss": 0.008444362878799438, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.37500000521540644, + "reward_std": 0.33937130570411683, + "rewards/MultiModalAccuracyORM": 0.37500000521540644, + "step": 855, + "train_speed(iter/s)": 0.02876 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.1, + "epoch": 0.3474747474747475, + "grad_norm": 2.368905519842238, + "kl": 0.008038330078125, + "learning_rate": 2e-07, + "loss": 0.026756054162979125, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000111758709, + "reward_std": 0.44455128610134126, + "rewards/MultiModalAccuracyORM": 0.3500000111758709, + "step": 860, + "train_speed(iter/s)": 0.028787 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.85, + "epoch": 0.34949494949494947, + "grad_norm": 2.3043935598394203, + "kl": 0.0070343017578125, + "learning_rate": 2e-07, + "loss": 0.059600555896759035, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333432674408, + "reward_std": 0.3885723173618317, + "rewards/MultiModalAccuracyORM": 0.2833333432674408, + "step": 865, + "train_speed(iter/s)": 0.028814 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.95, + "epoch": 0.3515151515151515, + "grad_norm": 1.9471040249213727, + "kl": 0.0069305419921875, + "learning_rate": 2e-07, + "loss": 0.028457581996917725, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000596046447, + "reward_std": 0.2940494120121002, + "rewards/MultiModalAccuracyORM": 0.17500000596046447, + "step": 870, + "train_speed(iter/s)": 0.028708 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.5, + "epoch": 0.35353535353535354, + "grad_norm": 2.196604109706096, + "kl": 0.0058319091796875, + "learning_rate": 2e-07, + "loss": 0.04532061517238617, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666775941849, + "reward_std": 0.42524099349975586, + "rewards/MultiModalAccuracyORM": 0.3916666775941849, + "step": 875, + "train_speed(iter/s)": 0.028627 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.05, + "epoch": 0.35555555555555557, + "grad_norm": 1.9101064459839039, + "kl": 0.0102691650390625, + "learning_rate": 2e-07, + "loss": 0.04224415421485901, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30000000521540643, + "reward_std": 0.3391170799732208, + "rewards/MultiModalAccuracyORM": 0.30000000521540643, + "step": 880, + "train_speed(iter/s)": 0.028551 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.75, + "epoch": 0.3575757575757576, + "grad_norm": 1.7650856984522036, + "kl": 0.0097930908203125, + "learning_rate": 2e-07, + "loss": 0.031351178884506226, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.27555315792560575, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 885, + "train_speed(iter/s)": 0.028585 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.05, + "epoch": 0.3595959595959596, + "grad_norm": 2.4394117877960615, + "kl": 0.0123748779296875, + "learning_rate": 2e-07, + "loss": 0.01872892677783966, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000521540642, + "reward_std": 0.30489687621593475, + "rewards/MultiModalAccuracyORM": 0.22500000521540642, + "step": 890, + "train_speed(iter/s)": 0.028637 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.05, + "epoch": 0.3616161616161616, + "grad_norm": 2.3682785721081854, + "kl": 0.00737762451171875, + "learning_rate": 2e-07, + "loss": 0.02124558687210083, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000089406967, + "reward_std": 0.41817026138305663, + "rewards/MultiModalAccuracyORM": 0.3500000089406967, + "step": 895, + "train_speed(iter/s)": 0.028688 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.85, + "epoch": 0.36363636363636365, + "grad_norm": 1.3234500775547358, + "kl": 0.007550048828125, + "learning_rate": 2e-07, + "loss": 0.025475236773490905, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.2260383188724518, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 900, + "train_speed(iter/s)": 0.028712 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.75, + "epoch": 0.3656565656565657, + "grad_norm": 3.0802331121314785, + "kl": 0.0105621337890625, + "learning_rate": 2e-07, + "loss": 0.06260026693344116, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166666865348815, + "reward_std": 0.3003867596387863, + "rewards/MultiModalAccuracyORM": 0.24166666865348815, + "step": 905, + "train_speed(iter/s)": 0.028645 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.9, + "epoch": 0.36767676767676766, + "grad_norm": 3.596137864021678, + "kl": 0.01011199951171875, + "learning_rate": 2e-07, + "loss": 0.007353886961936951, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4833333432674408, + "reward_std": 0.38523324131965636, + "rewards/MultiModalAccuracyORM": 0.4833333432674408, + "step": 910, + "train_speed(iter/s)": 0.028662 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.65, + "epoch": 0.3696969696969697, + "grad_norm": 1.4417889638729746, + "kl": 0.01177978515625, + "learning_rate": 2e-07, + "loss": -0.006625932455062866, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3416666783392429, + "reward_std": 0.37195890843868257, + "rewards/MultiModalAccuracyORM": 0.3416666783392429, + "step": 915, + "train_speed(iter/s)": 0.028677 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.05, + "epoch": 0.3717171717171717, + "grad_norm": 2.8875811253312333, + "kl": 0.01148529052734375, + "learning_rate": 2e-07, + "loss": -3.943443298339844e-05, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667237877847, + "reward_std": 0.3604352355003357, + "rewards/MultiModalAccuracyORM": 0.41666667237877847, + "step": 920, + "train_speed(iter/s)": 0.028643 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.3, + "epoch": 0.37373737373737376, + "grad_norm": 1.8636332228250176, + "kl": 0.0091461181640625, + "learning_rate": 2e-07, + "loss": 0.004881632328033447, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333402872086, + "reward_std": 0.3440760403871536, + "rewards/MultiModalAccuracyORM": 0.2833333402872086, + "step": 925, + "train_speed(iter/s)": 0.028644 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.4, + "epoch": 0.37575757575757573, + "grad_norm": 2.1407505535783242, + "kl": 0.00869598388671875, + "learning_rate": 2e-07, + "loss": 0.05731675624847412, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833334103226663, + "reward_std": 0.41186849772930145, + "rewards/MultiModalAccuracyORM": 0.25833334103226663, + "step": 930, + "train_speed(iter/s)": 0.028644 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.9, + "epoch": 0.37777777777777777, + "grad_norm": 3.79021329286614, + "kl": 0.009942626953125, + "learning_rate": 2e-07, + "loss": 0.0477484941482544, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666716337204, + "reward_std": 0.2674737572669983, + "rewards/MultiModalAccuracyORM": 0.3166666716337204, + "step": 935, + "train_speed(iter/s)": 0.028648 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.45, + "epoch": 0.3797979797979798, + "grad_norm": 2.2451102482111724, + "kl": 0.012542724609375, + "learning_rate": 2e-07, + "loss": -1.335442066192627e-05, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000052154064, + "reward_std": 0.25591449439525604, + "rewards/MultiModalAccuracyORM": 0.3500000052154064, + "step": 940, + "train_speed(iter/s)": 0.028624 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.15, + "epoch": 0.38181818181818183, + "grad_norm": 1.4018775780145751, + "kl": 0.0094390869140625, + "learning_rate": 2e-07, + "loss": -0.003527042269706726, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667088866234, + "reward_std": 0.31088480055332185, + "rewards/MultiModalAccuracyORM": 0.24166667088866234, + "step": 945, + "train_speed(iter/s)": 0.028624 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.7, + "epoch": 0.3838383838383838, + "grad_norm": 3.8112599620979117, + "kl": 0.01011962890625, + "learning_rate": 2e-07, + "loss": 0.01941452920436859, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667610406876, + "reward_std": 0.34228681921958926, + "rewards/MultiModalAccuracyORM": 0.34166667610406876, + "step": 950, + "train_speed(iter/s)": 0.028616 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.35, + "epoch": 0.38585858585858585, + "grad_norm": 1.8716114512263384, + "kl": 0.0135040283203125, + "learning_rate": 2e-07, + "loss": 0.01583598256111145, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666701436043, + "reward_std": 0.26291108727455137, + "rewards/MultiModalAccuracyORM": 0.3916666701436043, + "step": 955, + "train_speed(iter/s)": 0.028656 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.0, + "epoch": 0.3878787878787879, + "grad_norm": 2.6882447296010508, + "kl": 0.0098663330078125, + "learning_rate": 2e-07, + "loss": 0.008884111046791076, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333410322666, + "reward_std": 0.3393357157707214, + "rewards/MultiModalAccuracyORM": 0.2833333410322666, + "step": 960, + "train_speed(iter/s)": 0.02864 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.3, + "epoch": 0.3898989898989899, + "grad_norm": 2.477942143166408, + "kl": 0.013421630859375, + "learning_rate": 2e-07, + "loss": 0.013846510648727417, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667088866236, + "reward_std": 0.3041278898715973, + "rewards/MultiModalAccuracyORM": 0.39166667088866236, + "step": 965, + "train_speed(iter/s)": 0.028659 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.35, + "epoch": 0.39191919191919194, + "grad_norm": 1.7487986972843892, + "kl": 0.008868408203125, + "learning_rate": 2e-07, + "loss": 0.041995507478713986, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000037252903, + "reward_std": 0.3277524411678314, + "rewards/MultiModalAccuracyORM": 0.3250000037252903, + "step": 970, + "train_speed(iter/s)": 0.028675 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.9, + "epoch": 0.3939393939393939, + "grad_norm": 1.040945452450775, + "kl": 0.00943603515625, + "learning_rate": 2e-07, + "loss": 0.004313239455223083, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333656191825, + "reward_std": 0.2722140818834305, + "rewards/MultiModalAccuracyORM": 0.18333333656191825, + "step": 975, + "train_speed(iter/s)": 0.028693 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.5, + "epoch": 0.39595959595959596, + "grad_norm": 1.987178230745996, + "kl": 0.0092681884765625, + "learning_rate": 2e-07, + "loss": 0.01756379157304764, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1833333358168602, + "reward_std": 0.3274982154369354, + "rewards/MultiModalAccuracyORM": 0.1833333358168602, + "step": 980, + "train_speed(iter/s)": 0.028714 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.4, + "epoch": 0.397979797979798, + "grad_norm": 1.9999919818314047, + "kl": 0.012908935546875, + "learning_rate": 2e-07, + "loss": 0.04084535539150238, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.32500001341104506, + "reward_std": 0.2752989321947098, + "rewards/MultiModalAccuracyORM": 0.32500001341104506, + "step": 985, + "train_speed(iter/s)": 0.028744 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.15, + "epoch": 0.4, + "grad_norm": 0.038170370656060805, + "kl": 0.010888671875, + "learning_rate": 2e-07, + "loss": 0.07128549218177796, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000819563867, + "reward_std": 0.30416645109653473, + "rewards/MultiModalAccuracyORM": 0.25000000819563867, + "step": 990, + "train_speed(iter/s)": 0.028708 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.45, + "epoch": 0.402020202020202, + "grad_norm": 2.632502419980814, + "kl": 0.0100616455078125, + "learning_rate": 2e-07, + "loss": 0.016613197326660157, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667610406875, + "reward_std": 0.37174026668071747, + "rewards/MultiModalAccuracyORM": 0.39166667610406875, + "step": 995, + "train_speed(iter/s)": 0.028687 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.07099216395354724, + "learning_rate": 2e-07, + "loss": 0.02232474982738495, + "memory(GiB)": 87.45, + "step": 1000, + "train_speed(iter/s)": 0.028672 + }, + { + "epoch": 0.40404040404040403, + "eval_clip_ratio": 0.0, + "eval_completion_length": 346.9533413696289, + "eval_kl": 0.013145751953125, + "eval_loss": -0.00028896695584990084, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.281666671782732, + "eval_reward_std": 0.3010890519618988, + "eval_rewards/MultiModalAccuracyORM": 0.281666671782732, + "eval_runtime": 1406.863, + "eval_samples_per_second": 0.036, + "eval_steps_per_second": 0.004, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.175, + "epoch": 0.40606060606060607, + "grad_norm": 1.905945440484278, + "kl": 0.009429931640625, + "learning_rate": 2e-07, + "loss": -0.0033631980419158935, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2708333387970924, + "reward_std": 0.24963780641555786, + "rewards/MultiModalAccuracyORM": 0.2708333387970924, + "step": 1005, + "train_speed(iter/s)": 0.027262 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.5, + "epoch": 0.4080808080808081, + "grad_norm": 1.6755020591769207, + "kl": 0.0134246826171875, + "learning_rate": 2e-07, + "loss": 0.05349223613739014, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000037252903, + "reward_std": 0.3494287371635437, + "rewards/MultiModalAccuracyORM": 0.2500000037252903, + "step": 1010, + "train_speed(iter/s)": 0.027279 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.25, + "epoch": 0.4101010101010101, + "grad_norm": 2.8913380726136872, + "kl": 0.0107147216796875, + "learning_rate": 2e-07, + "loss": -0.02667723298072815, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.38333334103226663, + "reward_std": 0.4211569488048553, + "rewards/MultiModalAccuracyORM": 0.38333334103226663, + "step": 1015, + "train_speed(iter/s)": 0.027304 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.65, + "epoch": 0.4121212121212121, + "grad_norm": 4.180952848080379, + "kl": 0.0100433349609375, + "learning_rate": 2e-07, + "loss": 0.00991852581501007, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333447575569, + "reward_std": 0.3088736057281494, + "rewards/MultiModalAccuracyORM": 0.2083333447575569, + "step": 1020, + "train_speed(iter/s)": 0.027315 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.55, + "epoch": 0.41414141414141414, + "grad_norm": 1.9667254904423306, + "kl": 0.0121246337890625, + "learning_rate": 2e-07, + "loss": 0.01899299621582031, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333507180214, + "reward_std": 0.3071291267871857, + "rewards/MultiModalAccuracyORM": 0.15833333507180214, + "step": 1025, + "train_speed(iter/s)": 0.027329 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.4, + "epoch": 0.4161616161616162, + "grad_norm": 1.7062594547415575, + "kl": 0.0100616455078125, + "learning_rate": 2e-07, + "loss": 0.004674983024597168, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500001043081283, + "reward_std": 0.3679134130477905, + "rewards/MultiModalAccuracyORM": 0.22500001043081283, + "step": 1030, + "train_speed(iter/s)": 0.027298 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.0, + "epoch": 0.41818181818181815, + "grad_norm": 72.23734764401382, + "kl": 0.011712646484375, + "learning_rate": 2e-07, + "loss": 0.05118045210838318, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666775941849, + "reward_std": 0.34735551476478577, + "rewards/MultiModalAccuracyORM": 0.2666666775941849, + "step": 1035, + "train_speed(iter/s)": 0.027303 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.15, + "epoch": 0.4202020202020202, + "grad_norm": 1.6715902563969363, + "kl": 0.0135772705078125, + "learning_rate": 2e-07, + "loss": 0.045872822403907776, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2416666716337204, + "reward_std": 0.287842845916748, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 1040, + "train_speed(iter/s)": 0.027298 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.15, + "epoch": 0.4222222222222222, + "grad_norm": 2.734745023688755, + "kl": 0.012158203125, + "learning_rate": 2e-07, + "loss": 0.05562522411346436, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.4314686059951782, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 1045, + "train_speed(iter/s)": 0.027328 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.3, + "epoch": 0.42424242424242425, + "grad_norm": 0.07598134741536419, + "kl": 0.009765625, + "learning_rate": 2e-07, + "loss": 0.008748695254325867, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.18326250910758973, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 1050, + "train_speed(iter/s)": 0.027308 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.2, + "epoch": 0.4262626262626263, + "grad_norm": 9.627726509942965, + "kl": 0.0136199951171875, + "learning_rate": 2e-07, + "loss": 0.03634963035583496, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666768491268, + "reward_std": 0.36670139729976653, + "rewards/MultiModalAccuracyORM": 0.3166666768491268, + "step": 1055, + "train_speed(iter/s)": 0.027311 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7, + "epoch": 0.42828282828282827, + "grad_norm": 1.2371668114044378, + "kl": 0.0134979248046875, + "learning_rate": 2e-07, + "loss": 0.04366698265075684, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333373069763, + "reward_std": 0.3498693466186523, + "rewards/MultiModalAccuracyORM": 0.2083333373069763, + "step": 1060, + "train_speed(iter/s)": 0.027334 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.05, + "epoch": 0.4303030303030303, + "grad_norm": 2.52858518092475, + "kl": 0.0135711669921875, + "learning_rate": 2e-07, + "loss": 0.065219247341156, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000111758709, + "reward_std": 0.37853889763355253, + "rewards/MultiModalAccuracyORM": 0.3000000111758709, + "step": 1065, + "train_speed(iter/s)": 0.027352 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5, + "epoch": 0.43232323232323233, + "grad_norm": 2.3424705728855995, + "kl": 0.0116546630859375, + "learning_rate": 2e-07, + "loss": 0.03819225430488586, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.3227578908205032, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 1070, + "train_speed(iter/s)": 0.027305 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.55, + "epoch": 0.43434343434343436, + "grad_norm": 2.798437729299758, + "kl": 0.014569091796875, + "learning_rate": 2e-07, + "loss": 0.004848736524581909, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000819563863, + "reward_std": 0.31416428089141846, + "rewards/MultiModalAccuracyORM": 0.27500000819563863, + "step": 1075, + "train_speed(iter/s)": 0.027334 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.8, + "epoch": 0.43636363636363634, + "grad_norm": 1.7741031757506147, + "kl": 0.0157135009765625, + "learning_rate": 2e-07, + "loss": 0.00888105109333992, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000447034834, + "reward_std": 0.312698033452034, + "rewards/MultiModalAccuracyORM": 0.27500000447034834, + "step": 1080, + "train_speed(iter/s)": 0.027339 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.6, + "epoch": 0.4383838383838384, + "grad_norm": 2.06880703867489, + "kl": 0.0158050537109375, + "learning_rate": 2e-07, + "loss": -0.05194641947746277, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666716337204, + "reward_std": 0.22603832483291625, + "rewards/MultiModalAccuracyORM": 0.2666666716337204, + "step": 1085, + "train_speed(iter/s)": 0.027329 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.4, + "epoch": 0.4404040404040404, + "grad_norm": 2.4630209071132656, + "kl": 0.015411376953125, + "learning_rate": 2e-07, + "loss": -0.018011474609375, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20833333507180213, + "reward_std": 0.3071291267871857, + "rewards/MultiModalAccuracyORM": 0.20833333507180213, + "step": 1090, + "train_speed(iter/s)": 0.027372 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.2, + "epoch": 0.44242424242424244, + "grad_norm": 2.265643619288025, + "kl": 0.01461181640625, + "learning_rate": 2e-07, + "loss": 0.04221695959568024, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666701436043, + "reward_std": 0.3329358011484146, + "rewards/MultiModalAccuracyORM": 0.2666666701436043, + "step": 1095, + "train_speed(iter/s)": 0.027407 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.95, + "epoch": 0.4444444444444444, + "grad_norm": 2.894324596003934, + "kl": 0.009808349609375, + "learning_rate": 2e-07, + "loss": 0.02248055934906006, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4166666828095913, + "reward_std": 0.44607712924480436, + "rewards/MultiModalAccuracyORM": 0.4166666828095913, + "step": 1100, + "train_speed(iter/s)": 0.027442 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.95, + "epoch": 0.44646464646464645, + "grad_norm": 0.9507289625656876, + "kl": 0.0140777587890625, + "learning_rate": 2e-07, + "loss": -0.0001364484429359436, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.46666667237877846, + "reward_std": 0.24261614382267, + "rewards/MultiModalAccuracyORM": 0.46666667237877846, + "step": 1105, + "train_speed(iter/s)": 0.027471 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.75, + "epoch": 0.4484848484848485, + "grad_norm": 4.493560880958603, + "kl": 0.01422119140625, + "learning_rate": 2e-07, + "loss": 0.00024300813674926758, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4500000149011612, + "reward_std": 0.345323646068573, + "rewards/MultiModalAccuracyORM": 0.4500000149011612, + "step": 1110, + "train_speed(iter/s)": 0.027312 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.15, + "epoch": 0.4505050505050505, + "grad_norm": 1.866809698039603, + "kl": 0.0131317138671875, + "learning_rate": 2e-07, + "loss": -0.007444334030151367, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000298023224, + "reward_std": 0.3281930506229401, + "rewards/MultiModalAccuracyORM": 0.20000000298023224, + "step": 1115, + "train_speed(iter/s)": 0.027296 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.45, + "epoch": 0.45252525252525255, + "grad_norm": 0.04083454065583723, + "kl": 0.0086578369140625, + "learning_rate": 2e-07, + "loss": 0.009036242961883545, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000447034835, + "reward_std": 0.24710224866867064, + "rewards/MultiModalAccuracyORM": 0.22500000447034835, + "step": 1120, + "train_speed(iter/s)": 0.027256 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.75, + "epoch": 0.45454545454545453, + "grad_norm": 2.1257862237671588, + "kl": 0.01603851318359375, + "learning_rate": 2e-07, + "loss": -0.014222325384616851, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4333333432674408, + "reward_std": 0.4078585982322693, + "rewards/MultiModalAccuracyORM": 0.4333333432674408, + "step": 1125, + "train_speed(iter/s)": 0.027299 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.15, + "epoch": 0.45656565656565656, + "grad_norm": 48.10712707725128, + "kl": 0.0124542236328125, + "learning_rate": 2e-07, + "loss": 0.009453803300857544, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30833333879709246, + "reward_std": 0.32858102321624755, + "rewards/MultiModalAccuracyORM": 0.30833333879709246, + "step": 1130, + "train_speed(iter/s)": 0.027339 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.6, + "epoch": 0.4585858585858586, + "grad_norm": 0.8869001794016839, + "kl": 0.01041259765625, + "learning_rate": 2e-07, + "loss": -0.002349555492401123, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333730697633, + "reward_std": 0.29003951847553255, + "rewards/MultiModalAccuracyORM": 0.13333333730697633, + "step": 1135, + "train_speed(iter/s)": 0.027364 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0, + "epoch": 0.46060606060606063, + "grad_norm": 2.2315283680448346, + "kl": 0.0132476806640625, + "learning_rate": 2e-07, + "loss": -0.010060985386371613, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000745058059, + "reward_std": 0.3043610692024231, + "rewards/MultiModalAccuracyORM": 0.17500000745058059, + "step": 1140, + "train_speed(iter/s)": 0.027393 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.2, + "epoch": 0.4626262626262626, + "grad_norm": 0.04850876090724914, + "kl": 0.0081451416015625, + "learning_rate": 2e-07, + "loss": -0.022587394714355467, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.10833333656191826, + "reward_std": 0.20343697369098662, + "rewards/MultiModalAccuracyORM": 0.10833333656191826, + "step": 1145, + "train_speed(iter/s)": 0.027421 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.05, + "epoch": 0.46464646464646464, + "grad_norm": 2.2096178690715, + "kl": 0.0104400634765625, + "learning_rate": 2e-07, + "loss": 0.01734369993209839, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000074505806, + "reward_std": 0.33700530230998993, + "rewards/MultiModalAccuracyORM": 0.3250000074505806, + "step": 1150, + "train_speed(iter/s)": 0.027419 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.65, + "epoch": 0.4666666666666667, + "grad_norm": 1.3995623416059861, + "kl": 0.020782470703125, + "learning_rate": 2e-07, + "loss": 0.004217700660228729, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000447034834, + "reward_std": 0.20594746768474578, + "rewards/MultiModalAccuracyORM": 0.27500000447034834, + "step": 1155, + "train_speed(iter/s)": 0.027419 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.45, + "epoch": 0.4686868686868687, + "grad_norm": 7.604841869136694, + "kl": 0.017425537109375, + "learning_rate": 2e-07, + "loss": 0.04910666048526764, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000029802322, + "reward_std": 0.3408561676740646, + "rewards/MultiModalAccuracyORM": 0.3000000029802322, + "step": 1160, + "train_speed(iter/s)": 0.02739 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1, + "epoch": 0.4707070707070707, + "grad_norm": 1.7338556861412973, + "kl": 0.009881591796875, + "learning_rate": 2e-07, + "loss": -0.02307046055793762, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000149011613, + "reward_std": 0.18081162869930267, + "rewards/MultiModalAccuracyORM": 0.17500000149011613, + "step": 1165, + "train_speed(iter/s)": 0.027388 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.15, + "epoch": 0.4727272727272727, + "grad_norm": 1.2587552234540058, + "kl": 0.0092010498046875, + "learning_rate": 2e-07, + "loss": -0.05895323753356933, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666668429970743, + "reward_std": 0.40890581607818605, + "rewards/MultiModalAccuracyORM": 0.41666668429970743, + "step": 1170, + "train_speed(iter/s)": 0.027373 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.2, + "epoch": 0.47474747474747475, + "grad_norm": 0.06683334066144007, + "kl": 0.01002349853515625, + "learning_rate": 2e-07, + "loss": 0.02935360074043274, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2416666716337204, + "reward_std": 0.27523933053016664, + "rewards/MultiModalAccuracyORM": 0.2416666716337204, + "step": 1175, + "train_speed(iter/s)": 0.027312 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.15, + "epoch": 0.4767676767676768, + "grad_norm": 27.070556493942583, + "kl": 0.00930938720703125, + "learning_rate": 2e-07, + "loss": 0.0851466953754425, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000223517416, + "reward_std": 0.3342405825853348, + "rewards/MultiModalAccuracyORM": 0.25000000223517416, + "step": 1180, + "train_speed(iter/s)": 0.027331 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.55, + "epoch": 0.47878787878787876, + "grad_norm": 1.5534177345271625, + "kl": 0.0102996826171875, + "learning_rate": 2e-07, + "loss": 0.028819066286087037, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.266666679084301, + "reward_std": 0.3129431068897247, + "rewards/MultiModalAccuracyORM": 0.266666679084301, + "step": 1185, + "train_speed(iter/s)": 0.027335 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.65, + "epoch": 0.4808080808080808, + "grad_norm": 2.8838868478156816, + "kl": 0.02685546875, + "learning_rate": 2e-07, + "loss": 0.006991004943847657, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000447034836, + "reward_std": 0.2323400765657425, + "rewards/MultiModalAccuracyORM": 0.17500000447034836, + "step": 1190, + "train_speed(iter/s)": 0.027082 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.2, + "epoch": 0.48282828282828283, + "grad_norm": 2.6317167816627993, + "kl": 0.014031982421875, + "learning_rate": 2e-07, + "loss": -0.003238886594772339, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.20000000596046447, + "reward_std": 0.30388820767402647, + "rewards/MultiModalAccuracyORM": 0.20000000596046447, + "step": 1195, + "train_speed(iter/s)": 0.026955 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.35, + "epoch": 0.48484848484848486, + "grad_norm": 53.95756362621299, + "kl": 0.0124114990234375, + "learning_rate": 2e-07, + "loss": -0.00888831913471222, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000029802322, + "reward_std": 0.29782613217830656, + "rewards/MultiModalAccuracyORM": 0.3000000029802322, + "step": 1200, + "train_speed(iter/s)": 0.026995 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0, + "epoch": 0.4868686868686869, + "grad_norm": 1.8840812265683782, + "kl": 0.016448974609375, + "learning_rate": 2e-07, + "loss": 0.024408812820911407, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667610406876, + "reward_std": 0.4253006011247635, + "rewards/MultiModalAccuracyORM": 0.34166667610406876, + "step": 1205, + "train_speed(iter/s)": 0.02702 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.2, + "epoch": 0.4888888888888889, + "grad_norm": 2.267475237086073, + "kl": 0.01165771484375, + "learning_rate": 2e-07, + "loss": -0.02959960699081421, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4416666813194752, + "reward_std": 0.3111630380153656, + "rewards/MultiModalAccuracyORM": 0.4416666813194752, + "step": 1210, + "train_speed(iter/s)": 0.027058 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.55, + "epoch": 0.4909090909090909, + "grad_norm": 1.53249738300366, + "kl": 0.01207275390625, + "learning_rate": 2e-07, + "loss": 0.01664416640996933, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000089406967, + "reward_std": 0.39155901670455934, + "rewards/MultiModalAccuracyORM": 0.3500000089406967, + "step": 1215, + "train_speed(iter/s)": 0.027075 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.35, + "epoch": 0.49292929292929294, + "grad_norm": 2.838473944184638, + "kl": 0.0138153076171875, + "learning_rate": 2e-07, + "loss": 0.011857110261917114, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.16666667237877847, + "reward_std": 0.32422170639038084, + "rewards/MultiModalAccuracyORM": 0.16666667237877847, + "step": 1220, + "train_speed(iter/s)": 0.027075 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.2, + "epoch": 0.494949494949495, + "grad_norm": 2.239419757076915, + "kl": 0.0130462646484375, + "learning_rate": 2e-07, + "loss": 0.03971967101097107, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667088866234, + "reward_std": 0.23224489092826844, + "rewards/MultiModalAccuracyORM": 0.36666667088866234, + "step": 1225, + "train_speed(iter/s)": 0.027083 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.8, + "epoch": 0.49696969696969695, + "grad_norm": 2.1763944900135637, + "kl": 0.0342437744140625, + "learning_rate": 2e-07, + "loss": -0.010297659039497375, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3583333410322666, + "reward_std": 0.3408351272344589, + "rewards/MultiModalAccuracyORM": 0.3583333410322666, + "step": 1230, + "train_speed(iter/s)": 0.027096 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.45, + "epoch": 0.498989898989899, + "grad_norm": 6.002103596814289, + "kl": 0.020233154296875, + "learning_rate": 2e-07, + "loss": 0.08779069185256957, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000000670552255, + "reward_std": 0.35311026573181153, + "rewards/MultiModalAccuracyORM": 0.40000000670552255, + "step": 1235, + "train_speed(iter/s)": 0.027106 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.45, + "epoch": 0.501010101010101, + "grad_norm": 1.7067044601090864, + "kl": 0.00786285400390625, + "learning_rate": 2e-07, + "loss": 0.05108952522277832, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000089406967, + "reward_std": 0.3033378630876541, + "rewards/MultiModalAccuracyORM": 0.3500000089406967, + "step": 1240, + "train_speed(iter/s)": 0.027091 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.4, + "epoch": 0.503030303030303, + "grad_norm": 0.8938521798548926, + "kl": 0.009466552734375, + "learning_rate": 2e-07, + "loss": -0.01685338616371155, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333656191826, + "reward_std": 0.2292436480522156, + "rewards/MultiModalAccuracyORM": 0.13333333656191826, + "step": 1245, + "train_speed(iter/s)": 0.02707 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 3.702370322108623, + "learning_rate": 2e-07, + "loss": 0.036279809474945066, + "memory(GiB)": 87.45, + "step": 1250, + "train_speed(iter/s)": 0.027086 + }, + { + "epoch": 0.5050505050505051, + "eval_clip_ratio": 0.0, + "eval_completion_length": 321.4716763305664, + "eval_kl": 0.015718994140625, + "eval_loss": 0.013520264066755772, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.3033333399891853, + "eval_reward_std": 0.3383384072780609, + "eval_rewards/MultiModalAccuracyORM": 0.3033333399891853, + "eval_runtime": 765.5729, + "eval_samples_per_second": 0.065, + "eval_steps_per_second": 0.007, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.475, + "epoch": 0.5070707070707071, + "grad_norm": 1.4811421198816048, + "kl": 0.01293487548828125, + "learning_rate": 2e-07, + "loss": 0.03056705594062805, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35833334140479567, + "reward_std": 0.38048321902751925, + "rewards/MultiModalAccuracyORM": 0.35833334140479567, + "step": 1255, + "train_speed(iter/s)": 0.026435 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.15, + "epoch": 0.509090909090909, + "grad_norm": 2.0552411044504764, + "kl": 0.0252899169921875, + "learning_rate": 2e-07, + "loss": 0.028329643607139587, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334177732465, + "reward_std": 0.281466943025589, + "rewards/MultiModalAccuracyORM": 0.33333334177732465, + "step": 1260, + "train_speed(iter/s)": 0.026464 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.4, + "epoch": 0.5111111111111111, + "grad_norm": 2.615766039038286, + "kl": 0.01002197265625, + "learning_rate": 2e-07, + "loss": 0.002955615520477295, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000521540644, + "reward_std": 0.2292436480522156, + "rewards/MultiModalAccuracyORM": 0.25000000521540644, + "step": 1265, + "train_speed(iter/s)": 0.02646 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.85, + "epoch": 0.5131313131313131, + "grad_norm": 1.9893529067484352, + "kl": 0.011163330078125, + "learning_rate": 2e-07, + "loss": 0.018701747059822083, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.300000012665987, + "reward_std": 0.3127244770526886, + "rewards/MultiModalAccuracyORM": 0.300000012665987, + "step": 1270, + "train_speed(iter/s)": 0.026466 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.65, + "epoch": 0.5151515151515151, + "grad_norm": 1.6843559930041148, + "kl": 0.0115509033203125, + "learning_rate": 2e-07, + "loss": 0.012320590019226075, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5000000111758709, + "reward_std": 0.345323646068573, + "rewards/MultiModalAccuracyORM": 0.5000000111758709, + "step": 1275, + "train_speed(iter/s)": 0.026464 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.25, + "epoch": 0.5171717171717172, + "grad_norm": 3.0894548096911407, + "kl": 0.010302734375, + "learning_rate": 2e-07, + "loss": -0.02475722283124924, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000074505806, + "reward_std": 0.21999078691005708, + "rewards/MultiModalAccuracyORM": 0.2500000074505806, + "step": 1280, + "train_speed(iter/s)": 0.026449 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.45, + "epoch": 0.5191919191919192, + "grad_norm": 0.056162470903676515, + "kl": 0.010772705078125, + "learning_rate": 2e-07, + "loss": -0.0004087850451469421, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667386889457, + "reward_std": 0.23105688095092775, + "rewards/MultiModalAccuracyORM": 0.21666667386889457, + "step": 1285, + "train_speed(iter/s)": 0.026422 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.55, + "epoch": 0.5212121212121212, + "grad_norm": 1.7176303706462466, + "kl": 0.011578369140625, + "learning_rate": 2e-07, + "loss": 0.023639577627182006, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666693985462, + "reward_std": 0.28077210783958434, + "rewards/MultiModalAccuracyORM": 0.2666666693985462, + "step": 1290, + "train_speed(iter/s)": 0.026422 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.65, + "epoch": 0.5232323232323233, + "grad_norm": 1.244445708488179, + "kl": 0.0103790283203125, + "learning_rate": 2e-07, + "loss": -0.017145507037639618, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3833333395421505, + "reward_std": 0.4086130350828171, + "rewards/MultiModalAccuracyORM": 0.3833333395421505, + "step": 1295, + "train_speed(iter/s)": 0.026442 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.2, + "epoch": 0.5252525252525253, + "grad_norm": 1.7914388567184454, + "kl": 0.0092559814453125, + "learning_rate": 2e-07, + "loss": 0.054825717210769655, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333417773247, + "reward_std": 0.3536572724580765, + "rewards/MultiModalAccuracyORM": 0.3083333417773247, + "step": 1300, + "train_speed(iter/s)": 0.026415 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.9, + "epoch": 0.5272727272727272, + "grad_norm": 2.6174114359405976, + "kl": 0.010308837890625, + "learning_rate": 2e-07, + "loss": -0.019986753165721894, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4500000074505806, + "reward_std": 0.3099655658006668, + "rewards/MultiModalAccuracyORM": 0.4500000074505806, + "step": 1305, + "train_speed(iter/s)": 0.026387 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.9, + "epoch": 0.5292929292929293, + "grad_norm": 32.625329420627345, + "kl": 0.00882568359375, + "learning_rate": 2e-07, + "loss": 0.008027985692024231, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667312383654, + "reward_std": 0.40485736131668093, + "rewards/MultiModalAccuracyORM": 0.41666667312383654, + "step": 1310, + "train_speed(iter/s)": 0.026366 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.3, + "epoch": 0.5313131313131313, + "grad_norm": 1.6706902692989012, + "kl": 0.0086761474609375, + "learning_rate": 2e-07, + "loss": 0.028931498527526855, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333334624767303, + "reward_std": 0.3558539390563965, + "rewards/MultiModalAccuracyORM": 0.28333334624767303, + "step": 1315, + "train_speed(iter/s)": 0.026338 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.25, + "epoch": 0.5333333333333333, + "grad_norm": 1.8800912459209826, + "kl": 0.0249176025390625, + "learning_rate": 2e-07, + "loss": 0.048329290747642514, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.25897533297538755, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 1320, + "train_speed(iter/s)": 0.026308 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.3, + "epoch": 0.5353535353535354, + "grad_norm": 3.1086990293234904, + "kl": 0.01292724609375, + "learning_rate": 2e-07, + "loss": 0.006182897090911865, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667312383653, + "reward_std": 0.3867922484874725, + "rewards/MultiModalAccuracyORM": 0.34166667312383653, + "step": 1325, + "train_speed(iter/s)": 0.026274 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.3, + "epoch": 0.5373737373737374, + "grad_norm": 0.08070215671871471, + "kl": 0.0099578857421875, + "learning_rate": 2e-07, + "loss": 0.062343114614486696, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000059604645, + "reward_std": 0.22625695466995238, + "rewards/MultiModalAccuracyORM": 0.3000000059604645, + "step": 1330, + "train_speed(iter/s)": 0.026241 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.75, + "epoch": 0.5393939393939394, + "grad_norm": 3.4146119265895893, + "kl": 0.0290008544921875, + "learning_rate": 2e-07, + "loss": -0.02337663769721985, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000000819563863, + "reward_std": 0.31852359175682066, + "rewards/MultiModalAccuracyORM": 0.40000000819563863, + "step": 1335, + "train_speed(iter/s)": 0.026231 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.85, + "epoch": 0.5414141414141415, + "grad_norm": 1.014030648475331, + "kl": 0.0152801513671875, + "learning_rate": 2e-07, + "loss": 0.03424631953239441, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.22807018756866454, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 1340, + "train_speed(iter/s)": 0.026218 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.5, + "epoch": 0.5434343434343434, + "grad_norm": 2.579076344272663, + "kl": 0.0294189453125, + "learning_rate": 2e-07, + "loss": -0.004431784152984619, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.425000012665987, + "reward_std": 0.3433456152677536, + "rewards/MultiModalAccuracyORM": 0.425000012665987, + "step": 1345, + "train_speed(iter/s)": 0.026212 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.2, + "epoch": 0.5454545454545454, + "grad_norm": 0.09604007460689165, + "kl": 0.0132415771484375, + "learning_rate": 2e-07, + "loss": 0.011541323363780975, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333730697632, + "reward_std": 0.24961273670196532, + "rewards/MultiModalAccuracyORM": 0.15833333730697632, + "step": 1350, + "train_speed(iter/s)": 0.026199 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2, + "epoch": 0.5474747474747474, + "grad_norm": 2.8630066616840306, + "kl": 0.0131500244140625, + "learning_rate": 2e-07, + "loss": 0.0038095355033874513, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333484828472, + "reward_std": 0.371958914399147, + "rewards/MultiModalAccuracyORM": 0.4083333484828472, + "step": 1355, + "train_speed(iter/s)": 0.026195 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.35, + "epoch": 0.5494949494949495, + "grad_norm": 2.8462264230542202, + "kl": 0.0113922119140625, + "learning_rate": 2e-07, + "loss": -0.013850301504135132, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.13333333879709244, + "reward_std": 0.23857065439224243, + "rewards/MultiModalAccuracyORM": 0.13333333879709244, + "step": 1360, + "train_speed(iter/s)": 0.026178 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.8, + "epoch": 0.5515151515151515, + "grad_norm": 1.9037157526983224, + "kl": 0.0115966796875, + "learning_rate": 2e-07, + "loss": 0.061475354433059695, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333334252238274, + "reward_std": 0.37644500732421876, + "rewards/MultiModalAccuracyORM": 0.28333334252238274, + "step": 1365, + "train_speed(iter/s)": 0.026169 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.25, + "epoch": 0.5535353535353535, + "grad_norm": 1.5230914677267515, + "kl": 0.012347412109375, + "learning_rate": 2e-07, + "loss": 0.02505878210067749, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667386889457, + "reward_std": 0.26496326327323916, + "rewards/MultiModalAccuracyORM": 0.36666667386889457, + "step": 1370, + "train_speed(iter/s)": 0.026162 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.7, + "epoch": 0.5555555555555556, + "grad_norm": 1.9879722073308892, + "kl": 0.0135223388671875, + "learning_rate": 2e-07, + "loss": 0.010433109104633331, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000447034834, + "reward_std": 0.18332211077213287, + "rewards/MultiModalAccuracyORM": 0.27500000447034834, + "step": 1375, + "train_speed(iter/s)": 0.026157 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.9, + "epoch": 0.5575757575757576, + "grad_norm": 2.649637312336083, + "kl": 0.012469482421875, + "learning_rate": 2e-07, + "loss": 0.009650683403015137, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333333805203438, + "reward_std": 0.3890485167503357, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 1380, + "train_speed(iter/s)": 0.02615 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.05, + "epoch": 0.5595959595959596, + "grad_norm": 0.05006149717815439, + "kl": 0.016656494140625, + "learning_rate": 2e-07, + "loss": -0.007993972301483155, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667386889455, + "reward_std": 0.3541334718465805, + "rewards/MultiModalAccuracyORM": 0.29166667386889455, + "step": 1385, + "train_speed(iter/s)": 0.026129 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.65, + "epoch": 0.5616161616161616, + "grad_norm": 0.08079407011077554, + "kl": 0.01981201171875, + "learning_rate": 2e-07, + "loss": 0.03422499895095825, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833334103226663, + "reward_std": 0.3597048044204712, + "rewards/MultiModalAccuracyORM": 0.25833334103226663, + "step": 1390, + "train_speed(iter/s)": 0.026124 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.3, + "epoch": 0.5636363636363636, + "grad_norm": 2.595093461800728, + "kl": 0.016748046875, + "learning_rate": 2e-07, + "loss": 0.0661674439907074, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667610406877, + "reward_std": 0.41412476599216463, + "rewards/MultiModalAccuracyORM": 0.41666667610406877, + "step": 1395, + "train_speed(iter/s)": 0.02611 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.3, + "epoch": 0.5656565656565656, + "grad_norm": 1.8524460034780388, + "kl": 0.0219970703125, + "learning_rate": 2e-07, + "loss": 0.0748141050338745, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333410322666, + "reward_std": 0.3222051203250885, + "rewards/MultiModalAccuracyORM": 0.2833333410322666, + "step": 1400, + "train_speed(iter/s)": 0.026104 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.7, + "epoch": 0.5676767676767677, + "grad_norm": 1.8645433263018287, + "kl": 0.020556640625, + "learning_rate": 2e-07, + "loss": -0.019703832268714905, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333402872086, + "reward_std": 0.23230449855327606, + "rewards/MultiModalAccuracyORM": 0.2833333402872086, + "step": 1405, + "train_speed(iter/s)": 0.026096 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.15, + "epoch": 0.5696969696969697, + "grad_norm": 2.007508731899272, + "kl": 0.014324951171875, + "learning_rate": 2e-07, + "loss": 0.026613450050354003, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000000819563863, + "reward_std": 0.26928699016571045, + "rewards/MultiModalAccuracyORM": 0.40000000819563863, + "step": 1410, + "train_speed(iter/s)": 0.026082 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.6, + "epoch": 0.5717171717171717, + "grad_norm": 1.3049808616717113, + "kl": 0.0161651611328125, + "learning_rate": 2e-07, + "loss": -0.019157709181308748, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833334103226663, + "reward_std": 0.3352662205696106, + "rewards/MultiModalAccuracyORM": 0.25833334103226663, + "step": 1415, + "train_speed(iter/s)": 0.026066 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.4, + "epoch": 0.5737373737373738, + "grad_norm": 1.7990652267186868, + "kl": 0.021240234375, + "learning_rate": 2e-07, + "loss": 0.043132427334785464, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000298023224, + "reward_std": 0.2159808874130249, + "rewards/MultiModalAccuracyORM": 0.17500000298023224, + "step": 1420, + "train_speed(iter/s)": 0.026059 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.85, + "epoch": 0.5757575757575758, + "grad_norm": 1.3873829792776142, + "kl": 0.017431640625, + "learning_rate": 2e-07, + "loss": 0.010021258890628815, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.45833334475755694, + "reward_std": 0.2770525634288788, + "rewards/MultiModalAccuracyORM": 0.45833334475755694, + "step": 1425, + "train_speed(iter/s)": 0.026059 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.05, + "epoch": 0.5777777777777777, + "grad_norm": 1.6565432442769377, + "kl": 0.0139190673828125, + "learning_rate": 2e-07, + "loss": 0.016829773783683777, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667312383654, + "reward_std": 0.40086200535297395, + "rewards/MultiModalAccuracyORM": 0.29166667312383654, + "step": 1430, + "train_speed(iter/s)": 0.026063 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.7, + "epoch": 0.5797979797979798, + "grad_norm": 1.2410328295318487, + "kl": 0.015863037109375, + "learning_rate": 2e-07, + "loss": -0.04091094434261322, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666731238365, + "reward_std": 0.3603756338357925, + "rewards/MultiModalAccuracyORM": 0.3166666731238365, + "step": 1435, + "train_speed(iter/s)": 0.026053 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.7, + "epoch": 0.5818181818181818, + "grad_norm": 2.659138324217993, + "kl": 0.01724853515625, + "learning_rate": 2e-07, + "loss": 0.08770001530647278, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3333333425223827, + "reward_std": 0.4456100821495056, + "rewards/MultiModalAccuracyORM": 0.3333333425223827, + "step": 1440, + "train_speed(iter/s)": 0.026045 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.15, + "epoch": 0.5838383838383838, + "grad_norm": 2.6855533659279462, + "kl": 0.015350341796875, + "learning_rate": 2e-07, + "loss": -0.03101794719696045, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.2644129186868668, + "rewards/MultiModalAccuracyORM": 0.1666666716337204, + "step": 1445, + "train_speed(iter/s)": 0.026039 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.85, + "epoch": 0.5858585858585859, + "grad_norm": 0.8787033948980154, + "kl": 0.018048095703125, + "learning_rate": 2e-07, + "loss": 0.021743962168693544, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1500000014901161, + "reward_std": 0.2496483266353607, + "rewards/MultiModalAccuracyORM": 0.1500000014901161, + "step": 1450, + "train_speed(iter/s)": 0.026027 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.65, + "epoch": 0.5878787878787879, + "grad_norm": 2.6089377973235917, + "kl": 0.0154541015625, + "learning_rate": 2e-07, + "loss": -0.0126606285572052, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833333730697633, + "reward_std": 0.287842845916748, + "rewards/MultiModalAccuracyORM": 0.25833333730697633, + "step": 1455, + "train_speed(iter/s)": 0.026012 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.55, + "epoch": 0.5898989898989899, + "grad_norm": 3.1599228273908895, + "kl": 0.017535400390625, + "learning_rate": 2e-07, + "loss": 0.03227808475494385, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000089406967, + "reward_std": 0.2754935622215271, + "rewards/MultiModalAccuracyORM": 0.3500000089406967, + "step": 1460, + "train_speed(iter/s)": 0.025992 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.35, + "epoch": 0.591919191919192, + "grad_norm": 3.772779516485284, + "kl": 0.016162109375, + "learning_rate": 2e-07, + "loss": -0.006427288055419922, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667088866234, + "reward_std": 0.3214506834745407, + "rewards/MultiModalAccuracyORM": 0.21666667088866234, + "step": 1465, + "train_speed(iter/s)": 0.02598 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.7, + "epoch": 0.593939393939394, + "grad_norm": 1.9048234622524929, + "kl": 0.019964599609375, + "learning_rate": 2e-07, + "loss": 0.02089463174343109, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334550261495, + "reward_std": 0.39707074165344236, + "rewards/MultiModalAccuracyORM": 0.33333334550261495, + "step": 1470, + "train_speed(iter/s)": 0.025963 + }, + { + "clip_ratio": 0.0, + "completion_length": 372.2, + "epoch": 0.5959595959595959, + "grad_norm": 1.7167051608215667, + "kl": 0.0126953125, + "learning_rate": 2e-07, + "loss": 0.0002398371696472168, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.3485645651817322, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 1475, + "train_speed(iter/s)": 0.025929 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.9, + "epoch": 0.597979797979798, + "grad_norm": 2.018355689891589, + "kl": 0.014324951171875, + "learning_rate": 2e-07, + "loss": 0.025476664304733276, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5250000104308128, + "reward_std": 0.3463323086500168, + "rewards/MultiModalAccuracyORM": 0.5250000104308128, + "step": 1480, + "train_speed(iter/s)": 0.025899 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.35, + "epoch": 0.6, + "grad_norm": 1.9564498539046626, + "kl": 0.013104248046875, + "learning_rate": 2e-07, + "loss": -0.0017219483852386475, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666753590107, + "reward_std": 0.3392761141061783, + "rewards/MultiModalAccuracyORM": 0.2666666753590107, + "step": 1485, + "train_speed(iter/s)": 0.025884 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.55, + "epoch": 0.602020202020202, + "grad_norm": 3.3586873596373836, + "kl": 0.0190948486328125, + "learning_rate": 2e-07, + "loss": -0.015026980638504028, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334550261495, + "reward_std": 0.43529842495918275, + "rewards/MultiModalAccuracyORM": 0.33333334550261495, + "step": 1490, + "train_speed(iter/s)": 0.02585 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.1, + "epoch": 0.604040404040404, + "grad_norm": 1.5566031738878978, + "kl": 0.0152313232421875, + "learning_rate": 2e-07, + "loss": 0.05221402645111084, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4666666716337204, + "reward_std": 0.3853524446487427, + "rewards/MultiModalAccuracyORM": 0.4666666716337204, + "step": 1495, + "train_speed(iter/s)": 0.025826 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.092725055214899, + "learning_rate": 2e-07, + "loss": 0.044440290331840514, + "memory(GiB)": 87.45, + "step": 1500, + "train_speed(iter/s)": 0.025794 + }, + { + "epoch": 0.6060606060606061, + "eval_clip_ratio": 0.0, + "eval_completion_length": 332.07667766571046, + "eval_kl": 0.03210205078125, + "eval_loss": 0.03433879837393761, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.32333334147930143, + "eval_reward_std": 0.34949765503406527, + "eval_rewards/MultiModalAccuracyORM": 0.32333334147930143, + "eval_runtime": 946.9078, + "eval_samples_per_second": 0.053, + "eval_steps_per_second": 0.005, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.3, + "epoch": 0.6080808080808081, + "grad_norm": 1.594406200527781, + "kl": 0.01402130126953125, + "learning_rate": 2e-07, + "loss": 0.011821150779724121, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30000000447034836, + "reward_std": 0.29021300822496415, + "rewards/MultiModalAccuracyORM": 0.30000000447034836, + "step": 1505, + "train_speed(iter/s)": 0.02519 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.35, + "epoch": 0.6101010101010101, + "grad_norm": 1.872354266566466, + "kl": 0.01295166015625, + "learning_rate": 2e-07, + "loss": 0.040472963452339174, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667237877845, + "reward_std": 0.24481281042098998, + "rewards/MultiModalAccuracyORM": 0.39166667237877845, + "step": 1510, + "train_speed(iter/s)": 0.025138 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.25, + "epoch": 0.6121212121212121, + "grad_norm": 2.2298458448624032, + "kl": 0.017498779296875, + "learning_rate": 2e-07, + "loss": -0.003679761290550232, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666753590107, + "reward_std": 0.33752005696296694, + "rewards/MultiModalAccuracyORM": 0.2666666753590107, + "step": 1515, + "train_speed(iter/s)": 0.02512 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.3, + "epoch": 0.6141414141414141, + "grad_norm": 2.1057358539094637, + "kl": 0.013360595703125, + "learning_rate": 2e-07, + "loss": -0.040804427862167356, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.11666667088866234, + "reward_std": 0.22625695466995238, + "rewards/MultiModalAccuracyORM": 0.11666667088866234, + "step": 1520, + "train_speed(iter/s)": 0.025038 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.45, + "epoch": 0.6161616161616161, + "grad_norm": 1.7271901034384924, + "kl": 0.01778564453125, + "learning_rate": 2e-07, + "loss": 0.04612007737159729, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667759418485, + "reward_std": 0.385197651386261, + "rewards/MultiModalAccuracyORM": 0.29166667759418485, + "step": 1525, + "train_speed(iter/s)": 0.025 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.4, + "epoch": 0.6181818181818182, + "grad_norm": 2.251271699623951, + "kl": 0.015338134765625, + "learning_rate": 2e-07, + "loss": 0.07724932432174683, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40833334177732467, + "reward_std": 0.39786076843738555, + "rewards/MultiModalAccuracyORM": 0.40833334177732467, + "step": 1530, + "train_speed(iter/s)": 0.024971 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.35, + "epoch": 0.6202020202020202, + "grad_norm": 3.517799255266591, + "kl": 0.021319580078125, + "learning_rate": 2e-07, + "loss": -0.042039293050765994, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2000000074505806, + "reward_std": 0.27122943103313446, + "rewards/MultiModalAccuracyORM": 0.2000000074505806, + "step": 1535, + "train_speed(iter/s)": 0.02492 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.3, + "epoch": 0.6222222222222222, + "grad_norm": 2.5032184616862736, + "kl": 0.023309326171875, + "learning_rate": 2e-07, + "loss": 0.004111546277999878, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667386889457, + "reward_std": 0.36037562787532806, + "rewards/MultiModalAccuracyORM": 0.36666667386889457, + "step": 1540, + "train_speed(iter/s)": 0.024886 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.65, + "epoch": 0.6242424242424243, + "grad_norm": 1.3788944987112297, + "kl": 0.018865966796875, + "learning_rate": 2e-07, + "loss": 0.03875549137592316, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.14166666939854622, + "reward_std": 0.275529146194458, + "rewards/MultiModalAccuracyORM": 0.14166666939854622, + "step": 1545, + "train_speed(iter/s)": 0.02484 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.9, + "epoch": 0.6262626262626263, + "grad_norm": 1.8495513561932837, + "kl": 0.02667236328125, + "learning_rate": 2e-07, + "loss": 0.006523740291595459, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333656191825, + "reward_std": 0.3144540905952454, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 1550, + "train_speed(iter/s)": 0.024776 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.2, + "epoch": 0.6282828282828283, + "grad_norm": 1.753463603338966, + "kl": 0.030621337890625, + "learning_rate": 2e-07, + "loss": -0.08293852806091309, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000149011614, + "reward_std": 0.23083824515342713, + "rewards/MultiModalAccuracyORM": 0.25000000149011614, + "step": 1555, + "train_speed(iter/s)": 0.02475 + }, + { + "clip_ratio": 0.0, + "completion_length": 382.7, + "epoch": 0.6303030303030303, + "grad_norm": 2.663595112199716, + "kl": 0.0219970703125, + "learning_rate": 2e-07, + "loss": -0.002608485519886017, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35000000447034835, + "reward_std": 0.27756678462028506, + "rewards/MultiModalAccuracyORM": 0.35000000447034835, + "step": 1560, + "train_speed(iter/s)": 0.024719 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.55, + "epoch": 0.6323232323232323, + "grad_norm": 1.803682568463378, + "kl": 0.02052001953125, + "learning_rate": 2e-07, + "loss": -0.031521540880203244, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333440124989, + "reward_std": 0.24935851097106934, + "rewards/MultiModalAccuracyORM": 0.2833333440124989, + "step": 1565, + "train_speed(iter/s)": 0.024694 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.5, + "epoch": 0.6343434343434343, + "grad_norm": 1.9551331787297712, + "kl": 0.012725830078125, + "learning_rate": 2e-07, + "loss": 0.016904991865158082, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2250000059604645, + "reward_std": 0.3863160490989685, + "rewards/MultiModalAccuracyORM": 0.2250000059604645, + "step": 1570, + "train_speed(iter/s)": 0.024625 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.85, + "epoch": 0.6363636363636364, + "grad_norm": 2.19696821448914, + "kl": 0.016156005859375, + "learning_rate": 2e-07, + "loss": 0.00793578326702118, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.38333333656191826, + "reward_std": 0.2260383188724518, + "rewards/MultiModalAccuracyORM": 0.38333333656191826, + "step": 1575, + "train_speed(iter/s)": 0.024598 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.35, + "epoch": 0.6383838383838384, + "grad_norm": 0.10124868688137513, + "kl": 0.016912841796875, + "learning_rate": 2e-07, + "loss": -0.007649339735507965, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666666865348815, + "reward_std": 0.23634997606277466, + "rewards/MultiModalAccuracyORM": 0.26666666865348815, + "step": 1580, + "train_speed(iter/s)": 0.024579 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.0, + "epoch": 0.6404040404040404, + "grad_norm": 1.6301877045933517, + "kl": 0.012744140625, + "learning_rate": 2e-07, + "loss": 0.013163220882415772, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000052154064, + "reward_std": 0.3906099498271942, + "rewards/MultiModalAccuracyORM": 0.3250000052154064, + "step": 1585, + "train_speed(iter/s)": 0.024565 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.05, + "epoch": 0.6424242424242425, + "grad_norm": 2.155746940066879, + "kl": 0.016387939453125, + "learning_rate": 2e-07, + "loss": -0.006454774737358093, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2333333373069763, + "reward_std": 0.2855865776538849, + "rewards/MultiModalAccuracyORM": 0.2333333373069763, + "step": 1590, + "train_speed(iter/s)": 0.024526 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.0, + "epoch": 0.6444444444444445, + "grad_norm": 2.831254989761031, + "kl": 0.0135467529296875, + "learning_rate": 2e-07, + "loss": 0.04445863664150238, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000149011612, + "reward_std": 0.28787843585014344, + "rewards/MultiModalAccuracyORM": 0.20000000149011612, + "step": 1595, + "train_speed(iter/s)": 0.024495 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.45, + "epoch": 0.6464646464646465, + "grad_norm": 1.4752518445027274, + "kl": 0.017083740234375, + "learning_rate": 2e-07, + "loss": 0.03578461408615112, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666668131947516, + "reward_std": 0.33704385757446287, + "rewards/MultiModalAccuracyORM": 0.36666668131947516, + "step": 1600, + "train_speed(iter/s)": 0.024476 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.65, + "epoch": 0.6484848484848484, + "grad_norm": 0.9187218241799472, + "kl": 0.016937255859375, + "learning_rate": 2e-07, + "loss": 0.02192138433456421, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000111758709, + "reward_std": 0.3222196638584137, + "rewards/MultiModalAccuracyORM": 0.3500000111758709, + "step": 1605, + "train_speed(iter/s)": 0.024426 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.35, + "epoch": 0.6505050505050505, + "grad_norm": 1.7973159194566164, + "kl": 0.0144775390625, + "learning_rate": 2e-07, + "loss": 0.01784837543964386, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.12500000298023223, + "reward_std": 0.2689731627702713, + "rewards/MultiModalAccuracyORM": 0.12500000298023223, + "step": 1610, + "train_speed(iter/s)": 0.024388 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.55, + "epoch": 0.6525252525252525, + "grad_norm": 2.0318711993448617, + "kl": 0.018182373046875, + "learning_rate": 2e-07, + "loss": -0.02051687240600586, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4333333380520344, + "reward_std": 0.261207589507103, + "rewards/MultiModalAccuracyORM": 0.4333333380520344, + "step": 1615, + "train_speed(iter/s)": 0.024346 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.3, + "epoch": 0.6545454545454545, + "grad_norm": 1.9030819605130962, + "kl": 0.0175079345703125, + "learning_rate": 2e-07, + "loss": 0.06623161435127259, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.37500000596046446, + "reward_std": 0.24885829985141755, + "rewards/MultiModalAccuracyORM": 0.37500000596046446, + "step": 1620, + "train_speed(iter/s)": 0.024315 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.4, + "epoch": 0.6565656565656566, + "grad_norm": 2.08045815446475, + "kl": 0.0169708251953125, + "learning_rate": 2e-07, + "loss": -0.013642898201942444, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000029802322, + "reward_std": 0.3378098726272583, + "rewards/MultiModalAccuracyORM": 0.2500000029802322, + "step": 1625, + "train_speed(iter/s)": 0.024289 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.75, + "epoch": 0.6585858585858586, + "grad_norm": 1.436661872799103, + "kl": 0.0193359375, + "learning_rate": 2e-07, + "loss": 0.02239292562007904, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333805203437, + "reward_std": 0.2629852324724197, + "rewards/MultiModalAccuracyORM": 0.15833333805203437, + "step": 1630, + "train_speed(iter/s)": 0.024248 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.4, + "epoch": 0.6606060606060606, + "grad_norm": 2.5008411774286494, + "kl": 0.020758056640625, + "learning_rate": 2e-07, + "loss": 0.02127687931060791, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333432674408, + "reward_std": 0.3023863762617111, + "rewards/MultiModalAccuracyORM": 0.4083333432674408, + "step": 1635, + "train_speed(iter/s)": 0.024227 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.8, + "epoch": 0.6626262626262627, + "grad_norm": 2.6410537415459125, + "kl": 0.02030029296875, + "learning_rate": 2e-07, + "loss": 0.05219934582710266, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667684912684, + "reward_std": 0.35006397068500517, + "rewards/MultiModalAccuracyORM": 0.41666667684912684, + "step": 1640, + "train_speed(iter/s)": 0.024199 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.0, + "epoch": 0.6646464646464646, + "grad_norm": 2.4569826375450914, + "kl": 0.01795654296875, + "learning_rate": 2e-07, + "loss": 0.013086378574371338, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.45000001341104506, + "reward_std": 0.3337643891572952, + "rewards/MultiModalAccuracyORM": 0.45000001341104506, + "step": 1645, + "train_speed(iter/s)": 0.024168 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.8, + "epoch": 0.6666666666666666, + "grad_norm": 1.9280627341583514, + "kl": 0.015191650390625, + "learning_rate": 2e-07, + "loss": 0.01907120943069458, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15000000447034836, + "reward_std": 0.30035116970539094, + "rewards/MultiModalAccuracyORM": 0.15000000447034836, + "step": 1650, + "train_speed(iter/s)": 0.024141 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.0, + "epoch": 0.6686868686868687, + "grad_norm": 2.6312715310589687, + "kl": 0.015863037109375, + "learning_rate": 2e-07, + "loss": -0.04063203632831573, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333656191825, + "reward_std": 0.25741389989852903, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 1655, + "train_speed(iter/s)": 0.024076 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.05, + "epoch": 0.6707070707070707, + "grad_norm": 0.9566291807644657, + "kl": 0.015057373046875, + "learning_rate": 2e-07, + "loss": 0.018163633346557618, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666667610406875, + "reward_std": 0.28446817994117735, + "rewards/MultiModalAccuracyORM": 0.26666667610406875, + "step": 1660, + "train_speed(iter/s)": 0.024043 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.85, + "epoch": 0.6727272727272727, + "grad_norm": 1.9521868347750622, + "kl": 0.019769287109375, + "learning_rate": 2e-07, + "loss": -5.202591419219971e-05, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667535901069, + "reward_std": 0.23481498062610626, + "rewards/MultiModalAccuracyORM": 0.21666667535901069, + "step": 1665, + "train_speed(iter/s)": 0.024026 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.4, + "epoch": 0.6747474747474748, + "grad_norm": 2.1472683375029757, + "kl": 0.01842041015625, + "learning_rate": 2e-07, + "loss": 0.08016844987869262, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667461395264, + "reward_std": 0.29655990600585935, + "rewards/MultiModalAccuracyORM": 0.24166667461395264, + "step": 1670, + "train_speed(iter/s)": 0.024002 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.15, + "epoch": 0.6767676767676768, + "grad_norm": 2.136669782149022, + "kl": 0.012603759765625, + "learning_rate": 2e-07, + "loss": 0.03559441566467285, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333334103226662, + "reward_std": 0.31266487538814547, + "rewards/MultiModalAccuracyORM": 0.18333334103226662, + "step": 1675, + "train_speed(iter/s)": 0.023983 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.7, + "epoch": 0.6787878787878788, + "grad_norm": 2.5120224393696056, + "kl": 0.033984375, + "learning_rate": 2e-07, + "loss": -0.02109343409538269, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333410322666, + "reward_std": 0.2629852324724197, + "rewards/MultiModalAccuracyORM": 0.2083333410322666, + "step": 1680, + "train_speed(iter/s)": 0.02395 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.85, + "epoch": 0.6808080808080809, + "grad_norm": 2.7291188101039268, + "kl": 0.0185638427734375, + "learning_rate": 2e-07, + "loss": 0.06400806307792664, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.441666679084301, + "reward_std": 0.3586460083723068, + "rewards/MultiModalAccuracyORM": 0.441666679084301, + "step": 1685, + "train_speed(iter/s)": 0.023931 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.25, + "epoch": 0.6828282828282828, + "grad_norm": 2.473418035792826, + "kl": 0.03394775390625, + "learning_rate": 2e-07, + "loss": 0.042749062180519104, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000044703484, + "reward_std": 0.30718872845172884, + "rewards/MultiModalAccuracyORM": 0.3250000044703484, + "step": 1690, + "train_speed(iter/s)": 0.023921 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.45, + "epoch": 0.6848484848484848, + "grad_norm": 1.4363881715878042, + "kl": 0.023870849609375, + "learning_rate": 2e-07, + "loss": 0.007241478562355042, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000022351742, + "reward_std": 0.3244373768568039, + "rewards/MultiModalAccuracyORM": 0.3500000022351742, + "step": 1695, + "train_speed(iter/s)": 0.023909 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0, + "epoch": 0.6868686868686869, + "grad_norm": 2.953319073134284, + "kl": 0.023052978515625, + "learning_rate": 2e-07, + "loss": -0.010269761085510254, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.32500000596046447, + "reward_std": 0.3144540905952454, + "rewards/MultiModalAccuracyORM": 0.32500000596046447, + "step": 1700, + "train_speed(iter/s)": 0.023894 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.35, + "epoch": 0.6888888888888889, + "grad_norm": 2.565868939994401, + "kl": 0.02255859375, + "learning_rate": 2e-07, + "loss": 0.018953490257263183, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3583333417773247, + "reward_std": 0.30840655863285066, + "rewards/MultiModalAccuracyORM": 0.3583333417773247, + "step": 1705, + "train_speed(iter/s)": 0.023885 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.9, + "epoch": 0.6909090909090909, + "grad_norm": 0.6694533298035624, + "kl": 0.01932373046875, + "learning_rate": 2e-07, + "loss": 0.008337923884391784, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667535901069, + "reward_std": 0.2652415007352829, + "rewards/MultiModalAccuracyORM": 0.21666667535901069, + "step": 1710, + "train_speed(iter/s)": 0.02387 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.95, + "epoch": 0.692929292929293, + "grad_norm": 1.567189433294113, + "kl": 0.030279541015625, + "learning_rate": 2e-07, + "loss": 0.03896563053131104, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667759418485, + "reward_std": 0.35563530325889586, + "rewards/MultiModalAccuracyORM": 0.41666667759418485, + "step": 1715, + "train_speed(iter/s)": 0.023857 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.45, + "epoch": 0.694949494949495, + "grad_norm": 1.8167696383064045, + "kl": 0.0214141845703125, + "learning_rate": 2e-07, + "loss": 0.020650827884674074, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4333333484828472, + "reward_std": 0.39936017990112305, + "rewards/MultiModalAccuracyORM": 0.4333333484828472, + "step": 1720, + "train_speed(iter/s)": 0.023843 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.55, + "epoch": 0.696969696969697, + "grad_norm": 2.213186558232037, + "kl": 0.0275634765625, + "learning_rate": 2e-07, + "loss": -0.008746334910392761, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.341666679084301, + "reward_std": 0.35490245223045347, + "rewards/MultiModalAccuracyORM": 0.341666679084301, + "step": 1725, + "train_speed(iter/s)": 0.023835 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.9, + "epoch": 0.6989898989898989, + "grad_norm": 2.601045176615316, + "kl": 0.021826171875, + "learning_rate": 2e-07, + "loss": -0.03737230598926544, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667461395265, + "reward_std": 0.38179769814014436, + "rewards/MultiModalAccuracyORM": 0.31666667461395265, + "step": 1730, + "train_speed(iter/s)": 0.023822 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.3, + "epoch": 0.701010101010101, + "grad_norm": 0.9407841462948962, + "kl": 0.0240234375, + "learning_rate": 2e-07, + "loss": -0.0031855762004852294, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2750000089406967, + "reward_std": 0.2511385798454285, + "rewards/MultiModalAccuracyORM": 0.2750000089406967, + "step": 1735, + "train_speed(iter/s)": 0.023807 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.95, + "epoch": 0.703030303030303, + "grad_norm": 2.6759259468484413, + "kl": 0.0215087890625, + "learning_rate": 2e-07, + "loss": 0.025629484653472902, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.34166667312383653, + "reward_std": 0.3390218883752823, + "rewards/MultiModalAccuracyORM": 0.34166667312383653, + "step": 1740, + "train_speed(iter/s)": 0.023819 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.85, + "epoch": 0.705050505050505, + "grad_norm": 1.6215662631256935, + "kl": 0.043084716796875, + "learning_rate": 2e-07, + "loss": 0.01873619556427002, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333395421505, + "reward_std": 0.23631438612937927, + "rewards/MultiModalAccuracyORM": 0.3083333395421505, + "step": 1745, + "train_speed(iter/s)": 0.023814 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 3.313730122510265, + "learning_rate": 2e-07, + "loss": -0.041856271028518674, + "memory(GiB)": 87.45, + "step": 1750, + "train_speed(iter/s)": 0.023773 + }, + { + "epoch": 0.7070707070707071, + "eval_clip_ratio": 0.0, + "eval_completion_length": 318.58167419433596, + "eval_kl": 0.0221929931640625, + "eval_loss": 0.0349855050444603, + "eval_response_clip_ratio": 0.001666666716337204, + "eval_reward": 0.2950000064074993, + "eval_reward_std": 0.3137217426300049, + "eval_rewards/MultiModalAccuracyORM": 0.2950000064074993, + "eval_runtime": 782.5117, + "eval_samples_per_second": 0.064, + "eval_steps_per_second": 0.006, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.775, + "epoch": 0.7090909090909091, + "grad_norm": 1.446054468361364, + "kl": 0.0215576171875, + "learning_rate": 2e-07, + "loss": 0.013345304131507873, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333333805203438, + "reward_std": 0.3380433991551399, + "rewards/MultiModalAccuracyORM": 0.23333333805203438, + "step": 1755, + "train_speed(iter/s)": 0.023419 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.45, + "epoch": 0.7111111111111111, + "grad_norm": 1.3947630704883345, + "kl": 0.018634033203125, + "learning_rate": 2e-07, + "loss": 0.010007500648498535, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000521540643, + "reward_std": 0.27148365676403047, + "rewards/MultiModalAccuracyORM": 0.17500000521540643, + "step": 1760, + "train_speed(iter/s)": 0.023454 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5, + "epoch": 0.7131313131313132, + "grad_norm": 2.218781010019711, + "kl": 0.021832275390625, + "learning_rate": 2e-07, + "loss": -0.013157431781291962, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333656191825, + "reward_std": 0.2652770906686783, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 1765, + "train_speed(iter/s)": 0.023491 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.75, + "epoch": 0.7151515151515152, + "grad_norm": 1.7430710535513718, + "kl": 0.01793212890625, + "learning_rate": 2e-07, + "loss": 0.021530145406723024, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.26666667610406875, + "reward_std": 0.3066769391298294, + "rewards/MultiModalAccuracyORM": 0.26666667610406875, + "step": 1770, + "train_speed(iter/s)": 0.023528 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.35, + "epoch": 0.7171717171717171, + "grad_norm": 1.7339756470338048, + "kl": 0.014569091796875, + "learning_rate": 2e-07, + "loss": -0.058446085453033446, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.2820172876119614, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 1775, + "train_speed(iter/s)": 0.023563 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.0, + "epoch": 0.7191919191919192, + "grad_norm": 1.6384172396752068, + "kl": 0.0145233154296875, + "learning_rate": 2e-07, + "loss": -0.00234740674495697, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.33333334177732465, + "reward_std": 0.3890485167503357, + "rewards/MultiModalAccuracyORM": 0.33333334177732465, + "step": 1780, + "train_speed(iter/s)": 0.02359 + }, + { + "clip_ratio": 0.0, + "completion_length": 391.6, + "epoch": 0.7212121212121212, + "grad_norm": 2.6878660022854333, + "kl": 0.016748046875, + "learning_rate": 2e-07, + "loss": 0.03554516434669495, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333395421505, + "reward_std": 0.35974039435386657, + "rewards/MultiModalAccuracyORM": 0.2833333395421505, + "step": 1785, + "train_speed(iter/s)": 0.023622 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.25, + "epoch": 0.7232323232323232, + "grad_norm": 2.4324428426946834, + "kl": 0.0128204345703125, + "learning_rate": 2e-07, + "loss": -0.047456872463226316, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833333730697633, + "reward_std": 0.2970361053943634, + "rewards/MultiModalAccuracyORM": 0.25833333730697633, + "step": 1790, + "train_speed(iter/s)": 0.023655 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.65, + "epoch": 0.7252525252525253, + "grad_norm": 1.8618904482502028, + "kl": 0.0149169921875, + "learning_rate": 2e-07, + "loss": 0.009033694863319397, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2583333387970924, + "reward_std": 0.21750431060791015, + "rewards/MultiModalAccuracyORM": 0.2583333387970924, + "step": 1795, + "train_speed(iter/s)": 0.023686 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.05, + "epoch": 0.7272727272727273, + "grad_norm": 3.36471551001556, + "kl": 0.02044677734375, + "learning_rate": 2e-07, + "loss": 0.010516098141670227, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666731238365, + "reward_std": 0.21218962371349334, + "rewards/MultiModalAccuracyORM": 0.3166666731238365, + "step": 1800, + "train_speed(iter/s)": 0.023721 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.05, + "epoch": 0.7292929292929293, + "grad_norm": 3.723751882855137, + "kl": 0.023046875, + "learning_rate": 2e-07, + "loss": -0.02001919746398926, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333447575569, + "reward_std": 0.28128686249256135, + "rewards/MultiModalAccuracyORM": 0.4083333447575569, + "step": 1805, + "train_speed(iter/s)": 0.023755 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.35, + "epoch": 0.7313131313131314, + "grad_norm": 54.701999328620005, + "kl": 0.02723388671875, + "learning_rate": 2e-07, + "loss": 0.03721327781677246, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2416666753590107, + "reward_std": 0.2910481750965118, + "rewards/MultiModalAccuracyORM": 0.2416666753590107, + "step": 1810, + "train_speed(iter/s)": 0.02379 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.45, + "epoch": 0.7333333333333333, + "grad_norm": 3.0855092667576733, + "kl": 0.015704345703125, + "learning_rate": 2e-07, + "loss": -0.037659955024719236, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667759418486, + "reward_std": 0.36648276150226594, + "rewards/MultiModalAccuracyORM": 0.36666667759418486, + "step": 1815, + "train_speed(iter/s)": 0.023829 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3, + "epoch": 0.7353535353535353, + "grad_norm": 2.1896027058768217, + "kl": 0.01336669921875, + "learning_rate": 2e-07, + "loss": 0.02186403125524521, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4416666753590107, + "reward_std": 0.2956440091133118, + "rewards/MultiModalAccuracyORM": 0.4416666753590107, + "step": 1820, + "train_speed(iter/s)": 0.023865 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.95, + "epoch": 0.7373737373737373, + "grad_norm": 1.540468825830471, + "kl": 0.010992431640625, + "learning_rate": 2e-07, + "loss": 0.03888830542564392, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1583333395421505, + "reward_std": 0.21368902921676636, + "rewards/MultiModalAccuracyORM": 0.1583333395421505, + "step": 1825, + "train_speed(iter/s)": 0.023899 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.15, + "epoch": 0.7393939393939394, + "grad_norm": 49.26742721312377, + "kl": 0.0157135009765625, + "learning_rate": 2e-07, + "loss": -0.0031795650720596313, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1333333395421505, + "reward_std": 0.2736803233623505, + "rewards/MultiModalAccuracyORM": 0.1333333395421505, + "step": 1830, + "train_speed(iter/s)": 0.023929 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.0, + "epoch": 0.7414141414141414, + "grad_norm": 1.2425141205561836, + "kl": 0.0211181640625, + "learning_rate": 2e-07, + "loss": -0.01690070778131485, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.33905747830867766, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 1835, + "train_speed(iter/s)": 0.023961 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.15, + "epoch": 0.7434343434343434, + "grad_norm": 2.8910783603707144, + "kl": 0.0198638916015625, + "learning_rate": 2e-07, + "loss": 0.06207960844039917, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.40000001043081285, + "reward_std": 0.38306058645248414, + "rewards/MultiModalAccuracyORM": 0.40000001043081285, + "step": 1840, + "train_speed(iter/s)": 0.023987 + }, + { + "clip_ratio": 0.0, + "completion_length": 398.35, + "epoch": 0.7454545454545455, + "grad_norm": 14.235626745032626, + "kl": 0.019775390625, + "learning_rate": 2e-07, + "loss": 0.037658247351646426, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.16666666865348817, + "reward_std": 0.12708649039268494, + "rewards/MultiModalAccuracyORM": 0.16666666865348817, + "step": 1845, + "train_speed(iter/s)": 0.024018 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.4, + "epoch": 0.7474747474747475, + "grad_norm": 1.833635434555557, + "kl": 0.018505859375, + "learning_rate": 2e-07, + "loss": -0.026553609967231752, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3750000111758709, + "reward_std": 0.34710129499435427, + "rewards/MultiModalAccuracyORM": 0.3750000111758709, + "step": 1850, + "train_speed(iter/s)": 0.024051 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.6, + "epoch": 0.7494949494949495, + "grad_norm": 1.825594490175896, + "kl": 0.02091064453125, + "learning_rate": 2e-07, + "loss": 0.02868058383464813, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334550261495, + "reward_std": 0.3127244710922241, + "rewards/MultiModalAccuracyORM": 0.33333334550261495, + "step": 1855, + "train_speed(iter/s)": 0.024084 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.45, + "epoch": 0.7515151515151515, + "grad_norm": 1.3722283938123239, + "kl": 0.023919677734375, + "learning_rate": 2e-07, + "loss": 0.017566892504692077, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.14166666865348815, + "reward_std": 0.32900004684925077, + "rewards/MultiModalAccuracyORM": 0.14166666865348815, + "step": 1860, + "train_speed(iter/s)": 0.024119 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.25, + "epoch": 0.7535353535353535, + "grad_norm": 3.3603602877653964, + "kl": 0.023779296875, + "learning_rate": 2e-07, + "loss": 0.051629495620727536, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334177732465, + "reward_std": 0.4036242991685867, + "rewards/MultiModalAccuracyORM": 0.33333334177732465, + "step": 1865, + "train_speed(iter/s)": 0.02415 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.4, + "epoch": 0.7555555555555555, + "grad_norm": 4.690429815238561, + "kl": 0.0260162353515625, + "learning_rate": 2e-07, + "loss": -0.004315692186355591, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000029802322, + "reward_std": 0.2940108567476273, + "rewards/MultiModalAccuracyORM": 0.2500000029802322, + "step": 1870, + "train_speed(iter/s)": 0.024182 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.45, + "epoch": 0.7575757575757576, + "grad_norm": 2.7051519330762646, + "kl": 0.0303466796875, + "learning_rate": 2e-07, + "loss": -0.008211909234523774, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25833334103226663, + "reward_std": 0.3237069517374039, + "rewards/MultiModalAccuracyORM": 0.25833334103226663, + "step": 1875, + "train_speed(iter/s)": 0.024217 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.5, + "epoch": 0.7595959595959596, + "grad_norm": 2.8417211154013895, + "kl": 0.02593994140625, + "learning_rate": 2e-07, + "loss": 0.061132901906967164, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.27500000670552255, + "reward_std": 0.40261563658714294, + "rewards/MultiModalAccuracyORM": 0.27500000670552255, + "step": 1880, + "train_speed(iter/s)": 0.024247 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.45, + "epoch": 0.7616161616161616, + "grad_norm": 2.730755662335053, + "kl": 0.026220703125, + "learning_rate": 2e-07, + "loss": 0.036236304044723514, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1666666731238365, + "reward_std": 0.3101543754339218, + "rewards/MultiModalAccuracyORM": 0.1666666731238365, + "step": 1885, + "train_speed(iter/s)": 0.024279 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.0, + "epoch": 0.7636363636363637, + "grad_norm": 1.777471986992103, + "kl": 0.025811767578125, + "learning_rate": 2e-07, + "loss": 0.010323920845985412, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666701436043, + "reward_std": 0.2506715327501297, + "rewards/MultiModalAccuracyORM": 0.3916666701436043, + "step": 1890, + "train_speed(iter/s)": 0.024315 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.5, + "epoch": 0.7656565656565657, + "grad_norm": 0.13037300867268706, + "kl": 0.030450439453125, + "learning_rate": 2e-07, + "loss": 0.0042250391095876695, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35000001043081286, + "reward_std": 0.3182337760925293, + "rewards/MultiModalAccuracyORM": 0.35000001043081286, + "step": 1895, + "train_speed(iter/s)": 0.024345 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.7, + "epoch": 0.7676767676767676, + "grad_norm": 1.7511437916198835, + "kl": 0.016363525390625, + "learning_rate": 2e-07, + "loss": 0.006176537275314331, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.44166667461395265, + "reward_std": 0.2988493382930756, + "rewards/MultiModalAccuracyORM": 0.44166667461395265, + "step": 1900, + "train_speed(iter/s)": 0.024374 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.25, + "epoch": 0.7696969696969697, + "grad_norm": 2.6784748457723992, + "kl": 0.026043701171875, + "learning_rate": 2e-07, + "loss": -0.0650195300579071, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4416666753590107, + "reward_std": 0.4098664551973343, + "rewards/MultiModalAccuracyORM": 0.4416666753590107, + "step": 1905, + "train_speed(iter/s)": 0.024412 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.75, + "epoch": 0.7717171717171717, + "grad_norm": 2.0646305839430648, + "kl": 0.027471923828125, + "learning_rate": 2e-07, + "loss": 0.023633481562137605, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30833334401249884, + "reward_std": 0.375223833322525, + "rewards/MultiModalAccuracyORM": 0.30833334401249884, + "step": 1910, + "train_speed(iter/s)": 0.024446 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.0, + "epoch": 0.7737373737373737, + "grad_norm": 1.9430903927913294, + "kl": 0.018585205078125, + "learning_rate": 2e-07, + "loss": -0.023164969682693482, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.3330695480108261, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 1915, + "train_speed(iter/s)": 0.024483 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.55, + "epoch": 0.7757575757575758, + "grad_norm": 1.2487710271189274, + "kl": 0.0145263671875, + "learning_rate": 2e-07, + "loss": 0.014984607696533203, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.38333334028720856, + "reward_std": 0.2784802496433258, + "rewards/MultiModalAccuracyORM": 0.38333334028720856, + "step": 1920, + "train_speed(iter/s)": 0.024514 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.65, + "epoch": 0.7777777777777778, + "grad_norm": 3.397172729657377, + "kl": 0.025823974609375, + "learning_rate": 2e-07, + "loss": 0.010728538036346436, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.45000001341104506, + "reward_std": 0.36237767040729524, + "rewards/MultiModalAccuracyORM": 0.45000001341104506, + "step": 1925, + "train_speed(iter/s)": 0.024547 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.9, + "epoch": 0.7797979797979798, + "grad_norm": 2.445242624274772, + "kl": 0.02085418701171875, + "learning_rate": 2e-07, + "loss": 0.0506191611289978, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.366666679084301, + "reward_std": 0.3425410449504852, + "rewards/MultiModalAccuracyORM": 0.366666679084301, + "step": 1930, + "train_speed(iter/s)": 0.024581 + }, + { + "clip_ratio": 0.0, + "completion_length": 414.95, + "epoch": 0.7818181818181819, + "grad_norm": 2.2267041732312953, + "kl": 0.0191650390625, + "learning_rate": 2e-07, + "loss": 0.07460187673568726, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1750000037252903, + "reward_std": 0.27998208105564115, + "rewards/MultiModalAccuracyORM": 0.1750000037252903, + "step": 1935, + "train_speed(iter/s)": 0.024609 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.15, + "epoch": 0.7838383838383839, + "grad_norm": 0.08307319969608204, + "kl": 0.01834716796875, + "learning_rate": 2e-07, + "loss": 0.01801389306783676, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.2292436480522156, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 1940, + "train_speed(iter/s)": 0.024633 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.25, + "epoch": 0.7858585858585858, + "grad_norm": 2.4956737169852876, + "kl": 0.0243896484375, + "learning_rate": 2e-07, + "loss": 0.02604297399520874, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4416666768491268, + "reward_std": 0.23860624432563782, + "rewards/MultiModalAccuracyORM": 0.4416666768491268, + "step": 1945, + "train_speed(iter/s)": 0.024666 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.6, + "epoch": 0.7878787878787878, + "grad_norm": 1.412421381873315, + "kl": 0.03074951171875, + "learning_rate": 2e-07, + "loss": -0.008066686987876891, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.3619014710187912, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 1950, + "train_speed(iter/s)": 0.0247 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.4, + "epoch": 0.7898989898989899, + "grad_norm": 2.581974028906461, + "kl": 0.0264404296875, + "learning_rate": 2e-07, + "loss": 0.0021781913936138155, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.6333333432674408, + "reward_std": 0.34636789858341216, + "rewards/MultiModalAccuracyORM": 0.6333333432674408, + "step": 1955, + "train_speed(iter/s)": 0.024735 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.8, + "epoch": 0.7919191919191919, + "grad_norm": 2.7977079078012546, + "kl": 0.030804443359375, + "learning_rate": 2e-07, + "loss": 0.028843042254447938, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333333879709245, + "reward_std": 0.18488111793994905, + "rewards/MultiModalAccuracyORM": 0.23333333879709245, + "step": 1960, + "train_speed(iter/s)": 0.02477 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.4, + "epoch": 0.793939393939394, + "grad_norm": 2.3766146998216606, + "kl": 0.029986572265625, + "learning_rate": 2e-07, + "loss": 0.01644158363342285, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5250000111758709, + "reward_std": 0.3782962501049042, + "rewards/MultiModalAccuracyORM": 0.5250000111758709, + "step": 1965, + "train_speed(iter/s)": 0.024803 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.2, + "epoch": 0.795959595959596, + "grad_norm": 1.6454459000922825, + "kl": 0.03331298828125, + "learning_rate": 2e-07, + "loss": 0.04098441600799561, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666805744171, + "reward_std": 0.31651573479175565, + "rewards/MultiModalAccuracyORM": 0.3916666805744171, + "step": 1970, + "train_speed(iter/s)": 0.024833 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5, + "epoch": 0.797979797979798, + "grad_norm": 2.676941712540541, + "kl": 0.033160400390625, + "learning_rate": 2e-07, + "loss": -0.06822603344917297, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333395421505, + "reward_std": 0.26591232419013977, + "rewards/MultiModalAccuracyORM": 0.4083333395421505, + "step": 1975, + "train_speed(iter/s)": 0.024862 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.05, + "epoch": 0.8, + "grad_norm": 2.6654647292288565, + "kl": 0.03338623046875, + "learning_rate": 2e-07, + "loss": 0.018979550898075105, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333410322666, + "reward_std": 0.2988493382930756, + "rewards/MultiModalAccuracyORM": 0.3083333410322666, + "step": 1980, + "train_speed(iter/s)": 0.024892 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.9, + "epoch": 0.802020202020202, + "grad_norm": 1.2773941729876779, + "kl": 0.02757568359375, + "learning_rate": 2e-07, + "loss": 0.0032975614070892335, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3666666716337204, + "reward_std": 0.21999078691005708, + "rewards/MultiModalAccuracyORM": 0.3666666716337204, + "step": 1985, + "train_speed(iter/s)": 0.024918 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.45, + "epoch": 0.804040404040404, + "grad_norm": 3.249804741680811, + "kl": 0.0233734130859375, + "learning_rate": 2e-07, + "loss": -0.0009274959564208984, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.25000000447034837, + "reward_std": 0.3111390322446823, + "rewards/MultiModalAccuracyORM": 0.25000000447034837, + "step": 1990, + "train_speed(iter/s)": 0.024952 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.3, + "epoch": 0.806060606060606, + "grad_norm": 1.6358353140611315, + "kl": 0.02435302734375, + "learning_rate": 2e-07, + "loss": 0.01845797598361969, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3083333380520344, + "reward_std": 0.31345489621162415, + "rewards/MultiModalAccuracyORM": 0.3083333380520344, + "step": 1995, + "train_speed(iter/s)": 0.024983 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 2.5769756858186366, + "learning_rate": 2e-07, + "loss": -0.03718583881855011, + "memory(GiB)": 87.45, + "step": 2000, + "train_speed(iter/s)": 0.025016 + }, + { + "epoch": 0.8080808080808081, + "eval_clip_ratio": 0.0, + "eval_completion_length": 323.9533418273926, + "eval_kl": 0.0281341552734375, + "eval_loss": 0.006039996165782213, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.318333340883255, + "eval_reward_std": 0.32694393634796143, + "eval_rewards/MultiModalAccuracyORM": 0.318333340883255, + "eval_runtime": 462.0456, + "eval_samples_per_second": 0.108, + "eval_steps_per_second": 0.011, + "step": 2000 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.125, + "epoch": 0.8101010101010101, + "grad_norm": 1.7033276169087128, + "kl": 0.02674102783203125, + "learning_rate": 2e-07, + "loss": 0.03609513640403748, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23750001043081284, + "reward_std": 0.24687736183404924, + "rewards/MultiModalAccuracyORM": 0.23750001043081284, + "step": 2005, + "train_speed(iter/s)": 0.024793 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.55, + "epoch": 0.8121212121212121, + "grad_norm": 1.77522203951707, + "kl": 0.0292724609375, + "learning_rate": 2e-07, + "loss": 0.01515505015850067, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000059604645, + "reward_std": 0.38405978083610537, + "rewards/MultiModalAccuracyORM": 0.3500000059604645, + "step": 2010, + "train_speed(iter/s)": 0.024823 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.5, + "epoch": 0.8141414141414142, + "grad_norm": 2.047124696336966, + "kl": 0.02886962890625, + "learning_rate": 2e-07, + "loss": -0.056891226768493654, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.350000012665987, + "reward_std": 0.3127244710922241, + "rewards/MultiModalAccuracyORM": 0.350000012665987, + "step": 2015, + "train_speed(iter/s)": 0.024857 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.0, + "epoch": 0.8161616161616162, + "grad_norm": 2.933718360724764, + "kl": 0.0226837158203125, + "learning_rate": 2e-07, + "loss": 0.04815356135368347, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666753590107, + "reward_std": 0.3597048044204712, + "rewards/MultiModalAccuracyORM": 0.3916666753590107, + "step": 2020, + "train_speed(iter/s)": 0.024891 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.85, + "epoch": 0.8181818181818182, + "grad_norm": 2.3099689560601595, + "kl": 0.015521240234375, + "learning_rate": 2e-07, + "loss": 0.00659940093755722, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4250000067055225, + "reward_std": 0.2574163258075714, + "rewards/MultiModalAccuracyORM": 0.4250000067055225, + "step": 2025, + "train_speed(iter/s)": 0.024922 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.5, + "epoch": 0.8202020202020202, + "grad_norm": 2.5439305675732165, + "kl": 0.019085693359375, + "learning_rate": 2e-07, + "loss": 0.0326183021068573, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3750000074505806, + "reward_std": 0.31040860116481783, + "rewards/MultiModalAccuracyORM": 0.3750000074505806, + "step": 2030, + "train_speed(iter/s)": 0.024949 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.5, + "epoch": 0.8222222222222222, + "grad_norm": 3.2829060035742557, + "kl": 0.023626708984375, + "learning_rate": 2e-07, + "loss": 0.015071746706962586, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666693985462, + "reward_std": 0.205923455953598, + "rewards/MultiModalAccuracyORM": 0.2666666693985462, + "step": 2035, + "train_speed(iter/s)": 0.024973 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.55, + "epoch": 0.8242424242424242, + "grad_norm": 2.658698100364113, + "kl": 0.0230712890625, + "learning_rate": 2e-07, + "loss": 0.013616405427455902, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333333879709244, + "reward_std": 0.34936913549900056, + "rewards/MultiModalAccuracyORM": 0.28333333879709244, + "step": 2040, + "train_speed(iter/s)": 0.024998 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.15, + "epoch": 0.8262626262626263, + "grad_norm": 2.342715529046246, + "kl": 0.029901123046875, + "learning_rate": 2e-07, + "loss": 0.037117105722427365, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3166666753590107, + "reward_std": 0.36670139729976653, + "rewards/MultiModalAccuracyORM": 0.3166666753590107, + "step": 2045, + "train_speed(iter/s)": 0.025033 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.45, + "epoch": 0.8282828282828283, + "grad_norm": 0.9452733042514408, + "kl": 0.025860595703125, + "learning_rate": 2e-07, + "loss": 0.03209388256072998, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000074505805, + "reward_std": 0.26750934720039365, + "rewards/MultiModalAccuracyORM": 0.22500000074505805, + "step": 2050, + "train_speed(iter/s)": 0.025068 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.15, + "epoch": 0.8303030303030303, + "grad_norm": 2.136815117405037, + "kl": 0.0298553466796875, + "learning_rate": 2e-07, + "loss": 0.04463410079479217, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3416666753590107, + "reward_std": 0.4307381808757782, + "rewards/MultiModalAccuracyORM": 0.3416666753590107, + "step": 2055, + "train_speed(iter/s)": 0.025094 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.85, + "epoch": 0.8323232323232324, + "grad_norm": 1.7941689466428354, + "kl": 0.018414306640625, + "learning_rate": 2e-07, + "loss": -0.013085222244262696, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333333432674406, + "reward_std": 0.27756677865982055, + "rewards/MultiModalAccuracyORM": 0.33333333432674406, + "step": 2060, + "train_speed(iter/s)": 0.025121 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.6, + "epoch": 0.8343434343434344, + "grad_norm": 2.741809894885581, + "kl": 0.0217803955078125, + "learning_rate": 2e-07, + "loss": 0.032400667667388916, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333395421505, + "reward_std": 0.3207202583551407, + "rewards/MultiModalAccuracyORM": 0.2083333395421505, + "step": 2065, + "train_speed(iter/s)": 0.025146 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.7, + "epoch": 0.8363636363636363, + "grad_norm": 1.5317649365927353, + "kl": 0.02591552734375, + "learning_rate": 2e-07, + "loss": 0.026116135716438293, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2916666746139526, + "reward_std": 0.3315081149339676, + "rewards/MultiModalAccuracyORM": 0.2916666746139526, + "step": 2070, + "train_speed(iter/s)": 0.02517 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.3, + "epoch": 0.8383838383838383, + "grad_norm": 2.2493040161672164, + "kl": 0.023297119140625, + "learning_rate": 2e-07, + "loss": 0.011263298988342284, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3666666723787785, + "reward_std": 0.27122943103313446, + "rewards/MultiModalAccuracyORM": 0.3666666723787785, + "step": 2075, + "train_speed(iter/s)": 0.025195 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.35, + "epoch": 0.8404040404040404, + "grad_norm": 1.6803752878651963, + "kl": 0.05001220703125, + "learning_rate": 2e-07, + "loss": 0.021441753208637237, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3833333387970924, + "reward_std": 0.3531844109296799, + "rewards/MultiModalAccuracyORM": 0.3833333387970924, + "step": 2080, + "train_speed(iter/s)": 0.025225 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.15, + "epoch": 0.8424242424242424, + "grad_norm": 1.980173450589181, + "kl": 0.0163818359375, + "learning_rate": 2e-07, + "loss": 0.013161852955818176, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667088866234, + "reward_std": 0.22625695466995238, + "rewards/MultiModalAccuracyORM": 0.21666667088866234, + "step": 2085, + "train_speed(iter/s)": 0.025254 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.1, + "epoch": 0.8444444444444444, + "grad_norm": 1.0010632093343366, + "kl": 0.017938232421875, + "learning_rate": 2e-07, + "loss": -0.0012541890144348144, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15000000223517418, + "reward_std": 0.2916341096162796, + "rewards/MultiModalAccuracyORM": 0.15000000223517418, + "step": 2090, + "train_speed(iter/s)": 0.025273 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0, + "epoch": 0.8464646464646465, + "grad_norm": 1.8276205217385537, + "kl": 0.0211029052734375, + "learning_rate": 2e-07, + "loss": 0.018240103125572206, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2916666708886623, + "reward_std": 0.35748412609100344, + "rewards/MultiModalAccuracyORM": 0.2916666708886623, + "step": 2095, + "train_speed(iter/s)": 0.0253 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.3, + "epoch": 0.8484848484848485, + "grad_norm": 2.25183174936328, + "kl": 0.0171142578125, + "learning_rate": 2e-07, + "loss": -0.0015764832496643066, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666775941849, + "reward_std": 0.2782260239124298, + "rewards/MultiModalAccuracyORM": 0.3916666775941849, + "step": 2100, + "train_speed(iter/s)": 0.02533 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.65, + "epoch": 0.8505050505050505, + "grad_norm": 2.301476369720727, + "kl": 0.02381591796875, + "learning_rate": 2e-07, + "loss": 0.02723083198070526, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.2500000037252903, + "reward_std": 0.3780420243740082, + "rewards/MultiModalAccuracyORM": 0.2500000037252903, + "step": 2105, + "train_speed(iter/s)": 0.025351 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.2, + "epoch": 0.8525252525252526, + "grad_norm": 2.2465362796243915, + "kl": 0.031561279296875, + "learning_rate": 2e-07, + "loss": -0.006004461646080017, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4500000111758709, + "reward_std": 0.386061829328537, + "rewards/MultiModalAccuracyORM": 0.4500000111758709, + "step": 2110, + "train_speed(iter/s)": 0.025381 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.45, + "epoch": 0.8545454545454545, + "grad_norm": 0.034882262330713364, + "kl": 0.01632537841796875, + "learning_rate": 2e-07, + "loss": 0.07573002576828003, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2083333358168602, + "reward_std": 0.3058815211057663, + "rewards/MultiModalAccuracyORM": 0.2083333358168602, + "step": 2115, + "train_speed(iter/s)": 0.025404 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.3, + "epoch": 0.8565656565656565, + "grad_norm": 1.8179385747560524, + "kl": 0.01519775390625, + "learning_rate": 2e-07, + "loss": 0.046589908003807065, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000596046447, + "reward_std": 0.28446818590164186, + "rewards/MultiModalAccuracyORM": 0.20000000596046447, + "step": 2120, + "train_speed(iter/s)": 0.025437 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.8, + "epoch": 0.8585858585858586, + "grad_norm": 1.842386637827148, + "kl": 0.023931884765625, + "learning_rate": 2e-07, + "loss": 0.0047568708658218386, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2500000141561031, + "reward_std": 0.32924269437789916, + "rewards/MultiModalAccuracyORM": 0.2500000141561031, + "step": 2125, + "train_speed(iter/s)": 0.025467 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.4, + "epoch": 0.8606060606060606, + "grad_norm": 3.12980971819249, + "kl": 0.0230224609375, + "learning_rate": 2e-07, + "loss": 0.012965646386146546, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.3833333387970924, + "reward_std": 0.3985911935567856, + "rewards/MultiModalAccuracyORM": 0.3833333387970924, + "step": 2130, + "train_speed(iter/s)": 0.025486 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.65, + "epoch": 0.8626262626262626, + "grad_norm": 0.9262722343921138, + "kl": 0.018023681640625, + "learning_rate": 2e-07, + "loss": 0.0012422390282154083, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000149011613, + "reward_std": 0.1808116167783737, + "rewards/MultiModalAccuracyORM": 0.17500000149011613, + "step": 2135, + "train_speed(iter/s)": 0.025513 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.7, + "epoch": 0.8646464646464647, + "grad_norm": 1.0357905764180717, + "kl": 0.01571044921875, + "learning_rate": 2e-07, + "loss": 0.0018387317657470703, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667014360426, + "reward_std": 0.25490583181381227, + "rewards/MultiModalAccuracyORM": 0.29166667014360426, + "step": 2140, + "train_speed(iter/s)": 0.025545 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.1, + "epoch": 0.8666666666666667, + "grad_norm": 2.379354282182724, + "kl": 0.019244384765625, + "learning_rate": 2e-07, + "loss": 0.028354501724243163, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000111758709, + "reward_std": 0.2963056802749634, + "rewards/MultiModalAccuracyORM": 0.3000000111758709, + "step": 2145, + "train_speed(iter/s)": 0.025579 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.1, + "epoch": 0.8686868686868687, + "grad_norm": 1.257926920186221, + "kl": 0.0236419677734375, + "learning_rate": 2e-07, + "loss": 0.05731485486030578, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4500000074505806, + "reward_std": 0.24860407412052155, + "rewards/MultiModalAccuracyORM": 0.4500000074505806, + "step": 2150, + "train_speed(iter/s)": 0.025607 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.65, + "epoch": 0.8707070707070707, + "grad_norm": 0.4145211028011141, + "kl": 0.035430908203125, + "learning_rate": 2e-07, + "loss": -0.008838014304637909, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.0416666679084301, + "reward_std": 0.12552748322486879, + "rewards/MultiModalAccuracyORM": 0.0416666679084301, + "step": 2155, + "train_speed(iter/s)": 0.025637 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.6, + "epoch": 0.8727272727272727, + "grad_norm": 3.5679392309928852, + "kl": 0.020635986328125, + "learning_rate": 2e-07, + "loss": -0.04596620798110962, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.21666667237877846, + "reward_std": 0.3494287371635437, + "rewards/MultiModalAccuracyORM": 0.21666667237877846, + "step": 2160, + "train_speed(iter/s)": 0.02566 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.8, + "epoch": 0.8747474747474747, + "grad_norm": 2.915431806582569, + "kl": 0.03173828125, + "learning_rate": 2e-07, + "loss": 0.03424719870090485, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.45000000670552254, + "reward_std": 0.3579271614551544, + "rewards/MultiModalAccuracyORM": 0.45000000670552254, + "step": 2165, + "train_speed(iter/s)": 0.025692 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.9, + "epoch": 0.8767676767676768, + "grad_norm": 1.2438809288674397, + "kl": 0.02581787109375, + "learning_rate": 2e-07, + "loss": 0.022351789474487304, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2250000022351742, + "reward_std": 0.2556006669998169, + "rewards/MultiModalAccuracyORM": 0.2250000022351742, + "step": 2170, + "train_speed(iter/s)": 0.025718 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.35, + "epoch": 0.8787878787878788, + "grad_norm": 0.08213166464110444, + "kl": 0.0291015625, + "learning_rate": 2e-07, + "loss": -0.04905802011489868, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.3343147337436676, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 2175, + "train_speed(iter/s)": 0.025744 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.8, + "epoch": 0.8808080808080808, + "grad_norm": 1.2558474815848573, + "kl": 0.0392333984375, + "learning_rate": 2e-07, + "loss": 0.03639570772647858, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000052154064, + "reward_std": 0.40410049855709074, + "rewards/MultiModalAccuracyORM": 0.3500000052154064, + "step": 2180, + "train_speed(iter/s)": 0.025763 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.25, + "epoch": 0.8828282828282829, + "grad_norm": 2.2083604873690255, + "kl": 0.02174072265625, + "learning_rate": 2e-07, + "loss": 0.04861523509025574, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.36666667386889457, + "reward_std": 0.4242177873849869, + "rewards/MultiModalAccuracyORM": 0.36666667386889457, + "step": 2185, + "train_speed(iter/s)": 0.025791 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.4, + "epoch": 0.8848484848484849, + "grad_norm": 1.9173115593509535, + "kl": 0.02357177734375, + "learning_rate": 2e-07, + "loss": 0.013380092382431031, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000074505806, + "reward_std": 0.311967608332634, + "rewards/MultiModalAccuracyORM": 0.3000000074505806, + "step": 2190, + "train_speed(iter/s)": 0.025813 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.5, + "epoch": 0.8868686868686869, + "grad_norm": 1.3588226440942046, + "kl": 0.025439453125, + "learning_rate": 2e-07, + "loss": 0.011188817024230958, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.18326250910758973, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 2195, + "train_speed(iter/s)": 0.025832 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.1, + "epoch": 0.8888888888888888, + "grad_norm": 1.8037621160022852, + "kl": 0.034747314453125, + "learning_rate": 2e-07, + "loss": 0.04917380511760712, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333333879709245, + "reward_std": 0.3719944924116135, + "rewards/MultiModalAccuracyORM": 0.23333333879709245, + "step": 2200, + "train_speed(iter/s)": 0.025862 + }, + { + "clip_ratio": 0.0, + "completion_length": 482.15, + "epoch": 0.8909090909090909, + "grad_norm": 2.141711868124079, + "kl": 0.0226776123046875, + "learning_rate": 2e-07, + "loss": -0.018071025609970093, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.11666666939854622, + "reward_std": 0.23030244410037995, + "rewards/MultiModalAccuracyORM": 0.11666666939854622, + "step": 2205, + "train_speed(iter/s)": 0.025882 + }, + { + "clip_ratio": 0.0, + "completion_length": 501.4, + "epoch": 0.8929292929292929, + "grad_norm": 1.4394465065225663, + "kl": 0.03128662109375, + "learning_rate": 2e-07, + "loss": 0.019231194257736207, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1833333380520344, + "reward_std": 0.31740519404411316, + "rewards/MultiModalAccuracyORM": 0.1833333380520344, + "step": 2210, + "train_speed(iter/s)": 0.025901 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.9, + "epoch": 0.8949494949494949, + "grad_norm": 1.8778711843519251, + "kl": 0.03623046875, + "learning_rate": 2e-07, + "loss": 0.042392924427986145, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3000000104308128, + "reward_std": 0.24866368174552916, + "rewards/MultiModalAccuracyORM": 0.3000000104308128, + "step": 2215, + "train_speed(iter/s)": 0.025928 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.6, + "epoch": 0.896969696969697, + "grad_norm": 2.783501622971831, + "kl": 0.02158203125, + "learning_rate": 2e-07, + "loss": -0.009627214074134827, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000074505806, + "reward_std": 0.30665292739868166, + "rewards/MultiModalAccuracyORM": 0.3250000074505806, + "step": 2220, + "train_speed(iter/s)": 0.025959 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.35, + "epoch": 0.898989898989899, + "grad_norm": 64.84162647185127, + "kl": 0.042742919921875, + "learning_rate": 2e-07, + "loss": 0.027672123908996583, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1250000014901161, + "reward_std": 0.24265173375606536, + "rewards/MultiModalAccuracyORM": 0.1250000014901161, + "step": 2225, + "train_speed(iter/s)": 0.025989 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.25, + "epoch": 0.901010101010101, + "grad_norm": 2.756817795935333, + "kl": 0.027203369140625, + "learning_rate": 2e-07, + "loss": -0.0488799124956131, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.20000000149011612, + "reward_std": 0.2922547996044159, + "rewards/MultiModalAccuracyORM": 0.20000000149011612, + "step": 2230, + "train_speed(iter/s)": 0.026019 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.1, + "epoch": 0.9030303030303031, + "grad_norm": 3.484265646880912, + "kl": 0.0185455322265625, + "learning_rate": 2e-07, + "loss": -0.006375116109848022, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333447575569, + "reward_std": 0.2692514002323151, + "rewards/MultiModalAccuracyORM": 0.4083333447575569, + "step": 2235, + "train_speed(iter/s)": 0.026045 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.6, + "epoch": 0.9050505050505051, + "grad_norm": 0.08112989718996635, + "kl": 0.026385498046875, + "learning_rate": 2e-07, + "loss": 0.07493855953216552, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.22500000670552253, + "reward_std": 0.2915985196828842, + "rewards/MultiModalAccuracyORM": 0.22500000670552253, + "step": 2240, + "train_speed(iter/s)": 0.026071 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.5, + "epoch": 0.907070707070707, + "grad_norm": 2.1571772688182276, + "kl": 0.02109375, + "learning_rate": 2e-07, + "loss": -0.008470755815505982, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333656191825, + "reward_std": 0.1808116227388382, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 2245, + "train_speed(iter/s)": 0.026093 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 2.4521268907747036, + "learning_rate": 2e-07, + "loss": 0.02900133728981018, + "memory(GiB)": 87.45, + "step": 2250, + "train_speed(iter/s)": 0.026122 + }, + { + "epoch": 0.9090909090909091, + "eval_clip_ratio": 0.0, + "eval_completion_length": 326.39667755126953, + "eval_kl": 0.0267205810546875, + "eval_loss": 0.02248476631939411, + "eval_response_clip_ratio": 0.0, + "eval_reward": 0.3383333416283131, + "eval_reward_std": 0.30222029507160186, + "eval_rewards/MultiModalAccuracyORM": 0.3383333416283131, + "eval_runtime": 479.1069, + "eval_samples_per_second": 0.104, + "eval_steps_per_second": 0.01, + "step": 2250 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.825, + "epoch": 0.9111111111111111, + "grad_norm": 2.997368813220566, + "kl": 0.02721710205078125, + "learning_rate": 2e-07, + "loss": 0.003950953483581543, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4541666753590107, + "reward_std": 0.3525440260767937, + "rewards/MultiModalAccuracyORM": 0.4541666753590107, + "step": 2255, + "train_speed(iter/s)": 0.025886 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.3, + "epoch": 0.9131313131313131, + "grad_norm": 3.095107484502175, + "kl": 0.0549560546875, + "learning_rate": 2e-07, + "loss": 0.006377041339874268, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4750000052154064, + "reward_std": 0.30114119648933413, + "rewards/MultiModalAccuracyORM": 0.4750000052154064, + "step": 2260, + "train_speed(iter/s)": 0.025918 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.8, + "epoch": 0.9151515151515152, + "grad_norm": 2.764452940040707, + "kl": 0.025128173828125, + "learning_rate": 2e-07, + "loss": -0.060949933528900144, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2833333402872086, + "reward_std": 0.3563301384449005, + "rewards/MultiModalAccuracyORM": 0.2833333402872086, + "step": 2265, + "train_speed(iter/s)": 0.025947 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.85, + "epoch": 0.9171717171717172, + "grad_norm": 1.6613189303519411, + "kl": 0.0338897705078125, + "learning_rate": 2e-07, + "loss": 0.030397918820381165, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334177732468, + "reward_std": 0.21600489914417267, + "rewards/MultiModalAccuracyORM": 0.23333334177732468, + "step": 2270, + "train_speed(iter/s)": 0.025974 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.75, + "epoch": 0.9191919191919192, + "grad_norm": 2.4104355223612903, + "kl": 0.043646240234375, + "learning_rate": 2e-07, + "loss": 0.02471620440483093, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667088866236, + "reward_std": 0.22880061268806456, + "rewards/MultiModalAccuracyORM": 0.39166667088866236, + "step": 2275, + "train_speed(iter/s)": 0.026005 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.65, + "epoch": 0.9212121212121213, + "grad_norm": 0.9890862252945101, + "kl": 0.0232696533203125, + "learning_rate": 2e-07, + "loss": -0.019132834672927857, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.43333334773778914, + "reward_std": 0.28934226334095003, + "rewards/MultiModalAccuracyORM": 0.43333334773778914, + "step": 2280, + "train_speed(iter/s)": 0.026037 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.95, + "epoch": 0.9232323232323232, + "grad_norm": 2.8529813646862565, + "kl": 0.016925048828125, + "learning_rate": 2e-07, + "loss": 0.02090049088001251, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.33333334401249887, + "reward_std": 0.25286819934844973, + "rewards/MultiModalAccuracyORM": 0.33333334401249887, + "step": 2285, + "train_speed(iter/s)": 0.026068 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.05, + "epoch": 0.9252525252525252, + "grad_norm": 1.89117356154723, + "kl": 0.0194671630859375, + "learning_rate": 2e-07, + "loss": 0.006132407486438752, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.24166667014360427, + "reward_std": 0.33449481427669525, + "rewards/MultiModalAccuracyORM": 0.24166667014360427, + "step": 2290, + "train_speed(iter/s)": 0.026093 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.9, + "epoch": 0.9272727272727272, + "grad_norm": 1.5821722224404322, + "kl": 0.0285491943359375, + "learning_rate": 2e-07, + "loss": -0.055334615707397464, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3250000089406967, + "reward_std": 0.3450992465019226, + "rewards/MultiModalAccuracyORM": 0.3250000089406967, + "step": 2295, + "train_speed(iter/s)": 0.026121 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.45, + "epoch": 0.9292929292929293, + "grad_norm": 1.0631048809606616, + "kl": 0.0221282958984375, + "learning_rate": 2e-07, + "loss": 0.04601133763790131, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.17500000298023224, + "reward_std": 0.3211964577436447, + "rewards/MultiModalAccuracyORM": 0.17500000298023224, + "step": 2300, + "train_speed(iter/s)": 0.026144 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.7, + "epoch": 0.9313131313131313, + "grad_norm": 2.2872062972102016, + "kl": 0.013800048828125, + "learning_rate": 2e-07, + "loss": -0.06729268431663513, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.308333345502615, + "reward_std": 0.42669269144535066, + "rewards/MultiModalAccuracyORM": 0.308333345502615, + "step": 2305, + "train_speed(iter/s)": 0.026168 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.05, + "epoch": 0.9333333333333333, + "grad_norm": 1.5571796305098269, + "kl": 0.015960693359375, + "learning_rate": 2e-07, + "loss": 0.019453226029872893, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2750000111758709, + "reward_std": 0.2812868684530258, + "rewards/MultiModalAccuracyORM": 0.2750000111758709, + "step": 2310, + "train_speed(iter/s)": 0.02619 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.5, + "epoch": 0.9353535353535354, + "grad_norm": 1.2789781364913986, + "kl": 0.0262939453125, + "learning_rate": 2e-07, + "loss": -0.014371034502983094, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.39166667610406875, + "reward_std": 0.35789157152175904, + "rewards/MultiModalAccuracyORM": 0.39166667610406875, + "step": 2315, + "train_speed(iter/s)": 0.026212 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.75, + "epoch": 0.9373737373737374, + "grad_norm": 2.0043648431803742, + "kl": 0.0160247802734375, + "learning_rate": 2e-07, + "loss": 0.004941976815462113, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.2666666753590107, + "reward_std": 0.3945602476596832, + "rewards/MultiModalAccuracyORM": 0.2666666753590107, + "step": 2320, + "train_speed(iter/s)": 0.026239 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.45, + "epoch": 0.9393939393939394, + "grad_norm": 2.434275159571036, + "kl": 0.0218505859375, + "learning_rate": 2e-07, + "loss": 0.015783283114433288, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.30000001192092896, + "reward_std": 0.44407508671283724, + "rewards/MultiModalAccuracyORM": 0.30000001192092896, + "step": 2325, + "train_speed(iter/s)": 0.026266 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.45, + "epoch": 0.9414141414141414, + "grad_norm": 3.3518880188766262, + "kl": 0.0180023193359375, + "learning_rate": 2e-07, + "loss": 0.004853534698486328, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.37500001266598704, + "reward_std": 0.3925822228193283, + "rewards/MultiModalAccuracyORM": 0.37500001266598704, + "step": 2330, + "train_speed(iter/s)": 0.026293 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.3, + "epoch": 0.9434343434343434, + "grad_norm": 2.162505598086888, + "kl": 0.018927001953125, + "learning_rate": 2e-07, + "loss": 0.06589244604110718, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.05, + "reward": 0.3666666693985462, + "reward_std": 0.2581467509269714, + "rewards/MultiModalAccuracyORM": 0.3666666693985462, + "step": 2335, + "train_speed(iter/s)": 0.026311 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.1, + "epoch": 0.9454545454545454, + "grad_norm": 2.6990455984773494, + "kl": 0.0258880615234375, + "learning_rate": 2e-07, + "loss": 0.007903063297271728, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.15833333656191825, + "reward_std": 0.3127004593610764, + "rewards/MultiModalAccuracyORM": 0.15833333656191825, + "step": 2340, + "train_speed(iter/s)": 0.026336 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5, + "epoch": 0.9474747474747475, + "grad_norm": 31.778104916563368, + "kl": 0.046075439453125, + "learning_rate": 2e-07, + "loss": -0.046237149834632875, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.28333334624767303, + "reward_std": 0.3485885769128799, + "rewards/MultiModalAccuracyORM": 0.28333334624767303, + "step": 2345, + "train_speed(iter/s)": 0.026363 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.05, + "epoch": 0.9494949494949495, + "grad_norm": 1.8887972983852979, + "kl": 0.0284271240234375, + "learning_rate": 2e-07, + "loss": -0.044114714860916136, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.29166667610406877, + "reward_std": 0.3408351272344589, + "rewards/MultiModalAccuracyORM": 0.29166667610406877, + "step": 2350, + "train_speed(iter/s)": 0.026385 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.95, + "epoch": 0.9515151515151515, + "grad_norm": 2.719100446764501, + "kl": 0.0343994140625, + "learning_rate": 2e-07, + "loss": 0.030634421110153198, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.38333334028720856, + "reward_std": 0.379781112074852, + "rewards/MultiModalAccuracyORM": 0.38333334028720856, + "step": 2355, + "train_speed(iter/s)": 0.026406 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.7, + "epoch": 0.9535353535353536, + "grad_norm": 2.4658627482626816, + "kl": 0.033807373046875, + "learning_rate": 2e-07, + "loss": 0.026800933480262756, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4916666731238365, + "reward_std": 0.2393606811761856, + "rewards/MultiModalAccuracyORM": 0.4916666731238365, + "step": 2360, + "train_speed(iter/s)": 0.026436 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.25, + "epoch": 0.9555555555555556, + "grad_norm": 2.851734873550529, + "kl": 0.027685546875, + "learning_rate": 2e-07, + "loss": 0.013045597076416015, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3500000052154064, + "reward_std": 0.43759028911590575, + "rewards/MultiModalAccuracyORM": 0.3500000052154064, + "step": 2365, + "train_speed(iter/s)": 0.026463 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.05, + "epoch": 0.9575757575757575, + "grad_norm": 1.448742319519302, + "kl": 0.02066650390625, + "learning_rate": 2e-07, + "loss": -0.010880425572395325, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667237877847, + "reward_std": 0.3780420243740082, + "rewards/MultiModalAccuracyORM": 0.41666667237877847, + "step": 2370, + "train_speed(iter/s)": 0.026491 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.3, + "epoch": 0.9595959595959596, + "grad_norm": 1.7573565404253169, + "kl": 0.05279541015625, + "learning_rate": 2e-07, + "loss": -0.009101217985153199, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3916666805744171, + "reward_std": 0.32049004435539247, + "rewards/MultiModalAccuracyORM": 0.3916666805744171, + "step": 2375, + "train_speed(iter/s)": 0.026519 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.75, + "epoch": 0.9616161616161616, + "grad_norm": 1.3965100041612641, + "kl": 0.02640380859375, + "learning_rate": 2e-07, + "loss": 0.01602880358695984, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.41666667759418485, + "reward_std": 0.3471368789672852, + "rewards/MultiModalAccuracyORM": 0.41666667759418485, + "step": 2380, + "train_speed(iter/s)": 0.026544 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.25, + "epoch": 0.9636363636363636, + "grad_norm": 2.2883768350459732, + "kl": 0.0283721923828125, + "learning_rate": 2e-07, + "loss": -0.02478056252002716, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.18333333730697632, + "reward_std": 0.3252063632011414, + "rewards/MultiModalAccuracyORM": 0.18333333730697632, + "step": 2385, + "train_speed(iter/s)": 0.02657 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.4, + "epoch": 0.9656565656565657, + "grad_norm": 2.3698939133503027, + "kl": 0.027130126953125, + "learning_rate": 2e-07, + "loss": 0.0352479875087738, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.316666679084301, + "reward_std": 0.2815410941839218, + "rewards/MultiModalAccuracyORM": 0.316666679084301, + "step": 2390, + "train_speed(iter/s)": 0.0266 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.3, + "epoch": 0.9676767676767677, + "grad_norm": 2.6455515972771577, + "kl": 0.0282379150390625, + "learning_rate": 2e-07, + "loss": 0.02145477384328842, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.3416666738688946, + "reward_std": 0.43726191222667693, + "rewards/MultiModalAccuracyORM": 0.3416666738688946, + "step": 2395, + "train_speed(iter/s)": 0.026624 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.1, + "epoch": 0.9696969696969697, + "grad_norm": 1.3800009626988052, + "kl": 0.023291015625, + "learning_rate": 2e-07, + "loss": 0.009223046898841857, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.09166666939854622, + "reward_std": 0.1850757420063019, + "rewards/MultiModalAccuracyORM": 0.09166666939854622, + "step": 2400, + "train_speed(iter/s)": 0.02665 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.95, + "epoch": 0.9717171717171718, + "grad_norm": 2.707313244667536, + "kl": 0.0386138916015625, + "learning_rate": 2e-07, + "loss": -0.016336160898208617, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.38333333730697633, + "reward_std": 0.24860407412052155, + "rewards/MultiModalAccuracyORM": 0.38333333730697633, + "step": 2405, + "train_speed(iter/s)": 0.026681 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.65, + "epoch": 0.9737373737373738, + "grad_norm": 2.6298064760318223, + "kl": 0.031060791015625, + "learning_rate": 2e-07, + "loss": -0.026252752542495726, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.21666667014360427, + "reward_std": 0.3385047078132629, + "rewards/MultiModalAccuracyORM": 0.21666667014360427, + "step": 2410, + "train_speed(iter/s)": 0.0267 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.65, + "epoch": 0.9757575757575757, + "grad_norm": 2.0364458058384423, + "kl": 0.018072509765625, + "learning_rate": 2e-07, + "loss": -0.022683143615722656, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.483333345502615, + "reward_std": 0.2900991141796112, + "rewards/MultiModalAccuracyORM": 0.483333345502615, + "step": 2415, + "train_speed(iter/s)": 0.026729 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.65, + "epoch": 0.9777777777777777, + "grad_norm": 3.0539530097221843, + "kl": 0.0236175537109375, + "learning_rate": 2e-07, + "loss": -0.025200226902961732, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5166666835546494, + "reward_std": 0.3579155892133713, + "rewards/MultiModalAccuracyORM": 0.5166666835546494, + "step": 2420, + "train_speed(iter/s)": 0.026757 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.65, + "epoch": 0.9797979797979798, + "grad_norm": 2.837404902371068, + "kl": 0.0191802978515625, + "learning_rate": 2e-07, + "loss": -0.05283277034759522, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1916666679084301, + "reward_std": 0.24939410090446473, + "rewards/MultiModalAccuracyORM": 0.1916666679084301, + "step": 2425, + "train_speed(iter/s)": 0.026783 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.05, + "epoch": 0.9818181818181818, + "grad_norm": 1.2637214917941955, + "kl": 0.0302001953125, + "learning_rate": 2e-07, + "loss": 0.013781133294105529, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.37500001266598704, + "reward_std": 0.4204265236854553, + "rewards/MultiModalAccuracyORM": 0.37500001266598704, + "step": 2430, + "train_speed(iter/s)": 0.026802 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.8, + "epoch": 0.9838383838383838, + "grad_norm": 0.058208298350106734, + "kl": 0.0239227294921875, + "learning_rate": 2e-07, + "loss": 0.03573224246501923, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.32500000670552254, + "reward_std": 0.26597192585468293, + "rewards/MultiModalAccuracyORM": 0.32500000670552254, + "step": 2435, + "train_speed(iter/s)": 0.02683 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.85, + "epoch": 0.9858585858585859, + "grad_norm": 1.6302602474729853, + "kl": 0.0171844482421875, + "learning_rate": 2e-07, + "loss": -0.012005738914012909, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.1916666679084301, + "reward_std": 0.19717081785202026, + "rewards/MultiModalAccuracyORM": 0.1916666679084301, + "step": 2440, + "train_speed(iter/s)": 0.026853 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.0, + "epoch": 0.9878787878787879, + "grad_norm": 2.5433362450025765, + "kl": 0.0248321533203125, + "learning_rate": 2e-07, + "loss": -0.030718517303466798, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.35833333879709245, + "reward_std": 0.30894235968589784, + "rewards/MultiModalAccuracyORM": 0.35833333879709245, + "step": 2445, + "train_speed(iter/s)": 0.026875 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.35, + "epoch": 0.98989898989899, + "grad_norm": 1.0906797325242925, + "kl": 0.024761962890625, + "learning_rate": 2e-07, + "loss": -0.007297384738922119, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.23333334028720856, + "reward_std": 0.3393357157707214, + "rewards/MultiModalAccuracyORM": 0.23333334028720856, + "step": 2450, + "train_speed(iter/s)": 0.026893 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.7, + "epoch": 0.9919191919191919, + "grad_norm": 1.8168984918524227, + "kl": 0.0161956787109375, + "learning_rate": 2e-07, + "loss": 0.03163195252418518, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.450000011920929, + "reward_std": 0.37525942325592043, + "rewards/MultiModalAccuracyORM": 0.450000011920929, + "step": 2455, + "train_speed(iter/s)": 0.026915 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.05, + "epoch": 0.9939393939393939, + "grad_norm": 1.171315154121709, + "kl": 0.02081298828125, + "learning_rate": 2e-07, + "loss": 0.014726841449737548, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.4083333402872086, + "reward_std": 0.29634127020835876, + "rewards/MultiModalAccuracyORM": 0.4083333402872086, + "step": 2460, + "train_speed(iter/s)": 0.026936 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.6, + "epoch": 0.9959595959595959, + "grad_norm": 0.9872275853532635, + "kl": 0.01080322265625, + "learning_rate": 2e-07, + "loss": 0.01651265621185303, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.5666666753590107, + "reward_std": 0.2488823115825653, + "rewards/MultiModalAccuracyORM": 0.5666666753590107, + "step": 2465, + "train_speed(iter/s)": 0.026959 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.75, + "epoch": 0.997979797979798, + "grad_norm": 1.8423007639906985, + "kl": 0.0185638427734375, + "learning_rate": 2e-07, + "loss": -0.006967762112617492, + "memory(GiB)": 87.45, + "response_clip_ratio": 0.0, + "reward": 0.31666667237877844, + "reward_std": 0.21999078691005708, + "rewards/MultiModalAccuracyORM": 0.31666667237877844, + "step": 2470, + "train_speed(iter/s)": 0.02698 + }, + { + "epoch": 1.0, + "grad_norm": 2.4251028884123285, + "learning_rate": 2e-07, + "loss": -0.04546417593955994, + "memory(GiB)": 87.45, + "step": 2475, + "train_speed(iter/s)": 0.026999 + }, + { + "epoch": 1.0, + "eval_clip_ratio": 0.0, + "eval_completion_length": 364.18834014892576, + "eval_kl": 0.0238104248046875, + "eval_loss": 0.01933932490646839, + "eval_response_clip_ratio": 0.00833333358168602, + "eval_reward": 0.34333334282040595, + "eval_reward_std": 0.295663959980011, + "eval_rewards/MultiModalAccuracyORM": 0.34333334282040595, + "eval_runtime": 580.8644, + "eval_samples_per_second": 0.086, + "eval_steps_per_second": 0.009, + "step": 2475 + } + ], + "logging_steps": 5, + "max_steps": 2475, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}