| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9861932938856017, | |
| "eval_steps": 500, | |
| "global_step": 252, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 129.85416984558105, | |
| "epoch": 0.007889546351084813, | |
| "grad_norm": 3.8763763904571533, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.9994155764579773, | |
| "reward_std": 0.7669541835784912, | |
| "rewards/format_reward": 0.5937500149011612, | |
| "rewards/segmentation_reward": 0.4056655764579773, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 136.31250381469727, | |
| "epoch": 0.015779092702169626, | |
| "grad_norm": 15.093283653259277, | |
| "kl": 0.0006837844848632812, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.158398061990738, | |
| "reward_std": 0.7105055451393127, | |
| "rewards/format_reward": 0.6979166865348816, | |
| "rewards/segmentation_reward": 0.4604813829064369, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 141.71875381469727, | |
| "epoch": 0.023668639053254437, | |
| "grad_norm": 6.756906032562256, | |
| "kl": 0.000885009765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 1.1017816215753555, | |
| "reward_std": 0.5777320563793182, | |
| "rewards/format_reward": 0.6770833507180214, | |
| "rewards/segmentation_reward": 0.4246982932090759, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 133.3958396911621, | |
| "epoch": 0.03155818540433925, | |
| "grad_norm": 10.368873596191406, | |
| "kl": 0.0015869140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.040566012263298, | |
| "reward_std": 0.7502522468566895, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "rewards/segmentation_reward": 0.43639934808015823, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 143.71875762939453, | |
| "epoch": 0.03944773175542406, | |
| "grad_norm": 4.0498151779174805, | |
| "kl": 0.00193023681640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "reward": 1.1338110864162445, | |
| "reward_std": 0.6782345250248909, | |
| "rewards/format_reward": 0.708333358168602, | |
| "rewards/segmentation_reward": 0.4254777356982231, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 124.4062557220459, | |
| "epoch": 0.047337278106508875, | |
| "grad_norm": 20.274686813354492, | |
| "kl": 0.0040283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.3649465143680573, | |
| "reward_std": 0.4273431599140167, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "rewards/segmentation_reward": 0.5732797980308533, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 131.6666717529297, | |
| "epoch": 0.055226824457593686, | |
| "grad_norm": 13.247719764709473, | |
| "kl": 0.00662994384765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.2582235038280487, | |
| "reward_std": 0.5580896884202957, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "rewards/segmentation_reward": 0.5082234740257263, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 131.52083778381348, | |
| "epoch": 0.0631163708086785, | |
| "grad_norm": 11.869627952575684, | |
| "kl": 0.0071258544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.4533603489398956, | |
| "reward_std": 0.5124156475067139, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "rewards/segmentation_reward": 0.5991936326026917, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 132.90625381469727, | |
| "epoch": 0.07100591715976332, | |
| "grad_norm": 3.0018997192382812, | |
| "kl": 0.00543212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "reward": 1.2137048244476318, | |
| "reward_std": 0.5586739107966423, | |
| "rewards/format_reward": 0.708333358168602, | |
| "rewards/segmentation_reward": 0.5053714141249657, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 129.46875381469727, | |
| "epoch": 0.07889546351084813, | |
| "grad_norm": 15.782857894897461, | |
| "kl": 0.0078582763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.3436061143875122, | |
| "reward_std": 0.46356433629989624, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "rewards/segmentation_reward": 0.5206894353032112, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 125.33333587646484, | |
| "epoch": 0.08678500986193294, | |
| "grad_norm": 5.636582851409912, | |
| "kl": 0.0071868896484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.5922125577926636, | |
| "reward_std": 0.32092406041920185, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "rewards/segmentation_reward": 0.6755458265542984, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 120.16666793823242, | |
| "epoch": 0.09467455621301775, | |
| "grad_norm": 17.472490310668945, | |
| "kl": 0.00865936279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "reward": 1.4907157123088837, | |
| "reward_std": 0.3035526014864445, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "rewards/segmentation_reward": 0.5844657346606255, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 125.35416984558105, | |
| "epoch": 0.10256410256410256, | |
| "grad_norm": 3.8183038234710693, | |
| "kl": 0.0095672607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.4928309321403503, | |
| "reward_std": 0.46883198618888855, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "rewards/segmentation_reward": 0.6178309172391891, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 117.92708587646484, | |
| "epoch": 0.11045364891518737, | |
| "grad_norm": 6.6655073165893555, | |
| "kl": 0.0110626220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "reward": 1.5008599758148193, | |
| "reward_std": 0.32115011289715767, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.5529432520270348, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 123.93750190734863, | |
| "epoch": 0.11834319526627218, | |
| "grad_norm": 5.591380596160889, | |
| "kl": 0.012359619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.5433619320392609, | |
| "reward_std": 0.24045727029442787, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.5954452455043793, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 119.31250190734863, | |
| "epoch": 0.126232741617357, | |
| "grad_norm": 4.104308605194092, | |
| "kl": 0.0131378173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "reward": 1.5899344384670258, | |
| "reward_std": 0.2260684370994568, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.6420177668333054, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 121.08333778381348, | |
| "epoch": 0.1341222879684418, | |
| "grad_norm": 5.432396411895752, | |
| "kl": 0.0205535888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.5686425566673279, | |
| "reward_std": 0.3436120003461838, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "rewards/segmentation_reward": 0.6623925417661667, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 113.20833587646484, | |
| "epoch": 0.14201183431952663, | |
| "grad_norm": 5.052847862243652, | |
| "kl": 0.0161285400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.618556797504425, | |
| "reward_std": 0.22547486051917076, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6602234989404678, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 119.66666793823242, | |
| "epoch": 0.14990138067061143, | |
| "grad_norm": 4.373110771179199, | |
| "kl": 0.0154266357421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "reward": 1.5658972263336182, | |
| "reward_std": 0.2384246401488781, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.617980569601059, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 114.64583778381348, | |
| "epoch": 0.15779092702169625, | |
| "grad_norm": 5.60256290435791, | |
| "kl": 0.022735595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6209804117679596, | |
| "reward_std": 0.16078345850110054, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.641813725233078, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 112.28125190734863, | |
| "epoch": 0.16568047337278108, | |
| "grad_norm": 7.399819850921631, | |
| "kl": 0.0194091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.6231780648231506, | |
| "reward_std": 0.16180265322327614, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.644011452794075, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 118.37500190734863, | |
| "epoch": 0.17357001972386588, | |
| "grad_norm": 5.146966934204102, | |
| "kl": 0.020172119140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.5712138712406158, | |
| "reward_std": 0.18034345097839832, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.602463886141777, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 109.30208587646484, | |
| "epoch": 0.1814595660749507, | |
| "grad_norm": 10.886876106262207, | |
| "kl": 0.0229644775390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6121678054332733, | |
| "reward_std": 0.24894177541136742, | |
| "rewards/format_reward": 0.927083358168602, | |
| "rewards/segmentation_reward": 0.6850844025611877, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 115.40625190734863, | |
| "epoch": 0.1893491124260355, | |
| "grad_norm": 7.059934139251709, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.603627324104309, | |
| "reward_std": 0.2590697519481182, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.6557105779647827, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 105.81250190734863, | |
| "epoch": 0.19723865877712032, | |
| "grad_norm": 6.3141889572143555, | |
| "kl": 0.02276611328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.609460562467575, | |
| "reward_std": 0.21640464290976524, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.651127278804779, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 106.47916793823242, | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 5.775827884674072, | |
| "kl": 0.0296630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.5251965522766113, | |
| "reward_std": 0.21586111560463905, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.566863164305687, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 107.38541984558105, | |
| "epoch": 0.21301775147928995, | |
| "grad_norm": 6.3134379386901855, | |
| "kl": 0.0269775390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7112098634243011, | |
| "reward_std": 0.08758194698020816, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.711209774017334, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 106.05208778381348, | |
| "epoch": 0.22090729783037474, | |
| "grad_norm": 8.249260902404785, | |
| "kl": 0.02569580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.737591177225113, | |
| "reward_std": 0.03949332144111395, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7375911623239517, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 107.68750190734863, | |
| "epoch": 0.22879684418145957, | |
| "grad_norm": 21.619091033935547, | |
| "kl": 0.025360107421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7143003344535828, | |
| "reward_std": 0.14945492893457413, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7143002450466156, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 113.18750381469727, | |
| "epoch": 0.23668639053254437, | |
| "grad_norm": 11.503852844238281, | |
| "kl": 0.027099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6730247139930725, | |
| "reward_std": 0.17038158606737852, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7042748034000397, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 107.75000190734863, | |
| "epoch": 0.2445759368836292, | |
| "grad_norm": 6.8054704666137695, | |
| "kl": 0.0262451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7297659516334534, | |
| "reward_std": 0.0851635541766882, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7401826083660126, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 109.43750190734863, | |
| "epoch": 0.252465483234714, | |
| "grad_norm": 7.1466593742370605, | |
| "kl": 0.03009033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6955690681934357, | |
| "reward_std": 0.13199757551774383, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7164023518562317, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 112.22916984558105, | |
| "epoch": 0.2603550295857988, | |
| "grad_norm": 4.616764068603516, | |
| "kl": 0.031646728515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6858111023902893, | |
| "reward_std": 0.15736100357025862, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7066444158554077, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 105.96875190734863, | |
| "epoch": 0.2682445759368836, | |
| "grad_norm": 4.323781490325928, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.692588210105896, | |
| "reward_std": 0.18604120332747698, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.7446715235710144, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 107.27083587646484, | |
| "epoch": 0.27613412228796846, | |
| "grad_norm": 6.824800968170166, | |
| "kl": 0.038543701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.6549388766288757, | |
| "reward_std": 0.17549242451786995, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.686188817024231, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 109.16666793823242, | |
| "epoch": 0.28402366863905326, | |
| "grad_norm": 7.346378803253174, | |
| "kl": 0.025604248046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.6843693554401398, | |
| "reward_std": 0.13274440541863441, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6947860270738602, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 113.21875190734863, | |
| "epoch": 0.29191321499013806, | |
| "grad_norm": 10.401052474975586, | |
| "kl": 0.03076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6937299370765686, | |
| "reward_std": 0.08907313086092472, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7041465640068054, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 115.80208587646484, | |
| "epoch": 0.29980276134122286, | |
| "grad_norm": 8.589919090270996, | |
| "kl": 0.0274658203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7341200113296509, | |
| "reward_std": 0.12931476812809706, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7549533247947693, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 118.71875190734863, | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 12.668815612792969, | |
| "kl": 0.02984619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6736123263835907, | |
| "reward_std": 0.09443093463778496, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6840289980173111, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 121.02083587646484, | |
| "epoch": 0.3155818540433925, | |
| "grad_norm": 6.178909778594971, | |
| "kl": 0.022613525390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7086512446403503, | |
| "reward_std": 0.13165022525936365, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7190678864717484, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 121.04167175292969, | |
| "epoch": 0.3234714003944773, | |
| "grad_norm": 5.155839443206787, | |
| "kl": 0.022979736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6464684903621674, | |
| "reward_std": 0.16541135124862194, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.6881351172924042, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 123.53125381469727, | |
| "epoch": 0.33136094674556216, | |
| "grad_norm": 6.814601898193359, | |
| "kl": 0.01806640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "reward": 1.7974173426628113, | |
| "reward_std": 0.06069173011928797, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7974172979593277, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 114.95833587646484, | |
| "epoch": 0.33925049309664695, | |
| "grad_norm": 5.867588520050049, | |
| "kl": 0.030914306640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6036024987697601, | |
| "reward_std": 0.14355931896716356, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6244357973337173, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 119.41666793823242, | |
| "epoch": 0.34714003944773175, | |
| "grad_norm": 23.10361099243164, | |
| "kl": 0.02508544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7223087847232819, | |
| "reward_std": 0.1400171834975481, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7431419938802719, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 120.78125190734863, | |
| "epoch": 0.35502958579881655, | |
| "grad_norm": 8.673208236694336, | |
| "kl": 0.030670166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.5970956683158875, | |
| "reward_std": 0.20673380978405476, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.6387622952461243, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 122.50000190734863, | |
| "epoch": 0.3629191321499014, | |
| "grad_norm": 7.40756893157959, | |
| "kl": 0.032196044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6095560789108276, | |
| "reward_std": 0.24808678403496742, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.6616393625736237, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 119.16666984558105, | |
| "epoch": 0.3708086785009862, | |
| "grad_norm": 7.48468017578125, | |
| "kl": 0.02667236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6872661709785461, | |
| "reward_std": 0.1560370922088623, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7289328575134277, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 121.70833587646484, | |
| "epoch": 0.378698224852071, | |
| "grad_norm": 6.0176005363464355, | |
| "kl": 0.02667236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.705275982618332, | |
| "reward_std": 0.12693702802062035, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7156926393508911, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 119.76041793823242, | |
| "epoch": 0.3865877712031558, | |
| "grad_norm": 19.220964431762695, | |
| "kl": 0.03289794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.67743381857872, | |
| "reward_std": 0.11258519534021616, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.6982671469449997, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 124.2812557220459, | |
| "epoch": 0.39447731755424065, | |
| "grad_norm": 5.315231800079346, | |
| "kl": 0.033966064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6300671994686127, | |
| "reward_std": 0.2049922477453947, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6717338263988495, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 120.82291984558105, | |
| "epoch": 0.40236686390532544, | |
| "grad_norm": 6.098005771636963, | |
| "kl": 0.028411865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7469660639762878, | |
| "reward_std": 0.137971930205822, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7573827505111694, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 125.18750381469727, | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 8.745720863342285, | |
| "kl": 0.02703857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6644043028354645, | |
| "reward_std": 0.13505547679960728, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6852375864982605, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 119.29166984558105, | |
| "epoch": 0.4181459566074951, | |
| "grad_norm": 7.582885265350342, | |
| "kl": 0.02655029296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7572149634361267, | |
| "reward_std": 0.05950829852372408, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7572149932384491, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 122.39583587646484, | |
| "epoch": 0.4260355029585799, | |
| "grad_norm": 5.690119743347168, | |
| "kl": 0.02685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7116829752922058, | |
| "reward_std": 0.1341047789901495, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7325162440538406, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 123.38541793823242, | |
| "epoch": 0.4339250493096647, | |
| "grad_norm": 4.931894779205322, | |
| "kl": 0.026763916015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7832928001880646, | |
| "reward_std": 0.14402389503084123, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.8041260987520218, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 123.33333587646484, | |
| "epoch": 0.4418145956607495, | |
| "grad_norm": 6.575313091278076, | |
| "kl": 0.026123046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7617658376693726, | |
| "reward_std": 0.08360383100807667, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7617658227682114, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 117.22916793823242, | |
| "epoch": 0.44970414201183434, | |
| "grad_norm": 5.90615701675415, | |
| "kl": 0.02032470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "reward": 1.733703076839447, | |
| "reward_std": 0.08622908592224121, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7441196739673615, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 120.58333587646484, | |
| "epoch": 0.45759368836291914, | |
| "grad_norm": 5.734189510345459, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.765779048204422, | |
| "reward_std": 0.05810722103342414, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7657789587974548, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 122.66666984558105, | |
| "epoch": 0.46548323471400394, | |
| "grad_norm": 4.0810956954956055, | |
| "kl": 0.02264404296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7729882895946503, | |
| "reward_std": 0.06873794272542, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7729882448911667, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 122.95833587646484, | |
| "epoch": 0.47337278106508873, | |
| "grad_norm": 8.775988578796387, | |
| "kl": 0.02838134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6857317984104156, | |
| "reward_std": 0.20080409944057465, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7169817835092545, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 122.13541984558105, | |
| "epoch": 0.4812623274161736, | |
| "grad_norm": 12.763365745544434, | |
| "kl": 0.024200439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7577170729637146, | |
| "reward_std": 0.1079479455947876, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7681336998939514, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 119.69791793823242, | |
| "epoch": 0.4891518737672584, | |
| "grad_norm": 9.827467918395996, | |
| "kl": 0.026092529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7310417592525482, | |
| "reward_std": 0.16975797526538372, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7518750727176666, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 123.47916984558105, | |
| "epoch": 0.4970414201183432, | |
| "grad_norm": 11.630967140197754, | |
| "kl": 0.03076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6405887305736542, | |
| "reward_std": 0.20058110356330872, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.6822553277015686, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 116.79166793823242, | |
| "epoch": 0.504930966469428, | |
| "grad_norm": 4.396234035491943, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7577708065509796, | |
| "reward_std": 0.056545618921518326, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.757770836353302, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 116.90625381469727, | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 8.891778945922852, | |
| "kl": 0.024566650390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.7180190980434418, | |
| "reward_std": 0.1297496184706688, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.738852322101593, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 115.18750190734863, | |
| "epoch": 0.5207100591715976, | |
| "grad_norm": 5.857807159423828, | |
| "kl": 0.026397705078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6948060393333435, | |
| "reward_std": 0.09921691659837961, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7052226364612579, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 121.09375381469727, | |
| "epoch": 0.5285996055226825, | |
| "grad_norm": 6.213242053985596, | |
| "kl": 0.022308349609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.7551944553852081, | |
| "reward_std": 0.15169572597369552, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7760278284549713, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 115.51041793823242, | |
| "epoch": 0.5364891518737672, | |
| "grad_norm": 8.733428001403809, | |
| "kl": 0.027435302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6302316784858704, | |
| "reward_std": 0.229142090305686, | |
| "rewards/format_reward": 0.9375, | |
| "rewards/segmentation_reward": 0.6927317082881927, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 127.31250381469727, | |
| "epoch": 0.5443786982248521, | |
| "grad_norm": 8.592183113098145, | |
| "kl": 0.026947021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.5972277224063873, | |
| "reward_std": 0.1787685491144657, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6284777224063873, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 121.78125190734863, | |
| "epoch": 0.5522682445759369, | |
| "grad_norm": 4.904063701629639, | |
| "kl": 0.032379150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6003088057041168, | |
| "reward_std": 0.14930523186922073, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6315587908029556, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 116.23958587646484, | |
| "epoch": 0.5601577909270217, | |
| "grad_norm": 7.675012588500977, | |
| "kl": 0.048187255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.6342654526233673, | |
| "reward_std": 0.24084181897342205, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "rewards/segmentation_reward": 0.6863488405942917, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 110.20833587646484, | |
| "epoch": 0.5680473372781065, | |
| "grad_norm": 6.364027500152588, | |
| "kl": 0.0234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.6845062971115112, | |
| "reward_std": 0.07788908947259188, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.705339640378952, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 114.71875190734863, | |
| "epoch": 0.5759368836291914, | |
| "grad_norm": 12.653519630432129, | |
| "kl": 0.03668212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.655396193265915, | |
| "reward_std": 0.16850140318274498, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6866461485624313, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 114.77083587646484, | |
| "epoch": 0.5838264299802761, | |
| "grad_norm": 7.866998195648193, | |
| "kl": 0.025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.6978517174720764, | |
| "reward_std": 0.14674336509779096, | |
| "rewards/format_reward": 0.96875, | |
| "rewards/segmentation_reward": 0.7291017025709152, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 113.13541984558105, | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 9.050918579101562, | |
| "kl": 0.0279541015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6342374980449677, | |
| "reward_std": 0.09696032106876373, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6342374533414841, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 110.10417175292969, | |
| "epoch": 0.5996055226824457, | |
| "grad_norm": 11.760872840881348, | |
| "kl": 0.02703857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7879691123962402, | |
| "reward_std": 0.08564800582826138, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7983856648206711, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 116.18750381469727, | |
| "epoch": 0.6074950690335306, | |
| "grad_norm": 8.666062355041504, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.6772170960903168, | |
| "reward_std": 0.1316035082563758, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.6980504393577576, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 113.64583396911621, | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 10.047959327697754, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7389695942401886, | |
| "reward_std": 0.06903288420289755, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7389696538448334, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 115.52083587646484, | |
| "epoch": 0.6232741617357002, | |
| "grad_norm": 10.192525863647461, | |
| "kl": 0.0306396484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7115220725536346, | |
| "reward_std": 0.13409498427063227, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7323553562164307, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 110.38541984558105, | |
| "epoch": 0.631163708086785, | |
| "grad_norm": 6.273956298828125, | |
| "kl": 0.0413818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.5747724771499634, | |
| "reward_std": 0.12117741256952286, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.5747724398970604, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 109.43750381469727, | |
| "epoch": 0.6390532544378699, | |
| "grad_norm": 6.993807315826416, | |
| "kl": 0.030975341796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7259157598018646, | |
| "reward_std": 0.09234390081837773, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7363324463367462, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 110.15625190734863, | |
| "epoch": 0.6469428007889546, | |
| "grad_norm": 5.614496231079102, | |
| "kl": 0.030731201171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7176712453365326, | |
| "reward_std": 0.15571256913244724, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7385045886039734, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 109.75000190734863, | |
| "epoch": 0.6548323471400395, | |
| "grad_norm": 25.766254425048828, | |
| "kl": 0.0313720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.641405701637268, | |
| "reward_std": 0.14323315117508173, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.6726557016372681, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 115.44791793823242, | |
| "epoch": 0.6627218934911243, | |
| "grad_norm": 5.393517971038818, | |
| "kl": 0.028839111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7041931748390198, | |
| "reward_std": 0.07750691333785653, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.714609831571579, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 111.65625190734863, | |
| "epoch": 0.6706114398422091, | |
| "grad_norm": 4.121002674102783, | |
| "kl": 0.02972412109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7423414289951324, | |
| "reward_std": 0.10370620153844357, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7527580559253693, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 116.79166984558105, | |
| "epoch": 0.6785009861932939, | |
| "grad_norm": 18.222322463989258, | |
| "kl": 0.02703857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7806086838245392, | |
| "reward_std": 0.07675650157034397, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.791025385260582, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 115.37500381469727, | |
| "epoch": 0.6863905325443787, | |
| "grad_norm": 7.765015125274658, | |
| "kl": 0.02838134765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.693938434123993, | |
| "reward_std": 0.15568579966202378, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7251883894205093, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 115.29166984558105, | |
| "epoch": 0.6942800788954635, | |
| "grad_norm": 18.34151840209961, | |
| "kl": 0.026214599609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "reward": 1.691928893327713, | |
| "reward_std": 0.140016196295619, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7231789231300354, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 121.05208396911621, | |
| "epoch": 0.7021696252465484, | |
| "grad_norm": 10.368853569030762, | |
| "kl": 0.021881103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "reward": 1.8064870834350586, | |
| "reward_std": 0.09992434550076723, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.816903680562973, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 114.10416984558105, | |
| "epoch": 0.7100591715976331, | |
| "grad_norm": 4.952389240264893, | |
| "kl": 0.03314208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7042470872402191, | |
| "reward_std": 0.06299979891628027, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7042470574378967, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 117.68750190734863, | |
| "epoch": 0.717948717948718, | |
| "grad_norm": 9.378170013427734, | |
| "kl": 0.03192138671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7310494482517242, | |
| "reward_std": 0.09936824138276279, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7414660751819611, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 119.51041793823242, | |
| "epoch": 0.7258382642998028, | |
| "grad_norm": 14.939569473266602, | |
| "kl": 0.029388427734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8184643685817719, | |
| "reward_std": 0.07522716512903571, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.818464383482933, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 128.58333778381348, | |
| "epoch": 0.7337278106508875, | |
| "grad_norm": 8.10946273803711, | |
| "kl": 0.032073974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6870926320552826, | |
| "reward_std": 0.14518141373991966, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.707925945520401, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 116.75000190734863, | |
| "epoch": 0.7416173570019724, | |
| "grad_norm": 4.120285511016846, | |
| "kl": 0.02752685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.6995415091514587, | |
| "reward_std": 0.1466266419738531, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.720374807715416, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 123.09375190734863, | |
| "epoch": 0.7495069033530573, | |
| "grad_norm": 16.04999351501465, | |
| "kl": 0.032073974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7072420418262482, | |
| "reward_std": 0.12071913667023182, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7280752956867218, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 125.27083778381348, | |
| "epoch": 0.757396449704142, | |
| "grad_norm": 4.702588081359863, | |
| "kl": 0.029693603515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.629258632659912, | |
| "reward_std": 0.13004819490015507, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.6605086177587509, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 120.84375190734863, | |
| "epoch": 0.7652859960552268, | |
| "grad_norm": 4.365908622741699, | |
| "kl": 0.03076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6611265540122986, | |
| "reward_std": 0.12717730086296797, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6819598078727722, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 121.59375381469727, | |
| "epoch": 0.7731755424063116, | |
| "grad_norm": 9.223386764526367, | |
| "kl": 0.02752685546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "reward": 1.7275770008563995, | |
| "reward_std": 0.08318935800343752, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7275769412517548, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 126.01041793823242, | |
| "epoch": 0.7810650887573964, | |
| "grad_norm": 12.06326961517334, | |
| "kl": 0.032257080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7239615619182587, | |
| "reward_std": 0.11263703089207411, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7343782037496567, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 128.11458587646484, | |
| "epoch": 0.7889546351084813, | |
| "grad_norm": 16.42841148376465, | |
| "kl": 0.028778076171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8079151809215546, | |
| "reward_std": 0.06341414572671056, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.818331703543663, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 118.48958587646484, | |
| "epoch": 0.796844181459566, | |
| "grad_norm": 6.883551597595215, | |
| "kl": 0.0294189453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.7349311411380768, | |
| "reward_std": 0.12644702824763954, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7765978276729584, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 127.86458778381348, | |
| "epoch": 0.8047337278106509, | |
| "grad_norm": 9.619441032409668, | |
| "kl": 0.03240966796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.7535338401794434, | |
| "reward_std": 0.1224900484085083, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7639505118131638, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 124.44792175292969, | |
| "epoch": 0.8126232741617357, | |
| "grad_norm": 4.861848831176758, | |
| "kl": 0.03369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7802566289901733, | |
| "reward_std": 0.06228824611753225, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7906733006238937, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 118.57291793823242, | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 12.132479667663574, | |
| "kl": 0.03399658203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6861309111118317, | |
| "reward_std": 0.06774610374122858, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6861308366060257, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 122.41666793823242, | |
| "epoch": 0.8284023668639053, | |
| "grad_norm": 4.027884483337402, | |
| "kl": 0.03985595703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7455581724643707, | |
| "reward_std": 0.11986150033771992, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7663915157318115, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 121.88542175292969, | |
| "epoch": 0.8362919132149902, | |
| "grad_norm": 8.1727933883667, | |
| "kl": 0.03948974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7751370966434479, | |
| "reward_std": 0.05637869983911514, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7751370966434479, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 120.05208587646484, | |
| "epoch": 0.8441814595660749, | |
| "grad_norm": 30.948610305786133, | |
| "kl": 0.0394287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.6998119950294495, | |
| "reward_std": 0.13076962530612946, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7206452935934067, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 125.39583396911621, | |
| "epoch": 0.8520710059171598, | |
| "grad_norm": 10.170028686523438, | |
| "kl": 0.03857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7232375144958496, | |
| "reward_std": 0.051960008684545755, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7336541265249252, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 120.46875190734863, | |
| "epoch": 0.8599605522682445, | |
| "grad_norm": 35.49694061279297, | |
| "kl": 0.04034423828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7316677272319794, | |
| "reward_std": 0.07910315738990903, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7420843094587326, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 120.42708587646484, | |
| "epoch": 0.8678500986193294, | |
| "grad_norm": 12.206290245056152, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7922857403755188, | |
| "reward_std": 0.05224635393824428, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7922856956720352, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 123.88541793823242, | |
| "epoch": 0.8757396449704142, | |
| "grad_norm": 7.881491184234619, | |
| "kl": 0.038055419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.6793254017829895, | |
| "reward_std": 0.2003505825996399, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7209920138120651, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 117.27083778381348, | |
| "epoch": 0.883629191321499, | |
| "grad_norm": 7.963736057281494, | |
| "kl": 0.048980712890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.6975261867046356, | |
| "reward_std": 0.16515915468335152, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.728776216506958, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 115.54166793823242, | |
| "epoch": 0.8915187376725838, | |
| "grad_norm": 4.596273899078369, | |
| "kl": 0.0467529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7152385115623474, | |
| "reward_std": 0.09552549291402102, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7256551682949066, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 120.69791793823242, | |
| "epoch": 0.8994082840236687, | |
| "grad_norm": 26.24955177307129, | |
| "kl": 0.03668212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.673191875219345, | |
| "reward_std": 0.07764662057161331, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6731918603181839, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 117.08333587646484, | |
| "epoch": 0.9072978303747534, | |
| "grad_norm": 4.7493696212768555, | |
| "kl": 0.03692626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7461699545383453, | |
| "reward_std": 0.08548186905682087, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.756586566567421, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 115.64583587646484, | |
| "epoch": 0.9151873767258383, | |
| "grad_norm": 3.8763372898101807, | |
| "kl": 0.03564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7519335448741913, | |
| "reward_std": 0.07535870576975867, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7623501420021057, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 116.66666984558105, | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 7.987335681915283, | |
| "kl": 0.034820556640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.752055674791336, | |
| "reward_std": 0.04146988596767187, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7520556747913361, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 112.97916984558105, | |
| "epoch": 0.9309664694280079, | |
| "grad_norm": 7.9439802169799805, | |
| "kl": 0.033843994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7821557521820068, | |
| "reward_std": 0.04594178858678788, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.782155767083168, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 112.35416793823242, | |
| "epoch": 0.9388560157790927, | |
| "grad_norm": 14.11174488067627, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7454370260238647, | |
| "reward_std": 0.07132519292645156, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7558536529541016, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 116.09375190734863, | |
| "epoch": 0.9467455621301775, | |
| "grad_norm": 6.1710309982299805, | |
| "kl": 0.04046630859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.6930521726608276, | |
| "reward_std": 0.08492282312363386, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7034688144922256, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 110.62500190734863, | |
| "epoch": 0.9546351084812623, | |
| "grad_norm": 5.6831254959106445, | |
| "kl": 0.03900146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7358277440071106, | |
| "reward_std": 0.03763490542769432, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7358278036117554, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 110.30208396911621, | |
| "epoch": 0.9625246548323472, | |
| "grad_norm": 6.731239318847656, | |
| "kl": 0.0379638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7100496292114258, | |
| "reward_std": 0.11989523191004992, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7308829575777054, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 121.90625190734863, | |
| "epoch": 0.9704142011834319, | |
| "grad_norm": 6.267615795135498, | |
| "kl": 0.03106689453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.6641573905944824, | |
| "reward_std": 0.06016020243987441, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.674574002623558, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 115.28125190734863, | |
| "epoch": 0.9783037475345168, | |
| "grad_norm": 11.331254959106445, | |
| "kl": 0.03729248046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7366459667682648, | |
| "reward_std": 0.1369774853810668, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.76789590716362, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 115.06250190734863, | |
| "epoch": 0.9861932938856016, | |
| "grad_norm": 7.267125606536865, | |
| "kl": 0.034027099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.765406996011734, | |
| "reward_std": 0.06693344842642546, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7758236229419708, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 120.85416984558105, | |
| "epoch": 0.9940828402366864, | |
| "grad_norm": 24.98333168029785, | |
| "kl": 0.03643798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6338366866111755, | |
| "reward_std": 0.11646552104502916, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.654670000076294, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 117.5138931274414, | |
| "epoch": 1.0, | |
| "grad_norm": 24.98333168029785, | |
| "kl": 0.038167317708333336, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "reward": 1.8023079633712769, | |
| "reward_std": 0.0641996127863725, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8023078640302023, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 118.71875190734863, | |
| "epoch": 1.0078895463510849, | |
| "grad_norm": 3.1590914726257324, | |
| "kl": 0.036834716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.74771448969841, | |
| "reward_std": 0.12587013002485037, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7685477286577225, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 117.43750190734863, | |
| "epoch": 1.0157790927021697, | |
| "grad_norm": 3.237163543701172, | |
| "kl": 0.0372314453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7158082127571106, | |
| "reward_std": 0.08808104507625103, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7262248396873474, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 116.84375190734863, | |
| "epoch": 1.0236686390532543, | |
| "grad_norm": 2.3869004249572754, | |
| "kl": 0.045928955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.732060581445694, | |
| "reward_std": 0.05624359124340117, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7320606112480164, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 115.43750190734863, | |
| "epoch": 1.0315581854043392, | |
| "grad_norm": 2.4740047454833984, | |
| "kl": 0.032684326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 1.6838627457618713, | |
| "reward_std": 0.1397713664919138, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7151127308607101, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 121.77083778381348, | |
| "epoch": 1.039447731755424, | |
| "grad_norm": 4.081189155578613, | |
| "kl": 0.0347900390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.6779897511005402, | |
| "reward_std": 0.12438491266220808, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6988230645656586, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 124.55208778381348, | |
| "epoch": 1.047337278106509, | |
| "grad_norm": 2.9231972694396973, | |
| "kl": 0.036865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "reward": 1.7227727770805359, | |
| "reward_std": 0.09910841938108206, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7436061203479767, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 116.89583778381348, | |
| "epoch": 1.0552268244575937, | |
| "grad_norm": 1.8719556331634521, | |
| "kl": 0.03448486328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7760542333126068, | |
| "reward_std": 0.05843149218708277, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.776054173707962, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 125.28125190734863, | |
| "epoch": 1.0631163708086786, | |
| "grad_norm": 4.612652778625488, | |
| "kl": 0.03936767578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7310554683208466, | |
| "reward_std": 0.0841823904775083, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.741472139954567, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 125.41667175292969, | |
| "epoch": 1.0710059171597632, | |
| "grad_norm": 2.9326233863830566, | |
| "kl": 0.03546142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "reward": 1.7228147089481354, | |
| "reward_std": 0.09376791398972273, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7228147238492966, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 124.57291793823242, | |
| "epoch": 1.078895463510848, | |
| "grad_norm": 2.6695199012756348, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7047127783298492, | |
| "reward_std": 0.08051302284002304, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7151293754577637, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 118.53125381469727, | |
| "epoch": 1.086785009861933, | |
| "grad_norm": 5.622714996337891, | |
| "kl": 0.042724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.682199478149414, | |
| "reward_std": 0.05199452117085457, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.6821994781494141, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 127.53125762939453, | |
| "epoch": 1.0946745562130178, | |
| "grad_norm": 2.1606082916259766, | |
| "kl": 0.0406494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "reward": 1.7326411008834839, | |
| "reward_std": 0.18974016141146421, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7743077427148819, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 121.75000381469727, | |
| "epoch": 1.1025641025641026, | |
| "grad_norm": 2.8073818683624268, | |
| "kl": 0.041961669921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.7088265120983124, | |
| "reward_std": 0.0832697688601911, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7192431837320328, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 117.55208587646484, | |
| "epoch": 1.1104536489151873, | |
| "grad_norm": 1.8191330432891846, | |
| "kl": 0.041748046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.721179723739624, | |
| "reward_std": 0.05665353685617447, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7211797088384628, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 110.60416984558105, | |
| "epoch": 1.1183431952662721, | |
| "grad_norm": 1.681236743927002, | |
| "kl": 0.04168701171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "reward": 1.8110205233097076, | |
| "reward_std": 0.0791134424507618, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8214371502399445, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 112.11458587646484, | |
| "epoch": 1.126232741617357, | |
| "grad_norm": 1.9118095636367798, | |
| "kl": 0.0439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7839480638504028, | |
| "reward_std": 0.08729123277589679, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7943646758794785, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 114.46875381469727, | |
| "epoch": 1.1341222879684418, | |
| "grad_norm": 2.732917308807373, | |
| "kl": 0.044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "reward": 1.7890871465206146, | |
| "reward_std": 0.08393839001655579, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7995037585496902, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 113.70833587646484, | |
| "epoch": 1.1420118343195267, | |
| "grad_norm": 4.204154014587402, | |
| "kl": 0.05718994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.718978464603424, | |
| "reward_std": 0.11887417268007994, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7398117780685425, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 112.35416984558105, | |
| "epoch": 1.1499013806706113, | |
| "grad_norm": 7.688730239868164, | |
| "kl": 0.048583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7795838117599487, | |
| "reward_std": 0.06728760804980993, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7795837968587875, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 116.03125190734863, | |
| "epoch": 1.1577909270216962, | |
| "grad_norm": 1.9673783779144287, | |
| "kl": 0.05224609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.740858942270279, | |
| "reward_std": 0.08058743178844452, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7408589273691177, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 114.85416984558105, | |
| "epoch": 1.165680473372781, | |
| "grad_norm": 3.3340156078338623, | |
| "kl": 0.05194091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.72911536693573, | |
| "reward_std": 0.09798233769834042, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7395320534706116, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 119.91667175292969, | |
| "epoch": 1.1735700197238659, | |
| "grad_norm": 2.7571516036987305, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.6843095421791077, | |
| "reward_std": 0.172153084538877, | |
| "rewards/format_reward": 0.958333358168602, | |
| "rewards/segmentation_reward": 0.7259761840105057, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 111.11458587646484, | |
| "epoch": 1.1814595660749507, | |
| "grad_norm": 5.544634819030762, | |
| "kl": 0.0479736328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0019, | |
| "reward": 1.7362055480480194, | |
| "reward_std": 0.12884071236476302, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7570388317108154, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 114.02083587646484, | |
| "epoch": 1.1893491124260356, | |
| "grad_norm": 1.6093388795852661, | |
| "kl": 0.05059814453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "reward": 1.7716272175312042, | |
| "reward_std": 0.09435711428523064, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.782043844461441, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 106.68750190734863, | |
| "epoch": 1.1972386587771204, | |
| "grad_norm": 2.3275067806243896, | |
| "kl": 0.06378173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7265048921108246, | |
| "reward_std": 0.08734399639070034, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7369215190410614, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 114.94791984558105, | |
| "epoch": 1.205128205128205, | |
| "grad_norm": 3.726362705230713, | |
| "kl": 0.06097412109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7594355046749115, | |
| "reward_std": 0.12504899874329567, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7802688479423523, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 117.42708778381348, | |
| "epoch": 1.21301775147929, | |
| "grad_norm": 4.784036636352539, | |
| "kl": 0.052490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.795677661895752, | |
| "reward_std": 0.0523160370066762, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.795677661895752, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 116.28125190734863, | |
| "epoch": 1.2209072978303748, | |
| "grad_norm": 4.563464641571045, | |
| "kl": 0.05303955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7723517715930939, | |
| "reward_std": 0.0657765488140285, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7723517417907715, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 115.67708587646484, | |
| "epoch": 1.2287968441814596, | |
| "grad_norm": 5.471823215484619, | |
| "kl": 0.0528564453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7356767654418945, | |
| "reward_std": 0.12338399747386575, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7565100789070129, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 115.55208778381348, | |
| "epoch": 1.2366863905325443, | |
| "grad_norm": 1.571742057800293, | |
| "kl": 0.0523681640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7827491760253906, | |
| "reward_std": 0.06495401542633772, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7931658029556274, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 114.43750190734863, | |
| "epoch": 1.244575936883629, | |
| "grad_norm": 2.303530216217041, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7282366454601288, | |
| "reward_std": 0.04335784679278731, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7282366305589676, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 108.57291793823242, | |
| "epoch": 1.252465483234714, | |
| "grad_norm": 2.374271869659424, | |
| "kl": 0.06158447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7437051832675934, | |
| "reward_std": 0.07279739435762167, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.743705153465271, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 107.39583396911621, | |
| "epoch": 1.2603550295857988, | |
| "grad_norm": 6.32477331161499, | |
| "kl": 0.0670166015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.726712703704834, | |
| "reward_std": 0.1198442354798317, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.747546061873436, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 106.02083587646484, | |
| "epoch": 1.2682445759368837, | |
| "grad_norm": 22.797327041625977, | |
| "kl": 0.0625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7934371829032898, | |
| "reward_std": 0.06572614051401615, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8038537800312042, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 108.35416984558105, | |
| "epoch": 1.2761341222879685, | |
| "grad_norm": 2.8181421756744385, | |
| "kl": 0.1036376953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0042, | |
| "reward": 1.793130248785019, | |
| "reward_std": 0.06474862340837717, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8035468757152557, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 105.84375190734863, | |
| "epoch": 1.2840236686390534, | |
| "grad_norm": 2.300621509552002, | |
| "kl": 0.0782470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.7725840210914612, | |
| "reward_std": 0.09849729388952255, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7830007076263428, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 103.26041984558105, | |
| "epoch": 1.291913214990138, | |
| "grad_norm": 19.949954986572266, | |
| "kl": 0.072998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7890552282333374, | |
| "reward_std": 0.06625080667436123, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7994718849658966, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 103.05208587646484, | |
| "epoch": 1.2998027613412229, | |
| "grad_norm": 2.4026191234588623, | |
| "kl": 0.0733642578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7272422015666962, | |
| "reward_std": 0.07955199759453535, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7376587837934494, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 96.37500381469727, | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 3.9920060634613037, | |
| "kl": 0.0792236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7014544606208801, | |
| "reward_std": 0.10399023815989494, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7222877889871597, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 96.16666793823242, | |
| "epoch": 1.3155818540433926, | |
| "grad_norm": 3.3116116523742676, | |
| "kl": 0.08984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.795901119709015, | |
| "reward_std": 0.06413333816453815, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7959010004997253, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 95.82291984558105, | |
| "epoch": 1.3234714003944772, | |
| "grad_norm": 1.6592262983322144, | |
| "kl": 0.090576171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.6837320029735565, | |
| "reward_std": 0.14411024749279022, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "rewards/segmentation_reward": 0.7149820476770401, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 94.01041984558105, | |
| "epoch": 1.331360946745562, | |
| "grad_norm": 2.873749256134033, | |
| "kl": 0.080078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7172091603279114, | |
| "reward_std": 0.10507537145167589, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7276259064674377, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 92.86458587646484, | |
| "epoch": 1.339250493096647, | |
| "grad_norm": 8.259775161743164, | |
| "kl": 0.0858154296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7761318981647491, | |
| "reward_std": 0.041504258988425136, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.776131808757782, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 90.58333587646484, | |
| "epoch": 1.3471400394477318, | |
| "grad_norm": 2.346843719482422, | |
| "kl": 0.119384765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0048, | |
| "reward": 1.759489893913269, | |
| "reward_std": 0.0735306958667934, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7699065655469894, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 93.85416984558105, | |
| "epoch": 1.3550295857988166, | |
| "grad_norm": 1.9567958116531372, | |
| "kl": 0.11865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0047, | |
| "reward": 1.7193129658699036, | |
| "reward_std": 0.07020795345306396, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7193129658699036, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 92.85416984558105, | |
| "epoch": 1.3629191321499015, | |
| "grad_norm": 2.2362473011016846, | |
| "kl": 0.1275634765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0051, | |
| "reward": 1.7795703411102295, | |
| "reward_std": 0.07625795714557171, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7899868935346603, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 101.35416984558105, | |
| "epoch": 1.3708086785009863, | |
| "grad_norm": 3.968324899673462, | |
| "kl": 0.125732421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.005, | |
| "reward": 1.7042430341243744, | |
| "reward_std": 0.10214248253032565, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7250763475894928, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 95.37500190734863, | |
| "epoch": 1.378698224852071, | |
| "grad_norm": 3.955090045928955, | |
| "kl": 0.1297607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0052, | |
| "reward": 1.7905175387859344, | |
| "reward_std": 0.038271178025752306, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7905174940824509, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 95.66666984558105, | |
| "epoch": 1.3865877712031558, | |
| "grad_norm": 2.1160168647766113, | |
| "kl": 0.1488037109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.006, | |
| "reward": 1.7364437878131866, | |
| "reward_std": 0.06818734481930733, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7364437729120255, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 96.19791984558105, | |
| "epoch": 1.3944773175542406, | |
| "grad_norm": 2.239210367202759, | |
| "kl": 0.121826171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0049, | |
| "reward": 1.710487574338913, | |
| "reward_std": 0.12724610278382897, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7417375445365906, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 97.14583587646484, | |
| "epoch": 1.4023668639053255, | |
| "grad_norm": 1.8484389781951904, | |
| "kl": 0.1024169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.706442952156067, | |
| "reward_std": 0.13373982720077038, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7376929074525833, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 97.36458587646484, | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 4.035000801086426, | |
| "kl": 0.0792236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.675961673259735, | |
| "reward_std": 0.07547118235379457, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6863782703876495, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 95.48958396911621, | |
| "epoch": 1.418145956607495, | |
| "grad_norm": 2.940195322036743, | |
| "kl": 0.078369140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0031, | |
| "reward": 1.6918656527996063, | |
| "reward_std": 0.15424280427396297, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7335322201251984, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 98.84375381469727, | |
| "epoch": 1.4260355029585798, | |
| "grad_norm": 2.4502110481262207, | |
| "kl": 0.06298828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7267792224884033, | |
| "reward_std": 0.08856830094009638, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7371958494186401, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 99.81250190734863, | |
| "epoch": 1.4339250493096647, | |
| "grad_norm": 3.8718204498291016, | |
| "kl": 0.067626953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7854253649711609, | |
| "reward_std": 0.08994585368782282, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7958420068025589, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 97.03125190734863, | |
| "epoch": 1.4418145956607495, | |
| "grad_norm": 3.1483190059661865, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7442137897014618, | |
| "reward_std": 0.07469187397509813, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.754630520939827, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 97.07291984558105, | |
| "epoch": 1.4497041420118344, | |
| "grad_norm": 3.180529832839966, | |
| "kl": 0.0655517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7026579678058624, | |
| "reward_std": 0.20012306701391935, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "rewards/segmentation_reward": 0.7443246394395828, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 100.80208587646484, | |
| "epoch": 1.4575936883629192, | |
| "grad_norm": 3.37025785446167, | |
| "kl": 0.06414794921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7813129425048828, | |
| "reward_std": 0.06885035336017609, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7917295545339584, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 97.64583778381348, | |
| "epoch": 1.4654832347140039, | |
| "grad_norm": 3.9030656814575195, | |
| "kl": 0.06085205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.715784877538681, | |
| "reward_std": 0.08244315255433321, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7262014895677567, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 101.83333396911621, | |
| "epoch": 1.4733727810650887, | |
| "grad_norm": 4.519697666168213, | |
| "kl": 0.06439208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.6724973022937775, | |
| "reward_std": 0.14969274401664734, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6933306306600571, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 99.75, | |
| "epoch": 1.4812623274161736, | |
| "grad_norm": 2.1684651374816895, | |
| "kl": 0.0518798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "reward": 1.7607315480709076, | |
| "reward_std": 0.028905052226036787, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7607315182685852, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 97.66666984558105, | |
| "epoch": 1.4891518737672584, | |
| "grad_norm": 2.300941228866577, | |
| "kl": 0.06951904296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.655644565820694, | |
| "reward_std": 0.12620826810598373, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.6764779090881348, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 99.77083587646484, | |
| "epoch": 1.497041420118343, | |
| "grad_norm": 2.674388885498047, | |
| "kl": 0.06103515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7588639855384827, | |
| "reward_std": 0.11651718989014626, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7796972990036011, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 98.09375190734863, | |
| "epoch": 1.504930966469428, | |
| "grad_norm": 11.82987117767334, | |
| "kl": 0.0665283203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7967063784599304, | |
| "reward_std": 0.030512763187289238, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7967063188552856, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 97.13541793823242, | |
| "epoch": 1.5128205128205128, | |
| "grad_norm": 1.5385335683822632, | |
| "kl": 0.0543212890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7657526135444641, | |
| "reward_std": 0.08876698836684227, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7761692702770233, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 96.39583587646484, | |
| "epoch": 1.5207100591715976, | |
| "grad_norm": 1.428580641746521, | |
| "kl": 0.06072998046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7331555485725403, | |
| "reward_std": 0.08605837309733033, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7435721755027771, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 96.07291984558105, | |
| "epoch": 1.5285996055226825, | |
| "grad_norm": 2.108811616897583, | |
| "kl": 0.071044921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.757724791765213, | |
| "reward_std": 0.07616918394342065, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7681414484977722, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 94.59375190734863, | |
| "epoch": 1.5364891518737673, | |
| "grad_norm": 4.254746913909912, | |
| "kl": 0.111572265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0045, | |
| "reward": 1.7280253171920776, | |
| "reward_std": 0.0810533631592989, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7384419143199921, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 99.73958587646484, | |
| "epoch": 1.5443786982248522, | |
| "grad_norm": 2.411263942718506, | |
| "kl": 0.072509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7581177949905396, | |
| "reward_std": 0.04941168939694762, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7581177949905396, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 97.29166793823242, | |
| "epoch": 1.552268244575937, | |
| "grad_norm": 1.9083694219589233, | |
| "kl": 0.0650634765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7459802627563477, | |
| "reward_std": 0.09188527404330671, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.766813650727272, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 93.73958587646484, | |
| "epoch": 1.5601577909270217, | |
| "grad_norm": 1.550249457359314, | |
| "kl": 0.0621337890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7677803039550781, | |
| "reward_std": 0.04235605616122484, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7677803039550781, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 93.08333587646484, | |
| "epoch": 1.5680473372781065, | |
| "grad_norm": 9.880169868469238, | |
| "kl": 0.0809326171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7984677851200104, | |
| "reward_std": 0.024913502275012434, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7984676957130432, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 94.59375381469727, | |
| "epoch": 1.5759368836291914, | |
| "grad_norm": 1.7292006015777588, | |
| "kl": 0.0706787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.6630052626132965, | |
| "reward_std": 0.10535286273807287, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6734219044446945, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 89.83333587646484, | |
| "epoch": 1.583826429980276, | |
| "grad_norm": 3.4253365993499756, | |
| "kl": 0.0667724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7671616673469543, | |
| "reward_std": 0.04377884138375521, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7671616673469543, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 89.06250190734863, | |
| "epoch": 1.5917159763313609, | |
| "grad_norm": 3.8319993019104004, | |
| "kl": 0.0673828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7373066842556, | |
| "reward_std": 0.05676834611222148, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.747723326086998, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 90.77083587646484, | |
| "epoch": 1.5996055226824457, | |
| "grad_norm": 2.4027304649353027, | |
| "kl": 0.0836181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7197685837745667, | |
| "reward_std": 0.048274436965584755, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7197685688734055, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 91.70833587646484, | |
| "epoch": 1.6074950690335306, | |
| "grad_norm": 2.952907085418701, | |
| "kl": 0.079345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7424035966396332, | |
| "reward_std": 0.08384070452302694, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.75282022356987, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 84.82292175292969, | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 3.295057773590088, | |
| "kl": 0.0828857421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "reward": 1.7684763371944427, | |
| "reward_std": 0.042786918580532074, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7684763222932816, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 86.11458587646484, | |
| "epoch": 1.6232741617357003, | |
| "grad_norm": 5.877788543701172, | |
| "kl": 0.08740234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "reward": 1.7584342658519745, | |
| "reward_std": 0.04845387488603592, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7584342360496521, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 87.69791984558105, | |
| "epoch": 1.6311637080867851, | |
| "grad_norm": 11.630145072937012, | |
| "kl": 0.074462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.7452757060527802, | |
| "reward_std": 0.048048963537439704, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7452756613492966, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 87.88541984558105, | |
| "epoch": 1.63905325443787, | |
| "grad_norm": 2.092057228088379, | |
| "kl": 0.102294921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0041, | |
| "reward": 1.7092933058738708, | |
| "reward_std": 0.12242361903190613, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7301265597343445, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 88.29166984558105, | |
| "epoch": 1.6469428007889546, | |
| "grad_norm": 1.6916812658309937, | |
| "kl": 0.0902099609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0036, | |
| "reward": 1.7132295370101929, | |
| "reward_std": 0.17004388198256493, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.7653128653764725, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 86.42708396911621, | |
| "epoch": 1.6548323471400395, | |
| "grad_norm": 2.2493884563446045, | |
| "kl": 0.08447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7234868705272675, | |
| "reward_std": 0.06308982148766518, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7339034825563431, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 92.68750190734863, | |
| "epoch": 1.6627218934911243, | |
| "grad_norm": 2.1522436141967773, | |
| "kl": 0.07208251953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.753316581249237, | |
| "reward_std": 0.127250699326396, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.774149939417839, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 90.13541984558105, | |
| "epoch": 1.670611439842209, | |
| "grad_norm": 1.7701236009597778, | |
| "kl": 0.06756591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7777323722839355, | |
| "reward_std": 0.04612369416281581, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7777323573827744, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 89.05208587646484, | |
| "epoch": 1.6785009861932938, | |
| "grad_norm": 9.215665817260742, | |
| "kl": 0.06768798828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.8172322511672974, | |
| "reward_std": 0.058106679469347, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.827648863196373, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 92.09375190734863, | |
| "epoch": 1.6863905325443787, | |
| "grad_norm": 1.429556131362915, | |
| "kl": 0.0614013671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7814797163009644, | |
| "reward_std": 0.05074140336364508, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7814796715974808, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 95.375, | |
| "epoch": 1.6942800788954635, | |
| "grad_norm": 1.654097080230713, | |
| "kl": 0.06536865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.798690527677536, | |
| "reward_std": 0.054276286624372005, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.79869045317173, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 91.86458587646484, | |
| "epoch": 1.7021696252465484, | |
| "grad_norm": 2.162991523742676, | |
| "kl": 0.063720703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.816997766494751, | |
| "reward_std": 0.045502190943807364, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.816997766494751, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 102.71875381469727, | |
| "epoch": 1.7100591715976332, | |
| "grad_norm": 2.4174466133117676, | |
| "kl": 0.0667724609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7929243445396423, | |
| "reward_std": 0.057674878276884556, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7929242998361588, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 100.30208778381348, | |
| "epoch": 1.717948717948718, | |
| "grad_norm": 2.792175531387329, | |
| "kl": 0.08599853515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "reward": 1.7294350564479828, | |
| "reward_std": 0.12824528431519866, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7502683401107788, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 101.83333778381348, | |
| "epoch": 1.725838264299803, | |
| "grad_norm": 2.035576820373535, | |
| "kl": 0.0584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.765143632888794, | |
| "reward_std": 0.12963381642475724, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7859769463539124, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 101.50000190734863, | |
| "epoch": 1.7337278106508875, | |
| "grad_norm": 2.4070231914520264, | |
| "kl": 0.07916259765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7128539979457855, | |
| "reward_std": 0.11889935936778784, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7336873859167099, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 101.09375190734863, | |
| "epoch": 1.7416173570019724, | |
| "grad_norm": 2.5801444053649902, | |
| "kl": 0.06781005859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7691087424755096, | |
| "reward_std": 0.07278524897992611, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7691086679697037, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 104.84375381469727, | |
| "epoch": 1.7495069033530573, | |
| "grad_norm": 3.1142055988311768, | |
| "kl": 0.06622314453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.726320058107376, | |
| "reward_std": 0.09467954037245363, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7367367446422577, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 101.51041984558105, | |
| "epoch": 1.7573964497041419, | |
| "grad_norm": 2.2835278511047363, | |
| "kl": 0.0704345703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.697280466556549, | |
| "reward_std": 0.08086889609694481, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7076971083879471, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 105.22916984558105, | |
| "epoch": 1.7652859960552267, | |
| "grad_norm": 1.3429104089736938, | |
| "kl": 0.07025146484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.6932333409786224, | |
| "reward_std": 0.058003371581435204, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7036499530076981, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 104.63541793823242, | |
| "epoch": 1.7731755424063116, | |
| "grad_norm": 2.0300707817077637, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.758153110742569, | |
| "reward_std": 0.049405214842408895, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.758153110742569, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 109.54166984558105, | |
| "epoch": 1.7810650887573964, | |
| "grad_norm": 2.2276878356933594, | |
| "kl": 0.05450439453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.7898038029670715, | |
| "reward_std": 0.07037154771387577, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.8002204447984695, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 110.67708587646484, | |
| "epoch": 1.7889546351084813, | |
| "grad_norm": 2.3131182193756104, | |
| "kl": 0.05841064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.810005784034729, | |
| "reward_std": 0.02927766414359212, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8100057393312454, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 109.03125190734863, | |
| "epoch": 1.7968441814595661, | |
| "grad_norm": 5.591848373413086, | |
| "kl": 0.063232421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7375752925872803, | |
| "reward_std": 0.06387870013713837, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7375752925872803, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 115.06250381469727, | |
| "epoch": 1.804733727810651, | |
| "grad_norm": 2.8504865169525146, | |
| "kl": 0.0628662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.6701695919036865, | |
| "reward_std": 0.09486013744026423, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6805862784385681, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 114.32291793823242, | |
| "epoch": 1.8126232741617359, | |
| "grad_norm": 2.7374796867370605, | |
| "kl": 0.06536865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.7931168377399445, | |
| "reward_std": 0.06577130127698183, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7931168079376221, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 116.29167175292969, | |
| "epoch": 1.8205128205128205, | |
| "grad_norm": 3.255202054977417, | |
| "kl": 0.06494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.728683203458786, | |
| "reward_std": 0.09193603880703449, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7390998601913452, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 121.97916793823242, | |
| "epoch": 1.8284023668639053, | |
| "grad_norm": 3.000345468521118, | |
| "kl": 0.0560302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "reward": 1.741953283548355, | |
| "reward_std": 0.10554883629083633, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.7627865821123123, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 118.30208587646484, | |
| "epoch": 1.8362919132149902, | |
| "grad_norm": 2.914006471633911, | |
| "kl": 0.07244873046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7272584438323975, | |
| "reward_std": 0.1506546987220645, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "rewards/segmentation_reward": 0.7585083991289139, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 121.15625381469727, | |
| "epoch": 1.8441814595660748, | |
| "grad_norm": 2.076815128326416, | |
| "kl": 0.069091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.6870795786380768, | |
| "reward_std": 0.10884091723710299, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7079129219055176, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 115.11458587646484, | |
| "epoch": 1.8520710059171597, | |
| "grad_norm": 2.715985059738159, | |
| "kl": 0.0792236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7684183418750763, | |
| "reward_std": 0.09033020678907633, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7788349986076355, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 119.62500381469727, | |
| "epoch": 1.8599605522682445, | |
| "grad_norm": 3.4143686294555664, | |
| "kl": 0.0587158203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.8009620606899261, | |
| "reward_std": 0.046604559291154146, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8009620159864426, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 126.77083778381348, | |
| "epoch": 1.8678500986193294, | |
| "grad_norm": 2.546739339828491, | |
| "kl": 0.062255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.761908084154129, | |
| "reward_std": 0.07723418436944485, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7723246961832047, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 122.57291793823242, | |
| "epoch": 1.8757396449704142, | |
| "grad_norm": 11.713898658752441, | |
| "kl": 0.06842041015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7244771420955658, | |
| "reward_std": 0.04781572869978845, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7244771271944046, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 120.54166793823242, | |
| "epoch": 1.883629191321499, | |
| "grad_norm": 4.24526834487915, | |
| "kl": 0.0626220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "reward": 1.7059939503669739, | |
| "reward_std": 0.13842293061316013, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7268272489309311, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 117.65625381469727, | |
| "epoch": 1.891518737672584, | |
| "grad_norm": 4.1007561683654785, | |
| "kl": 0.06787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7405757904052734, | |
| "reward_std": 0.04020647844299674, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.740575760602951, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 117.26041984558105, | |
| "epoch": 1.8994082840236688, | |
| "grad_norm": 2.303384780883789, | |
| "kl": 0.07501220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.003, | |
| "reward": 1.6991797089576721, | |
| "reward_std": 0.19727707654237747, | |
| "rewards/format_reward": 0.9479167014360428, | |
| "rewards/segmentation_reward": 0.7512629479169846, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 121.5000057220459, | |
| "epoch": 1.9072978303747534, | |
| "grad_norm": 1.9164984226226807, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.7220526039600372, | |
| "reward_std": 0.06677288189530373, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.722052663564682, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 118.84375190734863, | |
| "epoch": 1.9151873767258383, | |
| "grad_norm": 4.73820161819458, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "reward": 1.7459237277507782, | |
| "reward_std": 0.041601293021813035, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7459237277507782, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 113.8125, | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 3.414625406265259, | |
| "kl": 0.0687255859375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.750750720500946, | |
| "reward_std": 0.10410196101292968, | |
| "rewards/format_reward": 0.9791666865348816, | |
| "rewards/segmentation_reward": 0.7715839594602585, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 114.04166793823242, | |
| "epoch": 1.9309664694280078, | |
| "grad_norm": 3.6483190059661865, | |
| "kl": 0.05694580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0023, | |
| "reward": 1.850210815668106, | |
| "reward_std": 0.030932937748730183, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.8502108156681061, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 111.08333587646484, | |
| "epoch": 1.9388560157790926, | |
| "grad_norm": 2.375276803970337, | |
| "kl": 0.06561279296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.741035521030426, | |
| "reward_std": 0.10635390551760793, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "rewards/segmentation_reward": 0.761868804693222, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 112.56250381469727, | |
| "epoch": 1.9467455621301775, | |
| "grad_norm": 3.1703550815582275, | |
| "kl": 0.06463623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "reward": 1.743516057729721, | |
| "reward_std": 0.054085819981992245, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7435160130262375, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 112.88541984558105, | |
| "epoch": 1.9546351084812623, | |
| "grad_norm": 19.166675567626953, | |
| "kl": 0.069091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "reward": 1.6917358338832855, | |
| "reward_std": 0.08896347414702177, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7021525055170059, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 113.57291984558105, | |
| "epoch": 1.9625246548323472, | |
| "grad_norm": 1.4388498067855835, | |
| "kl": 0.0714111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0029, | |
| "reward": 1.7879726886749268, | |
| "reward_std": 0.04161839000880718, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.787972629070282, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 113.20833587646484, | |
| "epoch": 1.970414201183432, | |
| "grad_norm": 4.911606311798096, | |
| "kl": 0.068603515625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.675138384103775, | |
| "reward_std": 0.07887607486918569, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.6855549961328506, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 107.64583396911621, | |
| "epoch": 1.9783037475345169, | |
| "grad_norm": 2.6165802478790283, | |
| "kl": 0.06707763671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "reward": 1.7642135918140411, | |
| "reward_std": 0.05668263230472803, | |
| "rewards/format_reward": 1.0, | |
| "rewards/segmentation_reward": 0.7642135620117188, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 112.51041984558105, | |
| "epoch": 1.9861932938856017, | |
| "grad_norm": 2.5721254348754883, | |
| "kl": 0.0810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0032, | |
| "reward": 1.7676794826984406, | |
| "reward_std": 0.05829272768460214, | |
| "rewards/format_reward": 0.9895833432674408, | |
| "rewards/segmentation_reward": 0.7780960947275162, | |
| "step": 252 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 252, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |