{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990600241708071, "eval_steps": 400, "global_step": 465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010742580905062441, "grad_norm": 97.31650637788948, "learning_rate": 8.51063829787234e-08, "logits/chosen": -10.386978149414062, "logits/rejected": -10.301819801330566, "logps/chosen": -0.9842015504837036, "logps/rejected": -0.9797419309616089, "loss": 6.1427, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -9.84201431274414, "rewards/margins": -0.044595133513212204, "rewards/rejected": -9.797419548034668, "step": 5 }, { "epoch": 0.021485161810124883, "grad_norm": 120.32296756999746, "learning_rate": 1.702127659574468e-07, "logits/chosen": -10.528864860534668, "logits/rejected": -10.550542831420898, "logps/chosen": -1.1053581237792969, "logps/rejected": -1.0148571729660034, "loss": 5.8489, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -11.053580284118652, "rewards/margins": -0.9050087928771973, "rewards/rejected": -10.148571968078613, "step": 10 }, { "epoch": 0.03222774271518732, "grad_norm": 63.03552892163084, "learning_rate": 2.553191489361702e-07, "logits/chosen": -10.048526763916016, "logits/rejected": -9.960186004638672, "logps/chosen": -1.2764005661010742, "logps/rejected": -1.1620838642120361, "loss": 6.2427, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -12.764005661010742, "rewards/margins": -1.1431667804718018, "rewards/rejected": -11.62083911895752, "step": 15 }, { "epoch": 0.042970323620249766, "grad_norm": 115.65630638049038, "learning_rate": 3.404255319148936e-07, "logits/chosen": -9.938735961914062, "logits/rejected": -10.032970428466797, "logps/chosen": -0.886467456817627, "logps/rejected": -0.9640073776245117, "loss": 5.5509, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.86467456817627, "rewards/margins": 0.775399386882782, "rewards/rejected": -9.640073776245117, "step": 20 }, { "epoch": 0.05371290452531221, "grad_norm": 96.51643381412, "learning_rate": 4.25531914893617e-07, "logits/chosen": -9.94281005859375, "logits/rejected": -9.81358814239502, "logps/chosen": -0.8847745060920715, "logps/rejected": -0.9943971633911133, "loss": 5.2621, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.84774398803711, "rewards/margins": 1.0962274074554443, "rewards/rejected": -9.943971633911133, "step": 25 }, { "epoch": 0.06445548543037465, "grad_norm": 79.98872357640575, "learning_rate": 5.106382978723404e-07, "logits/chosen": -9.344173431396484, "logits/rejected": -9.224719047546387, "logps/chosen": -0.6412209272384644, "logps/rejected": -0.6745115518569946, "loss": 5.0537, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.412209510803223, "rewards/margins": 0.3329065442085266, "rewards/rejected": -6.745115756988525, "step": 30 }, { "epoch": 0.07519806633543709, "grad_norm": 90.98736128158838, "learning_rate": 5.957446808510638e-07, "logits/chosen": -8.797516822814941, "logits/rejected": -8.775238037109375, "logps/chosen": -0.7022872567176819, "logps/rejected": -0.6628175973892212, "loss": 5.2242, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -7.0228729248046875, "rewards/margins": -0.3946966230869293, "rewards/rejected": -6.628176212310791, "step": 35 }, { "epoch": 0.08594064724049953, "grad_norm": 85.27765487288175, "learning_rate": 6.808510638297872e-07, "logits/chosen": -8.165349006652832, "logits/rejected": -7.939520835876465, "logps/chosen": -0.6976481676101685, "logps/rejected": -0.7118233442306519, "loss": 4.8954, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -6.976480960845947, "rewards/margins": 0.14175233244895935, "rewards/rejected": -7.118234157562256, "step": 40 }, { "epoch": 0.09668322814556198, "grad_norm": 60.0810198108563, "learning_rate": 7.659574468085107e-07, "logits/chosen": -8.638801574707031, "logits/rejected": -8.301128387451172, "logps/chosen": -0.5912588834762573, "logps/rejected": -0.643363356590271, "loss": 4.5924, "rewards/accuracies": 0.625, "rewards/chosen": -5.912589073181152, "rewards/margins": 0.5210448503494263, "rewards/rejected": -6.433633327484131, "step": 45 }, { "epoch": 0.10742580905062442, "grad_norm": 56.311443110110645, "learning_rate": 7.998983280184396e-07, "logits/chosen": -8.66537857055664, "logits/rejected": -8.455618858337402, "logps/chosen": -0.532910943031311, "logps/rejected": -0.5861265063285828, "loss": 4.6603, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.3291096687316895, "rewards/margins": 0.5321555733680725, "rewards/rejected": -5.861265659332275, "step": 50 }, { "epoch": 0.11816838995568685, "grad_norm": 48.22964437945852, "learning_rate": 7.992771864078597e-07, "logits/chosen": -8.251328468322754, "logits/rejected": -8.19440746307373, "logps/chosen": -0.6145357489585876, "logps/rejected": -0.7057743072509766, "loss": 4.6648, "rewards/accuracies": 0.5625, "rewards/chosen": -6.145357131958008, "rewards/margins": 0.9123857617378235, "rewards/rejected": -7.057743072509766, "step": 55 }, { "epoch": 0.1289109708607493, "grad_norm": 58.650484172589515, "learning_rate": 7.980922636120897e-07, "logits/chosen": -8.88660717010498, "logits/rejected": -8.566266059875488, "logps/chosen": -0.5707553625106812, "logps/rejected": -0.6651050448417664, "loss": 4.4872, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -5.707553863525391, "rewards/margins": 0.9434973001480103, "rewards/rejected": -6.6510515213012695, "step": 60 }, { "epoch": 0.13965355176581173, "grad_norm": 50.701519232478496, "learning_rate": 7.963452327474534e-07, "logits/chosen": -8.946069717407227, "logits/rejected": -8.80902099609375, "logps/chosen": -0.6769601702690125, "logps/rejected": -0.7762205600738525, "loss": 4.6697, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -6.769601345062256, "rewards/margins": 0.9926047325134277, "rewards/rejected": -7.762206077575684, "step": 65 }, { "epoch": 0.15039613267087418, "grad_norm": 55.73800060229348, "learning_rate": 7.940385606293987e-07, "logits/chosen": -8.857608795166016, "logits/rejected": -8.605030059814453, "logps/chosen": -0.6864951252937317, "logps/rejected": -0.7849037647247314, "loss": 4.579, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -6.864950656890869, "rewards/margins": 0.9840868711471558, "rewards/rejected": -7.849038124084473, "step": 70 }, { "epoch": 0.16113871357593662, "grad_norm": 61.45336329452104, "learning_rate": 7.911755042893434e-07, "logits/chosen": -9.510955810546875, "logits/rejected": -9.195249557495117, "logps/chosen": -0.6112648248672485, "logps/rejected": -0.7157038450241089, "loss": 4.4206, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -6.112648010253906, "rewards/margins": 1.044390320777893, "rewards/rejected": -7.157038688659668, "step": 75 }, { "epoch": 0.17188129448099906, "grad_norm": 63.05991232357819, "learning_rate": 7.877601063757321e-07, "logits/chosen": -8.919134140014648, "logits/rejected": -8.694350242614746, "logps/chosen": -0.6809953451156616, "logps/rejected": -0.8292111158370972, "loss": 4.3782, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.809953212738037, "rewards/margins": 1.4821574687957764, "rewards/rejected": -8.29211139678955, "step": 80 }, { "epoch": 0.1826238753860615, "grad_norm": 80.14192606614782, "learning_rate": 7.837971894457989e-07, "logits/chosen": -9.203702926635742, "logits/rejected": -8.941718101501465, "logps/chosen": -0.6751728653907776, "logps/rejected": -0.8182951807975769, "loss": 4.6537, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -6.751728057861328, "rewards/margins": 1.4312243461608887, "rewards/rejected": -8.182951927185059, "step": 85 }, { "epoch": 0.19336645629112395, "grad_norm": 57.0890392656848, "learning_rate": 7.792923491560942e-07, "logits/chosen": -8.762784004211426, "logits/rejected": -8.619606018066406, "logps/chosen": -0.6792052388191223, "logps/rejected": -0.782240092754364, "loss": 4.4488, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -6.792051792144775, "rewards/margins": 1.0303497314453125, "rewards/rejected": -7.822402000427246, "step": 90 }, { "epoch": 0.2041090371961864, "grad_norm": 62.86804177211711, "learning_rate": 7.742519463613926e-07, "logits/chosen": -9.201288223266602, "logits/rejected": -9.108887672424316, "logps/chosen": -0.6846643090248108, "logps/rejected": -0.8157285451889038, "loss": 4.2734, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -6.846642971038818, "rewards/margins": 1.3106427192687988, "rewards/rejected": -8.157285690307617, "step": 95 }, { "epoch": 0.21485161810124884, "grad_norm": 156.13974754079638, "learning_rate": 7.68683098133138e-07, "logits/chosen": -8.922677040100098, "logits/rejected": -8.721606254577637, "logps/chosen": -0.7626782655715942, "logps/rejected": -0.8957880139350891, "loss": 4.4059, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -7.626782417297363, "rewards/margins": 1.3310966491699219, "rewards/rejected": -8.957880020141602, "step": 100 }, { "epoch": 0.22559419900631128, "grad_norm": 105.18853752160656, "learning_rate": 7.625936677101051e-07, "logits/chosen": -9.266253471374512, "logits/rejected": -9.21663761138916, "logps/chosen": -0.7982575297355652, "logps/rejected": -0.824684739112854, "loss": 4.4496, "rewards/accuracies": 0.5625, "rewards/chosen": -7.9825758934021, "rewards/margins": 0.2642715573310852, "rewards/rejected": -8.246847152709961, "step": 105 }, { "epoch": 0.2363367799113737, "grad_norm": 55.591763777771575, "learning_rate": 7.559922533954731e-07, "logits/chosen": -9.533427238464355, "logits/rejected": -9.415986061096191, "logps/chosen": -0.760522723197937, "logps/rejected": -0.880321204662323, "loss": 4.2742, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.605226039886475, "rewards/margins": 1.1979866027832031, "rewards/rejected": -8.803213119506836, "step": 110 }, { "epoch": 0.24707936081643614, "grad_norm": 68.68376533729582, "learning_rate": 7.488881764159808e-07, "logits/chosen": -10.321605682373047, "logits/rejected": -10.032452583312988, "logps/chosen": -0.7515496015548706, "logps/rejected": -0.8424334526062012, "loss": 4.146, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -7.515496730804443, "rewards/margins": 0.9088379740715027, "rewards/rejected": -8.424333572387695, "step": 115 }, { "epoch": 0.2578219417214986, "grad_norm": 84.12336331826019, "learning_rate": 7.412914677603135e-07, "logits/chosen": -9.94493293762207, "logits/rejected": -9.746753692626953, "logps/chosen": -0.8637507557868958, "logps/rejected": -0.9913690686225891, "loss": 4.2196, "rewards/accuracies": 0.59375, "rewards/chosen": -8.637508392333984, "rewards/margins": 1.2761822938919067, "rewards/rejected": -9.913690567016602, "step": 120 }, { "epoch": 0.26856452262656105, "grad_norm": 65.56256589725982, "learning_rate": 7.332128540153017e-07, "logits/chosen": -11.014134407043457, "logits/rejected": -10.658555030822754, "logps/chosen": -0.7951505184173584, "logps/rejected": -0.9381753206253052, "loss": 4.0983, "rewards/accuracies": 0.65625, "rewards/chosen": -7.951504707336426, "rewards/margins": 1.430248498916626, "rewards/rejected": -9.381753921508789, "step": 125 }, { "epoch": 0.27930710353162347, "grad_norm": 65.20658081437091, "learning_rate": 7.246637422199322e-07, "logits/chosen": -10.825661659240723, "logits/rejected": -10.685798645019531, "logps/chosen": -0.867350697517395, "logps/rejected": -1.0283457040786743, "loss": 4.0614, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -8.673507690429688, "rewards/margins": 1.6099493503570557, "rewards/rejected": -10.283456802368164, "step": 130 }, { "epoch": 0.29004968443668594, "grad_norm": 71.60093594488256, "learning_rate": 7.156562037585574e-07, "logits/chosen": -11.534868240356445, "logits/rejected": -11.279884338378906, "logps/chosen": -0.8583852648735046, "logps/rejected": -1.0596177577972412, "loss": 3.9342, "rewards/accuracies": 0.71875, "rewards/chosen": -8.583852767944336, "rewards/margins": 2.012324810028076, "rewards/rejected": -10.59617805480957, "step": 135 }, { "epoch": 0.30079226534174835, "grad_norm": 90.84373325771121, "learning_rate": 7.062029573160467e-07, "logits/chosen": -12.217193603515625, "logits/rejected": -11.932929039001465, "logps/chosen": -0.9700864553451538, "logps/rejected": -1.183528184890747, "loss": 3.69, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -9.700864791870117, "rewards/margins": 2.1344172954559326, "rewards/rejected": -11.835283279418945, "step": 140 }, { "epoch": 0.31153484624681077, "grad_norm": 108.66029102385438, "learning_rate": 6.963173509189455e-07, "logits/chosen": -13.757959365844727, "logits/rejected": -13.603772163391113, "logps/chosen": -1.109348177909851, "logps/rejected": -1.3448617458343506, "loss": 3.808, "rewards/accuracies": 0.59375, "rewards/chosen": -11.093481063842773, "rewards/margins": 2.355137348175049, "rewards/rejected": -13.448617935180664, "step": 145 }, { "epoch": 0.32227742715187324, "grad_norm": 120.43718591165803, "learning_rate": 6.860133430880024e-07, "logits/chosen": -14.138008117675781, "logits/rejected": -13.760258674621582, "logps/chosen": -1.2876828908920288, "logps/rejected": -1.5249125957489014, "loss": 3.7404, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -12.876829147338867, "rewards/margins": 2.3722963333129883, "rewards/rejected": -15.249125480651855, "step": 150 }, { "epoch": 0.33302000805693566, "grad_norm": 110.47332053227805, "learning_rate": 6.753054831286747e-07, "logits/chosen": -14.456746101379395, "logits/rejected": -14.32475471496582, "logps/chosen": -1.3068922758102417, "logps/rejected": -1.516629934310913, "loss": 3.8826, "rewards/accuracies": 0.6875, "rewards/chosen": -13.06892204284668, "rewards/margins": 2.097377300262451, "rewards/rejected": -15.166299819946289, "step": 155 }, { "epoch": 0.34376258896199813, "grad_norm": 280.21180497264857, "learning_rate": 6.642088905874433e-07, "logits/chosen": -14.917425155639648, "logits/rejected": -14.61968994140625, "logps/chosen": -1.3311243057250977, "logps/rejected": -1.5419546365737915, "loss": 3.7542, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -13.311243057250977, "rewards/margins": 2.108302593231201, "rewards/rejected": -15.41954517364502, "step": 160 }, { "epoch": 0.35450516986706054, "grad_norm": 97.70985018206724, "learning_rate": 6.527392339029455e-07, "logits/chosen": -14.492767333984375, "logits/rejected": -14.438491821289062, "logps/chosen": -1.2500733137130737, "logps/rejected": -1.525674819946289, "loss": 3.4458, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -12.500734329223633, "rewards/margins": 2.756016254425049, "rewards/rejected": -15.256749153137207, "step": 165 }, { "epoch": 0.365247750772123, "grad_norm": 115.13192096245145, "learning_rate": 6.409127082820689e-07, "logits/chosen": -14.65284252166748, "logits/rejected": -14.798192977905273, "logps/chosen": -1.573756217956543, "logps/rejected": -1.85759699344635, "loss": 3.6649, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -15.73755931854248, "rewards/margins": 2.838409900665283, "rewards/rejected": -18.575969696044922, "step": 170 }, { "epoch": 0.37599033167718543, "grad_norm": 120.51325866275113, "learning_rate": 6.287460128322457e-07, "logits/chosen": -15.47950553894043, "logits/rejected": -15.095005989074707, "logps/chosen": -1.6204099655151367, "logps/rejected": -1.9557273387908936, "loss": 3.4241, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -16.2041015625, "rewards/margins": 3.3531746864318848, "rewards/rejected": -19.557273864746094, "step": 175 }, { "epoch": 0.3867329125822479, "grad_norm": 177.63227168183283, "learning_rate": 6.16256326982239e-07, "logits/chosen": -15.96112060546875, "logits/rejected": -15.974037170410156, "logps/chosen": -1.5117028951644897, "logps/rejected": -1.9151155948638916, "loss": 3.4091, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -15.117030143737793, "rewards/margins": 4.034125328063965, "rewards/rejected": -19.151155471801758, "step": 180 }, { "epoch": 0.3974754934873103, "grad_norm": 119.1370485384488, "learning_rate": 6.034612862247114e-07, "logits/chosen": -15.26576042175293, "logits/rejected": -15.074376106262207, "logps/chosen": -1.525244951248169, "logps/rejected": -1.833696722984314, "loss": 3.105, "rewards/accuracies": 0.75, "rewards/chosen": -15.252447128295898, "rewards/margins": 3.0845184326171875, "rewards/rejected": -18.336965560913086, "step": 185 }, { "epoch": 0.4082180743923728, "grad_norm": 115.37836093687133, "learning_rate": 5.903789572148295e-07, "logits/chosen": -14.956746101379395, "logits/rejected": -14.613385200500488, "logps/chosen": -1.6238969564437866, "logps/rejected": -2.0761537551879883, "loss": 3.3782, "rewards/accuracies": 0.71875, "rewards/chosen": -16.238969802856445, "rewards/margins": 4.522566795349121, "rewards/rejected": -20.76153564453125, "step": 190 }, { "epoch": 0.4189606552974352, "grad_norm": 129.58691495445163, "learning_rate": 5.770278122600662e-07, "logits/chosen": -15.031437873840332, "logits/rejected": -14.86229133605957, "logps/chosen": -1.686156988143921, "logps/rejected": -2.002657890319824, "loss": 3.5752, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -16.861570358276367, "rewards/margins": 3.165009021759033, "rewards/rejected": -20.026578903198242, "step": 195 }, { "epoch": 0.4297032362024977, "grad_norm": 103.32755261822129, "learning_rate": 5.634267032372192e-07, "logits/chosen": -15.254351615905762, "logits/rejected": -15.037300109863281, "logps/chosen": -1.6978883743286133, "logps/rejected": -2.0401151180267334, "loss": 3.3862, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -16.978879928588867, "rewards/margins": 3.4222705364227295, "rewards/rejected": -20.401153564453125, "step": 200 }, { "epoch": 0.4404458171075601, "grad_norm": 128.218280047368, "learning_rate": 5.495948349734758e-07, "logits/chosen": -14.857444763183594, "logits/rejected": -14.940084457397461, "logps/chosen": -1.7693378925323486, "logps/rejected": -2.061185836791992, "loss": 3.1236, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -17.693378448486328, "rewards/margins": 2.91848087310791, "rewards/rejected": -20.611858367919922, "step": 205 }, { "epoch": 0.45118839801262256, "grad_norm": 107.84849265892646, "learning_rate": 5.355517381291105e-07, "logits/chosen": -16.1163330078125, "logits/rejected": -15.79603099822998, "logps/chosen": -1.7783616781234741, "logps/rejected": -2.141852378845215, "loss": 3.1842, "rewards/accuracies": 0.71875, "rewards/chosen": -17.783615112304688, "rewards/margins": 3.6349079608917236, "rewards/rejected": -21.41852378845215, "step": 210 }, { "epoch": 0.461930978917685, "grad_norm": 108.45122155813243, "learning_rate": 5.21317241620105e-07, "logits/chosen": -16.612058639526367, "logits/rejected": -16.40911293029785, "logps/chosen": -1.7623794078826904, "logps/rejected": -2.1722538471221924, "loss": 3.4158, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -17.623790740966797, "rewards/margins": 4.0987443923950195, "rewards/rejected": -21.722537994384766, "step": 215 }, { "epoch": 0.4726735598227474, "grad_norm": 115.68303627040952, "learning_rate": 5.069114446196291e-07, "logits/chosen": -14.818249702453613, "logits/rejected": -14.463000297546387, "logps/chosen": -1.832088828086853, "logps/rejected": -2.3377938270568848, "loss": 3.1208, "rewards/accuracies": 0.8125, "rewards/chosen": -18.32088851928711, "rewards/margins": 5.057046890258789, "rewards/rejected": -23.37793731689453, "step": 220 }, { "epoch": 0.48341614072780986, "grad_norm": 115.29167221000085, "learning_rate": 4.923546881779183e-07, "logits/chosen": -15.626416206359863, "logits/rejected": -15.386268615722656, "logps/chosen": -1.7103182077407837, "logps/rejected": -2.0725884437561035, "loss": 3.0943, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -17.103179931640625, "rewards/margins": 3.622704267501831, "rewards/rejected": -20.72588539123535, "step": 225 }, { "epoch": 0.4941587216328723, "grad_norm": 110.55749043958713, "learning_rate": 4.776675265006186e-07, "logits/chosen": -15.542366027832031, "logits/rejected": -15.470932006835938, "logps/chosen": -1.7697646617889404, "logps/rejected": -2.1862690448760986, "loss": 3.2388, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -17.69764518737793, "rewards/margins": 4.165043830871582, "rewards/rejected": -21.862689971923828, "step": 230 }, { "epoch": 0.5049013025379347, "grad_norm": 194.28560949567938, "learning_rate": 4.62870697926156e-07, "logits/chosen": -14.893994331359863, "logits/rejected": -15.048649787902832, "logps/chosen": -1.764272928237915, "logps/rejected": -2.2773966789245605, "loss": 3.2608, "rewards/accuracies": 0.71875, "rewards/chosen": -17.642728805541992, "rewards/margins": 5.13123893737793, "rewards/rejected": -22.773967742919922, "step": 235 }, { "epoch": 0.5156438834429972, "grad_norm": 118.02806259112292, "learning_rate": 4.479850956431092e-07, "logits/chosen": -15.161096572875977, "logits/rejected": -15.109652519226074, "logps/chosen": -1.7125499248504639, "logps/rejected": -2.1439616680145264, "loss": 3.0925, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -17.125499725341797, "rewards/margins": 4.31411600112915, "rewards/rejected": -21.43961524963379, "step": 240 }, { "epoch": 0.5263864643480596, "grad_norm": 94.41744321870145, "learning_rate": 4.33031738188933e-07, "logits/chosen": -15.63347053527832, "logits/rejected": -15.267684936523438, "logps/chosen": -1.6985079050064087, "logps/rejected": -2.109764575958252, "loss": 3.2248, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -16.98508071899414, "rewards/margins": 4.1125664710998535, "rewards/rejected": -21.097644805908203, "step": 245 }, { "epoch": 0.5371290452531221, "grad_norm": 132.41327050594964, "learning_rate": 4.180317397716889e-07, "logits/chosen": -15.816108703613281, "logits/rejected": -15.996536254882812, "logps/chosen": -1.7325900793075562, "logps/rejected": -2.2646021842956543, "loss": 3.1082, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -17.32590103149414, "rewards/margins": 5.320119857788086, "rewards/rejected": -22.646020889282227, "step": 250 }, { "epoch": 0.5478716261581845, "grad_norm": 123.8588013852755, "learning_rate": 4.030062804566888e-07, "logits/chosen": -15.66382122039795, "logits/rejected": -15.521852493286133, "logps/chosen": -1.703418493270874, "logps/rejected": -2.1070733070373535, "loss": 3.0582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -17.03418731689453, "rewards/margins": 4.036545753479004, "rewards/rejected": -21.07073211669922, "step": 255 }, { "epoch": 0.5586142070632469, "grad_norm": 100.71973130430227, "learning_rate": 3.8797657626014614e-07, "logits/chosen": -15.945584297180176, "logits/rejected": -15.955207824707031, "logps/chosen": -1.7251708507537842, "logps/rejected": -2.1078107357025146, "loss": 3.1785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -17.251708984375, "rewards/margins": 3.826399326324463, "rewards/rejected": -21.078109741210938, "step": 260 }, { "epoch": 0.5693567879683094, "grad_norm": 118.85717581252126, "learning_rate": 3.729638491920669e-07, "logits/chosen": -15.352853775024414, "logits/rejected": -15.46752643585205, "logps/chosen": -1.7334463596343994, "logps/rejected": -2.135951519012451, "loss": 3.1444, "rewards/accuracies": 0.75, "rewards/chosen": -17.334461212158203, "rewards/margins": 4.025053977966309, "rewards/rejected": -21.359514236450195, "step": 265 }, { "epoch": 0.5800993688733719, "grad_norm": 85.82493905943672, "learning_rate": 3.5798929729067464e-07, "logits/chosen": -16.411659240722656, "logits/rejected": -16.199129104614258, "logps/chosen": -1.7729793787002563, "logps/rejected": -2.220877170562744, "loss": 2.7332, "rewards/accuracies": 0.8125, "rewards/chosen": -17.729793548583984, "rewards/margins": 4.478978157043457, "rewards/rejected": -22.208770751953125, "step": 270 }, { "epoch": 0.5908419497784343, "grad_norm": 106.03649240710887, "learning_rate": 3.4307406469068595e-07, "logits/chosen": -14.978113174438477, "logits/rejected": -15.032865524291992, "logps/chosen": -1.809618592262268, "logps/rejected": -2.3406805992126465, "loss": 2.9025, "rewards/accuracies": 0.78125, "rewards/chosen": -18.096187591552734, "rewards/margins": 5.3106207847595215, "rewards/rejected": -23.40680694580078, "step": 275 }, { "epoch": 0.6015845306834967, "grad_norm": 111.84746493906809, "learning_rate": 3.282392117676968e-07, "logits/chosen": -15.695220947265625, "logits/rejected": -15.893925666809082, "logps/chosen": -1.9052400588989258, "logps/rejected": -2.4178624153137207, "loss": 2.8972, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -19.052398681640625, "rewards/margins": 5.126224040985107, "rewards/rejected": -24.178625106811523, "step": 280 }, { "epoch": 0.6123271115885591, "grad_norm": 103.57050178439866, "learning_rate": 3.135056854008371e-07, "logits/chosen": -15.808944702148438, "logits/rejected": -15.778024673461914, "logps/chosen": -1.7815015316009521, "logps/rejected": -2.271728515625, "loss": 2.8431, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -17.81501579284668, "rewards/margins": 4.9022698402404785, "rewards/rejected": -22.717287063598633, "step": 285 }, { "epoch": 0.6230696924936215, "grad_norm": 108.56282250963646, "learning_rate": 2.988942893956833e-07, "logits/chosen": -15.920690536499023, "logits/rejected": -15.81785774230957, "logps/chosen": -1.8119595050811768, "logps/rejected": -2.2470898628234863, "loss": 3.0512, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -18.11959457397461, "rewards/margins": 4.351304054260254, "rewards/rejected": -22.470901489257812, "step": 290 }, { "epoch": 0.6338122733986841, "grad_norm": 116.17030678566253, "learning_rate": 2.844256551091911e-07, "logits/chosen": -16.46514320373535, "logits/rejected": -16.6345272064209, "logps/chosen": -1.7750489711761475, "logps/rejected": -2.3242697715759277, "loss": 2.8258, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -17.75048828125, "rewards/margins": 5.492205619812012, "rewards/rejected": -23.242694854736328, "step": 295 }, { "epoch": 0.6445548543037465, "grad_norm": 147.68751040852177, "learning_rate": 2.7012021231812664e-07, "logits/chosen": -16.046567916870117, "logits/rejected": -15.983491897583008, "logps/chosen": -1.897653579711914, "logps/rejected": -2.281114339828491, "loss": 3.2387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -18.97653579711914, "rewards/margins": 3.834609270095825, "rewards/rejected": -22.811145782470703, "step": 300 }, { "epoch": 0.6552974352088089, "grad_norm": 124.9827211839036, "learning_rate": 2.5599816037212954e-07, "logits/chosen": -15.513954162597656, "logits/rejected": -15.608304023742676, "logps/chosen": -1.8266518115997314, "logps/rejected": -2.34920072555542, "loss": 2.8842, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -18.266515731811523, "rewards/margins": 5.225489616394043, "rewards/rejected": -23.492008209228516, "step": 305 }, { "epoch": 0.6660400161138713, "grad_norm": 141.73868750375132, "learning_rate": 2.4207943967214064e-07, "logits/chosen": -16.29332160949707, "logits/rejected": -16.148832321166992, "logps/chosen": -1.9530408382415771, "logps/rejected": -2.4292497634887695, "loss": 3.0684, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -19.530406951904297, "rewards/margins": 4.762091636657715, "rewards/rejected": -24.292499542236328, "step": 310 }, { "epoch": 0.6767825970189338, "grad_norm": 172.98236669812187, "learning_rate": 2.2838370351446547e-07, "logits/chosen": -15.80296802520752, "logits/rejected": -15.732707023620605, "logps/chosen": -1.7977100610733032, "logps/rejected": -2.3419437408447266, "loss": 2.7361, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -17.977100372314453, "rewards/margins": 5.4423370361328125, "rewards/rejected": -23.419437408447266, "step": 315 }, { "epoch": 0.6875251779239963, "grad_norm": 112.41993171792654, "learning_rate": 2.1493029034023188e-07, "logits/chosen": -15.622350692749023, "logits/rejected": -15.62098503112793, "logps/chosen": -1.784433126449585, "logps/rejected": -2.267364025115967, "loss": 2.8725, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -17.844331741333008, "rewards/margins": 4.829309940338135, "rewards/rejected": -22.673641204833984, "step": 320 }, { "epoch": 0.6982677588290587, "grad_norm": 116.72973302505498, "learning_rate": 2.0173819642942376e-07, "logits/chosen": -15.983564376831055, "logits/rejected": -15.785995483398438, "logps/chosen": -1.9481559991836548, "logps/rejected": -2.5938241481781006, "loss": 2.9776, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -19.48155975341797, "rewards/margins": 6.456681251525879, "rewards/rejected": -25.938241958618164, "step": 325 }, { "epoch": 0.7090103397341211, "grad_norm": 137.081227030888, "learning_rate": 1.888260490780485e-07, "logits/chosen": -15.733128547668457, "logits/rejected": -15.935976028442383, "logps/chosen": -1.8248291015625, "logps/rejected": -2.2822577953338623, "loss": 3.0336, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -18.248287200927734, "rewards/margins": 4.574288368225098, "rewards/rejected": -22.82257652282715, "step": 330 }, { "epoch": 0.7197529206391836, "grad_norm": 117.59237011545949, "learning_rate": 1.7621208029631078e-07, "logits/chosen": -15.677896499633789, "logits/rejected": -15.52332878112793, "logps/chosen": -1.8683006763458252, "logps/rejected": -2.448673725128174, "loss": 2.8842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -18.68300437927246, "rewards/margins": 5.803731918334961, "rewards/rejected": -24.486736297607422, "step": 335 }, { "epoch": 0.730495501544246, "grad_norm": 122.52464571458833, "learning_rate": 1.6391410106493227e-07, "logits/chosen": -15.322489738464355, "logits/rejected": -15.156359672546387, "logps/chosen": -1.9856983423233032, "logps/rejected": -2.5366251468658447, "loss": 2.8321, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -19.856985092163086, "rewards/margins": 5.509265422821045, "rewards/rejected": -25.36625099182129, "step": 340 }, { "epoch": 0.7412380824493084, "grad_norm": 98.13439288099542, "learning_rate": 1.5194947618596673e-07, "logits/chosen": -16.58279037475586, "logits/rejected": -16.51984405517578, "logps/chosen": -1.9558055400848389, "logps/rejected": -2.502556324005127, "loss": 3.0276, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -19.558055877685547, "rewards/margins": 5.467510223388672, "rewards/rejected": -25.025564193725586, "step": 345 }, { "epoch": 0.7519806633543709, "grad_norm": 104.49522716837973, "learning_rate": 1.4033509976362083e-07, "logits/chosen": -15.616808891296387, "logits/rejected": -15.737129211425781, "logps/chosen": -1.9377790689468384, "logps/rejected": -2.3541884422302246, "loss": 2.9578, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -19.377790451049805, "rewards/margins": 4.164095401763916, "rewards/rejected": -23.541885375976562, "step": 350 }, { "epoch": 0.7627232442594333, "grad_norm": 107.6884489363759, "learning_rate": 1.2908737134970363e-07, "logits/chosen": -14.991783142089844, "logits/rejected": -15.030191421508789, "logps/chosen": -1.8485435247421265, "logps/rejected": -2.38105845451355, "loss": 3.0742, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -18.485435485839844, "rewards/margins": 5.325149059295654, "rewards/rejected": -23.810585021972656, "step": 355 }, { "epoch": 0.7734658251644958, "grad_norm": 105.82348414404971, "learning_rate": 1.1822217278738515e-07, "logits/chosen": -15.954935073852539, "logits/rejected": -16.074337005615234, "logps/chosen": -1.9491569995880127, "logps/rejected": -2.3941268920898438, "loss": 3.0422, "rewards/accuracies": 0.75, "rewards/chosen": -19.4915714263916, "rewards/margins": 4.449699878692627, "rewards/rejected": -23.941268920898438, "step": 360 }, { "epoch": 0.7842084060695582, "grad_norm": 208.94228225878328, "learning_rate": 1.0775484578596241e-07, "logits/chosen": -16.27437973022461, "logits/rejected": -16.11089324951172, "logps/chosen": -1.8912540674209595, "logps/rejected": -2.4439680576324463, "loss": 2.7384, "rewards/accuracies": 0.78125, "rewards/chosen": -18.912538528442383, "rewards/margins": 5.5271430015563965, "rewards/rejected": -24.439682006835938, "step": 365 }, { "epoch": 0.7949509869746206, "grad_norm": 126.14591373204779, "learning_rate": 9.770017025829673e-08, "logits/chosen": -16.250118255615234, "logits/rejected": -16.38758087158203, "logps/chosen": -2.065514087677002, "logps/rejected": -2.72831654548645, "loss": 2.6227, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -20.655139923095703, "rewards/margins": 6.628026008605957, "rewards/rejected": -27.283166885375977, "step": 370 }, { "epoch": 0.805693567879683, "grad_norm": 121.32471246816698, "learning_rate": 8.807234345151027e-08, "logits/chosen": -16.10807991027832, "logits/rejected": -16.029443740844727, "logps/chosen": -1.9544318914413452, "logps/rejected": -2.565009355545044, "loss": 2.899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.544321060180664, "rewards/margins": 6.105769634246826, "rewards/rejected": -25.65009117126465, "step": 375 }, { "epoch": 0.8164361487847456, "grad_norm": 201.08985911319357, "learning_rate": 7.888495990040924e-08, "logits/chosen": -14.932947158813477, "logits/rejected": -14.924430847167969, "logps/chosen": -1.965070128440857, "logps/rejected": -2.5968446731567383, "loss": 2.9375, "rewards/accuracies": 0.78125, "rewards/chosen": -19.650699615478516, "rewards/margins": 6.317748069763184, "rewards/rejected": -25.968448638916016, "step": 380 }, { "epoch": 0.827178729689808, "grad_norm": 132.40169960863122, "learning_rate": 7.015099223193943e-08, "logits/chosen": -16.522253036499023, "logits/rejected": -16.4668025970459, "logps/chosen": -2.0363919734954834, "logps/rejected": -2.5673530101776123, "loss": 2.8305, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -20.36391830444336, "rewards/margins": 5.309610366821289, "rewards/rejected": -25.67352867126465, "step": 385 }, { "epoch": 0.8379213105948704, "grad_norm": 113.91880111182708, "learning_rate": 6.188277284777857e-08, "logits/chosen": -15.97686767578125, "logits/rejected": -15.647547721862793, "logps/chosen": -2.0422708988189697, "logps/rejected": -2.549582004547119, "loss": 2.7972, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -20.42270851135254, "rewards/margins": 5.073108673095703, "rewards/rejected": -25.49581527709961, "step": 390 }, { "epoch": 0.8486638914999328, "grad_norm": 126.36653904434415, "learning_rate": 5.409197651092965e-08, "logits/chosen": -15.692411422729492, "logits/rejected": -15.747018814086914, "logps/chosen": -2.10833477973938, "logps/rejected": -2.676734685897827, "loss": 2.6959, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -21.083349227905273, "rewards/margins": 5.683998107910156, "rewards/rejected": -26.767345428466797, "step": 395 }, { "epoch": 0.8594064724049953, "grad_norm": 115.47783241869803, "learning_rate": 4.678960386090298e-08, "logits/chosen": -15.30811595916748, "logits/rejected": -15.540433883666992, "logps/chosen": -1.9201295375823975, "logps/rejected": -2.523301601409912, "loss": 2.7196, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -19.2012939453125, "rewards/margins": 6.03171968460083, "rewards/rejected": -25.233013153076172, "step": 400 }, { "epoch": 0.8594064724049953, "eval_logits/chosen": -14.502357482910156, "eval_logits/rejected": -14.432116508483887, "eval_logps/chosen": -1.8952556848526, "eval_logps/rejected": -2.3938686847686768, "eval_loss": 2.7580387592315674, "eval_rewards/accuracies": 0.7704917788505554, "eval_rewards/chosen": -18.952556610107422, "eval_rewards/margins": 4.986130237579346, "eval_rewards/rejected": -23.938688278198242, "eval_runtime": 86.7509, "eval_samples_per_second": 22.374, "eval_steps_per_second": 1.406, "step": 400 }, { "epoch": 0.8701490533100578, "grad_norm": 112.75479415537629, "learning_rate": 3.998596588076366e-08, "logits/chosen": -15.262601852416992, "logits/rejected": -14.926340103149414, "logps/chosen": -1.9843738079071045, "logps/rejected": -2.531123638153076, "loss": 3.0297, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -19.843738555908203, "rewards/margins": 5.467496871948242, "rewards/rejected": -25.311237335205078, "step": 405 }, { "epoch": 0.8808916342151202, "grad_norm": 124.72604723824772, "learning_rate": 3.3690669337976996e-08, "logits/chosen": -15.930806159973145, "logits/rejected": -15.931562423706055, "logps/chosen": -1.9564170837402344, "logps/rejected": -2.517925500869751, "loss": 2.6997, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -19.564170837402344, "rewards/margins": 5.615079402923584, "rewards/rejected": -25.17925262451172, "step": 410 }, { "epoch": 0.8916342151201826, "grad_norm": 124.6339007044252, "learning_rate": 2.7912603219609798e-08, "logits/chosen": -16.048858642578125, "logits/rejected": -15.944051742553711, "logps/chosen": -2.1005568504333496, "logps/rejected": -2.568063735961914, "loss": 2.6664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -21.005565643310547, "rewards/margins": 4.675069332122803, "rewards/rejected": -25.680639266967773, "step": 415 }, { "epoch": 0.9023767960252451, "grad_norm": 121.69678463559275, "learning_rate": 2.265992618104029e-08, "logits/chosen": -16.162189483642578, "logits/rejected": -16.180706024169922, "logps/chosen": -2.018214702606201, "logps/rejected": -2.648829460144043, "loss": 2.7859, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -20.182147979736328, "rewards/margins": 6.306146621704102, "rewards/rejected": -26.488290786743164, "step": 420 }, { "epoch": 0.9131193769303075, "grad_norm": 267.04077532902875, "learning_rate": 1.7940055025900304e-08, "logits/chosen": -14.952715873718262, "logits/rejected": -14.8345308303833, "logps/chosen": -1.9776952266693115, "logps/rejected": -2.4361090660095215, "loss": 3.0349, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -19.77695083618164, "rewards/margins": 4.584140777587891, "rewards/rejected": -24.361093521118164, "step": 425 }, { "epoch": 0.92386195783537, "grad_norm": 115.027170497356, "learning_rate": 1.3759654233514817e-08, "logits/chosen": -15.242410659790039, "logits/rejected": -15.459531784057617, "logps/chosen": -1.8886226415634155, "logps/rejected": -2.344308853149414, "loss": 2.7946, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -18.8862247467041, "rewards/margins": 4.556865692138672, "rewards/rejected": -23.443092346191406, "step": 430 }, { "epoch": 0.9346045387404324, "grad_norm": 116.35861873175186, "learning_rate": 1.0124626548627402e-08, "logits/chosen": -15.868557929992676, "logits/rejected": -15.74053955078125, "logps/chosen": -1.9277899265289307, "logps/rejected": -2.5743651390075684, "loss": 2.7446, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -19.27789878845215, "rewards/margins": 6.46575403213501, "rewards/rejected": -25.743656158447266, "step": 435 }, { "epoch": 0.9453471196454948, "grad_norm": 101.71134576562835, "learning_rate": 7.040104646698042e-09, "logits/chosen": -15.439016342163086, "logits/rejected": -15.44897747039795, "logps/chosen": -1.959753394126892, "logps/rejected": -2.5274107456207275, "loss": 2.6695, "rewards/accuracies": 0.78125, "rewards/chosen": -19.597536087036133, "rewards/margins": 5.676573276519775, "rewards/rejected": -25.274110794067383, "step": 440 }, { "epoch": 0.9560897005505573, "grad_norm": 110.29407715801415, "learning_rate": 4.510443886542114e-09, "logits/chosen": -15.287280082702637, "logits/rejected": -15.34148120880127, "logps/chosen": -2.031892776489258, "logps/rejected": -2.656055450439453, "loss": 2.7797, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -20.318927764892578, "rewards/margins": 6.2416229248046875, "rewards/rejected": -26.560550689697266, "step": 445 }, { "epoch": 0.9668322814556197, "grad_norm": 122.67920649488762, "learning_rate": 2.539216160544333e-09, "logits/chosen": -15.329913139343262, "logits/rejected": -15.057653427124023, "logps/chosen": -2.1148476600646973, "logps/rejected": -2.682511806488037, "loss": 2.8577, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -21.14847755432129, "rewards/margins": 5.676642894744873, "rewards/rejected": -26.825122833251953, "step": 450 }, { "epoch": 0.9775748623606821, "grad_norm": 127.18334450658871, "learning_rate": 1.1292048511303054e-09, "logits/chosen": -15.279156684875488, "logits/rejected": -15.283819198608398, "logps/chosen": -2.0526576042175293, "logps/rejected": -2.5599799156188965, "loss": 2.9411, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -20.52657699584961, "rewards/margins": 5.0732221603393555, "rewards/rejected": -25.599796295166016, "step": 455 }, { "epoch": 0.9883174432657446, "grad_norm": 115.1645535617376, "learning_rate": 2.82400900618418e-10, "logits/chosen": -15.98609447479248, "logits/rejected": -15.959650993347168, "logps/chosen": -1.9472544193267822, "logps/rejected": -2.6097803115844727, "loss": 2.5164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -19.47254180908203, "rewards/margins": 6.625257968902588, "rewards/rejected": -26.097803115844727, "step": 460 }, { "epoch": 0.9990600241708071, "grad_norm": 233.73237047558877, "learning_rate": 0.0, "logits/chosen": -15.412060737609863, "logits/rejected": -15.519399642944336, "logps/chosen": -1.9192397594451904, "logps/rejected": -2.4578652381896973, "loss": 2.5846, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -19.19239616394043, "rewards/margins": 5.386256217956543, "rewards/rejected": -24.57865333557129, "step": 465 }, { "epoch": 0.9990600241708071, "step": 465, "total_flos": 0.0, "train_loss": 3.541660060677477, "train_runtime": 9046.6245, "train_samples_per_second": 6.585, "train_steps_per_second": 0.051 } ], "logging_steps": 5, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }