{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032, "grad_norm": 56.213985443115234, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.6856637001037598, "logits/rejected": -0.9140785336494446, "logps/chosen": -359.3579406738281, "logps/rejected": -791.425537109375, "loss": 0.7081, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.004467130638659, "rewards/margins": 0.01198100671172142, "rewards/rejected": -0.007513875607401133, "step": 10 }, { "epoch": 0.064, "grad_norm": 45.755985260009766, "learning_rate": 6.249999999999999e-07, "logits/chosen": -1.7286349534988403, "logits/rejected": -0.9193947911262512, "logps/chosen": -329.63916015625, "logps/rejected": -799.1121826171875, "loss": 0.6069, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": 0.03486660122871399, "rewards/margins": 0.25346916913986206, "rewards/rejected": -0.21860253810882568, "step": 20 }, { "epoch": 0.096, "grad_norm": 6.911625385284424, "learning_rate": 9.374999999999999e-07, "logits/chosen": -1.7430554628372192, "logits/rejected": -1.0273202657699585, "logps/chosen": -335.06024169921875, "logps/rejected": -800.427001953125, "loss": 0.229, "rewards/accuracies": 0.9906249642372131, "rewards/chosen": -0.043616339564323425, "rewards/margins": 2.225285768508911, "rewards/rejected": -2.268902063369751, "step": 30 }, { "epoch": 0.128, "grad_norm": 0.6400136947631836, "learning_rate": 9.979871469976195e-07, "logits/chosen": -1.9680767059326172, "logits/rejected": -1.3984694480895996, "logps/chosen": -379.58111572265625, "logps/rejected": -961.974853515625, "loss": 0.2557, "rewards/accuracies": 0.984375, "rewards/chosen": -4.778716564178467, "rewards/margins": 13.423650741577148, "rewards/rejected": -18.202367782592773, "step": 40 }, { "epoch": 0.16, "grad_norm": 0.27598193287849426, "learning_rate": 9.898376992116177e-07, "logits/chosen": -2.057750940322876, "logits/rejected": -1.691310167312622, "logps/chosen": -479.58575439453125, "logps/rejected": -1219.13671875, "loss": 0.1768, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -12.780839920043945, "rewards/margins": 28.819786071777344, "rewards/rejected": -41.60062789916992, "step": 50 }, { "epoch": 0.192, "grad_norm": 0.5234229564666748, "learning_rate": 9.755282581475767e-07, "logits/chosen": -1.95061457157135, "logits/rejected": -1.602612018585205, "logps/chosen": -428.9530029296875, "logps/rejected": -1158.147705078125, "loss": 0.0436, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.114362716674805, "rewards/margins": 28.7945613861084, "rewards/rejected": -39.90892791748047, "step": 60 }, { "epoch": 0.224, "grad_norm": 0.14277251064777374, "learning_rate": 9.552387733294078e-07, "logits/chosen": -1.7910118103027344, "logits/rejected": -1.4031528234481812, "logps/chosen": -413.5960388183594, "logps/rejected": -1093.8875732421875, "loss": 0.0299, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -7.432526111602783, "rewards/margins": 23.624351501464844, "rewards/rejected": -31.05687713623047, "step": 70 }, { "epoch": 0.256, "grad_norm": 0.08953393995761871, "learning_rate": 9.29224396800933e-07, "logits/chosen": -1.6937999725341797, "logits/rejected": -1.2533811330795288, "logps/chosen": -393.97149658203125, "logps/rejected": -1056.011962890625, "loss": 0.0328, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.675661087036133, "rewards/margins": 20.344688415527344, "rewards/rejected": -25.020347595214844, "step": 80 }, { "epoch": 0.288, "grad_norm": 0.08168945461511612, "learning_rate": 8.978122744408905e-07, "logits/chosen": -1.6775288581848145, "logits/rejected": -1.1762558221817017, "logps/chosen": -354.72674560546875, "logps/rejected": -1026.7005615234375, "loss": 0.0888, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0572290420532227, "rewards/margins": 18.617963790893555, "rewards/rejected": -21.675193786621094, "step": 90 }, { "epoch": 0.32, "grad_norm": 0.09104285389184952, "learning_rate": 8.613974319136957e-07, "logits/chosen": -1.6571707725524902, "logits/rejected": -1.1443570852279663, "logps/chosen": -356.0404968261719, "logps/rejected": -1034.0635986328125, "loss": 0.0241, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.6314337253570557, "rewards/margins": 17.649852752685547, "rewards/rejected": -20.281286239624023, "step": 100 }, { "epoch": 0.352, "grad_norm": 0.20877273380756378, "learning_rate": 8.20437806992512e-07, "logits/chosen": -1.6720266342163086, "logits/rejected": -1.1429331302642822, "logps/chosen": -333.5172424316406, "logps/rejected": -1006.623779296875, "loss": 0.0307, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3207459449768066, "rewards/margins": 17.213682174682617, "rewards/rejected": -19.534427642822266, "step": 110 }, { "epoch": 0.384, "grad_norm": 0.10373561084270477, "learning_rate": 7.754484907260512e-07, "logits/chosen": -1.5757179260253906, "logits/rejected": -1.1068713665008545, "logps/chosen": -395.4210510253906, "logps/rejected": -995.869873046875, "loss": 0.0288, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5039141178131104, "rewards/margins": 15.861688613891602, "rewards/rejected": -18.365602493286133, "step": 120 }, { "epoch": 0.416, "grad_norm": 0.08577949553728104, "learning_rate": 7.269952498697734e-07, "logits/chosen": -1.6134161949157715, "logits/rejected": -1.1095223426818848, "logps/chosen": -338.8744812011719, "logps/rejected": -966.8275756835938, "loss": 0.0298, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.9737201929092407, "rewards/margins": 16.617738723754883, "rewards/rejected": -18.591461181640625, "step": 130 }, { "epoch": 0.448, "grad_norm": 0.070561982691288, "learning_rate": 6.756874120406714e-07, "logits/chosen": -1.61734938621521, "logits/rejected": -1.111788272857666, "logps/chosen": -346.34844970703125, "logps/rejected": -1010.5448608398438, "loss": 0.0824, "rewards/accuracies": 0.9812500476837158, "rewards/chosen": -2.2100601196289062, "rewards/margins": 17.178829193115234, "rewards/rejected": -19.38888931274414, "step": 140 }, { "epoch": 0.48, "grad_norm": 0.0759393498301506, "learning_rate": 6.22170203068947e-07, "logits/chosen": -1.6040197610855103, "logits/rejected": -1.1053553819656372, "logps/chosen": -333.19329833984375, "logps/rejected": -995.1849365234375, "loss": 0.0281, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0527515411376953, "rewards/margins": 17.604354858398438, "rewards/rejected": -19.657106399536133, "step": 150 }, { "epoch": 0.512, "grad_norm": 4.426650047302246, "learning_rate": 5.671166329088277e-07, "logits/chosen": -1.60532808303833, "logits/rejected": -1.108091950416565, "logps/chosen": -354.8777160644531, "logps/rejected": -984.4487915039062, "loss": 0.0589, "rewards/accuracies": 0.984375, "rewards/chosen": -2.1513655185699463, "rewards/margins": 16.895124435424805, "rewards/rejected": -19.046489715576172, "step": 160 }, { "epoch": 0.544, "grad_norm": 0.06832431256771088, "learning_rate": 5.112190321479025e-07, "logits/chosen": -1.64431631565094, "logits/rejected": -1.0916073322296143, "logps/chosen": -281.46929931640625, "logps/rejected": -976.3692016601562, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4421179294586182, "rewards/margins": 17.808563232421875, "rewards/rejected": -19.250680923461914, "step": 170 }, { "epoch": 0.576, "grad_norm": 0.08709734678268433, "learning_rate": 4.5518034554828327e-07, "logits/chosen": -1.620501160621643, "logits/rejected": -1.1170865297317505, "logps/chosen": -323.96331787109375, "logps/rejected": -971.4978637695312, "loss": 0.0302, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.913256287574768, "rewards/margins": 17.42499542236328, "rewards/rejected": -19.3382511138916, "step": 180 }, { "epoch": 0.608, "grad_norm": 0.0656159296631813, "learning_rate": 3.997052921083636e-07, "logits/chosen": -1.6022181510925293, "logits/rejected": -1.076964020729065, "logps/chosen": -321.6889953613281, "logps/rejected": -1016.191650390625, "loss": 0.0276, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7427327632904053, "rewards/margins": 17.825733184814453, "rewards/rejected": -19.568466186523438, "step": 190 }, { "epoch": 0.64, "grad_norm": 0.11039853096008301, "learning_rate": 3.454915028125263e-07, "logits/chosen": -1.5825995206832886, "logits/rejected": -1.0985522270202637, "logps/chosen": -346.71307373046875, "logps/rejected": -980.868896484375, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -1.8475337028503418, "rewards/margins": 17.101648330688477, "rewards/rejected": -18.949182510375977, "step": 200 }, { "epoch": 0.672, "grad_norm": 1.4628119468688965, "learning_rate": 2.9322074751673974e-07, "logits/chosen": -1.5580945014953613, "logits/rejected": -1.035766839981079, "logps/chosen": -365.5740966796875, "logps/rejected": -1023.2942504882812, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.307873249053955, "rewards/margins": 17.43593978881836, "rewards/rejected": -19.743812561035156, "step": 210 }, { "epoch": 0.704, "grad_norm": 0.0860789492726326, "learning_rate": 2.4355036129704696e-07, "logits/chosen": -1.5405123233795166, "logits/rejected": -1.0252822637557983, "logps/chosen": -340.4681091308594, "logps/rejected": -1036.653564453125, "loss": 0.0292, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7713720798492432, "rewards/margins": 18.092077255249023, "rewards/rejected": -19.863449096679688, "step": 220 }, { "epoch": 0.736, "grad_norm": 0.3460147976875305, "learning_rate": 1.971049780795901e-07, "logits/chosen": -1.592976450920105, "logits/rejected": -1.0598104000091553, "logps/chosen": -348.5957336425781, "logps/rejected": -1037.0172119140625, "loss": 0.0298, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1035141944885254, "rewards/margins": 18.46420669555664, "rewards/rejected": -20.56772232055664, "step": 230 }, { "epoch": 0.768, "grad_norm": 0.07527298480272293, "learning_rate": 1.5446867550656767e-07, "logits/chosen": -1.512205958366394, "logits/rejected": -1.0166820287704468, "logps/chosen": -361.0347595214844, "logps/rejected": -1025.22900390625, "loss": 0.0343, "rewards/accuracies": 0.9906249642372131, "rewards/chosen": -2.0099854469299316, "rewards/margins": 17.852081298828125, "rewards/rejected": -19.8620662689209, "step": 240 }, { "epoch": 0.8, "grad_norm": 0.0847870260477066, "learning_rate": 1.1617762982099444e-07, "logits/chosen": -1.559564471244812, "logits/rejected": -1.0617430210113525, "logps/chosen": -333.57275390625, "logps/rejected": -986.9796142578125, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8989250659942627, "rewards/margins": 17.682388305664062, "rewards/rejected": -19.58131217956543, "step": 250 }, { "epoch": 0.832, "grad_norm": 0.12463419884443283, "learning_rate": 8.271337313934867e-08, "logits/chosen": -1.5232856273651123, "logits/rejected": -1.0490562915802002, "logps/chosen": -328.0819091796875, "logps/rejected": -936.2249145507812, "loss": 0.0301, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7982633113861084, "rewards/margins": 16.8775691986084, "rewards/rejected": -18.675832748413086, "step": 260 }, { "epoch": 0.864, "grad_norm": 0.07065236568450928, "learning_rate": 5.44967379058161e-08, "logits/chosen": -1.5595731735229492, "logits/rejected": -1.0487172603607178, "logps/chosen": -332.07220458984375, "logps/rejected": -984.1925048828125, "loss": 0.0359, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.8496124744415283, "rewards/margins": 17.49714469909668, "rewards/rejected": -19.346757888793945, "step": 270 }, { "epoch": 0.896, "grad_norm": 0.07000024616718292, "learning_rate": 3.188256468013139e-08, "logits/chosen": -1.5517284870147705, "logits/rejected": -1.048112392425537, "logps/chosen": -337.4092102050781, "logps/rejected": -997.2933959960938, "loss": 0.0583, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8345385789871216, "rewards/margins": 17.588817596435547, "rewards/rejected": -19.423355102539062, "step": 280 }, { "epoch": 0.928, "grad_norm": 0.5683500170707703, "learning_rate": 1.5155239811656562e-08, "logits/chosen": -1.5175541639328003, "logits/rejected": -1.0427402257919312, "logps/chosen": -337.4837341308594, "logps/rejected": -965.8953247070312, "loss": 0.0368, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8142101764678955, "rewards/margins": 16.774681091308594, "rewards/rejected": -18.588891983032227, "step": 290 }, { "epoch": 0.96, "grad_norm": 8.109635353088379, "learning_rate": 4.5251191160326495e-09, "logits/chosen": -1.5444049835205078, "logits/rejected": -1.0272959470748901, "logps/chosen": -327.03204345703125, "logps/rejected": -989.8179321289062, "loss": 0.0285, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.781620979309082, "rewards/margins": 17.613508224487305, "rewards/rejected": -19.395130157470703, "step": 300 }, { "epoch": 0.992, "grad_norm": 0.0859638899564743, "learning_rate": 1.2588252874673466e-10, "logits/chosen": -1.5771433115005493, "logits/rejected": -1.041457176208496, "logps/chosen": -356.52764892578125, "logps/rejected": -1029.2864990234375, "loss": 0.0419, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9025259017944336, "rewards/margins": 17.435348510742188, "rewards/rejected": -19.337875366210938, "step": 310 }, { "epoch": 0.9984, "step": 312, "total_flos": 2.3132199484045394e+18, "train_loss": 0.09468334361624259, "train_runtime": 19437.4188, "train_samples_per_second": 0.514, "train_steps_per_second": 0.016 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3132199484045394e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }