NicholasCorrado's picture
Model save
ef144c4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9960291197882198,
"eval_steps": 1000,
"global_step": 754,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026472534745201853,
"grad_norm": 4.30932258831093,
"learning_rate": 6.578947368421052e-09,
"logits/chosen": -2.923454761505127,
"logits/rejected": -3.022336483001709,
"logps/chosen": -491.803955078125,
"logps/rejected": -509.828369140625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.026472534745201854,
"grad_norm": 4.3524814860184176,
"learning_rate": 6.578947368421052e-08,
"logits/chosen": -2.849837303161621,
"logits/rejected": -2.918842315673828,
"logps/chosen": -482.8021240234375,
"logps/rejected": -468.6262512207031,
"loss": 0.6931,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 1.7793978258850984e-05,
"rewards/margins": 0.00012061676534358412,
"rewards/rejected": -0.00010282275616191328,
"step": 10
},
{
"epoch": 0.05294506949040371,
"grad_norm": 4.058367941679181,
"learning_rate": 1.3157894736842104e-07,
"logits/chosen": -2.8850979804992676,
"logits/rejected": -2.9686431884765625,
"logps/chosen": -492.1334533691406,
"logps/rejected": -473.5765075683594,
"loss": 0.6929,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.00013926837709732354,
"rewards/margins": 0.0005674505373463035,
"rewards/rejected": -0.0004281821602489799,
"step": 20
},
{
"epoch": 0.07941760423560557,
"grad_norm": 4.100822256989,
"learning_rate": 1.9736842105263157e-07,
"logits/chosen": -2.868659496307373,
"logits/rejected": -2.9573891162872314,
"logps/chosen": -475.2897033691406,
"logps/rejected": -473.85064697265625,
"loss": 0.6906,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": 0.003203944070264697,
"rewards/margins": 0.006764715071767569,
"rewards/rejected": -0.0035607716999948025,
"step": 30
},
{
"epoch": 0.10589013898080742,
"grad_norm": 4.224597004906648,
"learning_rate": 2.631578947368421e-07,
"logits/chosen": -2.880610704421997,
"logits/rejected": -2.9531335830688477,
"logps/chosen": -477.7626953125,
"logps/rejected": -471.15203857421875,
"loss": 0.6838,
"rewards/accuracies": 0.7906249761581421,
"rewards/chosen": 0.0102998660877347,
"rewards/margins": 0.02004994824528694,
"rewards/rejected": -0.009750082157552242,
"step": 40
},
{
"epoch": 0.13236267372600927,
"grad_norm": 4.481790796785453,
"learning_rate": 3.2894736842105264e-07,
"logits/chosen": -2.910224199295044,
"logits/rejected": -2.966555118560791,
"logps/chosen": -496.03704833984375,
"logps/rejected": -483.69970703125,
"loss": 0.6705,
"rewards/accuracies": 0.8031250238418579,
"rewards/chosen": 0.02056797966361046,
"rewards/margins": 0.04747764393687248,
"rewards/rejected": -0.026909660547971725,
"step": 50
},
{
"epoch": 0.15883520847121113,
"grad_norm": 4.727420695183077,
"learning_rate": 3.9473684210526315e-07,
"logits/chosen": -2.908859968185425,
"logits/rejected": -2.9846339225769043,
"logps/chosen": -487.971923828125,
"logps/rejected": -487.698486328125,
"loss": 0.6321,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.047008998692035675,
"rewards/margins": 0.12849071621894836,
"rewards/rejected": -0.08148171752691269,
"step": 60
},
{
"epoch": 0.18530774321641297,
"grad_norm": 5.3651750883138645,
"learning_rate": 4.6052631578947365e-07,
"logits/chosen": -2.9276702404022217,
"logits/rejected": -2.958789348602295,
"logps/chosen": -488.4877014160156,
"logps/rejected": -489.53546142578125,
"loss": 0.5937,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.025530118495225906,
"rewards/margins": 0.22059020400047302,
"rewards/rejected": -0.19506008923053741,
"step": 70
},
{
"epoch": 0.21178027796161483,
"grad_norm": 3.6311630185736297,
"learning_rate": 4.999570604073014e-07,
"logits/chosen": -2.923982620239258,
"logits/rejected": -2.9833145141601562,
"logps/chosen": -503.56134033203125,
"logps/rejected": -552.4190673828125,
"loss": 0.4934,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -0.13993698358535767,
"rewards/margins": 0.5944468975067139,
"rewards/rejected": -0.7343839406967163,
"step": 80
},
{
"epoch": 0.23825281270681667,
"grad_norm": 4.662865437270245,
"learning_rate": 4.994741593733563e-07,
"logits/chosen": -2.9195408821105957,
"logits/rejected": -2.958824872970581,
"logps/chosen": -534.3409423828125,
"logps/rejected": -607.5826416015625,
"loss": 0.4331,
"rewards/accuracies": 0.871874988079071,
"rewards/chosen": -0.41773176193237305,
"rewards/margins": 0.9263063669204712,
"rewards/rejected": -1.3440382480621338,
"step": 90
},
{
"epoch": 0.26472534745201853,
"grad_norm": 4.763826147961603,
"learning_rate": 4.984557228946769e-07,
"logits/chosen": -2.815560817718506,
"logits/rejected": -2.8625988960266113,
"logps/chosen": -611.6197509765625,
"logps/rejected": -758.5867309570312,
"loss": 0.3704,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.2842621803283691,
"rewards/margins": 1.4864912033081055,
"rewards/rejected": -2.7707533836364746,
"step": 100
},
{
"epoch": 0.29119788219722037,
"grad_norm": 4.297222720122574,
"learning_rate": 4.969039372050355e-07,
"logits/chosen": -2.787930488586426,
"logits/rejected": -2.8286757469177246,
"logps/chosen": -622.7388305664062,
"logps/rejected": -840.2802734375,
"loss": 0.3234,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": -1.4110974073410034,
"rewards/margins": 2.2375450134277344,
"rewards/rejected": -3.648642063140869,
"step": 110
},
{
"epoch": 0.31767041694242226,
"grad_norm": 3.7535819941587065,
"learning_rate": 4.948221334560093e-07,
"logits/chosen": -2.831071615219116,
"logits/rejected": -2.843599796295166,
"logps/chosen": -592.7720947265625,
"logps/rejected": -807.1361083984375,
"loss": 0.3176,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": -1.023418664932251,
"rewards/margins": 2.2339682579040527,
"rewards/rejected": -3.2573866844177246,
"step": 120
},
{
"epoch": 0.3441429516876241,
"grad_norm": 4.4946406733821185,
"learning_rate": 4.922147805661402e-07,
"logits/chosen": -2.83577823638916,
"logits/rejected": -2.875858783721924,
"logps/chosen": -629.0977783203125,
"logps/rejected": -913.8206787109375,
"loss": 0.2875,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.3898423910140991,
"rewards/margins": 2.828009605407715,
"rewards/rejected": -4.2178521156311035,
"step": 130
},
{
"epoch": 0.37061548643282594,
"grad_norm": 4.541566634311875,
"learning_rate": 4.890874756276999e-07,
"logits/chosen": -2.7425622940063477,
"logits/rejected": -2.8039002418518066,
"logps/chosen": -634.9403686523438,
"logps/rejected": -911.8308715820312,
"loss": 0.2831,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5872424840927124,
"rewards/margins": 2.6655924320220947,
"rewards/rejected": -4.252835273742676,
"step": 140
},
{
"epoch": 0.3970880211780278,
"grad_norm": 3.3580041177607978,
"learning_rate": 4.854469318916532e-07,
"logits/chosen": -2.8008108139038086,
"logits/rejected": -2.8558998107910156,
"logps/chosen": -646.1107177734375,
"logps/rejected": -980.4015502929688,
"loss": 0.2668,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -1.5013834238052368,
"rewards/margins": 3.364828109741211,
"rewards/rejected": -4.866211414337158,
"step": 150
},
{
"epoch": 0.42356055592322966,
"grad_norm": 4.675914141471813,
"learning_rate": 4.8130096435661e-07,
"logits/chosen": -2.80204176902771,
"logits/rejected": -2.8504693508148193,
"logps/chosen": -631.5078125,
"logps/rejected": -925.1160888671875,
"loss": 0.2519,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -1.4621139764785767,
"rewards/margins": 2.9786953926086426,
"rewards/rejected": -4.440809726715088,
"step": 160
},
{
"epoch": 0.4500330906684315,
"grad_norm": 6.483714450702139,
"learning_rate": 4.766584729927049e-07,
"logits/chosen": -2.768064498901367,
"logits/rejected": -2.8086233139038086,
"logps/chosen": -676.531982421875,
"logps/rejected": -1029.0057373046875,
"loss": 0.2799,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8374897241592407,
"rewards/margins": 3.5798168182373047,
"rewards/rejected": -5.417306900024414,
"step": 170
},
{
"epoch": 0.47650562541363334,
"grad_norm": 5.396515481877322,
"learning_rate": 4.7152942363641345e-07,
"logits/chosen": -2.7385268211364746,
"logits/rejected": -2.760967493057251,
"logps/chosen": -642.5712890625,
"logps/rejected": -997.8201904296875,
"loss": 0.2558,
"rewards/accuracies": 0.8968750238418579,
"rewards/chosen": -1.627642273902893,
"rewards/margins": 3.5839648246765137,
"rewards/rejected": -5.211607456207275,
"step": 180
},
{
"epoch": 0.5029781601588352,
"grad_norm": 5.261191188089043,
"learning_rate": 4.6592482659732045e-07,
"logits/chosen": -2.7847423553466797,
"logits/rejected": -2.8355870246887207,
"logps/chosen": -674.9547729492188,
"logps/rejected": -972.1364135742188,
"loss": 0.2536,
"rewards/accuracies": 0.8968750238418579,
"rewards/chosen": -1.8109451532363892,
"rewards/margins": 3.033442258834839,
"rewards/rejected": -4.844388008117676,
"step": 190
},
{
"epoch": 0.5294506949040371,
"grad_norm": 4.9133890145951495,
"learning_rate": 4.5985671302276166e-07,
"logits/chosen": -2.7557764053344727,
"logits/rejected": -2.8086917400360107,
"logps/chosen": -681.2697143554688,
"logps/rejected": -989.4937744140625,
"loss": 0.2602,
"rewards/accuracies": 0.890625,
"rewards/chosen": -1.8896667957305908,
"rewards/margins": 3.2132556438446045,
"rewards/rejected": -5.102922439575195,
"step": 200
},
{
"epoch": 0.5559232296492389,
"grad_norm": 4.892431641268176,
"learning_rate": 4.533381090710776e-07,
"logits/chosen": -2.7014384269714355,
"logits/rejected": -2.7446448802948,
"logps/chosen": -670.3836669921875,
"logps/rejected": -988.2939453125,
"loss": 0.2323,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -1.8962217569351196,
"rewards/margins": 3.261091947555542,
"rewards/rejected": -5.157313823699951,
"step": 210
},
{
"epoch": 0.5823957643944407,
"grad_norm": 4.5229078992046166,
"learning_rate": 4.463830079489196e-07,
"logits/chosen": -2.7127437591552734,
"logits/rejected": -2.735060930252075,
"logps/chosen": -697.1866455078125,
"logps/rejected": -1054.2486572265625,
"loss": 0.2292,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -2.145174026489258,
"rewards/margins": 3.7120985984802246,
"rewards/rejected": -5.857272148132324,
"step": 220
},
{
"epoch": 0.6088682991396426,
"grad_norm": 5.024605464390731,
"learning_rate": 4.390063398726356e-07,
"logits/chosen": -2.6947622299194336,
"logits/rejected": -2.6960158348083496,
"logps/chosen": -719.8890380859375,
"logps/rejected": -1086.1702880859375,
"loss": 0.2353,
"rewards/accuracies": 0.921875,
"rewards/chosen": -2.32169771194458,
"rewards/margins": 3.803462266921997,
"rewards/rejected": -6.12515926361084,
"step": 230
},
{
"epoch": 0.6353408338848445,
"grad_norm": 4.497134908678504,
"learning_rate": 4.3122394001821657e-07,
"logits/chosen": -2.699171781539917,
"logits/rejected": -2.7273011207580566,
"logps/chosen": -715.7581176757812,
"logps/rejected": -1008.9625244140625,
"loss": 0.2481,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.174252986907959,
"rewards/margins": 3.0458567142486572,
"rewards/rejected": -5.220109462738037,
"step": 240
},
{
"epoch": 0.6618133686300464,
"grad_norm": 4.8761682045650625,
"learning_rate": 4.2305251452860566e-07,
"logits/chosen": -2.7068121433258057,
"logits/rejected": -2.7410686016082764,
"logps/chosen": -667.6207885742188,
"logps/rejected": -1025.73828125,
"loss": 0.2279,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.91938054561615,
"rewards/margins": 3.5805881023406982,
"rewards/rejected": -5.4999680519104,
"step": 250
},
{
"epoch": 0.6882859033752482,
"grad_norm": 6.933116753661563,
"learning_rate": 4.1450960465134024e-07,
"logits/chosen": -2.7003915309906006,
"logits/rejected": -2.7421202659606934,
"logps/chosen": -725.2587890625,
"logps/rejected": -1111.21533203125,
"loss": 0.2322,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.4127094745635986,
"rewards/margins": 3.868704319000244,
"rewards/rejected": -6.28141450881958,
"step": 260
},
{
"epoch": 0.71475843812045,
"grad_norm": 5.180639131738146,
"learning_rate": 4.0561354908350977e-07,
"logits/chosen": -2.7455577850341797,
"logits/rejected": -2.768336057662964,
"logps/chosen": -686.0287475585938,
"logps/rejected": -1041.7474365234375,
"loss": 0.2406,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.0675699710845947,
"rewards/margins": 3.5837669372558594,
"rewards/rejected": -5.651337146759033,
"step": 270
},
{
"epoch": 0.7412309728656519,
"grad_norm": 6.760828116437296,
"learning_rate": 3.963834446048644e-07,
"logits/chosen": -2.7979235649108887,
"logits/rejected": -2.795571804046631,
"logps/chosen": -757.2296142578125,
"logps/rejected": -1096.9935302734375,
"loss": 0.235,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": -2.414430618286133,
"rewards/margins": 3.682656764984131,
"rewards/rejected": -6.097087383270264,
"step": 280
},
{
"epoch": 0.7677035076108537,
"grad_norm": 5.924062579551431,
"learning_rate": 3.868391050835793e-07,
"logits/chosen": -2.707353353500366,
"logits/rejected": -2.708177089691162,
"logps/chosen": -735.2392578125,
"logps/rejected": -1096.3634033203125,
"loss": 0.2165,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.4990901947021484,
"rewards/margins": 3.737868547439575,
"rewards/rejected": -6.2369585037231445,
"step": 290
},
{
"epoch": 0.7941760423560555,
"grad_norm": 6.316712123280126,
"learning_rate": 3.770010189426761e-07,
"logits/chosen": -2.696866512298584,
"logits/rejected": -2.720412015914917,
"logps/chosen": -788.0701904296875,
"logps/rejected": -1174.900390625,
"loss": 0.2278,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": -2.9900288581848145,
"rewards/margins": 3.7810866832733154,
"rewards/rejected": -6.771115779876709,
"step": 300
},
{
"epoch": 0.8206485771012575,
"grad_norm": 5.395016037418297,
"learning_rate": 3.66890305178407e-07,
"logits/chosen": -2.716305732727051,
"logits/rejected": -2.716601848602295,
"logps/chosen": -742.38232421875,
"logps/rejected": -1122.4625244140625,
"loss": 0.2287,
"rewards/accuracies": 0.90625,
"rewards/chosen": -2.601231098175049,
"rewards/margins": 3.891951084136963,
"rewards/rejected": -6.4931817054748535,
"step": 310
},
{
"epoch": 0.8471211118464593,
"grad_norm": 6.246149943149367,
"learning_rate": 3.565286680250138e-07,
"logits/chosen": -2.748223066329956,
"logits/rejected": -2.7793920040130615,
"logps/chosen": -724.46484375,
"logps/rejected": -1080.5015869140625,
"loss": 0.2089,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -2.3395774364471436,
"rewards/margins": 3.757481813430786,
"rewards/rejected": -6.097059726715088,
"step": 320
},
{
"epoch": 0.8735936465916612,
"grad_norm": 6.876926943284203,
"learning_rate": 3.4593835036318225e-07,
"logits/chosen": -2.691286563873291,
"logits/rejected": -2.7156758308410645,
"logps/chosen": -759.7913208007812,
"logps/rejected": -1132.6741943359375,
"loss": 0.2212,
"rewards/accuracies": 0.921875,
"rewards/chosen": -2.764772415161133,
"rewards/margins": 3.8325703144073486,
"rewards/rejected": -6.597343444824219,
"step": 330
},
{
"epoch": 0.900066181336863,
"grad_norm": 7.266749393582284,
"learning_rate": 3.35142085972207e-07,
"logits/chosen": -2.6851227283477783,
"logits/rejected": -2.722938060760498,
"logps/chosen": -754.1365356445312,
"logps/rejected": -1132.1871337890625,
"loss": 0.218,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -2.7608418464660645,
"rewards/margins": 3.8383007049560547,
"rewards/rejected": -6.599142551422119,
"step": 340
},
{
"epoch": 0.9265387160820648,
"grad_norm": 5.8828162546054426,
"learning_rate": 3.2416305072836555e-07,
"logits/chosen": -2.703392267227173,
"logits/rejected": -2.717411756515503,
"logps/chosen": -759.862060546875,
"logps/rejected": -1137.0098876953125,
"loss": 0.2033,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -2.6198081970214844,
"rewards/margins": 3.8972651958465576,
"rewards/rejected": -6.517073154449463,
"step": 350
},
{
"epoch": 0.9530112508272667,
"grad_norm": 6.02673405904732,
"learning_rate": 3.1302481285426197e-07,
"logits/chosen": -2.7140469551086426,
"logits/rejected": -2.7299208641052246,
"logps/chosen": -727.9823608398438,
"logps/rejected": -1152.2041015625,
"loss": 0.2146,
"rewards/accuracies": 0.921875,
"rewards/chosen": -2.4466357231140137,
"rewards/margins": 4.253617763519287,
"rewards/rejected": -6.700253486633301,
"step": 360
},
{
"epoch": 0.9794837855724685,
"grad_norm": 5.891168724337083,
"learning_rate": 3.017512823259373e-07,
"logits/chosen": -2.7166543006896973,
"logits/rejected": -2.7297706604003906,
"logps/chosen": -730.9952392578125,
"logps/rejected": -1117.7025146484375,
"loss": 0.2011,
"rewards/accuracies": 0.9281250238418579,
"rewards/chosen": -2.5781686305999756,
"rewards/margins": 4.00323486328125,
"rewards/rejected": -6.5814032554626465,
"step": 370
},
{
"epoch": 1.0059563203176705,
"grad_norm": 5.4096108526897115,
"learning_rate": 2.9036665954635264e-07,
"logits/chosen": -2.7353827953338623,
"logits/rejected": -2.765237331390381,
"logps/chosen": -769.5081176757812,
"logps/rejected": -1209.970947265625,
"loss": 0.2135,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -2.7778820991516113,
"rewards/margins": 4.453129291534424,
"rewards/rejected": -7.231011390686035,
"step": 380
},
{
"epoch": 1.0324288550628722,
"grad_norm": 6.969996057859779,
"learning_rate": 2.7889538339542523e-07,
"logits/chosen": -2.715958833694458,
"logits/rejected": -2.741973638534546,
"logps/chosen": -760.3748779296875,
"logps/rejected": -1181.979248046875,
"loss": 0.1997,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -2.8078932762145996,
"rewards/margins": 4.318324089050293,
"rewards/rejected": -7.126217842102051,
"step": 390
},
{
"epoch": 1.0589013898080741,
"grad_norm": 6.27267698414733,
"learning_rate": 2.6736207876813643e-07,
"logits/chosen": -2.728332042694092,
"logits/rejected": -2.7608368396759033,
"logps/chosen": -768.7643432617188,
"logps/rejected": -1208.543701171875,
"loss": 0.1963,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -2.7519562244415283,
"rewards/margins": 4.398838996887207,
"rewards/rejected": -7.150795936584473,
"step": 400
},
{
"epoch": 1.0853739245532759,
"grad_norm": 6.331116230324871,
"learning_rate": 2.5579150371332953e-07,
"logits/chosen": -2.7396435737609863,
"logits/rejected": -2.7421391010284424,
"logps/chosen": -809.5894775390625,
"logps/rejected": -1248.327880859375,
"loss": 0.1821,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -3.0432112216949463,
"rewards/margins": 4.567535400390625,
"rewards/rejected": -7.61074686050415,
"step": 410
},
{
"epoch": 1.1118464592984778,
"grad_norm": 6.507894575900424,
"learning_rate": 2.4420849628667045e-07,
"logits/chosen": -2.7221198081970215,
"logits/rejected": -2.7356066703796387,
"logps/chosen": -795.4509887695312,
"logps/rejected": -1229.848876953125,
"loss": 0.205,
"rewards/accuracies": 0.940625011920929,
"rewards/chosen": -3.027529716491699,
"rewards/margins": 4.336198806762695,
"rewards/rejected": -7.3637285232543945,
"step": 420
},
{
"epoch": 1.1383189940436798,
"grad_norm": 6.866126936883296,
"learning_rate": 2.3263792123186352e-07,
"logits/chosen": -2.7347493171691895,
"logits/rejected": -2.7523412704467773,
"logps/chosen": -788.9684448242188,
"logps/rejected": -1187.0565185546875,
"loss": 0.2132,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -2.9675726890563965,
"rewards/margins": 4.024303913116455,
"rewards/rejected": -6.99187707901001,
"step": 430
},
{
"epoch": 1.1647915287888815,
"grad_norm": 5.664097996574406,
"learning_rate": 2.211046166045748e-07,
"logits/chosen": -2.699467420578003,
"logits/rejected": -2.715172529220581,
"logps/chosen": -787.1725463867188,
"logps/rejected": -1219.5947265625,
"loss": 0.1992,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -2.9378509521484375,
"rewards/margins": 4.4215216636657715,
"rewards/rejected": -7.359372138977051,
"step": 440
},
{
"epoch": 1.1912640635340834,
"grad_norm": 5.8438221376315,
"learning_rate": 2.096333404536474e-07,
"logits/chosen": -2.6697657108306885,
"logits/rejected": -2.6788392066955566,
"logps/chosen": -816.9085693359375,
"logps/rejected": -1204.7374267578125,
"loss": 0.1939,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.2424380779266357,
"rewards/margins": 4.097726821899414,
"rewards/rejected": -7.340165138244629,
"step": 450
},
{
"epoch": 1.2177365982792852,
"grad_norm": 6.534950899522826,
"learning_rate": 1.982487176740627e-07,
"logits/chosen": -2.6479620933532715,
"logits/rejected": -2.686981201171875,
"logps/chosen": -760.5189208984375,
"logps/rejected": -1172.6590576171875,
"loss": 0.2034,
"rewards/accuracies": 0.9281250238418579,
"rewards/chosen": -2.8477563858032227,
"rewards/margins": 4.02510929107666,
"rewards/rejected": -6.872865200042725,
"step": 460
},
{
"epoch": 1.244209133024487,
"grad_norm": 5.919455336024619,
"learning_rate": 1.8697518714573804e-07,
"logits/chosen": -2.7183327674865723,
"logits/rejected": -2.7170839309692383,
"logps/chosen": -790.9767456054688,
"logps/rejected": -1145.7747802734375,
"loss": 0.1854,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.942429304122925,
"rewards/margins": 3.9219982624053955,
"rewards/rejected": -6.864427089691162,
"step": 470
},
{
"epoch": 1.270681667769689,
"grad_norm": 8.510303191565324,
"learning_rate": 1.758369492716345e-07,
"logits/chosen": -2.7107906341552734,
"logits/rejected": -2.7333004474639893,
"logps/chosen": -831.3541870117188,
"logps/rejected": -1250.9085693359375,
"loss": 0.204,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -3.3519797325134277,
"rewards/margins": 4.182629585266113,
"rewards/rejected": -7.534609317779541,
"step": 480
},
{
"epoch": 1.2971542025148908,
"grad_norm": 6.158432969646956,
"learning_rate": 1.648579140277931e-07,
"logits/chosen": -2.7101292610168457,
"logits/rejected": -2.715567111968994,
"logps/chosen": -815.2239990234375,
"logps/rejected": -1204.00830078125,
"loss": 0.2016,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.184443712234497,
"rewards/margins": 4.096536159515381,
"rewards/rejected": -7.280980110168457,
"step": 490
},
{
"epoch": 1.3236267372600927,
"grad_norm": 6.688172890404941,
"learning_rate": 1.5406164963681773e-07,
"logits/chosen": -2.6899495124816895,
"logits/rejected": -2.697331666946411,
"logps/chosen": -788.6990356445312,
"logps/rejected": -1184.682861328125,
"loss": 0.2028,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -2.907625436782837,
"rewards/margins": 4.130288124084473,
"rewards/rejected": -7.037914276123047,
"step": 500
},
{
"epoch": 1.3500992720052944,
"grad_norm": 6.177665969794554,
"learning_rate": 1.4347133197498618e-07,
"logits/chosen": -2.720623016357422,
"logits/rejected": -2.713228702545166,
"logps/chosen": -813.38427734375,
"logps/rejected": -1158.912353515625,
"loss": 0.1947,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -3.203073024749756,
"rewards/margins": 3.7714266777038574,
"rewards/rejected": -6.974499702453613,
"step": 510
},
{
"epoch": 1.3765718067504964,
"grad_norm": 7.6140239843164785,
"learning_rate": 1.3310969482159296e-07,
"logits/chosen": -2.7323203086853027,
"logits/rejected": -2.720503330230713,
"logps/chosen": -822.5255737304688,
"logps/rejected": -1203.3837890625,
"loss": 0.206,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -3.2711524963378906,
"rewards/margins": 4.040999889373779,
"rewards/rejected": -7.312152862548828,
"step": 520
},
{
"epoch": 1.4030443414956983,
"grad_norm": 7.395646356776199,
"learning_rate": 1.2299898105732384e-07,
"logits/chosen": -2.664844036102295,
"logits/rejected": -2.6827051639556885,
"logps/chosen": -797.8216552734375,
"logps/rejected": -1175.84326171875,
"loss": 0.1975,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.197530746459961,
"rewards/margins": 3.875856876373291,
"rewards/rejected": -7.07338809967041,
"step": 530
},
{
"epoch": 1.4295168762409,
"grad_norm": 5.37023800702416,
"learning_rate": 1.1316089491642075e-07,
"logits/chosen": -2.6801838874816895,
"logits/rejected": -2.7150120735168457,
"logps/chosen": -782.3211669921875,
"logps/rejected": -1218.0201416015625,
"loss": 0.177,
"rewards/accuracies": 0.953125,
"rewards/chosen": -3.0446789264678955,
"rewards/margins": 4.468942642211914,
"rewards/rejected": -7.5136213302612305,
"step": 540
},
{
"epoch": 1.4559894109861018,
"grad_norm": 6.848159521074984,
"learning_rate": 1.0361655539513564e-07,
"logits/chosen": -2.7019715309143066,
"logits/rejected": -2.717634677886963,
"logps/chosen": -816.9815063476562,
"logps/rejected": -1239.1295166015625,
"loss": 0.1866,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.2783799171447754,
"rewards/margins": 4.237971305847168,
"rewards/rejected": -7.516351222991943,
"step": 550
},
{
"epoch": 1.4824619457313037,
"grad_norm": 6.1903290336235965,
"learning_rate": 9.438645091649028e-08,
"logits/chosen": -2.703291416168213,
"logits/rejected": -2.717991590499878,
"logps/chosen": -835.4637451171875,
"logps/rejected": -1262.481201171875,
"loss": 0.1941,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -3.5229759216308594,
"rewards/margins": 4.230559349060059,
"rewards/rejected": -7.753535270690918,
"step": 560
},
{
"epoch": 1.5089344804765057,
"grad_norm": 7.319396362423722,
"learning_rate": 8.549039534865979e-08,
"logits/chosen": -2.6856772899627686,
"logits/rejected": -2.7186999320983887,
"logps/chosen": -839.0086059570312,
"logps/rejected": -1260.779541015625,
"loss": 0.1939,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.4230124950408936,
"rewards/margins": 4.303016185760498,
"rewards/rejected": -7.7260284423828125,
"step": 570
},
{
"epoch": 1.5354070152217076,
"grad_norm": 6.830716936510353,
"learning_rate": 7.694748547139429e-08,
"logits/chosen": -2.690796136856079,
"logits/rejected": -2.6854541301727295,
"logps/chosen": -858.9547119140625,
"logps/rejected": -1268.125732421875,
"loss": 0.1822,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.606029510498047,
"rewards/margins": 4.35842752456665,
"rewards/rejected": -7.964457035064697,
"step": 580
},
{
"epoch": 1.5618795499669094,
"grad_norm": 7.344475965596424,
"learning_rate": 6.877605998178343e-08,
"logits/chosen": -2.6852376461029053,
"logits/rejected": -2.6851906776428223,
"logps/chosen": -825.2852783203125,
"logps/rejected": -1217.6024169921875,
"loss": 0.1937,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -3.3112761974334717,
"rewards/margins": 4.0954132080078125,
"rewards/rejected": -7.4066901206970215,
"step": 590
},
{
"epoch": 1.588352084712111,
"grad_norm": 6.774677658543177,
"learning_rate": 6.099366012736437e-08,
"logits/chosen": -2.6621475219726562,
"logits/rejected": -2.6802525520324707,
"logps/chosen": -823.2789306640625,
"logps/rejected": -1230.999755859375,
"loss": 0.1879,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.3858489990234375,
"rewards/margins": 4.153179168701172,
"rewards/rejected": -7.539028167724609,
"step": 600
},
{
"epoch": 1.614824619457313,
"grad_norm": 6.63238227242087,
"learning_rate": 5.3616992051080415e-08,
"logits/chosen": -2.677306890487671,
"logits/rejected": -2.712442636489868,
"logps/chosen": -788.0901489257812,
"logps/rejected": -1236.086181640625,
"loss": 0.1818,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.1388726234436035,
"rewards/margins": 4.399405002593994,
"rewards/rejected": -7.538276672363281,
"step": 610
},
{
"epoch": 1.641297154202515,
"grad_norm": 7.377420865927689,
"learning_rate": 4.666189092892245e-08,
"logits/chosen": -2.681398630142212,
"logits/rejected": -2.7074177265167236,
"logps/chosen": -808.8873291015625,
"logps/rejected": -1222.2945556640625,
"loss": 0.192,
"rewards/accuracies": 0.90625,
"rewards/chosen": -3.247420072555542,
"rewards/margins": 4.086552619934082,
"rewards/rejected": -7.3339738845825195,
"step": 620
},
{
"epoch": 1.6677696889477167,
"grad_norm": 8.778568268489295,
"learning_rate": 4.0143286977238345e-08,
"logits/chosen": -2.732050895690918,
"logits/rejected": -2.743983507156372,
"logps/chosen": -833.8406372070312,
"logps/rejected": -1255.4339599609375,
"loss": 0.2052,
"rewards/accuracies": 0.909375011920929,
"rewards/chosen": -3.412902355194092,
"rewards/margins": 4.319065570831299,
"rewards/rejected": -7.731966972351074,
"step": 630
},
{
"epoch": 1.6942422236929184,
"grad_norm": 9.757233734434815,
"learning_rate": 3.407517340267957e-08,
"logits/chosen": -2.704099655151367,
"logits/rejected": -2.747129440307617,
"logps/chosen": -825.1597900390625,
"logps/rejected": -1279.721923828125,
"loss": 0.1969,
"rewards/accuracies": 0.921875,
"rewards/chosen": -3.3202297687530518,
"rewards/margins": 4.6456804275512695,
"rewards/rejected": -7.965909481048584,
"step": 640
},
{
"epoch": 1.7207147584381204,
"grad_norm": 5.790336419749713,
"learning_rate": 2.847057636358663e-08,
"logits/chosen": -2.666538715362549,
"logits/rejected": -2.6833436489105225,
"logps/chosen": -791.0831298828125,
"logps/rejected": -1239.421875,
"loss": 0.2025,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.060492753982544,
"rewards/margins": 4.618899345397949,
"rewards/rejected": -7.679392337799072,
"step": 650
},
{
"epoch": 1.7471872931833223,
"grad_norm": 7.090658258583613,
"learning_rate": 2.3341527007295107e-08,
"logits/chosen": -2.724691867828369,
"logits/rejected": -2.7381081581115723,
"logps/chosen": -795.6064453125,
"logps/rejected": -1221.3892822265625,
"loss": 0.1934,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": -3.000559091567993,
"rewards/margins": 4.30582332611084,
"rewards/rejected": -7.306382656097412,
"step": 660
},
{
"epoch": 1.7736598279285243,
"grad_norm": 7.043130947329383,
"learning_rate": 1.8699035643389927e-08,
"logits/chosen": -2.6909706592559814,
"logits/rejected": -2.706798791885376,
"logps/chosen": -813.7172241210938,
"logps/rejected": -1210.0374755859375,
"loss": 0.1935,
"rewards/accuracies": 0.9375,
"rewards/chosen": -3.150881290435791,
"rewards/margins": 4.167717933654785,
"rewards/rejected": -7.318598747253418,
"step": 670
},
{
"epoch": 1.800132362673726,
"grad_norm": 8.93025020392657,
"learning_rate": 1.4553068108346778e-08,
"logits/chosen": -2.7232556343078613,
"logits/rejected": -2.758570671081543,
"logps/chosen": -816.0042724609375,
"logps/rejected": -1228.6907958984375,
"loss": 0.1969,
"rewards/accuracies": 0.9156249761581421,
"rewards/chosen": -3.222668409347534,
"rewards/margins": 4.130661487579346,
"rewards/rejected": -7.353329658508301,
"step": 680
},
{
"epoch": 1.8266048974189277,
"grad_norm": 8.105462657724475,
"learning_rate": 1.0912524372300031e-08,
"logits/chosen": -2.6958508491516113,
"logits/rejected": -2.7145791053771973,
"logps/chosen": -800.2642211914062,
"logps/rejected": -1204.597412109375,
"loss": 0.1979,
"rewards/accuracies": 0.9281250238418579,
"rewards/chosen": -3.0551960468292236,
"rewards/margins": 4.35771369934082,
"rewards/rejected": -7.412909507751465,
"step": 690
},
{
"epoch": 1.8530774321641297,
"grad_norm": 6.305134676239398,
"learning_rate": 7.785219433859846e-09,
"logits/chosen": -2.666156768798828,
"logits/rejected": -2.6787142753601074,
"logps/chosen": -795.4716186523438,
"logps/rejected": -1216.070068359375,
"loss": 0.1889,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": -3.1992480754852295,
"rewards/margins": 4.258774757385254,
"rewards/rejected": -7.4580230712890625,
"step": 700
},
{
"epoch": 1.8795499669093316,
"grad_norm": 8.35714287832233,
"learning_rate": 5.177866543990689e-09,
"logits/chosen": -2.6713449954986572,
"logits/rejected": -2.7099664211273193,
"logps/chosen": -809.7697143554688,
"logps/rejected": -1260.0615234375,
"loss": 0.1768,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.31097412109375,
"rewards/margins": 4.475188255310059,
"rewards/rejected": -7.786163330078125,
"step": 710
},
{
"epoch": 1.9060225016545336,
"grad_norm": 6.357842566297534,
"learning_rate": 3.0960627949644105e-09,
"logits/chosen": -2.661090850830078,
"logits/rejected": -2.6854565143585205,
"logps/chosen": -799.8544921875,
"logps/rejected": -1197.60595703125,
"loss": 0.195,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -3.179225206375122,
"rewards/margins": 4.040897846221924,
"rewards/rejected": -7.220122337341309,
"step": 720
},
{
"epoch": 1.9324950363997353,
"grad_norm": 6.897936781273812,
"learning_rate": 1.5442771053230663e-09,
"logits/chosen": -2.654930353164673,
"logits/rejected": -2.6900641918182373,
"logps/chosen": -794.2493286132812,
"logps/rejected": -1218.8597412109375,
"loss": 0.1752,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.182406187057495,
"rewards/margins": 4.386531352996826,
"rewards/rejected": -7.5689377784729,
"step": 730
},
{
"epoch": 1.958967571144937,
"grad_norm": 6.958740857504488,
"learning_rate": 5.25840626643681e-10,
"logits/chosen": -2.731703519821167,
"logits/rejected": -2.728083848953247,
"logps/chosen": -804.791259765625,
"logps/rejected": -1165.920654296875,
"loss": 0.1871,
"rewards/accuracies": 0.8968750238418579,
"rewards/chosen": -3.141540765762329,
"rewards/margins": 3.8939075469970703,
"rewards/rejected": -7.035449028015137,
"step": 740
},
{
"epoch": 1.985440105890139,
"grad_norm": 7.712958898241185,
"learning_rate": 4.293959269863201e-11,
"logits/chosen": -2.6934590339660645,
"logits/rejected": -2.702648639678955,
"logps/chosen": -818.2516479492188,
"logps/rejected": -1221.8408203125,
"loss": 0.2042,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -3.207921266555786,
"rewards/margins": 4.332727909088135,
"rewards/rejected": -7.5406494140625,
"step": 750
},
{
"epoch": 1.9960291197882198,
"step": 754,
"total_flos": 0.0,
"train_loss": 0.26563876598520053,
"train_runtime": 5267.2293,
"train_samples_per_second": 36.699,
"train_steps_per_second": 0.143
}
],
"logging_steps": 10,
"max_steps": 754,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}