{ "best_global_step": 150, "best_metric": 0.8634624481201172, "best_model_checkpoint": "lora_deception_model/checkpoint-100", "epoch": 5.0, "eval_steps": 50, "global_step": 745, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6688848555088043, "epoch": 0.06717044500419815, "grad_norm": 0.3304958939552307, "learning_rate": 7.2000000000000005e-06, "loss": 0.8658, "mean_token_accuracy": 0.8207968935370445, "num_tokens": 63133.0, "step": 10 }, { "entropy": 0.6755495790392161, "epoch": 0.1343408900083963, "grad_norm": 0.32398757338523865, "learning_rate": 1.5200000000000002e-05, "loss": 0.8662, "mean_token_accuracy": 0.8197331622242927, "num_tokens": 124489.0, "step": 20 }, { "entropy": 0.734241085499525, "epoch": 0.20151133501259447, "grad_norm": 0.31472697854042053, "learning_rate": 2.32e-05, "loss": 0.7134, "mean_token_accuracy": 0.8222098298370838, "num_tokens": 189081.0, "step": 30 }, { "entropy": 0.8794498354196548, "epoch": 0.2686817800167926, "grad_norm": 0.23867088556289673, "learning_rate": 3.1200000000000006e-05, "loss": 0.6577, "mean_token_accuracy": 0.820378091186285, "num_tokens": 270096.0, "step": 40 }, { "entropy": 0.9822202995419502, "epoch": 0.33585222502099077, "grad_norm": 0.23442600667476654, "learning_rate": 3.9200000000000004e-05, "loss": 0.5764, "mean_token_accuracy": 0.8329788409173489, "num_tokens": 343034.0, "step": 50 }, { "epoch": 0.33585222502099077, "eval_entropy": 1.35109115924154, "eval_loss": 0.9188798666000366, "eval_mean_token_accuracy": 0.7514589822718075, "eval_num_tokens": 343034.0, "eval_runtime": 5.1522, "eval_samples_per_second": 10.869, "eval_steps_per_second": 10.869, "step": 50 }, { "entropy": 0.8484722189605236, "epoch": 0.40302267002518893, "grad_norm": 0.27580252289772034, "learning_rate": 3.948201438848921e-05, "loss": 0.4939, "mean_token_accuracy": 0.8525501079857349, "num_tokens": 407346.0, "step": 60 }, { "entropy": 0.7665611129254103, "epoch": 0.47019311502938704, "grad_norm": 0.32506343722343445, "learning_rate": 3.8906474820143886e-05, "loss": 0.4095, "mean_token_accuracy": 0.8701820805668831, "num_tokens": 466070.0, "step": 70 }, { "entropy": 0.7331306278705597, "epoch": 0.5373635600335852, "grad_norm": 0.3568798899650574, "learning_rate": 3.833093525179856e-05, "loss": 0.448, "mean_token_accuracy": 0.863949628919363, "num_tokens": 536560.0, "step": 80 }, { "entropy": 0.7565343648195266, "epoch": 0.6045340050377834, "grad_norm": 0.3249973952770233, "learning_rate": 3.775539568345324e-05, "loss": 0.4065, "mean_token_accuracy": 0.8672126568853855, "num_tokens": 597642.0, "step": 90 }, { "entropy": 0.6388500925153494, "epoch": 0.6717044500419815, "grad_norm": 0.36163443326950073, "learning_rate": 3.7179856115107914e-05, "loss": 0.3693, "mean_token_accuracy": 0.8828025683760643, "num_tokens": 663210.0, "step": 100 }, { "epoch": 0.6717044500419815, "eval_entropy": 1.0956847156797136, "eval_loss": 0.8651557564735413, "eval_mean_token_accuracy": 0.7637390036668096, "eval_num_tokens": 663210.0, "eval_runtime": 4.9833, "eval_samples_per_second": 11.237, "eval_steps_per_second": 11.237, "step": 100 }, { "entropy": 0.6128827821463346, "epoch": 0.7388748950461796, "grad_norm": 0.29096513986587524, "learning_rate": 3.6604316546762595e-05, "loss": 0.3719, "mean_token_accuracy": 0.8851677082479, "num_tokens": 726617.0, "step": 110 }, { "entropy": 0.6564031228423118, "epoch": 0.8060453400503779, "grad_norm": 0.27515140175819397, "learning_rate": 3.602877697841727e-05, "loss": 0.3935, "mean_token_accuracy": 0.8752141006290912, "num_tokens": 800821.0, "step": 120 }, { "entropy": 0.6125180713832379, "epoch": 0.873215785054576, "grad_norm": 0.3126560151576996, "learning_rate": 3.545323741007194e-05, "loss": 0.3847, "mean_token_accuracy": 0.8818505816161633, "num_tokens": 868710.0, "step": 130 }, { "entropy": 0.6347699824720621, "epoch": 0.9403862300587741, "grad_norm": 0.23533445596694946, "learning_rate": 3.487769784172662e-05, "loss": 0.385, "mean_token_accuracy": 0.8835290059447288, "num_tokens": 935575.0, "step": 140 }, { "entropy": 0.579044715513157, "epoch": 1.0067170445004199, "grad_norm": 0.2643934488296509, "learning_rate": 3.43021582733813e-05, "loss": 0.3224, "mean_token_accuracy": 0.8946574262425869, "num_tokens": 996466.0, "step": 150 }, { "epoch": 1.0067170445004199, "eval_entropy": 0.9877308234572411, "eval_loss": 0.8634624481201172, "eval_mean_token_accuracy": 0.7611759411437171, "eval_num_tokens": 996466.0, "eval_runtime": 4.9854, "eval_samples_per_second": 11.233, "eval_steps_per_second": 11.233, "step": 150 }, { "entropy": 0.552555637806654, "epoch": 1.073887489504618, "grad_norm": 0.2633912265300751, "learning_rate": 3.372661870503598e-05, "loss": 0.3265, "mean_token_accuracy": 0.8968047671020031, "num_tokens": 1066789.0, "step": 160 }, { "entropy": 0.5594113569706678, "epoch": 1.141057934508816, "grad_norm": 0.2680589258670807, "learning_rate": 3.315107913669065e-05, "loss": 0.3519, "mean_token_accuracy": 0.8886663600802421, "num_tokens": 1138401.0, "step": 170 }, { "entropy": 0.5540055774152279, "epoch": 1.2082283795130142, "grad_norm": 0.27169251441955566, "learning_rate": 3.2575539568345325e-05, "loss": 0.3436, "mean_token_accuracy": 0.8913260444998741, "num_tokens": 1202735.0, "step": 180 }, { "entropy": 0.5612262919545173, "epoch": 1.2753988245172123, "grad_norm": 0.3151009976863861, "learning_rate": 3.2000000000000005e-05, "loss": 0.351, "mean_token_accuracy": 0.8909851491451264, "num_tokens": 1266476.0, "step": 190 }, { "entropy": 0.5354872807860375, "epoch": 1.3425692695214106, "grad_norm": 0.29847052693367004, "learning_rate": 3.142446043165468e-05, "loss": 0.3045, "mean_token_accuracy": 0.9023352213203907, "num_tokens": 1327389.0, "step": 200 }, { "epoch": 1.3425692695214106, "eval_entropy": 0.9606236421636173, "eval_loss": 0.8751164078712463, "eval_mean_token_accuracy": 0.7617330902389118, "eval_num_tokens": 1327389.0, "eval_runtime": 4.9901, "eval_samples_per_second": 11.222, "eval_steps_per_second": 11.222, "step": 200 }, { "entropy": 0.5093934688717127, "epoch": 1.4097397145256088, "grad_norm": 0.31276237964630127, "learning_rate": 3.084892086330935e-05, "loss": 0.2882, "mean_token_accuracy": 0.9080218151211739, "num_tokens": 1390647.0, "step": 210 }, { "entropy": 0.49913822263479235, "epoch": 1.4769101595298069, "grad_norm": 0.2638222575187683, "learning_rate": 3.027338129496403e-05, "loss": 0.32, "mean_token_accuracy": 0.9011362835764885, "num_tokens": 1458413.0, "step": 220 }, { "entropy": 0.5291724860668182, "epoch": 1.5440806045340052, "grad_norm": 0.7974303960800171, "learning_rate": 2.9697841726618707e-05, "loss": 0.3437, "mean_token_accuracy": 0.8957020588219166, "num_tokens": 1529069.0, "step": 230 }, { "entropy": 0.5148157492280007, "epoch": 1.611251049538203, "grad_norm": 0.3252256512641907, "learning_rate": 2.9122302158273384e-05, "loss": 0.3078, "mean_token_accuracy": 0.8965719811618328, "num_tokens": 1593891.0, "step": 240 }, { "entropy": 0.5439612463116645, "epoch": 1.6784214945424014, "grad_norm": 0.3167729079723358, "learning_rate": 2.854676258992806e-05, "loss": 0.3331, "mean_token_accuracy": 0.8926876865327358, "num_tokens": 1658563.0, "step": 250 }, { "epoch": 1.6784214945424014, "eval_entropy": 0.8733608674790178, "eval_loss": 0.8773566484451294, "eval_mean_token_accuracy": 0.7638165514383998, "eval_num_tokens": 1658563.0, "eval_runtime": 4.9846, "eval_samples_per_second": 11.235, "eval_steps_per_second": 11.235, "step": 250 }, { "entropy": 0.5023419760167599, "epoch": 1.7455919395465995, "grad_norm": 0.30670592188835144, "learning_rate": 2.7971223021582735e-05, "loss": 0.3503, "mean_token_accuracy": 0.8920910514891147, "num_tokens": 1735767.0, "step": 260 }, { "entropy": 0.5167030651122332, "epoch": 1.8127623845507976, "grad_norm": 0.25313618779182434, "learning_rate": 2.7395683453237412e-05, "loss": 0.3242, "mean_token_accuracy": 0.8931105189025402, "num_tokens": 1802895.0, "step": 270 }, { "entropy": 0.5199910126626491, "epoch": 1.879932829554996, "grad_norm": 0.3637166917324066, "learning_rate": 2.6820143884892086e-05, "loss": 0.3145, "mean_token_accuracy": 0.8975049994885922, "num_tokens": 1865468.0, "step": 280 }, { "entropy": 0.5213575050234794, "epoch": 1.9471032745591939, "grad_norm": 0.3701685965061188, "learning_rate": 2.6244604316546767e-05, "loss": 0.3398, "mean_token_accuracy": 0.896855977922678, "num_tokens": 1932031.0, "step": 290 }, { "entropy": 0.4997344066070605, "epoch": 2.0134340890008398, "grad_norm": 0.331307590007782, "learning_rate": 2.566906474820144e-05, "loss": 0.2637, "mean_token_accuracy": 0.9088972090165827, "num_tokens": 1992713.0, "step": 300 }, { "epoch": 2.0134340890008398, "eval_entropy": 0.9004479699901172, "eval_loss": 0.8707331418991089, "eval_mean_token_accuracy": 0.7641822940536908, "eval_num_tokens": 1992713.0, "eval_runtime": 4.9809, "eval_samples_per_second": 11.243, "eval_steps_per_second": 11.243, "step": 300 }, { "entropy": 0.47825686521828176, "epoch": 2.0806045340050376, "grad_norm": 0.3079531192779541, "learning_rate": 2.5093525179856118e-05, "loss": 0.2863, "mean_token_accuracy": 0.908924800157547, "num_tokens": 2061833.0, "step": 310 }, { "entropy": 0.48984590619802476, "epoch": 2.147774979009236, "grad_norm": 0.2569948434829712, "learning_rate": 2.451798561151079e-05, "loss": 0.3162, "mean_token_accuracy": 0.903419229388237, "num_tokens": 2132082.0, "step": 320 }, { "entropy": 0.46938193738460543, "epoch": 2.2149454240134343, "grad_norm": 0.29792118072509766, "learning_rate": 2.394244604316547e-05, "loss": 0.2943, "mean_token_accuracy": 0.9081865437328815, "num_tokens": 2202270.0, "step": 330 }, { "entropy": 0.4853254608809948, "epoch": 2.282115869017632, "grad_norm": 0.26585960388183594, "learning_rate": 2.336690647482015e-05, "loss": 0.2682, "mean_token_accuracy": 0.9112951382994652, "num_tokens": 2263789.0, "step": 340 }, { "entropy": 0.49189864136278627, "epoch": 2.3492863140218305, "grad_norm": 0.2741142213344574, "learning_rate": 2.2791366906474823e-05, "loss": 0.313, "mean_token_accuracy": 0.8986666277050972, "num_tokens": 2334071.0, "step": 350 }, { "epoch": 2.3492863140218305, "eval_entropy": 0.8618932158819267, "eval_loss": 0.8953577280044556, "eval_mean_token_accuracy": 0.7628704607486725, "eval_num_tokens": 2334071.0, "eval_runtime": 4.984, "eval_samples_per_second": 11.236, "eval_steps_per_second": 11.236, "step": 350 }, { "entropy": 0.49905899055302144, "epoch": 2.4164567590260284, "grad_norm": 0.3090650141239166, "learning_rate": 2.22158273381295e-05, "loss": 0.2886, "mean_token_accuracy": 0.9060709603130818, "num_tokens": 2396966.0, "step": 360 }, { "entropy": 0.49515552036464217, "epoch": 2.4836272040302267, "grad_norm": 0.3477407991886139, "learning_rate": 2.1640287769784174e-05, "loss": 0.2685, "mean_token_accuracy": 0.9104222223162651, "num_tokens": 2457974.0, "step": 370 }, { "entropy": 0.5188349276781082, "epoch": 2.5507976490344246, "grad_norm": 0.2762923538684845, "learning_rate": 2.1064748201438848e-05, "loss": 0.2911, "mean_token_accuracy": 0.9037038788199425, "num_tokens": 2526344.0, "step": 380 }, { "entropy": 0.5011733949184418, "epoch": 2.617968094038623, "grad_norm": 0.2646786570549011, "learning_rate": 2.0489208633093525e-05, "loss": 0.3158, "mean_token_accuracy": 0.9090628199279308, "num_tokens": 2596395.0, "step": 390 }, { "entropy": 0.49840320982038977, "epoch": 2.6851385390428213, "grad_norm": 0.4310755431652069, "learning_rate": 1.9913669064748202e-05, "loss": 0.2887, "mean_token_accuracy": 0.9083616696298122, "num_tokens": 2657920.0, "step": 400 }, { "epoch": 2.6851385390428213, "eval_entropy": 0.8837019130587578, "eval_loss": 0.88917475938797, "eval_mean_token_accuracy": 0.763011426797935, "eval_num_tokens": 2657920.0, "eval_runtime": 4.9877, "eval_samples_per_second": 11.228, "eval_steps_per_second": 11.228, "step": 400 }, { "entropy": 0.4771813187748194, "epoch": 2.752308984047019, "grad_norm": 0.32867103815078735, "learning_rate": 1.933812949640288e-05, "loss": 0.2668, "mean_token_accuracy": 0.9088380873203278, "num_tokens": 2725640.0, "step": 410 }, { "entropy": 0.47720473557710646, "epoch": 2.8194794290512175, "grad_norm": 0.27547284960746765, "learning_rate": 1.8762589928057556e-05, "loss": 0.2725, "mean_token_accuracy": 0.9101646140217781, "num_tokens": 2789853.0, "step": 420 }, { "entropy": 0.45747280344367025, "epoch": 2.886649874055416, "grad_norm": 0.3139909505844116, "learning_rate": 1.8187050359712234e-05, "loss": 0.267, "mean_token_accuracy": 0.9148379288613796, "num_tokens": 2857093.0, "step": 430 }, { "entropy": 0.46566248275339606, "epoch": 2.9538203190596137, "grad_norm": 0.3694741427898407, "learning_rate": 1.7611510791366907e-05, "loss": 0.2995, "mean_token_accuracy": 0.901902287453413, "num_tokens": 2928087.0, "step": 440 }, { "entropy": 0.47634567490106894, "epoch": 3.0201511335012596, "grad_norm": 0.26703643798828125, "learning_rate": 1.7035971223021584e-05, "loss": 0.3174, "mean_token_accuracy": 0.9040401434596581, "num_tokens": 2993485.0, "step": 450 }, { "epoch": 3.0201511335012596, "eval_entropy": 0.8348773148443017, "eval_loss": 0.8874495625495911, "eval_mean_token_accuracy": 0.7612802609801292, "eval_num_tokens": 2993485.0, "eval_runtime": 4.9913, "eval_samples_per_second": 11.22, "eval_steps_per_second": 11.22, "step": 450 }, { "entropy": 0.44407146945595743, "epoch": 3.0873215785054575, "grad_norm": 0.27298107743263245, "learning_rate": 1.646043165467626e-05, "loss": 0.2888, "mean_token_accuracy": 0.9176116026937962, "num_tokens": 3062721.0, "step": 460 }, { "entropy": 0.4276166781783104, "epoch": 3.154492023509656, "grad_norm": 0.2755906879901886, "learning_rate": 1.5884892086330935e-05, "loss": 0.2419, "mean_token_accuracy": 0.9195634700357914, "num_tokens": 3129709.0, "step": 470 }, { "entropy": 0.4346423916518688, "epoch": 3.2216624685138537, "grad_norm": 0.39296212792396545, "learning_rate": 1.5309352517985613e-05, "loss": 0.233, "mean_token_accuracy": 0.9278727151453495, "num_tokens": 3189161.0, "step": 480 }, { "entropy": 0.44294624738395216, "epoch": 3.288832913518052, "grad_norm": 0.39035606384277344, "learning_rate": 1.473381294964029e-05, "loss": 0.2781, "mean_token_accuracy": 0.9181225150823593, "num_tokens": 3255403.0, "step": 490 }, { "entropy": 0.4794738654047251, "epoch": 3.3560033585222504, "grad_norm": 0.3938174545764923, "learning_rate": 1.4158273381294965e-05, "loss": 0.2942, "mean_token_accuracy": 0.9075012236833573, "num_tokens": 3320554.0, "step": 500 }, { "epoch": 3.3560033585222504, "eval_entropy": 0.817679978374924, "eval_loss": 0.9140912294387817, "eval_mean_token_accuracy": 0.761023474591119, "eval_num_tokens": 3320554.0, "eval_runtime": 4.9833, "eval_samples_per_second": 11.238, "eval_steps_per_second": 11.238, "step": 500 }, { "entropy": 0.4701398782432079, "epoch": 3.4231738035264483, "grad_norm": 0.4412434697151184, "learning_rate": 1.3582733812949642e-05, "loss": 0.2707, "mean_token_accuracy": 0.9106249861419201, "num_tokens": 3384303.0, "step": 510 }, { "entropy": 0.45122345685958865, "epoch": 3.4903442485306466, "grad_norm": 0.26691338419914246, "learning_rate": 1.3007194244604318e-05, "loss": 0.258, "mean_token_accuracy": 0.9156571164727211, "num_tokens": 3451764.0, "step": 520 }, { "entropy": 0.4313320998102427, "epoch": 3.5575146935348445, "grad_norm": 0.34582847356796265, "learning_rate": 1.2431654676258993e-05, "loss": 0.2612, "mean_token_accuracy": 0.916472752392292, "num_tokens": 3522461.0, "step": 530 }, { "entropy": 0.43578826524317266, "epoch": 3.624685138539043, "grad_norm": 0.4498523771762848, "learning_rate": 1.185611510791367e-05, "loss": 0.2834, "mean_token_accuracy": 0.9140046447515487, "num_tokens": 3591502.0, "step": 540 }, { "entropy": 0.4625163245946169, "epoch": 3.6918555835432407, "grad_norm": 0.3399084806442261, "learning_rate": 1.1280575539568346e-05, "loss": 0.2906, "mean_token_accuracy": 0.9062492586672306, "num_tokens": 3659203.0, "step": 550 }, { "epoch": 3.6918555835432407, "eval_entropy": 0.8242657040911061, "eval_loss": 0.9106144905090332, "eval_mean_token_accuracy": 0.7601779720612934, "eval_num_tokens": 3659203.0, "eval_runtime": 4.9925, "eval_samples_per_second": 11.217, "eval_steps_per_second": 11.217, "step": 550 }, { "entropy": 0.44798229187726973, "epoch": 3.759026028547439, "grad_norm": 0.28008008003234863, "learning_rate": 1.0705035971223023e-05, "loss": 0.2947, "mean_token_accuracy": 0.9083551168441772, "num_tokens": 3735002.0, "step": 560 }, { "entropy": 0.438993413746357, "epoch": 3.8261964735516374, "grad_norm": 0.33622145652770996, "learning_rate": 1.0129496402877699e-05, "loss": 0.2536, "mean_token_accuracy": 0.9168887868523597, "num_tokens": 3798345.0, "step": 570 }, { "entropy": 0.43377807587385175, "epoch": 3.8933669185558353, "grad_norm": 0.41980335116386414, "learning_rate": 9.553956834532376e-06, "loss": 0.2414, "mean_token_accuracy": 0.925646186619997, "num_tokens": 3859139.0, "step": 580 }, { "entropy": 0.43494608998298645, "epoch": 3.9605373635600336, "grad_norm": 0.29723235964775085, "learning_rate": 8.978417266187051e-06, "loss": 0.2619, "mean_token_accuracy": 0.9199615843594074, "num_tokens": 3923018.0, "step": 590 }, { "entropy": 0.42559608741651606, "epoch": 4.0268681780016795, "grad_norm": 0.32702481746673584, "learning_rate": 8.402877697841727e-06, "loss": 0.2393, "mean_token_accuracy": 0.9227683498889585, "num_tokens": 3985896.0, "step": 600 }, { "epoch": 4.0268681780016795, "eval_entropy": 0.790077348372766, "eval_loss": 0.929413378238678, "eval_mean_token_accuracy": 0.7607843939747129, "eval_num_tokens": 3985896.0, "eval_runtime": 4.9875, "eval_samples_per_second": 11.228, "eval_steps_per_second": 11.228, "step": 600 }, { "entropy": 0.4118167482316494, "epoch": 4.094038623005877, "grad_norm": 0.3099454641342163, "learning_rate": 7.827338129496404e-06, "loss": 0.2197, "mean_token_accuracy": 0.9286121532320977, "num_tokens": 4046640.0, "step": 610 }, { "entropy": 0.42818755134940145, "epoch": 4.161209068010075, "grad_norm": 0.3183671236038208, "learning_rate": 7.25179856115108e-06, "loss": 0.2681, "mean_token_accuracy": 0.9148878164589405, "num_tokens": 4115433.0, "step": 620 }, { "entropy": 0.4235291346907616, "epoch": 4.228379513014274, "grad_norm": 0.3553420603275299, "learning_rate": 6.6762589928057564e-06, "loss": 0.2386, "mean_token_accuracy": 0.9219062335789203, "num_tokens": 4177541.0, "step": 630 }, { "entropy": 0.43433762453496455, "epoch": 4.295549958018472, "grad_norm": 0.47075527906417847, "learning_rate": 6.100719424460432e-06, "loss": 0.2696, "mean_token_accuracy": 0.9177930898964405, "num_tokens": 4248216.0, "step": 640 }, { "entropy": 0.43204528763890265, "epoch": 4.36272040302267, "grad_norm": 0.34699931740760803, "learning_rate": 5.525179856115108e-06, "loss": 0.2697, "mean_token_accuracy": 0.9196366496384144, "num_tokens": 4319252.0, "step": 650 }, { "epoch": 4.36272040302267, "eval_entropy": 0.7843316655073848, "eval_loss": 0.9468327164649963, "eval_mean_token_accuracy": 0.7589835811938558, "eval_num_tokens": 4319252.0, "eval_runtime": 4.9823, "eval_samples_per_second": 11.24, "eval_steps_per_second": 11.24, "step": 650 }, { "entropy": 0.4113185711205006, "epoch": 4.429890848026869, "grad_norm": 0.3835369050502777, "learning_rate": 4.9496402877697845e-06, "loss": 0.2489, "mean_token_accuracy": 0.9217935107648373, "num_tokens": 4389439.0, "step": 660 }, { "entropy": 0.4268360245972872, "epoch": 4.4970612930310665, "grad_norm": 0.32893145084381104, "learning_rate": 4.374100719424461e-06, "loss": 0.2546, "mean_token_accuracy": 0.9204200245440006, "num_tokens": 4454296.0, "step": 670 }, { "entropy": 0.43092771619558334, "epoch": 4.564231738035264, "grad_norm": 0.35402655601501465, "learning_rate": 3.798561151079137e-06, "loss": 0.2528, "mean_token_accuracy": 0.9206391289830208, "num_tokens": 4516174.0, "step": 680 }, { "entropy": 0.40939035564661025, "epoch": 4.631402183039462, "grad_norm": 0.32696768641471863, "learning_rate": 3.223021582733813e-06, "loss": 0.2567, "mean_token_accuracy": 0.9263360798358917, "num_tokens": 4586677.0, "step": 690 }, { "entropy": 0.42081440389156344, "epoch": 4.698572628043661, "grad_norm": 0.3911837637424469, "learning_rate": 2.6474820143884894e-06, "loss": 0.21, "mean_token_accuracy": 0.9308864399790764, "num_tokens": 4643628.0, "step": 700 }, { "epoch": 4.698572628043661, "eval_entropy": 0.7831281987684113, "eval_loss": 0.9461870789527893, "eval_mean_token_accuracy": 0.7588864458458764, "eval_num_tokens": 4643628.0, "eval_runtime": 4.9942, "eval_samples_per_second": 11.213, "eval_steps_per_second": 11.213, "step": 700 }, { "entropy": 0.43728532269597054, "epoch": 4.765743073047859, "grad_norm": 0.30957573652267456, "learning_rate": 2.0719424460431657e-06, "loss": 0.2575, "mean_token_accuracy": 0.9134377054870129, "num_tokens": 4712648.0, "step": 710 }, { "entropy": 0.4240421138703823, "epoch": 4.832913518052057, "grad_norm": 0.4323979318141937, "learning_rate": 1.4964028776978418e-06, "loss": 0.2681, "mean_token_accuracy": 0.9151543125510215, "num_tokens": 4786100.0, "step": 720 }, { "entropy": 0.42574540376663206, "epoch": 4.900083963056256, "grad_norm": 0.41679295897483826, "learning_rate": 9.208633093525181e-07, "loss": 0.2643, "mean_token_accuracy": 0.9235311761498451, "num_tokens": 4853429.0, "step": 730 }, { "entropy": 0.42149149887263776, "epoch": 4.9672544080604535, "grad_norm": 0.3193954825401306, "learning_rate": 3.4532374100719426e-07, "loss": 0.2576, "mean_token_accuracy": 0.9235921517014504, "num_tokens": 4921420.0, "step": 740 } ], "logging_steps": 10, "max_steps": 745, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.54582046359552e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }