| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.09758477677482313, | |
| "eval_steps": 10, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00024396194193705782, | |
| "grad_norm": 4.8585286140441895, | |
| "learning_rate": 2.4999420463141455e-07, | |
| "loss": 2.9081, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00024396194193705782, | |
| "eval_loss": 2.639136552810669, | |
| "eval_runtime": 157.6053, | |
| "eval_samples_per_second": 1.624, | |
| "eval_steps_per_second": 0.812, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00048792388387411563, | |
| "grad_norm": 3.586596965789795, | |
| "learning_rate": 2.4998840671678217e-07, | |
| "loss": 2.4085, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0007318858258111735, | |
| "grad_norm": 4.514856815338135, | |
| "learning_rate": 2.499826062544247e-07, | |
| "loss": 2.867, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0009758477677482313, | |
| "grad_norm": 3.343158483505249, | |
| "learning_rate": 2.4997680324266246e-07, | |
| "loss": 2.5093, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0012198097096852891, | |
| "grad_norm": 4.163078784942627, | |
| "learning_rate": 2.499709976798144e-07, | |
| "loss": 2.9917, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.001463771651622347, | |
| "grad_norm": 4.113401889801025, | |
| "learning_rate": 2.4996518956419777e-07, | |
| "loss": 2.8629, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0017077335935594047, | |
| "grad_norm": 2.110043525695801, | |
| "learning_rate": 2.499593788941286e-07, | |
| "loss": 2.3666, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0019516955354964625, | |
| "grad_norm": 3.960318088531494, | |
| "learning_rate": 2.499535656679212e-07, | |
| "loss": 2.6438, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0021956574774335204, | |
| "grad_norm": 3.959432601928711, | |
| "learning_rate": 2.499477498838886e-07, | |
| "loss": 2.6457, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0024396194193705783, | |
| "grad_norm": 2.219346523284912, | |
| "learning_rate": 2.4994193154034227e-07, | |
| "loss": 2.3086, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0024396194193705783, | |
| "eval_loss": 2.3810582160949707, | |
| "eval_runtime": 157.7847, | |
| "eval_samples_per_second": 1.622, | |
| "eval_steps_per_second": 0.811, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002683581361307636, | |
| "grad_norm": 2.600377082824707, | |
| "learning_rate": 2.499361106355922e-07, | |
| "loss": 2.3537, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.002927543303244694, | |
| "grad_norm": 3.251347303390503, | |
| "learning_rate": 2.499302871679468e-07, | |
| "loss": 2.483, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0031715052451817514, | |
| "grad_norm": 2.1139895915985107, | |
| "learning_rate": 2.4992446113571303e-07, | |
| "loss": 2.288, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0034154671871188093, | |
| "grad_norm": 3.138744592666626, | |
| "learning_rate": 2.4991863253719657e-07, | |
| "loss": 2.4845, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.003659429129055867, | |
| "grad_norm": 2.0805656909942627, | |
| "learning_rate": 2.4991280137070126e-07, | |
| "loss": 2.2727, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.003903391070992925, | |
| "grad_norm": 2.57004714012146, | |
| "learning_rate": 2.499069676345297e-07, | |
| "loss": 2.3858, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.004147353012929983, | |
| "grad_norm": 1.8521772623062134, | |
| "learning_rate": 2.499011313269829e-07, | |
| "loss": 2.2256, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.004391314954867041, | |
| "grad_norm": 2.250250816345215, | |
| "learning_rate": 2.498952924463603e-07, | |
| "loss": 2.375, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.004635276896804099, | |
| "grad_norm": 2.7878353595733643, | |
| "learning_rate": 2.498894509909601e-07, | |
| "loss": 2.0609, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0048792388387411565, | |
| "grad_norm": 2.4599826335906982, | |
| "learning_rate": 2.4988360695907864e-07, | |
| "loss": 2.1944, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0048792388387411565, | |
| "eval_loss": 2.1548237800598145, | |
| "eval_runtime": 157.9, | |
| "eval_samples_per_second": 1.621, | |
| "eval_steps_per_second": 0.811, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.005123200780678214, | |
| "grad_norm": 2.118277072906494, | |
| "learning_rate": 2.49877760349011e-07, | |
| "loss": 2.1107, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.005367162722615272, | |
| "grad_norm": 1.5559130907058716, | |
| "learning_rate": 2.498719111590508e-07, | |
| "loss": 1.992, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.00561112466455233, | |
| "grad_norm": 2.998913049697876, | |
| "learning_rate": 2.498660593874899e-07, | |
| "loss": 2.2592, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.005855086606489388, | |
| "grad_norm": 1.370886206626892, | |
| "learning_rate": 2.4986020503261886e-07, | |
| "loss": 2.0988, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.006099048548426446, | |
| "grad_norm": 1.2692762613296509, | |
| "learning_rate": 2.498543480927266e-07, | |
| "loss": 2.1908, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.006343010490363503, | |
| "grad_norm": 1.6744440793991089, | |
| "learning_rate": 2.4984848856610065e-07, | |
| "loss": 2.2077, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.006586972432300561, | |
| "grad_norm": 1.3982892036437988, | |
| "learning_rate": 2.4984262645102706e-07, | |
| "loss": 2.2539, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.006830934374237619, | |
| "grad_norm": 1.3442888259887695, | |
| "learning_rate": 2.4983676174579014e-07, | |
| "loss": 2.2487, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0070748963161746765, | |
| "grad_norm": 1.1121150255203247, | |
| "learning_rate": 2.498308944486729e-07, | |
| "loss": 2.024, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.007318858258111734, | |
| "grad_norm": 1.4833574295043945, | |
| "learning_rate": 2.4982502455795676e-07, | |
| "loss": 2.107, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007318858258111734, | |
| "eval_loss": 2.051649570465088, | |
| "eval_runtime": 158.0175, | |
| "eval_samples_per_second": 1.62, | |
| "eval_steps_per_second": 0.81, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007562820200048792, | |
| "grad_norm": 1.5546934604644775, | |
| "learning_rate": 2.498191520719216e-07, | |
| "loss": 2.151, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.00780678214198585, | |
| "grad_norm": 1.101186752319336, | |
| "learning_rate": 2.4981327698884575e-07, | |
| "loss": 2.0822, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.008050744083922909, | |
| "grad_norm": 1.13623046875, | |
| "learning_rate": 2.498073993070061e-07, | |
| "loss": 2.0729, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.008294706025859966, | |
| "grad_norm": 1.3326915502548218, | |
| "learning_rate": 2.49801519024678e-07, | |
| "loss": 2.2334, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.008538667967797023, | |
| "grad_norm": 1.1969497203826904, | |
| "learning_rate": 2.497956361401352e-07, | |
| "loss": 2.1631, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.008782629909734082, | |
| "grad_norm": 1.0180652141571045, | |
| "learning_rate": 2.4978975065165004e-07, | |
| "loss": 2.0552, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.009026591851671139, | |
| "grad_norm": 1.7680776119232178, | |
| "learning_rate": 2.497838625574932e-07, | |
| "loss": 2.2854, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.009270553793608197, | |
| "grad_norm": 1.048871397972107, | |
| "learning_rate": 2.497779718559339e-07, | |
| "loss": 2.27, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.009514515735545254, | |
| "grad_norm": 1.0272551774978638, | |
| "learning_rate": 2.497720785452398e-07, | |
| "loss": 1.9276, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.009758477677482313, | |
| "grad_norm": 0.9949386119842529, | |
| "learning_rate": 2.497661826236771e-07, | |
| "loss": 2.1643, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.009758477677482313, | |
| "eval_loss": 1.9904688596725464, | |
| "eval_runtime": 157.9911, | |
| "eval_samples_per_second": 1.62, | |
| "eval_steps_per_second": 0.81, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01000243961941937, | |
| "grad_norm": 1.153521180152893, | |
| "learning_rate": 2.497602840895103e-07, | |
| "loss": 2.0555, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.010246401561356429, | |
| "grad_norm": 1.1365783214569092, | |
| "learning_rate": 2.4975438294100266e-07, | |
| "loss": 1.9699, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.010490363503293486, | |
| "grad_norm": 1.3392469882965088, | |
| "learning_rate": 2.497484791764155e-07, | |
| "loss": 2.1889, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.010734325445230545, | |
| "grad_norm": 1.1810263395309448, | |
| "learning_rate": 2.4974257279400897e-07, | |
| "loss": 1.9938, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.010978287387167602, | |
| "grad_norm": 0.8270505666732788, | |
| "learning_rate": 2.497366637920414e-07, | |
| "loss": 2.1701, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.01122224932910466, | |
| "grad_norm": 1.1721283197402954, | |
| "learning_rate": 2.497307521687697e-07, | |
| "loss": 2.0702, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.011466211271041717, | |
| "grad_norm": 0.8560613989830017, | |
| "learning_rate": 2.497248379224492e-07, | |
| "loss": 2.0357, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.011710173212978776, | |
| "grad_norm": 2.072547674179077, | |
| "learning_rate": 2.497189210513339e-07, | |
| "loss": 2.1774, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.011954135154915833, | |
| "grad_norm": 1.9676735401153564, | |
| "learning_rate": 2.497130015536758e-07, | |
| "loss": 2.1073, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.012198097096852892, | |
| "grad_norm": 0.868861198425293, | |
| "learning_rate": 2.497070794277257e-07, | |
| "loss": 2.0378, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012198097096852892, | |
| "eval_loss": 1.958860993385315, | |
| "eval_runtime": 157.3873, | |
| "eval_samples_per_second": 1.627, | |
| "eval_steps_per_second": 0.813, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012442059038789949, | |
| "grad_norm": 1.0588116645812988, | |
| "learning_rate": 2.497011546717327e-07, | |
| "loss": 2.1439, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.012686020980727006, | |
| "grad_norm": 0.9421451687812805, | |
| "learning_rate": 2.496952272839445e-07, | |
| "loss": 1.9826, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.012929982922664065, | |
| "grad_norm": 0.88938969373703, | |
| "learning_rate": 2.4968929726260705e-07, | |
| "loss": 1.9675, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.013173944864601122, | |
| "grad_norm": 0.8794369101524353, | |
| "learning_rate": 2.4968336460596485e-07, | |
| "loss": 1.9546, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.01341790680653818, | |
| "grad_norm": 0.7067832350730896, | |
| "learning_rate": 2.4967742931226075e-07, | |
| "loss": 1.8798, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.013661868748475237, | |
| "grad_norm": 1.4922388792037964, | |
| "learning_rate": 2.4967149137973625e-07, | |
| "loss": 1.9596, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.013905830690412296, | |
| "grad_norm": 0.8123573660850525, | |
| "learning_rate": 2.496655508066309e-07, | |
| "loss": 1.9043, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.014149792632349353, | |
| "grad_norm": 0.8600869178771973, | |
| "learning_rate": 2.4965960759118313e-07, | |
| "loss": 1.9608, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.014393754574286412, | |
| "grad_norm": 0.7148178219795227, | |
| "learning_rate": 2.4965366173162953e-07, | |
| "loss": 2.0545, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.014637716516223469, | |
| "grad_norm": 0.8177701234817505, | |
| "learning_rate": 2.4964771322620516e-07, | |
| "loss": 2.0236, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014637716516223469, | |
| "eval_loss": 1.934555172920227, | |
| "eval_runtime": 157.5281, | |
| "eval_samples_per_second": 1.625, | |
| "eval_steps_per_second": 0.813, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014881678458160527, | |
| "grad_norm": 0.6155992746353149, | |
| "learning_rate": 2.4964176207314356e-07, | |
| "loss": 2.066, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.015125640400097584, | |
| "grad_norm": 0.9341537356376648, | |
| "learning_rate": 2.496358082706767e-07, | |
| "loss": 1.9537, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.015369602342034643, | |
| "grad_norm": 1.3128167390823364, | |
| "learning_rate": 2.4962985181703483e-07, | |
| "loss": 2.0044, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.0156135642839717, | |
| "grad_norm": 1.2402898073196411, | |
| "learning_rate": 2.496238927104469e-07, | |
| "loss": 1.962, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.015857526225908757, | |
| "grad_norm": 0.8261551260948181, | |
| "learning_rate": 2.4961793094913995e-07, | |
| "loss": 2.1043, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.016101488167845818, | |
| "grad_norm": 1.3150850534439087, | |
| "learning_rate": 2.4961196653133975e-07, | |
| "loss": 2.1101, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.016345450109782875, | |
| "grad_norm": 0.5901480317115784, | |
| "learning_rate": 2.4960599945527027e-07, | |
| "loss": 1.7913, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.01658941205171993, | |
| "grad_norm": 1.4552851915359497, | |
| "learning_rate": 2.49600029719154e-07, | |
| "loss": 1.9979, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.01683337399365699, | |
| "grad_norm": 0.6188462376594543, | |
| "learning_rate": 2.495940573212118e-07, | |
| "loss": 1.759, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.017077335935594046, | |
| "grad_norm": 0.6212908029556274, | |
| "learning_rate": 2.4958808225966306e-07, | |
| "loss": 1.9251, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017077335935594046, | |
| "eval_loss": 1.919191837310791, | |
| "eval_runtime": 157.4683, | |
| "eval_samples_per_second": 1.626, | |
| "eval_steps_per_second": 0.813, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.017321297877531106, | |
| "grad_norm": 0.6586403250694275, | |
| "learning_rate": 2.4958210453272533e-07, | |
| "loss": 2.0447, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.017565259819468163, | |
| "grad_norm": 0.6836444139480591, | |
| "learning_rate": 2.4957612413861483e-07, | |
| "loss": 2.0525, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.01780922176140522, | |
| "grad_norm": 0.7636261582374573, | |
| "learning_rate": 2.4957014107554603e-07, | |
| "loss": 2.0984, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.018053183703342277, | |
| "grad_norm": 0.5293551683425903, | |
| "learning_rate": 2.4956415534173195e-07, | |
| "loss": 1.8238, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.018297145645279338, | |
| "grad_norm": 0.5500568151473999, | |
| "learning_rate": 2.495581669353838e-07, | |
| "loss": 1.8841, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.018541107587216395, | |
| "grad_norm": 0.7883771061897278, | |
| "learning_rate": 2.4955217585471147e-07, | |
| "loss": 1.9951, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.01878506952915345, | |
| "grad_norm": 0.6567949056625366, | |
| "learning_rate": 2.495461820979229e-07, | |
| "loss": 2.0119, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.01902903147109051, | |
| "grad_norm": 0.8867214918136597, | |
| "learning_rate": 2.4954018566322477e-07, | |
| "loss": 1.8826, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.01927299341302757, | |
| "grad_norm": 0.8271172642707825, | |
| "learning_rate": 2.4953418654882195e-07, | |
| "loss": 1.9226, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.019516955354964626, | |
| "grad_norm": 0.5612655878067017, | |
| "learning_rate": 2.495281847529178e-07, | |
| "loss": 1.9987, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019516955354964626, | |
| "eval_loss": 1.9070545434951782, | |
| "eval_runtime": 157.7755, | |
| "eval_samples_per_second": 1.623, | |
| "eval_steps_per_second": 0.811, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.019760917296901683, | |
| "grad_norm": 0.9746911525726318, | |
| "learning_rate": 2.4952218027371403e-07, | |
| "loss": 2.0771, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.02000487923883874, | |
| "grad_norm": 0.7961266040802002, | |
| "learning_rate": 2.495161731094107e-07, | |
| "loss": 1.9497, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.0202488411807758, | |
| "grad_norm": 0.5901756286621094, | |
| "learning_rate": 2.4951016325820637e-07, | |
| "loss": 1.9636, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.020492803122712858, | |
| "grad_norm": 0.572099506855011, | |
| "learning_rate": 2.4950415071829794e-07, | |
| "loss": 2.0077, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.020736765064649915, | |
| "grad_norm": 0.7444072961807251, | |
| "learning_rate": 2.4949813548788067e-07, | |
| "loss": 1.9713, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.02098072700658697, | |
| "grad_norm": 1.6917086839675903, | |
| "learning_rate": 2.4949211756514816e-07, | |
| "loss": 2.1275, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.02122468894852403, | |
| "grad_norm": 0.4941423535346985, | |
| "learning_rate": 2.494860969482926e-07, | |
| "loss": 2.0304, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.02146865089046109, | |
| "grad_norm": 0.7001515626907349, | |
| "learning_rate": 2.4948007363550424e-07, | |
| "loss": 2.0102, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.021712612832398146, | |
| "grad_norm": 0.6658152341842651, | |
| "learning_rate": 2.4947404762497197e-07, | |
| "loss": 1.6802, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.021956574774335203, | |
| "grad_norm": 0.7706289291381836, | |
| "learning_rate": 2.49468018914883e-07, | |
| "loss": 2.0452, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.021956574774335203, | |
| "eval_loss": 1.8989028930664062, | |
| "eval_runtime": 158.0707, | |
| "eval_samples_per_second": 1.62, | |
| "eval_steps_per_second": 0.81, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02220053671627226, | |
| "grad_norm": 0.4736054837703705, | |
| "learning_rate": 2.4946198750342283e-07, | |
| "loss": 1.9606, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.02244449865820932, | |
| "grad_norm": 0.6369607448577881, | |
| "learning_rate": 2.4945595338877547e-07, | |
| "loss": 1.9367, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.022688460600146378, | |
| "grad_norm": 0.780017614364624, | |
| "learning_rate": 2.494499165691231e-07, | |
| "loss": 1.8239, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.022932422542083435, | |
| "grad_norm": 1.0048651695251465, | |
| "learning_rate": 2.4944387704264644e-07, | |
| "loss": 1.851, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.02317638448402049, | |
| "grad_norm": 0.5539764165878296, | |
| "learning_rate": 2.494378348075246e-07, | |
| "loss": 1.7927, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.023420346425957552, | |
| "grad_norm": 0.5273501873016357, | |
| "learning_rate": 2.494317898619349e-07, | |
| "loss": 1.7911, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02366430836789461, | |
| "grad_norm": 1.1313800811767578, | |
| "learning_rate": 2.4942574220405314e-07, | |
| "loss": 1.9152, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.023908270309831666, | |
| "grad_norm": 0.8607046604156494, | |
| "learning_rate": 2.4941969183205344e-07, | |
| "loss": 2.0688, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.024152232251768723, | |
| "grad_norm": 0.9859471321105957, | |
| "learning_rate": 2.494136387441083e-07, | |
| "loss": 2.0554, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.024396194193705784, | |
| "grad_norm": 0.5871405005455017, | |
| "learning_rate": 2.494075829383886e-07, | |
| "loss": 1.8362, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.024396194193705784, | |
| "eval_loss": 1.8896028995513916, | |
| "eval_runtime": 157.8345, | |
| "eval_samples_per_second": 1.622, | |
| "eval_steps_per_second": 0.811, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02464015613564284, | |
| "grad_norm": 0.5069964528083801, | |
| "learning_rate": 2.494015244130635e-07, | |
| "loss": 1.8013, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.024884118077579898, | |
| "grad_norm": 0.7139447927474976, | |
| "learning_rate": 2.493954631663007e-07, | |
| "loss": 1.8216, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.025128080019516955, | |
| "grad_norm": 0.48631080985069275, | |
| "learning_rate": 2.493893991962659e-07, | |
| "loss": 1.9325, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.02537204196145401, | |
| "grad_norm": 0.5576779842376709, | |
| "learning_rate": 2.493833325011235e-07, | |
| "loss": 2.0052, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.025616003903391072, | |
| "grad_norm": 0.6407865285873413, | |
| "learning_rate": 2.4937726307903606e-07, | |
| "loss": 1.9411, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.02585996584532813, | |
| "grad_norm": 0.7654765248298645, | |
| "learning_rate": 2.493711909281646e-07, | |
| "loss": 1.9438, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.026103927787265186, | |
| "grad_norm": 1.2607905864715576, | |
| "learning_rate": 2.493651160466685e-07, | |
| "loss": 2.0134, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.026347889729202243, | |
| "grad_norm": 0.8633036017417908, | |
| "learning_rate": 2.493590384327053e-07, | |
| "loss": 1.9775, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.026591851671139304, | |
| "grad_norm": 0.7568155527114868, | |
| "learning_rate": 2.49352958084431e-07, | |
| "loss": 1.9074, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.02683581361307636, | |
| "grad_norm": 0.5505961179733276, | |
| "learning_rate": 2.49346875e-07, | |
| "loss": 1.8467, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.02683581361307636, | |
| "eval_loss": 1.8828259706497192, | |
| "eval_runtime": 158.4116, | |
| "eval_samples_per_second": 1.616, | |
| "eval_steps_per_second": 0.808, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.027079775555013418, | |
| "grad_norm": 0.5095446109771729, | |
| "learning_rate": 2.49340789177565e-07, | |
| "loss": 1.9961, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.027323737496950475, | |
| "grad_norm": 1.7097959518432617, | |
| "learning_rate": 2.4933470061527687e-07, | |
| "loss": 1.9335, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.027567699438887535, | |
| "grad_norm": 1.0115768909454346, | |
| "learning_rate": 2.493286093112851e-07, | |
| "loss": 1.8118, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.027811661380824592, | |
| "grad_norm": 0.6412175297737122, | |
| "learning_rate": 2.493225152637374e-07, | |
| "loss": 1.9623, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.02805562332276165, | |
| "grad_norm": 0.5357053875923157, | |
| "learning_rate": 2.4931641847077963e-07, | |
| "loss": 1.8131, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.028299585264698706, | |
| "grad_norm": 0.6828150153160095, | |
| "learning_rate": 2.493103189305562e-07, | |
| "loss": 1.767, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.028543547206635766, | |
| "grad_norm": 0.5804136395454407, | |
| "learning_rate": 2.493042166412099e-07, | |
| "loss": 1.9831, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.028787509148572824, | |
| "grad_norm": 0.6375969052314758, | |
| "learning_rate": 2.492981116008816e-07, | |
| "loss": 1.9651, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.02903147109050988, | |
| "grad_norm": 0.6621755957603455, | |
| "learning_rate": 2.492920038077106e-07, | |
| "loss": 2.1064, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.029275433032446938, | |
| "grad_norm": 0.7436494827270508, | |
| "learning_rate": 2.492858932598346e-07, | |
| "loss": 1.8961, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.029275433032446938, | |
| "eval_loss": 1.8782259225845337, | |
| "eval_runtime": 158.1634, | |
| "eval_samples_per_second": 1.619, | |
| "eval_steps_per_second": 0.809, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.029519394974383995, | |
| "grad_norm": 0.5152058005332947, | |
| "learning_rate": 2.4927977995538954e-07, | |
| "loss": 1.875, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.029763356916321055, | |
| "grad_norm": 0.4640464782714844, | |
| "learning_rate": 2.4927366389250973e-07, | |
| "loss": 1.8429, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.030007318858258112, | |
| "grad_norm": 0.6126062273979187, | |
| "learning_rate": 2.4926754506932774e-07, | |
| "loss": 1.9581, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.03025128080019517, | |
| "grad_norm": 0.5338674187660217, | |
| "learning_rate": 2.4926142348397453e-07, | |
| "loss": 1.9682, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.030495242742132226, | |
| "grad_norm": 0.48220378160476685, | |
| "learning_rate": 2.492552991345792e-07, | |
| "loss": 1.9316, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.030739204684069286, | |
| "grad_norm": 1.0571016073226929, | |
| "learning_rate": 2.4924917201926936e-07, | |
| "loss": 1.9837, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.030983166626006343, | |
| "grad_norm": 0.5729621052742004, | |
| "learning_rate": 2.492430421361708e-07, | |
| "loss": 1.7242, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.0312271285679434, | |
| "grad_norm": 0.9092426896095276, | |
| "learning_rate": 2.4923690948340783e-07, | |
| "loss": 1.8327, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03147109050988046, | |
| "grad_norm": 0.44636791944503784, | |
| "learning_rate": 2.4923077405910264e-07, | |
| "loss": 2.0464, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.031715052451817514, | |
| "grad_norm": 0.6733670830726624, | |
| "learning_rate": 2.4922463586137616e-07, | |
| "loss": 1.8564, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.031715052451817514, | |
| "eval_loss": 1.873685359954834, | |
| "eval_runtime": 158.2193, | |
| "eval_samples_per_second": 1.618, | |
| "eval_steps_per_second": 0.809, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03195901439375457, | |
| "grad_norm": 0.6245723366737366, | |
| "learning_rate": 2.4921849488834745e-07, | |
| "loss": 2.0072, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.032202976335691635, | |
| "grad_norm": 0.47369739413261414, | |
| "learning_rate": 2.4921235113813376e-07, | |
| "loss": 2.0033, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.03244693827762869, | |
| "grad_norm": 0.6961667537689209, | |
| "learning_rate": 2.492062046088508e-07, | |
| "loss": 1.8175, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.03269090021956575, | |
| "grad_norm": 0.7953224182128906, | |
| "learning_rate": 2.4920005529861254e-07, | |
| "loss": 1.8035, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.032934862161502806, | |
| "grad_norm": 0.516058087348938, | |
| "learning_rate": 2.491939032055311e-07, | |
| "loss": 1.8855, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.03317882410343986, | |
| "grad_norm": 0.6488027572631836, | |
| "learning_rate": 2.491877483277171e-07, | |
| "loss": 1.9622, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.03342278604537692, | |
| "grad_norm": 0.6827359199523926, | |
| "learning_rate": 2.4918159066327943e-07, | |
| "loss": 1.847, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.03366674798731398, | |
| "grad_norm": 0.4918162226676941, | |
| "learning_rate": 2.49175430210325e-07, | |
| "loss": 1.9214, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.033910709929251034, | |
| "grad_norm": 0.7824620008468628, | |
| "learning_rate": 2.491692669669594e-07, | |
| "loss": 1.8472, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.03415467187118809, | |
| "grad_norm": 0.7084971070289612, | |
| "learning_rate": 2.4916310093128616e-07, | |
| "loss": 1.8638, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03415467187118809, | |
| "eval_loss": 1.869973063468933, | |
| "eval_runtime": 157.6522, | |
| "eval_samples_per_second": 1.624, | |
| "eval_steps_per_second": 0.812, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.034398633813125155, | |
| "grad_norm": 0.4873005747795105, | |
| "learning_rate": 2.491569321014073e-07, | |
| "loss": 1.9326, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.03464259575506221, | |
| "grad_norm": 0.6483212113380432, | |
| "learning_rate": 2.49150760475423e-07, | |
| "loss": 1.9035, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.03488655769699927, | |
| "grad_norm": 0.46081703901290894, | |
| "learning_rate": 2.4914458605143187e-07, | |
| "loss": 1.9746, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.035130519638936326, | |
| "grad_norm": 0.683131754398346, | |
| "learning_rate": 2.491384088275306e-07, | |
| "loss": 1.8517, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.03537448158087338, | |
| "grad_norm": 0.4871167242527008, | |
| "learning_rate": 2.491322288018143e-07, | |
| "loss": 1.7198, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.03561844352281044, | |
| "grad_norm": 0.6227270364761353, | |
| "learning_rate": 2.4912604597237626e-07, | |
| "loss": 1.8555, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.0358624054647475, | |
| "grad_norm": 0.5372536182403564, | |
| "learning_rate": 2.4911986033730807e-07, | |
| "loss": 1.8245, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.036106367406684554, | |
| "grad_norm": 0.7428392171859741, | |
| "learning_rate": 2.491136718946997e-07, | |
| "loss": 2.0657, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.03635032934862162, | |
| "grad_norm": 0.9103279709815979, | |
| "learning_rate": 2.4910748064263914e-07, | |
| "loss": 1.9042, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.036594291290558675, | |
| "grad_norm": 1.1896861791610718, | |
| "learning_rate": 2.491012865792129e-07, | |
| "loss": 1.8883, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.036594291290558675, | |
| "eval_loss": 1.86661696434021, | |
| "eval_runtime": 158.3624, | |
| "eval_samples_per_second": 1.617, | |
| "eval_steps_per_second": 0.808, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03683825323249573, | |
| "grad_norm": 0.7221816182136536, | |
| "learning_rate": 2.490950897025056e-07, | |
| "loss": 1.8696, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.03708221517443279, | |
| "grad_norm": 0.5009371042251587, | |
| "learning_rate": 2.4908889001060015e-07, | |
| "loss": 1.923, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.037326177116369846, | |
| "grad_norm": 0.6172135472297668, | |
| "learning_rate": 2.490826875015777e-07, | |
| "loss": 1.9862, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.0375701390583069, | |
| "grad_norm": 0.9549673199653625, | |
| "learning_rate": 2.490764821735178e-07, | |
| "loss": 1.9981, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.03781410100024396, | |
| "grad_norm": 0.5264533758163452, | |
| "learning_rate": 2.4907027402449803e-07, | |
| "loss": 1.8822, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.03805806294218102, | |
| "grad_norm": 0.4591792821884155, | |
| "learning_rate": 2.4906406305259434e-07, | |
| "loss": 1.9013, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.038302024884118074, | |
| "grad_norm": 0.4885839819908142, | |
| "learning_rate": 2.4905784925588094e-07, | |
| "loss": 1.918, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.03854598682605514, | |
| "grad_norm": 0.5201852917671204, | |
| "learning_rate": 2.4905163263243023e-07, | |
| "loss": 1.9607, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.038789948767992195, | |
| "grad_norm": 0.7386835813522339, | |
| "learning_rate": 2.4904541318031294e-07, | |
| "loss": 1.8633, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.03903391070992925, | |
| "grad_norm": 0.5655650496482849, | |
| "learning_rate": 2.49039190897598e-07, | |
| "loss": 1.9402, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03903391070992925, | |
| "eval_loss": 1.864166021347046, | |
| "eval_runtime": 158.2099, | |
| "eval_samples_per_second": 1.618, | |
| "eval_steps_per_second": 0.809, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03927787265186631, | |
| "grad_norm": 0.6714135408401489, | |
| "learning_rate": 2.490329657823525e-07, | |
| "loss": 1.7962, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.039521834593803366, | |
| "grad_norm": 0.685165524482727, | |
| "learning_rate": 2.490267378326419e-07, | |
| "loss": 1.9055, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.03976579653574042, | |
| "grad_norm": 0.5688671469688416, | |
| "learning_rate": 2.490205070465299e-07, | |
| "loss": 1.8434, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.04000975847767748, | |
| "grad_norm": 0.6001088619232178, | |
| "learning_rate": 2.4901427342207823e-07, | |
| "loss": 1.8715, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.04025372041961454, | |
| "grad_norm": 0.5576404929161072, | |
| "learning_rate": 2.490080369573472e-07, | |
| "loss": 1.8664, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0404976823615516, | |
| "grad_norm": 0.4974159002304077, | |
| "learning_rate": 2.4900179765039496e-07, | |
| "loss": 1.7923, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.04074164430348866, | |
| "grad_norm": 0.48131653666496277, | |
| "learning_rate": 2.489955554992782e-07, | |
| "loss": 1.8561, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.040985606245425715, | |
| "grad_norm": 0.49776557087898254, | |
| "learning_rate": 2.489893105020518e-07, | |
| "loss": 1.798, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.04122956818736277, | |
| "grad_norm": 0.7587680220603943, | |
| "learning_rate": 2.489830626567686e-07, | |
| "loss": 1.9562, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.04147353012929983, | |
| "grad_norm": 0.6052951216697693, | |
| "learning_rate": 2.4897681196148e-07, | |
| "loss": 1.9305, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04147353012929983, | |
| "eval_loss": 1.8620600700378418, | |
| "eval_runtime": 157.5929, | |
| "eval_samples_per_second": 1.624, | |
| "eval_steps_per_second": 0.812, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.041717492071236886, | |
| "grad_norm": 0.5671830177307129, | |
| "learning_rate": 2.4897055841423537e-07, | |
| "loss": 1.8514, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.04196145401317394, | |
| "grad_norm": 0.4015696346759796, | |
| "learning_rate": 2.489643020130825e-07, | |
| "loss": 1.8889, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.042205415955111, | |
| "grad_norm": 0.8785597681999207, | |
| "learning_rate": 2.4895804275606724e-07, | |
| "loss": 1.8905, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.04244937789704806, | |
| "grad_norm": 0.573078453540802, | |
| "learning_rate": 2.489517806412337e-07, | |
| "loss": 2.0164, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.04269333983898512, | |
| "grad_norm": 0.48950624465942383, | |
| "learning_rate": 2.4894551566662435e-07, | |
| "loss": 2.0895, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04293730178092218, | |
| "grad_norm": 0.5515138506889343, | |
| "learning_rate": 2.4893924783027967e-07, | |
| "loss": 1.9163, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.043181263722859235, | |
| "grad_norm": 0.4793028235435486, | |
| "learning_rate": 2.4893297713023835e-07, | |
| "loss": 1.8189, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.04342522566479629, | |
| "grad_norm": 0.5240328311920166, | |
| "learning_rate": 2.4892670356453745e-07, | |
| "loss": 1.9361, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.04366918760673335, | |
| "grad_norm": 0.5339527726173401, | |
| "learning_rate": 2.4892042713121207e-07, | |
| "loss": 1.9248, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.043913149548670406, | |
| "grad_norm": 0.468458890914917, | |
| "learning_rate": 2.4891414782829566e-07, | |
| "loss": 1.9061, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.043913149548670406, | |
| "eval_loss": 1.8581455945968628, | |
| "eval_runtime": 157.6293, | |
| "eval_samples_per_second": 1.624, | |
| "eval_steps_per_second": 0.812, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04415711149060746, | |
| "grad_norm": 0.5706861019134521, | |
| "learning_rate": 2.4890786565381976e-07, | |
| "loss": 1.8752, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.04440107343254452, | |
| "grad_norm": 0.573175311088562, | |
| "learning_rate": 2.489015806058142e-07, | |
| "loss": 1.9895, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.044645035374481584, | |
| "grad_norm": 1.2761479616165161, | |
| "learning_rate": 2.4889529268230683e-07, | |
| "loss": 1.9355, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.04488899731641864, | |
| "grad_norm": 3.7102456092834473, | |
| "learning_rate": 2.4888900188132405e-07, | |
| "loss": 1.9278, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.0451329592583557, | |
| "grad_norm": 0.5471494793891907, | |
| "learning_rate": 2.4888270820089003e-07, | |
| "loss": 1.9218, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.045376921200292755, | |
| "grad_norm": 0.9872457385063171, | |
| "learning_rate": 2.488764116390274e-07, | |
| "loss": 1.936, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.04562088314222981, | |
| "grad_norm": 0.528155505657196, | |
| "learning_rate": 2.488701121937568e-07, | |
| "loss": 1.9575, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.04586484508416687, | |
| "grad_norm": 0.51887446641922, | |
| "learning_rate": 2.488638098630973e-07, | |
| "loss": 1.8338, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.046108807026103926, | |
| "grad_norm": 0.4276951253414154, | |
| "learning_rate": 2.4885750464506606e-07, | |
| "loss": 2.0073, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.04635276896804098, | |
| "grad_norm": 0.5127749443054199, | |
| "learning_rate": 2.488511965376782e-07, | |
| "loss": 1.9237, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04635276896804098, | |
| "eval_loss": 1.856198787689209, | |
| "eval_runtime": 157.9524, | |
| "eval_samples_per_second": 1.621, | |
| "eval_steps_per_second": 0.81, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04659673090997804, | |
| "grad_norm": 0.5734567046165466, | |
| "learning_rate": 2.488448855389473e-07, | |
| "loss": 1.955, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.046840692851915104, | |
| "grad_norm": 0.4853633940219879, | |
| "learning_rate": 2.48838571646885e-07, | |
| "loss": 1.9313, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04708465479385216, | |
| "grad_norm": 0.8106932044029236, | |
| "learning_rate": 2.488322548595012e-07, | |
| "loss": 1.9164, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.04732861673578922, | |
| "grad_norm": 0.6387647986412048, | |
| "learning_rate": 2.488259351748038e-07, | |
| "loss": 2.0275, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.047572578677726275, | |
| "grad_norm": 0.48080340027809143, | |
| "learning_rate": 2.48819612590799e-07, | |
| "loss": 1.966, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.04781654061966333, | |
| "grad_norm": 0.464213103055954, | |
| "learning_rate": 2.4881328710549126e-07, | |
| "loss": 1.8753, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.04806050256160039, | |
| "grad_norm": 0.7000899314880371, | |
| "learning_rate": 2.48806958716883e-07, | |
| "loss": 2.0136, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.048304464503537446, | |
| "grad_norm": 0.474881112575531, | |
| "learning_rate": 2.488006274229749e-07, | |
| "loss": 1.9193, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.0485484264454745, | |
| "grad_norm": 0.5639634132385254, | |
| "learning_rate": 2.4879429322176583e-07, | |
| "loss": 1.8432, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.04879238838741157, | |
| "grad_norm": 0.41461923718452454, | |
| "learning_rate": 2.4878795611125284e-07, | |
| "loss": 1.8943, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04879238838741157, | |
| "eval_loss": 1.8539921045303345, | |
| "eval_runtime": 157.8624, | |
| "eval_samples_per_second": 1.622, | |
| "eval_steps_per_second": 0.811, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.049036350329348624, | |
| "grad_norm": 0.5546320080757141, | |
| "learning_rate": 2.487816160894311e-07, | |
| "loss": 1.8561, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.04928031227128568, | |
| "grad_norm": 0.4563431441783905, | |
| "learning_rate": 2.4877527315429387e-07, | |
| "loss": 1.9516, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.04952427421322274, | |
| "grad_norm": 0.48537513613700867, | |
| "learning_rate": 2.4876892730383267e-07, | |
| "loss": 2.0183, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.049768236155159795, | |
| "grad_norm": 0.5398459434509277, | |
| "learning_rate": 2.4876257853603717e-07, | |
| "loss": 1.9771, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.05001219809709685, | |
| "grad_norm": 0.47974419593811035, | |
| "learning_rate": 2.4875622684889513e-07, | |
| "loss": 1.8562, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.05025616003903391, | |
| "grad_norm": 0.42705652117729187, | |
| "learning_rate": 2.4874987224039246e-07, | |
| "loss": 1.9547, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.050500121980970966, | |
| "grad_norm": 1.4771904945373535, | |
| "learning_rate": 2.4874351470851334e-07, | |
| "loss": 1.9176, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.05074408392290802, | |
| "grad_norm": 0.4936388432979584, | |
| "learning_rate": 2.4873715425123986e-07, | |
| "loss": 1.986, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.05098804586484509, | |
| "grad_norm": 0.45525163412094116, | |
| "learning_rate": 2.4873079086655244e-07, | |
| "loss": 1.9623, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.051232007806782144, | |
| "grad_norm": 0.429779052734375, | |
| "learning_rate": 2.487244245524296e-07, | |
| "loss": 1.7466, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.051232007806782144, | |
| "eval_loss": 1.8527003526687622, | |
| "eval_runtime": 157.4992, | |
| "eval_samples_per_second": 1.625, | |
| "eval_steps_per_second": 0.813, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0514759697487192, | |
| "grad_norm": 0.4459904432296753, | |
| "learning_rate": 2.487180553068481e-07, | |
| "loss": 1.9284, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.05171993169065626, | |
| "grad_norm": 0.8717539310455322, | |
| "learning_rate": 2.487116831277826e-07, | |
| "loss": 1.7543, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.051963893632593315, | |
| "grad_norm": 0.7227014303207397, | |
| "learning_rate": 2.4870530801320607e-07, | |
| "loss": 1.8261, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.05220785557453037, | |
| "grad_norm": 0.4853971302509308, | |
| "learning_rate": 2.486989299610895e-07, | |
| "loss": 1.9214, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.05245181751646743, | |
| "grad_norm": 0.5626842975616455, | |
| "learning_rate": 2.4869254896940207e-07, | |
| "loss": 1.8116, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.052695779458404486, | |
| "grad_norm": 0.4326629340648651, | |
| "learning_rate": 2.4868616503611124e-07, | |
| "loss": 1.7844, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.05293974140034155, | |
| "grad_norm": 0.43978720903396606, | |
| "learning_rate": 2.486797781591823e-07, | |
| "loss": 1.7327, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.05318370334227861, | |
| "grad_norm": 1.3520264625549316, | |
| "learning_rate": 2.4867338833657884e-07, | |
| "loss": 1.9084, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.053427665284215664, | |
| "grad_norm": 1.791759967803955, | |
| "learning_rate": 2.4866699556626256e-07, | |
| "loss": 2.0314, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.05367162722615272, | |
| "grad_norm": 0.7393069267272949, | |
| "learning_rate": 2.486605998461933e-07, | |
| "loss": 1.8518, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05367162722615272, | |
| "eval_loss": 1.850144386291504, | |
| "eval_runtime": 156.9992, | |
| "eval_samples_per_second": 1.631, | |
| "eval_steps_per_second": 0.815, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05391558916808978, | |
| "grad_norm": 0.4648591876029968, | |
| "learning_rate": 2.4865420117432884e-07, | |
| "loss": 1.9889, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.054159551110026835, | |
| "grad_norm": 0.4539943337440491, | |
| "learning_rate": 2.4864779954862536e-07, | |
| "loss": 1.8777, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.05440351305196389, | |
| "grad_norm": 23.188865661621094, | |
| "learning_rate": 2.486413949670369e-07, | |
| "loss": 1.9913, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.05464747499390095, | |
| "grad_norm": 0.5861213803291321, | |
| "learning_rate": 2.486349874275158e-07, | |
| "loss": 1.5643, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.054891436935838006, | |
| "grad_norm": 0.4710935056209564, | |
| "learning_rate": 2.486285769280123e-07, | |
| "loss": 1.9896, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.05513539887777507, | |
| "grad_norm": 0.5323078632354736, | |
| "learning_rate": 2.48622163466475e-07, | |
| "loss": 1.7714, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.05537936081971213, | |
| "grad_norm": 0.5247780680656433, | |
| "learning_rate": 2.486157470408504e-07, | |
| "loss": 1.9497, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.055623322761649184, | |
| "grad_norm": 0.48543304204940796, | |
| "learning_rate": 2.4860932764908314e-07, | |
| "loss": 1.9012, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.05586728470358624, | |
| "grad_norm": 0.5412744879722595, | |
| "learning_rate": 2.486029052891161e-07, | |
| "loss": 1.8044, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.0561112466455233, | |
| "grad_norm": 0.4210870563983917, | |
| "learning_rate": 2.4859647995889003e-07, | |
| "loss": 1.7522, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0561112466455233, | |
| "eval_loss": 1.845929741859436, | |
| "eval_runtime": 157.1033, | |
| "eval_samples_per_second": 1.63, | |
| "eval_steps_per_second": 0.815, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.056355208587460355, | |
| "grad_norm": 0.49198633432388306, | |
| "learning_rate": 2.4859005165634397e-07, | |
| "loss": 1.6787, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.05659917052939741, | |
| "grad_norm": 0.4444589912891388, | |
| "learning_rate": 2.4858362037941493e-07, | |
| "loss": 1.8522, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.05684313247133447, | |
| "grad_norm": 0.42611005902290344, | |
| "learning_rate": 2.485771861260381e-07, | |
| "loss": 1.773, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.05708709441327153, | |
| "grad_norm": 0.44933363795280457, | |
| "learning_rate": 2.485707488941467e-07, | |
| "loss": 1.839, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.05733105635520859, | |
| "grad_norm": 0.510879397392273, | |
| "learning_rate": 2.48564308681672e-07, | |
| "loss": 1.9736, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.05757501829714565, | |
| "grad_norm": 0.48234203457832336, | |
| "learning_rate": 2.485578654865435e-07, | |
| "loss": 1.8358, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.057818980239082704, | |
| "grad_norm": 0.5287805795669556, | |
| "learning_rate": 2.485514193066886e-07, | |
| "loss": 1.7455, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.05806294218101976, | |
| "grad_norm": 0.4200873374938965, | |
| "learning_rate": 2.485449701400329e-07, | |
| "loss": 1.8146, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.05830690412295682, | |
| "grad_norm": 0.42826953530311584, | |
| "learning_rate": 2.485385179845001e-07, | |
| "loss": 1.8783, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.058550866064893875, | |
| "grad_norm": 0.6160483360290527, | |
| "learning_rate": 2.4853206283801187e-07, | |
| "loss": 2.0157, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.058550866064893875, | |
| "eval_loss": 1.8428621292114258, | |
| "eval_runtime": 157.1726, | |
| "eval_samples_per_second": 1.629, | |
| "eval_steps_per_second": 0.814, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05879482800683093, | |
| "grad_norm": 0.517240047454834, | |
| "learning_rate": 2.4852560469848794e-07, | |
| "loss": 1.8066, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.05903878994876799, | |
| "grad_norm": 0.45431217551231384, | |
| "learning_rate": 2.4851914356384624e-07, | |
| "loss": 1.763, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.05928275189070505, | |
| "grad_norm": 0.5374858975410461, | |
| "learning_rate": 2.485126794320027e-07, | |
| "loss": 1.7991, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.05952671383264211, | |
| "grad_norm": 0.4840785562992096, | |
| "learning_rate": 2.4850621230087125e-07, | |
| "loss": 1.9219, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.05977067577457917, | |
| "grad_norm": 0.6035332083702087, | |
| "learning_rate": 2.4849974216836405e-07, | |
| "loss": 1.8103, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.060014637716516224, | |
| "grad_norm": 0.44333499670028687, | |
| "learning_rate": 2.4849326903239115e-07, | |
| "loss": 1.8412, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.06025859965845328, | |
| "grad_norm": 0.7768390774726868, | |
| "learning_rate": 2.4848679289086074e-07, | |
| "loss": 1.9089, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.06050256160039034, | |
| "grad_norm": 0.5787532329559326, | |
| "learning_rate": 2.4848031374167913e-07, | |
| "loss": 1.9024, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.060746523542327395, | |
| "grad_norm": 0.4455646276473999, | |
| "learning_rate": 2.484738315827505e-07, | |
| "loss": 1.9293, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.06099048548426445, | |
| "grad_norm": 0.48859095573425293, | |
| "learning_rate": 2.484673464119773e-07, | |
| "loss": 1.8183, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06099048548426445, | |
| "eval_loss": 1.8416523933410645, | |
| "eval_runtime": 156.2376, | |
| "eval_samples_per_second": 1.639, | |
| "eval_steps_per_second": 0.819, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.061234447426201516, | |
| "grad_norm": 0.4281693398952484, | |
| "learning_rate": 2.484608582272598e-07, | |
| "loss": 1.9258, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.06147840936813857, | |
| "grad_norm": 0.43426513671875, | |
| "learning_rate": 2.4845436702649656e-07, | |
| "loss": 2.0341, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.06172237131007563, | |
| "grad_norm": 0.5216272473335266, | |
| "learning_rate": 2.48447872807584e-07, | |
| "loss": 1.8391, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.06196633325201269, | |
| "grad_norm": 0.4329265356063843, | |
| "learning_rate": 2.484413755684167e-07, | |
| "loss": 1.8692, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.062210295193949744, | |
| "grad_norm": 1.1542620658874512, | |
| "learning_rate": 2.484348753068872e-07, | |
| "loss": 1.9009, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.0624542571358868, | |
| "grad_norm": 0.44065535068511963, | |
| "learning_rate": 2.484283720208861e-07, | |
| "loss": 1.7906, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.06269821907782386, | |
| "grad_norm": 0.4028589129447937, | |
| "learning_rate": 2.4842186570830207e-07, | |
| "loss": 1.821, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.06294218101976091, | |
| "grad_norm": 0.5287508964538574, | |
| "learning_rate": 2.484153563670218e-07, | |
| "loss": 1.6887, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.06318614296169797, | |
| "grad_norm": 0.472429096698761, | |
| "learning_rate": 2.4840884399493006e-07, | |
| "loss": 1.8086, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.06343010490363503, | |
| "grad_norm": 0.40466898679733276, | |
| "learning_rate": 2.4840232858990943e-07, | |
| "loss": 1.8095, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06343010490363503, | |
| "eval_loss": 1.8428053855895996, | |
| "eval_runtime": 156.6484, | |
| "eval_samples_per_second": 1.634, | |
| "eval_steps_per_second": 0.817, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06367406684557209, | |
| "grad_norm": 0.5649131536483765, | |
| "learning_rate": 2.4839581014984084e-07, | |
| "loss": 1.8726, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.06391802878750914, | |
| "grad_norm": 0.5180754065513611, | |
| "learning_rate": 2.48389288672603e-07, | |
| "loss": 1.9934, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.0641619907294462, | |
| "grad_norm": 0.4884182810783386, | |
| "learning_rate": 2.483827641560728e-07, | |
| "loss": 1.7776, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.06440595267138327, | |
| "grad_norm": 0.5376865267753601, | |
| "learning_rate": 2.48376236598125e-07, | |
| "loss": 1.7831, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.06464991461332033, | |
| "grad_norm": 0.7305421829223633, | |
| "learning_rate": 2.4836970599663255e-07, | |
| "loss": 1.8499, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.06489387655525738, | |
| "grad_norm": 0.4067825376987457, | |
| "learning_rate": 2.4836317234946626e-07, | |
| "loss": 1.9762, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.06513783849719444, | |
| "grad_norm": 1.1095890998840332, | |
| "learning_rate": 2.48356635654495e-07, | |
| "loss": 1.884, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.0653818004391315, | |
| "grad_norm": 1.5947470664978027, | |
| "learning_rate": 2.4835009590958575e-07, | |
| "loss": 1.8838, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.06562576238106856, | |
| "grad_norm": 0.5433115363121033, | |
| "learning_rate": 2.483435531126034e-07, | |
| "loss": 1.9129, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.06586972432300561, | |
| "grad_norm": 0.43899622559547424, | |
| "learning_rate": 2.483370072614108e-07, | |
| "loss": 1.7831, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06586972432300561, | |
| "eval_loss": 1.839111328125, | |
| "eval_runtime": 156.1734, | |
| "eval_samples_per_second": 1.639, | |
| "eval_steps_per_second": 0.82, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06611368626494267, | |
| "grad_norm": 0.44969475269317627, | |
| "learning_rate": 2.483304583538689e-07, | |
| "loss": 1.901, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.06635764820687973, | |
| "grad_norm": 0.42426538467407227, | |
| "learning_rate": 2.4832390638783666e-07, | |
| "loss": 1.8534, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.06660161014881678, | |
| "grad_norm": 0.511674702167511, | |
| "learning_rate": 2.4831735136117095e-07, | |
| "loss": 1.9139, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.06684557209075384, | |
| "grad_norm": 0.43454718589782715, | |
| "learning_rate": 2.4831079327172674e-07, | |
| "loss": 1.9442, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.0670895340326909, | |
| "grad_norm": 0.4460424780845642, | |
| "learning_rate": 2.4830423211735686e-07, | |
| "loss": 1.9378, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.06733349597462795, | |
| "grad_norm": 0.6298746466636658, | |
| "learning_rate": 2.482976678959123e-07, | |
| "loss": 1.8372, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.06757745791656501, | |
| "grad_norm": 0.44850224256515503, | |
| "learning_rate": 2.4829110060524197e-07, | |
| "loss": 1.8511, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.06782141985850207, | |
| "grad_norm": 0.4357118308544159, | |
| "learning_rate": 2.482845302431927e-07, | |
| "loss": 1.763, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.06806538180043913, | |
| "grad_norm": 0.3952440023422241, | |
| "learning_rate": 2.4827795680760933e-07, | |
| "loss": 1.9439, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.06830934374237618, | |
| "grad_norm": 0.4903910458087921, | |
| "learning_rate": 2.482713802963348e-07, | |
| "loss": 1.811, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06830934374237618, | |
| "eval_loss": 1.8365715742111206, | |
| "eval_runtime": 157.7942, | |
| "eval_samples_per_second": 1.622, | |
| "eval_steps_per_second": 0.811, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06855330568431325, | |
| "grad_norm": 0.5027759075164795, | |
| "learning_rate": 2.4826480070720985e-07, | |
| "loss": 1.9209, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.06879726762625031, | |
| "grad_norm": 0.4530917704105377, | |
| "learning_rate": 2.482582180380734e-07, | |
| "loss": 1.8037, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.06904122956818737, | |
| "grad_norm": 0.4016598165035248, | |
| "learning_rate": 2.482516322867622e-07, | |
| "loss": 1.8756, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.06928519151012442, | |
| "grad_norm": 0.4351702630519867, | |
| "learning_rate": 2.48245043451111e-07, | |
| "loss": 2.0021, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.06952915345206148, | |
| "grad_norm": 0.4535478949546814, | |
| "learning_rate": 2.482384515289525e-07, | |
| "loss": 1.8903, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.06977311539399854, | |
| "grad_norm": 0.4296678304672241, | |
| "learning_rate": 2.482318565181174e-07, | |
| "loss": 1.916, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.0700170773359356, | |
| "grad_norm": 0.6348395347595215, | |
| "learning_rate": 2.4822525841643453e-07, | |
| "loss": 1.895, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.07026103927787265, | |
| "grad_norm": 0.4949493706226349, | |
| "learning_rate": 2.482186572217303e-07, | |
| "loss": 2.07, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.07050500121980971, | |
| "grad_norm": 0.4145565927028656, | |
| "learning_rate": 2.482120529318294e-07, | |
| "loss": 1.8886, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.07074896316174677, | |
| "grad_norm": 0.5197605490684509, | |
| "learning_rate": 2.482054455445545e-07, | |
| "loss": 1.876, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07074896316174677, | |
| "eval_loss": 1.8359309434890747, | |
| "eval_runtime": 156.5279, | |
| "eval_samples_per_second": 1.635, | |
| "eval_steps_per_second": 0.818, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07099292510368382, | |
| "grad_norm": 0.42653581500053406, | |
| "learning_rate": 2.481988350577259e-07, | |
| "loss": 1.8605, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.07123688704562088, | |
| "grad_norm": 0.3822322189807892, | |
| "learning_rate": 2.481922214691622e-07, | |
| "loss": 1.844, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.07148084898755794, | |
| "grad_norm": 0.4121018946170807, | |
| "learning_rate": 2.481856047766798e-07, | |
| "loss": 1.9521, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.071724810929495, | |
| "grad_norm": 0.3980840742588043, | |
| "learning_rate": 2.4817898497809304e-07, | |
| "loss": 1.8008, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.07196877287143205, | |
| "grad_norm": 0.7482399344444275, | |
| "learning_rate": 2.4817236207121427e-07, | |
| "loss": 1.8344, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.07221273481336911, | |
| "grad_norm": 0.5517648458480835, | |
| "learning_rate": 2.4816573605385374e-07, | |
| "loss": 1.9856, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.07245669675530617, | |
| "grad_norm": 0.3954029381275177, | |
| "learning_rate": 2.481591069238197e-07, | |
| "loss": 1.7306, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.07270065869724324, | |
| "grad_norm": 0.6213473677635193, | |
| "learning_rate": 2.481524746789182e-07, | |
| "loss": 1.873, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.0729446206391803, | |
| "grad_norm": 0.42206960916519165, | |
| "learning_rate": 2.4814583931695343e-07, | |
| "loss": 1.9073, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.07318858258111735, | |
| "grad_norm": 0.4138680100440979, | |
| "learning_rate": 2.4813920083572734e-07, | |
| "loss": 1.7581, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07318858258111735, | |
| "eval_loss": 1.8346822261810303, | |
| "eval_runtime": 156.8712, | |
| "eval_samples_per_second": 1.632, | |
| "eval_steps_per_second": 0.816, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07343254452305441, | |
| "grad_norm": 0.9438842535018921, | |
| "learning_rate": 2.481325592330399e-07, | |
| "loss": 1.8472, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.07367650646499146, | |
| "grad_norm": 0.3860412538051605, | |
| "learning_rate": 2.4812591450668896e-07, | |
| "loss": 1.8402, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.07392046840692852, | |
| "grad_norm": 0.33647987246513367, | |
| "learning_rate": 2.4811926665447034e-07, | |
| "loss": 1.9474, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.07416443034886558, | |
| "grad_norm": 0.3667222559452057, | |
| "learning_rate": 2.481126156741779e-07, | |
| "loss": 1.8661, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.07440839229080264, | |
| "grad_norm": 0.47111183404922485, | |
| "learning_rate": 2.481059615636031e-07, | |
| "loss": 1.7963, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.07465235423273969, | |
| "grad_norm": 0.4970519244670868, | |
| "learning_rate": 2.480993043205356e-07, | |
| "loss": 1.7931, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.07489631617467675, | |
| "grad_norm": 0.43172699213027954, | |
| "learning_rate": 2.4809264394276297e-07, | |
| "loss": 1.8096, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.0751402781166138, | |
| "grad_norm": 1.3444660902023315, | |
| "learning_rate": 2.4808598042807057e-07, | |
| "loss": 1.9013, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.07538424005855086, | |
| "grad_norm": 0.39566361904144287, | |
| "learning_rate": 2.4807931377424167e-07, | |
| "loss": 1.8494, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.07562820200048792, | |
| "grad_norm": 0.37536919116973877, | |
| "learning_rate": 2.4807264397905757e-07, | |
| "loss": 1.9214, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07562820200048792, | |
| "eval_loss": 1.8326919078826904, | |
| "eval_runtime": 156.8066, | |
| "eval_samples_per_second": 1.633, | |
| "eval_steps_per_second": 0.816, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07587216394242498, | |
| "grad_norm": 0.515691339969635, | |
| "learning_rate": 2.480659710402974e-07, | |
| "loss": 1.8315, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.07611612588436203, | |
| "grad_norm": 0.5210254192352295, | |
| "learning_rate": 2.480592949557383e-07, | |
| "loss": 1.9244, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.07636008782629909, | |
| "grad_norm": 0.5208694338798523, | |
| "learning_rate": 2.4805261572315513e-07, | |
| "loss": 1.8838, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.07660404976823615, | |
| "grad_norm": 0.4405214786529541, | |
| "learning_rate": 2.480459333403207e-07, | |
| "loss": 1.816, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.07684801171017322, | |
| "grad_norm": 0.4438663423061371, | |
| "learning_rate": 2.480392478050059e-07, | |
| "loss": 1.7578, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.07709197365211028, | |
| "grad_norm": 0.4870030879974365, | |
| "learning_rate": 2.4803255911497927e-07, | |
| "loss": 2.0076, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.07733593559404733, | |
| "grad_norm": 0.44352516531944275, | |
| "learning_rate": 2.4802586726800744e-07, | |
| "loss": 1.8897, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.07757989753598439, | |
| "grad_norm": 0.40144485235214233, | |
| "learning_rate": 2.4801917226185476e-07, | |
| "loss": 1.9574, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.07782385947792145, | |
| "grad_norm": 0.4221437871456146, | |
| "learning_rate": 2.480124740942837e-07, | |
| "loss": 1.8748, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.0780678214198585, | |
| "grad_norm": 0.39843979477882385, | |
| "learning_rate": 2.480057727630543e-07, | |
| "loss": 1.996, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0780678214198585, | |
| "eval_loss": 1.8313816785812378, | |
| "eval_runtime": 156.6502, | |
| "eval_samples_per_second": 1.634, | |
| "eval_steps_per_second": 0.817, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07831178336179556, | |
| "grad_norm": 0.7306655645370483, | |
| "learning_rate": 2.479990682659248e-07, | |
| "loss": 1.8732, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.07855574530373262, | |
| "grad_norm": 0.46410149335861206, | |
| "learning_rate": 2.4799236060065104e-07, | |
| "loss": 1.9037, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.07879970724566968, | |
| "grad_norm": 0.4528440833091736, | |
| "learning_rate": 2.47985649764987e-07, | |
| "loss": 1.8296, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.07904366918760673, | |
| "grad_norm": 0.5731680989265442, | |
| "learning_rate": 2.4797893575668437e-07, | |
| "loss": 1.839, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.07928763112954379, | |
| "grad_norm": 0.3977627456188202, | |
| "learning_rate": 2.4797221857349267e-07, | |
| "loss": 1.9664, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.07953159307148085, | |
| "grad_norm": 0.7255275249481201, | |
| "learning_rate": 2.4796549821315954e-07, | |
| "loss": 1.8649, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.0797755550134179, | |
| "grad_norm": 0.4904336929321289, | |
| "learning_rate": 2.479587746734302e-07, | |
| "loss": 1.945, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.08001951695535496, | |
| "grad_norm": 0.46819430589675903, | |
| "learning_rate": 2.4795204795204794e-07, | |
| "loss": 1.894, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.08026347889729202, | |
| "grad_norm": 0.8833802938461304, | |
| "learning_rate": 2.479453180467538e-07, | |
| "loss": 1.8628, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.08050744083922907, | |
| "grad_norm": 0.44334056973457336, | |
| "learning_rate": 2.479385849552867e-07, | |
| "loss": 1.8583, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08050744083922907, | |
| "eval_loss": 1.8302311897277832, | |
| "eval_runtime": 156.8163, | |
| "eval_samples_per_second": 1.632, | |
| "eval_steps_per_second": 0.816, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08075140278116613, | |
| "grad_norm": 0.4154978394508362, | |
| "learning_rate": 2.479318486753834e-07, | |
| "loss": 1.7181, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.0809953647231032, | |
| "grad_norm": 0.5498473048210144, | |
| "learning_rate": 2.479251092047787e-07, | |
| "loss": 2.1092, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.08123932666504026, | |
| "grad_norm": 0.41959795355796814, | |
| "learning_rate": 2.4791836654120494e-07, | |
| "loss": 1.853, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.08148328860697732, | |
| "grad_norm": 0.48775970935821533, | |
| "learning_rate": 2.4791162068239256e-07, | |
| "loss": 1.878, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.08172725054891437, | |
| "grad_norm": 1.0387691259384155, | |
| "learning_rate": 2.4790487162606977e-07, | |
| "loss": 1.9639, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.08197121249085143, | |
| "grad_norm": 0.4307618737220764, | |
| "learning_rate": 2.478981193699626e-07, | |
| "loss": 1.798, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.08221517443278849, | |
| "grad_norm": 0.8073650598526001, | |
| "learning_rate": 2.478913639117949e-07, | |
| "loss": 1.8512, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.08245913637472554, | |
| "grad_norm": 0.785327136516571, | |
| "learning_rate": 2.478846052492885e-07, | |
| "loss": 1.8926, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.0827030983166626, | |
| "grad_norm": 0.4723658263683319, | |
| "learning_rate": 2.478778433801629e-07, | |
| "loss": 1.9997, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.08294706025859966, | |
| "grad_norm": 0.4107203185558319, | |
| "learning_rate": 2.478710783021355e-07, | |
| "loss": 1.8609, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08294706025859966, | |
| "eval_loss": 1.829516887664795, | |
| "eval_runtime": 156.5752, | |
| "eval_samples_per_second": 1.635, | |
| "eval_steps_per_second": 0.817, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.08319102220053672, | |
| "grad_norm": 0.40097326040267944, | |
| "learning_rate": 2.4786431001292156e-07, | |
| "loss": 1.7514, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.08343498414247377, | |
| "grad_norm": 0.39558151364326477, | |
| "learning_rate": 2.478575385102342e-07, | |
| "loss": 1.9019, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.08367894608441083, | |
| "grad_norm": 0.3937402367591858, | |
| "learning_rate": 2.4785076379178427e-07, | |
| "loss": 2.0703, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.08392290802634789, | |
| "grad_norm": 0.3737332820892334, | |
| "learning_rate": 2.478439858552805e-07, | |
| "loss": 1.8953, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.08416686996828494, | |
| "grad_norm": 0.3693140745162964, | |
| "learning_rate": 2.4783720469842943e-07, | |
| "loss": 1.8952, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.084410831910222, | |
| "grad_norm": 0.41011977195739746, | |
| "learning_rate": 2.4783042031893544e-07, | |
| "loss": 1.7306, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.08465479385215906, | |
| "grad_norm": 0.4407089352607727, | |
| "learning_rate": 2.478236327145007e-07, | |
| "loss": 1.8516, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.08489875579409611, | |
| "grad_norm": 0.4775758683681488, | |
| "learning_rate": 2.4781684188282526e-07, | |
| "loss": 1.8198, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.08514271773603319, | |
| "grad_norm": 0.37072694301605225, | |
| "learning_rate": 2.4781004782160693e-07, | |
| "loss": 1.9177, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.08538667967797024, | |
| "grad_norm": 0.3914446532726288, | |
| "learning_rate": 2.478032505285412e-07, | |
| "loss": 1.8334, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08538667967797024, | |
| "eval_loss": 1.8291497230529785, | |
| "eval_runtime": 157.2832, | |
| "eval_samples_per_second": 1.628, | |
| "eval_steps_per_second": 0.814, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0856306416199073, | |
| "grad_norm": 0.40111953020095825, | |
| "learning_rate": 2.4779645000132166e-07, | |
| "loss": 1.9745, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.08587460356184436, | |
| "grad_norm": 0.4218769967556, | |
| "learning_rate": 2.477896462376395e-07, | |
| "loss": 1.7767, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.08611856550378141, | |
| "grad_norm": 1.2748806476593018, | |
| "learning_rate": 2.4778283923518366e-07, | |
| "loss": 1.9835, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.08636252744571847, | |
| "grad_norm": 0.9254433512687683, | |
| "learning_rate": 2.477760289916411e-07, | |
| "loss": 1.8909, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.08660648938765553, | |
| "grad_norm": 1.155629277229309, | |
| "learning_rate": 2.477692155046964e-07, | |
| "loss": 2.0672, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.08685045132959258, | |
| "grad_norm": 0.6299034357070923, | |
| "learning_rate": 2.47762398772032e-07, | |
| "loss": 1.9787, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.08709441327152964, | |
| "grad_norm": 0.7239134907722473, | |
| "learning_rate": 2.4775557879132803e-07, | |
| "loss": 1.7728, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.0873383752134667, | |
| "grad_norm": 0.4112605154514313, | |
| "learning_rate": 2.4774875556026265e-07, | |
| "loss": 1.824, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.08758233715540376, | |
| "grad_norm": 0.4959578812122345, | |
| "learning_rate": 2.477419290765115e-07, | |
| "loss": 1.7778, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.08782629909734081, | |
| "grad_norm": 0.4753192961215973, | |
| "learning_rate": 2.4773509933774833e-07, | |
| "loss": 1.6845, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08782629909734081, | |
| "eval_loss": 1.8272368907928467, | |
| "eval_runtime": 156.5455, | |
| "eval_samples_per_second": 1.635, | |
| "eval_steps_per_second": 0.818, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08807026103927787, | |
| "grad_norm": 0.39284539222717285, | |
| "learning_rate": 2.4772826634164435e-07, | |
| "loss": 1.6858, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.08831422298121493, | |
| "grad_norm": 0.48466554284095764, | |
| "learning_rate": 2.4772143008586876e-07, | |
| "loss": 1.9059, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.08855818492315198, | |
| "grad_norm": 0.4809161424636841, | |
| "learning_rate": 2.4771459056808844e-07, | |
| "loss": 1.9083, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.08880214686508904, | |
| "grad_norm": 0.5406439900398254, | |
| "learning_rate": 2.477077477859681e-07, | |
| "loss": 1.8219, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.0890461088070261, | |
| "grad_norm": 0.5194385647773743, | |
| "learning_rate": 2.4770090173717014e-07, | |
| "loss": 1.7921, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.08929007074896317, | |
| "grad_norm": 0.412882536649704, | |
| "learning_rate": 2.4769405241935484e-07, | |
| "loss": 1.7941, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.08953403269090023, | |
| "grad_norm": 0.37151506543159485, | |
| "learning_rate": 2.476871998301802e-07, | |
| "loss": 1.7942, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.08977799463283728, | |
| "grad_norm": 0.4231220483779907, | |
| "learning_rate": 2.476803439673019e-07, | |
| "loss": 1.8722, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.09002195657477434, | |
| "grad_norm": 0.5867494344711304, | |
| "learning_rate": 2.476734848283735e-07, | |
| "loss": 1.9138, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.0902659185167114, | |
| "grad_norm": 0.3956262171268463, | |
| "learning_rate": 2.476666224110462e-07, | |
| "loss": 1.9813, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.0902659185167114, | |
| "eval_loss": 1.826444149017334, | |
| "eval_runtime": 157.275, | |
| "eval_samples_per_second": 1.628, | |
| "eval_steps_per_second": 0.814, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09050988045864845, | |
| "grad_norm": 0.42614656686782837, | |
| "learning_rate": 2.476597567129691e-07, | |
| "loss": 1.7726, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.09075384240058551, | |
| "grad_norm": 0.47062888741493225, | |
| "learning_rate": 2.4765288773178894e-07, | |
| "loss": 1.8998, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.09099780434252257, | |
| "grad_norm": 0.43838515877723694, | |
| "learning_rate": 2.476460154651503e-07, | |
| "loss": 1.8538, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.09124176628445962, | |
| "grad_norm": 0.6669487357139587, | |
| "learning_rate": 2.4763913991069527e-07, | |
| "loss": 1.8683, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.09148572822639668, | |
| "grad_norm": 0.4067532420158386, | |
| "learning_rate": 2.4763226106606407e-07, | |
| "loss": 1.8279, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.09172969016833374, | |
| "grad_norm": 1.4081276655197144, | |
| "learning_rate": 2.476253789288943e-07, | |
| "loss": 1.6806, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.0919736521102708, | |
| "grad_norm": 0.5126282572746277, | |
| "learning_rate": 2.4761849349682154e-07, | |
| "loss": 1.7196, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.09221761405220785, | |
| "grad_norm": 0.47513243556022644, | |
| "learning_rate": 2.4761160476747895e-07, | |
| "loss": 1.7233, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.09246157599414491, | |
| "grad_norm": 0.5680952072143555, | |
| "learning_rate": 2.4760471273849755e-07, | |
| "loss": 1.9624, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.09270553793608197, | |
| "grad_norm": 0.4912157654762268, | |
| "learning_rate": 2.47597817407506e-07, | |
| "loss": 1.961, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09270553793608197, | |
| "eval_loss": 1.8258123397827148, | |
| "eval_runtime": 156.3289, | |
| "eval_samples_per_second": 1.638, | |
| "eval_steps_per_second": 0.819, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.09294949987801902, | |
| "grad_norm": 0.5005534291267395, | |
| "learning_rate": 2.475909187721307e-07, | |
| "loss": 1.8626, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.09319346181995608, | |
| "grad_norm": 0.45611926913261414, | |
| "learning_rate": 2.4758401682999573e-07, | |
| "loss": 1.919, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.09343742376189315, | |
| "grad_norm": 0.5665335655212402, | |
| "learning_rate": 2.475771115787231e-07, | |
| "loss": 1.8476, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.09368138570383021, | |
| "grad_norm": 0.4179742634296417, | |
| "learning_rate": 2.475702030159322e-07, | |
| "loss": 1.7702, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.09392534764576727, | |
| "grad_norm": 0.44780439138412476, | |
| "learning_rate": 2.475632911392405e-07, | |
| "loss": 1.7905, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.09416930958770432, | |
| "grad_norm": 0.9271466732025146, | |
| "learning_rate": 2.475563759462629e-07, | |
| "loss": 1.976, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.09441327152964138, | |
| "grad_norm": 0.6895579099655151, | |
| "learning_rate": 2.475494574346122e-07, | |
| "loss": 1.9016, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.09465723347157844, | |
| "grad_norm": 0.4328395426273346, | |
| "learning_rate": 2.475425356018988e-07, | |
| "loss": 1.7875, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.0949011954135155, | |
| "grad_norm": 0.4196988344192505, | |
| "learning_rate": 2.475356104457307e-07, | |
| "loss": 1.7607, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.09514515735545255, | |
| "grad_norm": 0.4333524703979492, | |
| "learning_rate": 2.4752868196371393e-07, | |
| "loss": 1.9771, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09514515735545255, | |
| "eval_loss": 1.8251597881317139, | |
| "eval_runtime": 157.0151, | |
| "eval_samples_per_second": 1.63, | |
| "eval_steps_per_second": 0.815, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.09538911929738961, | |
| "grad_norm": 0.6076596975326538, | |
| "learning_rate": 2.47521750153452e-07, | |
| "loss": 2.1356, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.09563308123932666, | |
| "grad_norm": 0.43572092056274414, | |
| "learning_rate": 2.4751481501254606e-07, | |
| "loss": 1.9217, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.09587704318126372, | |
| "grad_norm": 23.73161506652832, | |
| "learning_rate": 2.4750787653859505e-07, | |
| "loss": 2.1093, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.09612100512320078, | |
| "grad_norm": 0.46901410818099976, | |
| "learning_rate": 2.475009347291956e-07, | |
| "loss": 1.9877, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.09636496706513784, | |
| "grad_norm": 0.4053335189819336, | |
| "learning_rate": 2.47493989581942e-07, | |
| "loss": 1.9272, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.09660892900707489, | |
| "grad_norm": 0.4614839255809784, | |
| "learning_rate": 2.4748704109442635e-07, | |
| "loss": 1.885, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.09685289094901195, | |
| "grad_norm": 0.4277932047843933, | |
| "learning_rate": 2.4748008926423817e-07, | |
| "loss": 1.808, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.097096852890949, | |
| "grad_norm": 0.41171425580978394, | |
| "learning_rate": 2.474731340889649e-07, | |
| "loss": 1.928, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.09734081483288606, | |
| "grad_norm": 0.41549429297447205, | |
| "learning_rate": 2.4746617556619163e-07, | |
| "loss": 1.7844, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.09758477677482313, | |
| "grad_norm": 0.4279956817626953, | |
| "learning_rate": 2.4745921369350094e-07, | |
| "loss": 1.9173, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09758477677482313, | |
| "eval_loss": 1.823663353919983, | |
| "eval_runtime": 157.0142, | |
| "eval_samples_per_second": 1.63, | |
| "eval_steps_per_second": 0.815, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 4099, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0291845984681984e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |