|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.978930307941653, |
|
"eval_steps": 500, |
|
"global_step": 153, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.019448946515397084, |
|
"grad_norm": 6.861098289489746, |
|
"learning_rate": 6.25e-07, |
|
"loss": 1.2627, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03889789303079417, |
|
"grad_norm": 6.737518310546875, |
|
"learning_rate": 1.25e-06, |
|
"loss": 1.2374, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05834683954619125, |
|
"grad_norm": 6.920078754425049, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 1.2864, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07779578606158834, |
|
"grad_norm": 6.555400371551514, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.2097, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09724473257698542, |
|
"grad_norm": 6.175917625427246, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.1939, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1166936790923825, |
|
"grad_norm": 4.607879638671875, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.1667, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.13614262560777957, |
|
"grad_norm": 2.9558024406433105, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 1.1707, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.15559157212317667, |
|
"grad_norm": 2.6493217945098877, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0802, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.17504051863857376, |
|
"grad_norm": 4.4779953956604, |
|
"learning_rate": 5.625e-06, |
|
"loss": 1.1318, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.19448946515397084, |
|
"grad_norm": 4.794122695922852, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.162, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.21393841166936792, |
|
"grad_norm": 4.527707099914551, |
|
"learning_rate": 6.875e-06, |
|
"loss": 1.1654, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.233387358184765, |
|
"grad_norm": 3.3236396312713623, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.0345, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.25283630470016205, |
|
"grad_norm": 3.3806722164154053, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 1.0519, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.27228525121555913, |
|
"grad_norm": 2.476033926010132, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.0189, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2917341977309562, |
|
"grad_norm": 2.0179576873779297, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 0.9914, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.31118314424635335, |
|
"grad_norm": 1.9065077304840088, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0208, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.33063209076175043, |
|
"grad_norm": 1.8468468189239502, |
|
"learning_rate": 9.998685442495921e-06, |
|
"loss": 0.9893, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3500810372771475, |
|
"grad_norm": 1.5280859470367432, |
|
"learning_rate": 9.994742461208251e-06, |
|
"loss": 0.9482, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3695299837925446, |
|
"grad_norm": 1.369767665863037, |
|
"learning_rate": 9.988173129447251e-06, |
|
"loss": 0.9942, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.3889789303079417, |
|
"grad_norm": 1.207467794418335, |
|
"learning_rate": 9.978980901518663e-06, |
|
"loss": 0.9676, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.40842787682333875, |
|
"grad_norm": 1.4069736003875732, |
|
"learning_rate": 9.96717061090737e-06, |
|
"loss": 0.9322, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.42787682333873583, |
|
"grad_norm": 1.1611406803131104, |
|
"learning_rate": 9.95274846773583e-06, |
|
"loss": 0.9786, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4473257698541329, |
|
"grad_norm": 1.1889172792434692, |
|
"learning_rate": 9.935722055498655e-06, |
|
"loss": 0.9256, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.46677471636953, |
|
"grad_norm": 1.1732343435287476, |
|
"learning_rate": 9.916100327075038e-06, |
|
"loss": 0.8946, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4862236628849271, |
|
"grad_norm": 1.3214962482452393, |
|
"learning_rate": 9.893893600021112e-06, |
|
"loss": 0.9087, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5056726094003241, |
|
"grad_norm": 1.1180990934371948, |
|
"learning_rate": 9.869113551144754e-06, |
|
"loss": 0.9407, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5251215559157212, |
|
"grad_norm": 0.9630658626556396, |
|
"learning_rate": 9.841773210365646e-06, |
|
"loss": 0.9329, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5445705024311183, |
|
"grad_norm": 0.9198795557022095, |
|
"learning_rate": 9.811886953863841e-06, |
|
"loss": 0.9378, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5640194489465153, |
|
"grad_norm": 0.9277356266975403, |
|
"learning_rate": 9.779470496520442e-06, |
|
"loss": 0.9195, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5834683954619124, |
|
"grad_norm": 0.80616694688797, |
|
"learning_rate": 9.744540883654348e-06, |
|
"loss": 0.932, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6029173419773096, |
|
"grad_norm": 0.8931137323379517, |
|
"learning_rate": 9.707116482059447e-06, |
|
"loss": 0.9024, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6223662884927067, |
|
"grad_norm": 0.8059653043746948, |
|
"learning_rate": 9.667216970346916e-06, |
|
"loss": 0.9376, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6418152350081038, |
|
"grad_norm": 0.7000671625137329, |
|
"learning_rate": 9.624863328597767e-06, |
|
"loss": 0.8836, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6612641815235009, |
|
"grad_norm": 0.788222074508667, |
|
"learning_rate": 9.580077827331038e-06, |
|
"loss": 0.8801, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6807131280388979, |
|
"grad_norm": 0.960985004901886, |
|
"learning_rate": 9.532884015793432e-06, |
|
"loss": 0.8777, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.700162074554295, |
|
"grad_norm": 0.9860551357269287, |
|
"learning_rate": 9.48330670957659e-06, |
|
"loss": 0.905, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7196110210696921, |
|
"grad_norm": 0.8046407699584961, |
|
"learning_rate": 9.431371977568483e-06, |
|
"loss": 0.9257, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7390599675850892, |
|
"grad_norm": 0.9749986529350281, |
|
"learning_rate": 9.377107128245782e-06, |
|
"loss": 0.9026, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7585089141004863, |
|
"grad_norm": 1.0727578401565552, |
|
"learning_rate": 9.32054069531444e-06, |
|
"loss": 0.9418, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7779578606158833, |
|
"grad_norm": 0.8145267963409424, |
|
"learning_rate": 9.261702422706014e-06, |
|
"loss": 0.906, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7974068071312804, |
|
"grad_norm": 0.8878953456878662, |
|
"learning_rate": 9.200623248937619e-06, |
|
"loss": 0.911, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8168557536466775, |
|
"grad_norm": 0.8581916689872742, |
|
"learning_rate": 9.13733529084374e-06, |
|
"loss": 0.8873, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8363047001620746, |
|
"grad_norm": 0.6927335858345032, |
|
"learning_rate": 9.071871826688472e-06, |
|
"loss": 0.8726, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8557536466774717, |
|
"grad_norm": 0.6785300970077515, |
|
"learning_rate": 9.004267278667032e-06, |
|
"loss": 0.9282, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8752025931928687, |
|
"grad_norm": 0.735817015171051, |
|
"learning_rate": 8.934557194805787e-06, |
|
"loss": 0.9498, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.8946515397082658, |
|
"grad_norm": 0.6314980387687683, |
|
"learning_rate": 8.862778230270276e-06, |
|
"loss": 0.8851, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9141004862236629, |
|
"grad_norm": 0.6786671280860901, |
|
"learning_rate": 8.788968128091084e-06, |
|
"loss": 0.8834, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.93354943273906, |
|
"grad_norm": 0.7222251892089844, |
|
"learning_rate": 8.71316569931769e-06, |
|
"loss": 0.8915, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9529983792544571, |
|
"grad_norm": 0.7341165542602539, |
|
"learning_rate": 8.635410802610724e-06, |
|
"loss": 0.8392, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9724473257698542, |
|
"grad_norm": 0.7214636206626892, |
|
"learning_rate": 8.555744323283364e-06, |
|
"loss": 0.8845, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9918962722852512, |
|
"grad_norm": 0.848508358001709, |
|
"learning_rate": 8.474208151802898e-06, |
|
"loss": 0.8581, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.012965964343598, |
|
"grad_norm": 1.2223843336105347, |
|
"learning_rate": 8.390845161763756e-06, |
|
"loss": 1.4462, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.032414910858995, |
|
"grad_norm": 0.6608096361160278, |
|
"learning_rate": 8.305699187343586e-06, |
|
"loss": 0.9298, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0518638573743921, |
|
"grad_norm": 0.7156009674072266, |
|
"learning_rate": 8.218815000254233e-06, |
|
"loss": 0.7472, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0713128038897892, |
|
"grad_norm": 0.7926570177078247, |
|
"learning_rate": 8.130238286199747e-06, |
|
"loss": 0.8235, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0907617504051863, |
|
"grad_norm": 0.8245845437049866, |
|
"learning_rate": 8.04001562085379e-06, |
|
"loss": 0.9211, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.1102106969205834, |
|
"grad_norm": 0.6690172553062439, |
|
"learning_rate": 7.948194445369065e-06, |
|
"loss": 0.9178, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.1296596434359805, |
|
"grad_norm": 0.8226907849311829, |
|
"learning_rate": 7.85482304143168e-06, |
|
"loss": 0.7913, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.1491085899513775, |
|
"grad_norm": 0.9551813006401062, |
|
"learning_rate": 7.759950505873523e-06, |
|
"loss": 0.8701, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.1685575364667746, |
|
"grad_norm": 0.6174091696739197, |
|
"learning_rate": 7.66362672485601e-06, |
|
"loss": 0.8091, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1880064829821717, |
|
"grad_norm": 0.8523845076560974, |
|
"learning_rate": 7.565902347638806e-06, |
|
"loss": 0.8805, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.2074554294975688, |
|
"grad_norm": 0.6692298054695129, |
|
"learning_rate": 7.466828759947271e-06, |
|
"loss": 0.8076, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2269043760129659, |
|
"grad_norm": 0.6524062156677246, |
|
"learning_rate": 7.366458056952668e-06, |
|
"loss": 0.7676, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.246353322528363, |
|
"grad_norm": 0.7085561752319336, |
|
"learning_rate": 7.264843015879321e-06, |
|
"loss": 0.9008, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.26580226904376, |
|
"grad_norm": 0.6387475728988647, |
|
"learning_rate": 7.162037068253141e-06, |
|
"loss": 0.7623, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.2852512155591573, |
|
"grad_norm": 0.6962757110595703, |
|
"learning_rate": 7.058094271806091e-06, |
|
"loss": 0.8989, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.3047001620745542, |
|
"grad_norm": 0.6996948719024658, |
|
"learning_rate": 6.953069282051397e-06, |
|
"loss": 0.8975, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.3241491085899515, |
|
"grad_norm": 0.6775169968605042, |
|
"learning_rate": 6.84701732354442e-06, |
|
"loss": 0.744, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3435980551053484, |
|
"grad_norm": 0.5788021087646484, |
|
"learning_rate": 6.7399941608443096e-06, |
|
"loss": 0.8693, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.3630470016207457, |
|
"grad_norm": 0.797266960144043, |
|
"learning_rate": 6.632056069191723e-06, |
|
"loss": 0.8185, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.3824959481361425, |
|
"grad_norm": 0.6203992366790771, |
|
"learning_rate": 6.523259804918001e-06, |
|
"loss": 0.8219, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.4019448946515398, |
|
"grad_norm": 0.5873000621795654, |
|
"learning_rate": 6.413662575601391e-06, |
|
"loss": 0.8675, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.4213938411669367, |
|
"grad_norm": 0.5779026746749878, |
|
"learning_rate": 6.303322009985984e-06, |
|
"loss": 0.8004, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.440842787682334, |
|
"grad_norm": 0.7285134792327881, |
|
"learning_rate": 6.1922961276791925e-06, |
|
"loss": 0.8649, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4602917341977308, |
|
"grad_norm": 0.5539451837539673, |
|
"learning_rate": 6.08064330864371e-06, |
|
"loss": 0.7551, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4797406807131281, |
|
"grad_norm": 0.530315101146698, |
|
"learning_rate": 5.968422262499983e-06, |
|
"loss": 0.798, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.499189627228525, |
|
"grad_norm": 0.6449223756790161, |
|
"learning_rate": 5.85569199765534e-06, |
|
"loss": 0.8498, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.5186385737439223, |
|
"grad_norm": 0.5452909469604492, |
|
"learning_rate": 5.7425117902760195e-06, |
|
"loss": 0.7277, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5380875202593192, |
|
"grad_norm": 0.6189656853675842, |
|
"learning_rate": 5.628941153118388e-06, |
|
"loss": 0.8525, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.5575364667747165, |
|
"grad_norm": 0.5398715734481812, |
|
"learning_rate": 5.515039804235772e-06, |
|
"loss": 0.7927, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5769854132901133, |
|
"grad_norm": 0.5609137415885925, |
|
"learning_rate": 5.400867635577335e-06, |
|
"loss": 0.8495, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5964343598055106, |
|
"grad_norm": 0.6827127933502197, |
|
"learning_rate": 5.2864846814955e-06, |
|
"loss": 0.7973, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.6158833063209075, |
|
"grad_norm": 0.6233929395675659, |
|
"learning_rate": 5.17195108717852e-06, |
|
"loss": 0.8542, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.6353322528363048, |
|
"grad_norm": 0.46808454394340515, |
|
"learning_rate": 5.057327077024745e-06, |
|
"loss": 0.7751, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6547811993517016, |
|
"grad_norm": 0.5673578381538391, |
|
"learning_rate": 4.942672922975255e-06, |
|
"loss": 0.7718, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.674230145867099, |
|
"grad_norm": 0.6883957386016846, |
|
"learning_rate": 4.82804891282148e-06, |
|
"loss": 0.7642, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.6936790923824958, |
|
"grad_norm": 0.6758309006690979, |
|
"learning_rate": 4.713515318504501e-06, |
|
"loss": 0.9642, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.7131280388978931, |
|
"grad_norm": 0.4832862913608551, |
|
"learning_rate": 4.599132364422666e-06, |
|
"loss": 0.7228, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.73257698541329, |
|
"grad_norm": 0.7116863131523132, |
|
"learning_rate": 4.4849601957642295e-06, |
|
"loss": 0.8748, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.7520259319286873, |
|
"grad_norm": 0.5377712845802307, |
|
"learning_rate": 4.371058846881614e-06, |
|
"loss": 0.8228, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7714748784440841, |
|
"grad_norm": 0.49350258708000183, |
|
"learning_rate": 4.257488209723981e-06, |
|
"loss": 0.7398, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.7909238249594814, |
|
"grad_norm": 0.5975490212440491, |
|
"learning_rate": 4.1443080023446605e-06, |
|
"loss": 0.8464, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.8103727714748783, |
|
"grad_norm": 0.5417091846466064, |
|
"learning_rate": 4.0315777375000185e-06, |
|
"loss": 0.8294, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.8298217179902756, |
|
"grad_norm": 0.5055018663406372, |
|
"learning_rate": 3.9193566913562915e-06, |
|
"loss": 0.7465, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.8492706645056725, |
|
"grad_norm": 0.5458024144172668, |
|
"learning_rate": 3.807703872320809e-06, |
|
"loss": 0.8468, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.8687196110210698, |
|
"grad_norm": 0.5673462748527527, |
|
"learning_rate": 3.6966779900140193e-06, |
|
"loss": 0.83, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.8881685575364666, |
|
"grad_norm": 0.4504033625125885, |
|
"learning_rate": 3.586337424398609e-06, |
|
"loss": 0.8286, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.907617504051864, |
|
"grad_norm": 0.5393473505973816, |
|
"learning_rate": 3.4767401950820003e-06, |
|
"loss": 0.7724, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.9270664505672608, |
|
"grad_norm": 0.6027775406837463, |
|
"learning_rate": 3.3679439308082777e-06, |
|
"loss": 0.8476, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.946515397082658, |
|
"grad_norm": 0.5444150567054749, |
|
"learning_rate": 3.260005839155691e-06, |
|
"loss": 0.7916, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.965964343598055, |
|
"grad_norm": 0.5643745064735413, |
|
"learning_rate": 3.152982676455581e-06, |
|
"loss": 0.7754, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.9854132901134522, |
|
"grad_norm": 0.6249105334281921, |
|
"learning_rate": 3.046930717948604e-06, |
|
"loss": 0.8918, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.006482982171799, |
|
"grad_norm": 0.8598921895027161, |
|
"learning_rate": 2.9419057281939106e-06, |
|
"loss": 1.2974, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.025931928687196, |
|
"grad_norm": 0.5371376276016235, |
|
"learning_rate": 2.8379629317468604e-06, |
|
"loss": 0.7535, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.0453808752025933, |
|
"grad_norm": 0.5264458656311035, |
|
"learning_rate": 2.7351569841206792e-06, |
|
"loss": 0.7966, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.06482982171799, |
|
"grad_norm": 0.537636399269104, |
|
"learning_rate": 2.633541943047334e-06, |
|
"loss": 0.8117, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.0842787682333874, |
|
"grad_norm": 0.49134597182273865, |
|
"learning_rate": 2.53317124005273e-06, |
|
"loss": 0.8327, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.1037277147487843, |
|
"grad_norm": 0.5109895467758179, |
|
"learning_rate": 2.4340976523611957e-06, |
|
"loss": 0.7766, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.1231766612641816, |
|
"grad_norm": 0.5269736051559448, |
|
"learning_rate": 2.3363732751439926e-06, |
|
"loss": 0.7372, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.1426256077795784, |
|
"grad_norm": 0.5074533820152283, |
|
"learning_rate": 2.240049494126479e-06, |
|
"loss": 0.7629, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1620745542949757, |
|
"grad_norm": 0.5118080973625183, |
|
"learning_rate": 2.1451769585683196e-06, |
|
"loss": 0.6913, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.1815235008103726, |
|
"grad_norm": 0.5797163248062134, |
|
"learning_rate": 2.0518055546309362e-06, |
|
"loss": 0.8875, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.20097244732577, |
|
"grad_norm": 0.5009273886680603, |
|
"learning_rate": 1.9599843791462123e-06, |
|
"loss": 0.8202, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.2204213938411668, |
|
"grad_norm": 0.4513159394264221, |
|
"learning_rate": 1.8697617138002545e-06, |
|
"loss": 0.6574, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.239870340356564, |
|
"grad_norm": 0.483067125082016, |
|
"learning_rate": 1.7811849997457681e-06, |
|
"loss": 0.766, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.259319286871961, |
|
"grad_norm": 0.5190160274505615, |
|
"learning_rate": 1.6943008126564164e-06, |
|
"loss": 0.86, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.2787682333873582, |
|
"grad_norm": 0.45720207691192627, |
|
"learning_rate": 1.609154838236246e-06, |
|
"loss": 0.7715, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.298217179902755, |
|
"grad_norm": 0.46229684352874756, |
|
"learning_rate": 1.5257918481971028e-06, |
|
"loss": 0.7503, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.3176661264181524, |
|
"grad_norm": 0.4342985153198242, |
|
"learning_rate": 1.4442556767166371e-06, |
|
"loss": 0.7868, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.3371150729335493, |
|
"grad_norm": 0.4891180098056793, |
|
"learning_rate": 1.3645891973892772e-06, |
|
"loss": 0.7287, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3565640194489466, |
|
"grad_norm": 0.451238214969635, |
|
"learning_rate": 1.2868343006823113e-06, |
|
"loss": 0.7135, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.3760129659643434, |
|
"grad_norm": 0.5156052112579346, |
|
"learning_rate": 1.211031871908916e-06, |
|
"loss": 0.8523, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.3954619124797407, |
|
"grad_norm": 0.4339147210121155, |
|
"learning_rate": 1.137221769729725e-06, |
|
"loss": 0.7157, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.4149108589951376, |
|
"grad_norm": 0.4493626356124878, |
|
"learning_rate": 1.065442805194214e-06, |
|
"loss": 0.8451, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.434359805510535, |
|
"grad_norm": 0.4212321937084198, |
|
"learning_rate": 9.957327213329687e-07, |
|
"loss": 0.7459, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.4538087520259317, |
|
"grad_norm": 0.49532240629196167, |
|
"learning_rate": 9.281281733115288e-07, |
|
"loss": 0.8074, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.473257698541329, |
|
"grad_norm": 0.4537409543991089, |
|
"learning_rate": 8.626647091562612e-07, |
|
"loss": 0.7581, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.492706645056726, |
|
"grad_norm": 0.4133566617965698, |
|
"learning_rate": 7.993767510623834e-07, |
|
"loss": 0.761, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.512155591572123, |
|
"grad_norm": 0.47344374656677246, |
|
"learning_rate": 7.382975772939866e-07, |
|
"loss": 0.783, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.53160453808752, |
|
"grad_norm": 0.4699486494064331, |
|
"learning_rate": 6.794593046855613e-07, |
|
"loss": 0.7739, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5510534846029174, |
|
"grad_norm": 0.4478330910205841, |
|
"learning_rate": 6.228928717542205e-07, |
|
"loss": 0.7713, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.5705024311183147, |
|
"grad_norm": 0.43865975737571716, |
|
"learning_rate": 5.686280224315189e-07, |
|
"loss": 0.7643, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.5899513776337115, |
|
"grad_norm": 0.45029687881469727, |
|
"learning_rate": 5.166932904234101e-07, |
|
"loss": 0.8228, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.6094003241491084, |
|
"grad_norm": 0.43190592527389526, |
|
"learning_rate": 4.671159842065698e-07, |
|
"loss": 0.8042, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.6288492706645057, |
|
"grad_norm": 0.4283352494239807, |
|
"learning_rate": 4.199221726689634e-07, |
|
"loss": 0.6469, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.648298217179903, |
|
"grad_norm": 0.444757342338562, |
|
"learning_rate": 3.751366714022342e-07, |
|
"loss": 0.84, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.6677471636953, |
|
"grad_norm": 0.4214431941509247, |
|
"learning_rate": 3.3278302965308593e-07, |
|
"loss": 0.8341, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.6871961102106967, |
|
"grad_norm": 0.44082775712013245, |
|
"learning_rate": 2.928835179405548e-07, |
|
"loss": 0.7816, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.706645056726094, |
|
"grad_norm": 0.4211124777793884, |
|
"learning_rate": 2.5545911634565266e-07, |
|
"loss": 0.7624, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.7260940032414913, |
|
"grad_norm": 0.4169267416000366, |
|
"learning_rate": 2.205295034795596e-07, |
|
"loss": 0.7452, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.745542949756888, |
|
"grad_norm": 0.4236930012702942, |
|
"learning_rate": 1.881130461361591e-07, |
|
"loss": 0.7443, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.764991896272285, |
|
"grad_norm": 0.4934384822845459, |
|
"learning_rate": 1.5822678963435479e-07, |
|
"loss": 0.8023, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.7844408427876823, |
|
"grad_norm": 0.43219056725502014, |
|
"learning_rate": 1.3088644885524637e-07, |
|
"loss": 0.7411, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.8038897893030796, |
|
"grad_norm": 0.4079039990901947, |
|
"learning_rate": 1.0610639997888917e-07, |
|
"loss": 0.742, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.8233387358184765, |
|
"grad_norm": 0.39644140005111694, |
|
"learning_rate": 8.38996729249636e-08, |
|
"loss": 0.7157, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.8427876823338734, |
|
"grad_norm": 0.4225790500640869, |
|
"learning_rate": 6.427794450134529e-08, |
|
"loss": 0.8331, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.8622366288492707, |
|
"grad_norm": 0.4538624882698059, |
|
"learning_rate": 4.72515322641709e-08, |
|
"loss": 0.9018, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.881685575364668, |
|
"grad_norm": 0.4045908451080322, |
|
"learning_rate": 3.282938909263122e-08, |
|
"loss": 0.6865, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.901134521880065, |
|
"grad_norm": 0.4469752609729767, |
|
"learning_rate": 2.101909848133743e-08, |
|
"loss": 0.7197, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.9205834683954617, |
|
"grad_norm": 0.520662248134613, |
|
"learning_rate": 1.1826870552749669e-08, |
|
"loss": 0.8062, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.940032414910859, |
|
"grad_norm": 0.42809349298477173, |
|
"learning_rate": 5.257538791749173e-09, |
|
"loss": 0.799, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.9594813614262563, |
|
"grad_norm": 0.4747503101825714, |
|
"learning_rate": 1.3145575040801605e-09, |
|
"loss": 0.8033, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.978930307941653, |
|
"grad_norm": 0.41074883937835693, |
|
"learning_rate": 0.0, |
|
"loss": 0.6416, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.978930307941653, |
|
"step": 153, |
|
"total_flos": 132192675135488.0, |
|
"train_loss": 0.8684253560172187, |
|
"train_runtime": 7585.4265, |
|
"train_samples_per_second": 1.952, |
|
"train_steps_per_second": 0.02 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 153, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 132192675135488.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|