Training in progress, step 9500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72b069175149869f318a48bd011ed6c0026b2c123ef90c0d91ce6c0713bbf92d
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff22e875e6a914c0bc7bfb1c7e787c769c8414739be0f07bf5f2faaae0c3727f
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96a89b82d40a4e75a0ac37545280e3be68c54204263336c42598e8db051948b3
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8108,6 +8108,456 @@
|
|
| 8108 |
"mean_token_accuracy": 0.7758583545684814,
|
| 8109 |
"num_tokens": 9969639.0,
|
| 8110 |
"step": 9000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8111 |
}
|
| 8112 |
],
|
| 8113 |
"logging_steps": 10,
|
|
@@ -8127,7 +8577,7 @@
|
|
| 8127 |
"attributes": {}
|
| 8128 |
}
|
| 8129 |
},
|
| 8130 |
-
"total_flos": 1.
|
| 8131 |
"train_batch_size": 8,
|
| 8132 |
"trial_name": null,
|
| 8133 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.9141648196655248,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 9500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8108 |
"mean_token_accuracy": 0.7758583545684814,
|
| 8109 |
"num_tokens": 9969639.0,
|
| 8110 |
"step": 9000
|
| 8111 |
+
},
|
| 8112 |
+
{
|
| 8113 |
+
"epoch": 1.8154342131775136,
|
| 8114 |
+
"grad_norm": 10.5625,
|
| 8115 |
+
"learning_rate": 7.898448519040904e-06,
|
| 8116 |
+
"loss": 0.8608,
|
| 8117 |
+
"mean_token_accuracy": 0.7903220117092132,
|
| 8118 |
+
"num_tokens": 9980897.0,
|
| 8119 |
+
"step": 9010
|
| 8120 |
+
},
|
| 8121 |
+
{
|
| 8122 |
+
"epoch": 1.8174491235140038,
|
| 8123 |
+
"grad_norm": 13.75,
|
| 8124 |
+
"learning_rate": 7.885015783464303e-06,
|
| 8125 |
+
"loss": 0.7981,
|
| 8126 |
+
"mean_token_accuracy": 0.7931015849113464,
|
| 8127 |
+
"num_tokens": 9992007.0,
|
| 8128 |
+
"step": 9020
|
| 8129 |
+
},
|
| 8130 |
+
{
|
| 8131 |
+
"epoch": 1.8194640338504935,
|
| 8132 |
+
"grad_norm": 13.375,
|
| 8133 |
+
"learning_rate": 7.871583047887703e-06,
|
| 8134 |
+
"loss": 0.7756,
|
| 8135 |
+
"mean_token_accuracy": 0.8115738570690155,
|
| 8136 |
+
"num_tokens": 10001735.0,
|
| 8137 |
+
"step": 9030
|
| 8138 |
+
},
|
| 8139 |
+
{
|
| 8140 |
+
"epoch": 1.8214789441869836,
|
| 8141 |
+
"grad_norm": 11.25,
|
| 8142 |
+
"learning_rate": 7.858150312311102e-06,
|
| 8143 |
+
"loss": 0.8252,
|
| 8144 |
+
"mean_token_accuracy": 0.7951979100704193,
|
| 8145 |
+
"num_tokens": 10012981.0,
|
| 8146 |
+
"step": 9040
|
| 8147 |
+
},
|
| 8148 |
+
{
|
| 8149 |
+
"epoch": 1.8234938545234738,
|
| 8150 |
+
"grad_norm": 13.25,
|
| 8151 |
+
"learning_rate": 7.844717576734503e-06,
|
| 8152 |
+
"loss": 0.9316,
|
| 8153 |
+
"mean_token_accuracy": 0.7766720294952393,
|
| 8154 |
+
"num_tokens": 10024728.0,
|
| 8155 |
+
"step": 9050
|
| 8156 |
+
},
|
| 8157 |
+
{
|
| 8158 |
+
"epoch": 1.8255087648599637,
|
| 8159 |
+
"grad_norm": 12.1875,
|
| 8160 |
+
"learning_rate": 7.831284841157902e-06,
|
| 8161 |
+
"loss": 0.809,
|
| 8162 |
+
"mean_token_accuracy": 0.7951194763183593,
|
| 8163 |
+
"num_tokens": 10035125.0,
|
| 8164 |
+
"step": 9060
|
| 8165 |
+
},
|
| 8166 |
+
{
|
| 8167 |
+
"epoch": 1.8275236751964536,
|
| 8168 |
+
"grad_norm": 9.125,
|
| 8169 |
+
"learning_rate": 7.817852105581302e-06,
|
| 8170 |
+
"loss": 0.7939,
|
| 8171 |
+
"mean_token_accuracy": 0.8017635881900788,
|
| 8172 |
+
"num_tokens": 10046031.0,
|
| 8173 |
+
"step": 9070
|
| 8174 |
+
},
|
| 8175 |
+
{
|
| 8176 |
+
"epoch": 1.8295385855329438,
|
| 8177 |
+
"grad_norm": 9.375,
|
| 8178 |
+
"learning_rate": 7.804419370004703e-06,
|
| 8179 |
+
"loss": 0.8358,
|
| 8180 |
+
"mean_token_accuracy": 0.7958399653434753,
|
| 8181 |
+
"num_tokens": 10057704.0,
|
| 8182 |
+
"step": 9080
|
| 8183 |
+
},
|
| 8184 |
+
{
|
| 8185 |
+
"epoch": 1.831553495869434,
|
| 8186 |
+
"grad_norm": 11.625,
|
| 8187 |
+
"learning_rate": 7.790986634428102e-06,
|
| 8188 |
+
"loss": 0.739,
|
| 8189 |
+
"mean_token_accuracy": 0.8191445827484131,
|
| 8190 |
+
"num_tokens": 10068849.0,
|
| 8191 |
+
"step": 9090
|
| 8192 |
+
},
|
| 8193 |
+
{
|
| 8194 |
+
"epoch": 1.8335684062059239,
|
| 8195 |
+
"grad_norm": 14.0,
|
| 8196 |
+
"learning_rate": 7.777553898851502e-06,
|
| 8197 |
+
"loss": 0.8177,
|
| 8198 |
+
"mean_token_accuracy": 0.7937594950199127,
|
| 8199 |
+
"num_tokens": 10080412.0,
|
| 8200 |
+
"step": 9100
|
| 8201 |
+
},
|
| 8202 |
+
{
|
| 8203 |
+
"epoch": 1.8355833165424138,
|
| 8204 |
+
"grad_norm": 13.75,
|
| 8205 |
+
"learning_rate": 7.764121163274901e-06,
|
| 8206 |
+
"loss": 0.8874,
|
| 8207 |
+
"mean_token_accuracy": 0.7819468438625335,
|
| 8208 |
+
"num_tokens": 10091110.0,
|
| 8209 |
+
"step": 9110
|
| 8210 |
+
},
|
| 8211 |
+
{
|
| 8212 |
+
"epoch": 1.837598226878904,
|
| 8213 |
+
"grad_norm": 11.9375,
|
| 8214 |
+
"learning_rate": 7.750688427698301e-06,
|
| 8215 |
+
"loss": 0.7289,
|
| 8216 |
+
"mean_token_accuracy": 0.8155353426933288,
|
| 8217 |
+
"num_tokens": 10101270.0,
|
| 8218 |
+
"step": 9120
|
| 8219 |
+
},
|
| 8220 |
+
{
|
| 8221 |
+
"epoch": 1.8396131372153939,
|
| 8222 |
+
"grad_norm": 10.5,
|
| 8223 |
+
"learning_rate": 7.737255692121702e-06,
|
| 8224 |
+
"loss": 0.8756,
|
| 8225 |
+
"mean_token_accuracy": 0.7849370181560517,
|
| 8226 |
+
"num_tokens": 10113436.0,
|
| 8227 |
+
"step": 9130
|
| 8228 |
+
},
|
| 8229 |
+
{
|
| 8230 |
+
"epoch": 1.8416280475518838,
|
| 8231 |
+
"grad_norm": 11.25,
|
| 8232 |
+
"learning_rate": 7.7238229565451e-06,
|
| 8233 |
+
"loss": 0.9212,
|
| 8234 |
+
"mean_token_accuracy": 0.7761917889118195,
|
| 8235 |
+
"num_tokens": 10123689.0,
|
| 8236 |
+
"step": 9140
|
| 8237 |
+
},
|
| 8238 |
+
{
|
| 8239 |
+
"epoch": 1.843642957888374,
|
| 8240 |
+
"grad_norm": 10.6875,
|
| 8241 |
+
"learning_rate": 7.710390220968501e-06,
|
| 8242 |
+
"loss": 0.8504,
|
| 8243 |
+
"mean_token_accuracy": 0.7979696393013,
|
| 8244 |
+
"num_tokens": 10135000.0,
|
| 8245 |
+
"step": 9150
|
| 8246 |
+
},
|
| 8247 |
+
{
|
| 8248 |
+
"epoch": 1.845657868224864,
|
| 8249 |
+
"grad_norm": 10.0625,
|
| 8250 |
+
"learning_rate": 7.6969574853919e-06,
|
| 8251 |
+
"loss": 0.7885,
|
| 8252 |
+
"mean_token_accuracy": 0.8057900547981263,
|
| 8253 |
+
"num_tokens": 10146460.0,
|
| 8254 |
+
"step": 9160
|
| 8255 |
+
},
|
| 8256 |
+
{
|
| 8257 |
+
"epoch": 1.847672778561354,
|
| 8258 |
+
"grad_norm": 12.1875,
|
| 8259 |
+
"learning_rate": 7.6835247498153e-06,
|
| 8260 |
+
"loss": 0.7174,
|
| 8261 |
+
"mean_token_accuracy": 0.8195405840873718,
|
| 8262 |
+
"num_tokens": 10156971.0,
|
| 8263 |
+
"step": 9170
|
| 8264 |
+
},
|
| 8265 |
+
{
|
| 8266 |
+
"epoch": 1.849687688897844,
|
| 8267 |
+
"grad_norm": 9.625,
|
| 8268 |
+
"learning_rate": 7.6700920142387e-06,
|
| 8269 |
+
"loss": 0.8307,
|
| 8270 |
+
"mean_token_accuracy": 0.7955503463745117,
|
| 8271 |
+
"num_tokens": 10168870.0,
|
| 8272 |
+
"step": 9180
|
| 8273 |
+
},
|
| 8274 |
+
{
|
| 8275 |
+
"epoch": 1.8517025992343341,
|
| 8276 |
+
"grad_norm": 10.1875,
|
| 8277 |
+
"learning_rate": 7.6566592786621e-06,
|
| 8278 |
+
"loss": 0.753,
|
| 8279 |
+
"mean_token_accuracy": 0.8073143362998962,
|
| 8280 |
+
"num_tokens": 10180116.0,
|
| 8281 |
+
"step": 9190
|
| 8282 |
+
},
|
| 8283 |
+
{
|
| 8284 |
+
"epoch": 1.8537175095708243,
|
| 8285 |
+
"grad_norm": 12.4375,
|
| 8286 |
+
"learning_rate": 7.6432265430855e-06,
|
| 8287 |
+
"loss": 0.7821,
|
| 8288 |
+
"mean_token_accuracy": 0.8077682852745056,
|
| 8289 |
+
"num_tokens": 10191372.0,
|
| 8290 |
+
"step": 9200
|
| 8291 |
+
},
|
| 8292 |
+
{
|
| 8293 |
+
"epoch": 1.855732419907314,
|
| 8294 |
+
"grad_norm": 10.8125,
|
| 8295 |
+
"learning_rate": 7.6297938075089e-06,
|
| 8296 |
+
"loss": 0.8415,
|
| 8297 |
+
"mean_token_accuracy": 0.7897944033145905,
|
| 8298 |
+
"num_tokens": 10202628.0,
|
| 8299 |
+
"step": 9210
|
| 8300 |
+
},
|
| 8301 |
+
{
|
| 8302 |
+
"epoch": 1.8577473302438041,
|
| 8303 |
+
"grad_norm": 11.1875,
|
| 8304 |
+
"learning_rate": 7.6163610719323e-06,
|
| 8305 |
+
"loss": 0.7986,
|
| 8306 |
+
"mean_token_accuracy": 0.7961230039596557,
|
| 8307 |
+
"num_tokens": 10212786.0,
|
| 8308 |
+
"step": 9220
|
| 8309 |
+
},
|
| 8310 |
+
{
|
| 8311 |
+
"epoch": 1.8597622405802943,
|
| 8312 |
+
"grad_norm": 8.625,
|
| 8313 |
+
"learning_rate": 7.6029283363557e-06,
|
| 8314 |
+
"loss": 0.8377,
|
| 8315 |
+
"mean_token_accuracy": 0.7946724176406861,
|
| 8316 |
+
"num_tokens": 10223326.0,
|
| 8317 |
+
"step": 9230
|
| 8318 |
+
},
|
| 8319 |
+
{
|
| 8320 |
+
"epoch": 1.8617771509167842,
|
| 8321 |
+
"grad_norm": 13.8125,
|
| 8322 |
+
"learning_rate": 7.589495600779098e-06,
|
| 8323 |
+
"loss": 0.7952,
|
| 8324 |
+
"mean_token_accuracy": 0.7984604299068451,
|
| 8325 |
+
"num_tokens": 10234476.0,
|
| 8326 |
+
"step": 9240
|
| 8327 |
+
},
|
| 8328 |
+
{
|
| 8329 |
+
"epoch": 1.8637920612532741,
|
| 8330 |
+
"grad_norm": 11.5625,
|
| 8331 |
+
"learning_rate": 7.576062865202499e-06,
|
| 8332 |
+
"loss": 0.814,
|
| 8333 |
+
"mean_token_accuracy": 0.792439204454422,
|
| 8334 |
+
"num_tokens": 10245659.0,
|
| 8335 |
+
"step": 9250
|
| 8336 |
+
},
|
| 8337 |
+
{
|
| 8338 |
+
"epoch": 1.8658069715897643,
|
| 8339 |
+
"grad_norm": 11.8125,
|
| 8340 |
+
"learning_rate": 7.562630129625899e-06,
|
| 8341 |
+
"loss": 0.8127,
|
| 8342 |
+
"mean_token_accuracy": 0.8034590363502503,
|
| 8343 |
+
"num_tokens": 10256084.0,
|
| 8344 |
+
"step": 9260
|
| 8345 |
+
},
|
| 8346 |
+
{
|
| 8347 |
+
"epoch": 1.8678218819262544,
|
| 8348 |
+
"grad_norm": 11.6875,
|
| 8349 |
+
"learning_rate": 7.549197394049299e-06,
|
| 8350 |
+
"loss": 0.7521,
|
| 8351 |
+
"mean_token_accuracy": 0.8105040609836578,
|
| 8352 |
+
"num_tokens": 10266693.0,
|
| 8353 |
+
"step": 9270
|
| 8354 |
+
},
|
| 8355 |
+
{
|
| 8356 |
+
"epoch": 1.8698367922627444,
|
| 8357 |
+
"grad_norm": 11.625,
|
| 8358 |
+
"learning_rate": 7.535764658472699e-06,
|
| 8359 |
+
"loss": 0.7934,
|
| 8360 |
+
"mean_token_accuracy": 0.8005593240261077,
|
| 8361 |
+
"num_tokens": 10277746.0,
|
| 8362 |
+
"step": 9280
|
| 8363 |
+
},
|
| 8364 |
+
{
|
| 8365 |
+
"epoch": 1.8718517025992343,
|
| 8366 |
+
"grad_norm": 12.5,
|
| 8367 |
+
"learning_rate": 7.5223319228960985e-06,
|
| 8368 |
+
"loss": 0.8947,
|
| 8369 |
+
"mean_token_accuracy": 0.776383513212204,
|
| 8370 |
+
"num_tokens": 10290031.0,
|
| 8371 |
+
"step": 9290
|
| 8372 |
+
},
|
| 8373 |
+
{
|
| 8374 |
+
"epoch": 1.8738666129357244,
|
| 8375 |
+
"grad_norm": 12.125,
|
| 8376 |
+
"learning_rate": 7.508899187319498e-06,
|
| 8377 |
+
"loss": 0.8513,
|
| 8378 |
+
"mean_token_accuracy": 0.7838102102279663,
|
| 8379 |
+
"num_tokens": 10300283.0,
|
| 8380 |
+
"step": 9300
|
| 8381 |
+
},
|
| 8382 |
+
{
|
| 8383 |
+
"epoch": 1.8758815232722144,
|
| 8384 |
+
"grad_norm": 9.5,
|
| 8385 |
+
"learning_rate": 7.495466451742898e-06,
|
| 8386 |
+
"loss": 0.7765,
|
| 8387 |
+
"mean_token_accuracy": 0.8086275160312653,
|
| 8388 |
+
"num_tokens": 10312347.0,
|
| 8389 |
+
"step": 9310
|
| 8390 |
+
},
|
| 8391 |
+
{
|
| 8392 |
+
"epoch": 1.8778964336087043,
|
| 8393 |
+
"grad_norm": 11.1875,
|
| 8394 |
+
"learning_rate": 7.482033716166298e-06,
|
| 8395 |
+
"loss": 0.8356,
|
| 8396 |
+
"mean_token_accuracy": 0.7951161444187165,
|
| 8397 |
+
"num_tokens": 10322835.0,
|
| 8398 |
+
"step": 9320
|
| 8399 |
+
},
|
| 8400 |
+
{
|
| 8401 |
+
"epoch": 1.8799113439451944,
|
| 8402 |
+
"grad_norm": 14.125,
|
| 8403 |
+
"learning_rate": 7.468600980589697e-06,
|
| 8404 |
+
"loss": 0.8884,
|
| 8405 |
+
"mean_token_accuracy": 0.7830281972885131,
|
| 8406 |
+
"num_tokens": 10333529.0,
|
| 8407 |
+
"step": 9330
|
| 8408 |
+
},
|
| 8409 |
+
{
|
| 8410 |
+
"epoch": 1.8819262542816846,
|
| 8411 |
+
"grad_norm": 8.5625,
|
| 8412 |
+
"learning_rate": 7.455168245013098e-06,
|
| 8413 |
+
"loss": 0.7518,
|
| 8414 |
+
"mean_token_accuracy": 0.8141887187957764,
|
| 8415 |
+
"num_tokens": 10345013.0,
|
| 8416 |
+
"step": 9340
|
| 8417 |
+
},
|
| 8418 |
+
{
|
| 8419 |
+
"epoch": 1.8839411646181745,
|
| 8420 |
+
"grad_norm": 14.0,
|
| 8421 |
+
"learning_rate": 7.4417355094364975e-06,
|
| 8422 |
+
"loss": 0.8164,
|
| 8423 |
+
"mean_token_accuracy": 0.7980745792388916,
|
| 8424 |
+
"num_tokens": 10355765.0,
|
| 8425 |
+
"step": 9350
|
| 8426 |
+
},
|
| 8427 |
+
{
|
| 8428 |
+
"epoch": 1.8859560749546644,
|
| 8429 |
+
"grad_norm": 12.25,
|
| 8430 |
+
"learning_rate": 7.428302773859897e-06,
|
| 8431 |
+
"loss": 0.852,
|
| 8432 |
+
"mean_token_accuracy": 0.793835461139679,
|
| 8433 |
+
"num_tokens": 10367992.0,
|
| 8434 |
+
"step": 9360
|
| 8435 |
+
},
|
| 8436 |
+
{
|
| 8437 |
+
"epoch": 1.8879709852911546,
|
| 8438 |
+
"grad_norm": 12.75,
|
| 8439 |
+
"learning_rate": 7.414870038283297e-06,
|
| 8440 |
+
"loss": 0.7216,
|
| 8441 |
+
"mean_token_accuracy": 0.8170075476169586,
|
| 8442 |
+
"num_tokens": 10378311.0,
|
| 8443 |
+
"step": 9370
|
| 8444 |
+
},
|
| 8445 |
+
{
|
| 8446 |
+
"epoch": 1.8899858956276445,
|
| 8447 |
+
"grad_norm": 11.625,
|
| 8448 |
+
"learning_rate": 7.4014373027066965e-06,
|
| 8449 |
+
"loss": 0.8858,
|
| 8450 |
+
"mean_token_accuracy": 0.779736053943634,
|
| 8451 |
+
"num_tokens": 10389989.0,
|
| 8452 |
+
"step": 9380
|
| 8453 |
+
},
|
| 8454 |
+
{
|
| 8455 |
+
"epoch": 1.8920008059641344,
|
| 8456 |
+
"grad_norm": 12.75,
|
| 8457 |
+
"learning_rate": 7.388004567130097e-06,
|
| 8458 |
+
"loss": 0.8378,
|
| 8459 |
+
"mean_token_accuracy": 0.7875830888748169,
|
| 8460 |
+
"num_tokens": 10400859.0,
|
| 8461 |
+
"step": 9390
|
| 8462 |
+
},
|
| 8463 |
+
{
|
| 8464 |
+
"epoch": 1.8940157163006246,
|
| 8465 |
+
"grad_norm": 14.0,
|
| 8466 |
+
"learning_rate": 7.374571831553497e-06,
|
| 8467 |
+
"loss": 0.7966,
|
| 8468 |
+
"mean_token_accuracy": 0.8022767186164856,
|
| 8469 |
+
"num_tokens": 10411388.0,
|
| 8470 |
+
"step": 9400
|
| 8471 |
+
},
|
| 8472 |
+
{
|
| 8473 |
+
"epoch": 1.8960306266371147,
|
| 8474 |
+
"grad_norm": 13.1875,
|
| 8475 |
+
"learning_rate": 7.3611390959768956e-06,
|
| 8476 |
+
"loss": 0.8626,
|
| 8477 |
+
"mean_token_accuracy": 0.7839694082736969,
|
| 8478 |
+
"num_tokens": 10422407.0,
|
| 8479 |
+
"step": 9410
|
| 8480 |
+
},
|
| 8481 |
+
{
|
| 8482 |
+
"epoch": 1.8980455369736047,
|
| 8483 |
+
"grad_norm": 13.6875,
|
| 8484 |
+
"learning_rate": 7.347706360400296e-06,
|
| 8485 |
+
"loss": 0.8651,
|
| 8486 |
+
"mean_token_accuracy": 0.786309540271759,
|
| 8487 |
+
"num_tokens": 10432219.0,
|
| 8488 |
+
"step": 9420
|
| 8489 |
+
},
|
| 8490 |
+
{
|
| 8491 |
+
"epoch": 1.9000604473100946,
|
| 8492 |
+
"grad_norm": 12.25,
|
| 8493 |
+
"learning_rate": 7.334273624823696e-06,
|
| 8494 |
+
"loss": 0.7416,
|
| 8495 |
+
"mean_token_accuracy": 0.809950202703476,
|
| 8496 |
+
"num_tokens": 10442843.0,
|
| 8497 |
+
"step": 9430
|
| 8498 |
+
},
|
| 8499 |
+
{
|
| 8500 |
+
"epoch": 1.9020753576465848,
|
| 8501 |
+
"grad_norm": 9.75,
|
| 8502 |
+
"learning_rate": 7.320840889247096e-06,
|
| 8503 |
+
"loss": 0.8379,
|
| 8504 |
+
"mean_token_accuracy": 0.7921497166156769,
|
| 8505 |
+
"num_tokens": 10454398.0,
|
| 8506 |
+
"step": 9440
|
| 8507 |
+
},
|
| 8508 |
+
{
|
| 8509 |
+
"epoch": 1.904090267983075,
|
| 8510 |
+
"grad_norm": 11.3125,
|
| 8511 |
+
"learning_rate": 7.307408153670495e-06,
|
| 8512 |
+
"loss": 0.7609,
|
| 8513 |
+
"mean_token_accuracy": 0.812008547782898,
|
| 8514 |
+
"num_tokens": 10465437.0,
|
| 8515 |
+
"step": 9450
|
| 8516 |
+
},
|
| 8517 |
+
{
|
| 8518 |
+
"epoch": 1.9061051783195648,
|
| 8519 |
+
"grad_norm": 13.1875,
|
| 8520 |
+
"learning_rate": 7.293975418093895e-06,
|
| 8521 |
+
"loss": 0.7676,
|
| 8522 |
+
"mean_token_accuracy": 0.8095838546752929,
|
| 8523 |
+
"num_tokens": 10475130.0,
|
| 8524 |
+
"step": 9460
|
| 8525 |
+
},
|
| 8526 |
+
{
|
| 8527 |
+
"epoch": 1.9081200886560548,
|
| 8528 |
+
"grad_norm": 10.0625,
|
| 8529 |
+
"learning_rate": 7.280542682517295e-06,
|
| 8530 |
+
"loss": 0.759,
|
| 8531 |
+
"mean_token_accuracy": 0.8101568818092346,
|
| 8532 |
+
"num_tokens": 10486391.0,
|
| 8533 |
+
"step": 9470
|
| 8534 |
+
},
|
| 8535 |
+
{
|
| 8536 |
+
"epoch": 1.910134998992545,
|
| 8537 |
+
"grad_norm": 12.25,
|
| 8538 |
+
"learning_rate": 7.267109946940695e-06,
|
| 8539 |
+
"loss": 0.7881,
|
| 8540 |
+
"mean_token_accuracy": 0.801960825920105,
|
| 8541 |
+
"num_tokens": 10498930.0,
|
| 8542 |
+
"step": 9480
|
| 8543 |
+
},
|
| 8544 |
+
{
|
| 8545 |
+
"epoch": 1.9121499093290348,
|
| 8546 |
+
"grad_norm": 10.625,
|
| 8547 |
+
"learning_rate": 7.2536772113640956e-06,
|
| 8548 |
+
"loss": 0.7299,
|
| 8549 |
+
"mean_token_accuracy": 0.8158387124538422,
|
| 8550 |
+
"num_tokens": 10510083.0,
|
| 8551 |
+
"step": 9490
|
| 8552 |
+
},
|
| 8553 |
+
{
|
| 8554 |
+
"epoch": 1.9141648196655248,
|
| 8555 |
+
"grad_norm": 11.875,
|
| 8556 |
+
"learning_rate": 7.240244475787494e-06,
|
| 8557 |
+
"loss": 0.803,
|
| 8558 |
+
"mean_token_accuracy": 0.8004867613315583,
|
| 8559 |
+
"num_tokens": 10520466.0,
|
| 8560 |
+
"step": 9500
|
| 8561 |
}
|
| 8562 |
],
|
| 8563 |
"logging_steps": 10,
|
|
|
|
| 8577 |
"attributes": {}
|
| 8578 |
}
|
| 8579 |
},
|
| 8580 |
+
"total_flos": 1.2727359994976256e+16,
|
| 8581 |
"train_batch_size": 8,
|
| 8582 |
"trial_name": null,
|
| 8583 |
"trial_params": null
|