Plofski commited on
Commit
87946dd
·
verified ·
1 Parent(s): 72c0f75

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ab362d2b3f9dedf1f0f43335f7b06eefee0b16e014fc83df80bc46c1b6044cf
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72b069175149869f318a48bd011ed6c0026b2c123ef90c0d91ce6c0713bbf92d
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b116e5cf316372406a0b75f20675173ce00a1448ad26470e8baba7a28543337c
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff22e875e6a914c0bc7bfb1c7e787c769c8414739be0f07bf5f2faaae0c3727f
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:553711fa7348e1460e8e11ff55c1e2ba08096c9266ea56894e269e1a647bd7f3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a89b82d40a4e75a0ac37545280e3be68c54204263336c42598e8db051948b3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.8134193028410235,
6
  "eval_steps": 500,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8108,6 +8108,456 @@
8108
  "mean_token_accuracy": 0.7758583545684814,
8109
  "num_tokens": 9969639.0,
8110
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8111
  }
8112
  ],
8113
  "logging_steps": 10,
@@ -8127,7 +8577,7 @@
8127
  "attributes": {}
8128
  }
8129
  },
8130
- "total_flos": 1.2065001216479232e+16,
8131
  "train_batch_size": 8,
8132
  "trial_name": null,
8133
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.9141648196655248,
6
  "eval_steps": 500,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8108
  "mean_token_accuracy": 0.7758583545684814,
8109
  "num_tokens": 9969639.0,
8110
  "step": 9000
8111
+ },
8112
+ {
8113
+ "epoch": 1.8154342131775136,
8114
+ "grad_norm": 10.5625,
8115
+ "learning_rate": 7.898448519040904e-06,
8116
+ "loss": 0.8608,
8117
+ "mean_token_accuracy": 0.7903220117092132,
8118
+ "num_tokens": 9980897.0,
8119
+ "step": 9010
8120
+ },
8121
+ {
8122
+ "epoch": 1.8174491235140038,
8123
+ "grad_norm": 13.75,
8124
+ "learning_rate": 7.885015783464303e-06,
8125
+ "loss": 0.7981,
8126
+ "mean_token_accuracy": 0.7931015849113464,
8127
+ "num_tokens": 9992007.0,
8128
+ "step": 9020
8129
+ },
8130
+ {
8131
+ "epoch": 1.8194640338504935,
8132
+ "grad_norm": 13.375,
8133
+ "learning_rate": 7.871583047887703e-06,
8134
+ "loss": 0.7756,
8135
+ "mean_token_accuracy": 0.8115738570690155,
8136
+ "num_tokens": 10001735.0,
8137
+ "step": 9030
8138
+ },
8139
+ {
8140
+ "epoch": 1.8214789441869836,
8141
+ "grad_norm": 11.25,
8142
+ "learning_rate": 7.858150312311102e-06,
8143
+ "loss": 0.8252,
8144
+ "mean_token_accuracy": 0.7951979100704193,
8145
+ "num_tokens": 10012981.0,
8146
+ "step": 9040
8147
+ },
8148
+ {
8149
+ "epoch": 1.8234938545234738,
8150
+ "grad_norm": 13.25,
8151
+ "learning_rate": 7.844717576734503e-06,
8152
+ "loss": 0.9316,
8153
+ "mean_token_accuracy": 0.7766720294952393,
8154
+ "num_tokens": 10024728.0,
8155
+ "step": 9050
8156
+ },
8157
+ {
8158
+ "epoch": 1.8255087648599637,
8159
+ "grad_norm": 12.1875,
8160
+ "learning_rate": 7.831284841157902e-06,
8161
+ "loss": 0.809,
8162
+ "mean_token_accuracy": 0.7951194763183593,
8163
+ "num_tokens": 10035125.0,
8164
+ "step": 9060
8165
+ },
8166
+ {
8167
+ "epoch": 1.8275236751964536,
8168
+ "grad_norm": 9.125,
8169
+ "learning_rate": 7.817852105581302e-06,
8170
+ "loss": 0.7939,
8171
+ "mean_token_accuracy": 0.8017635881900788,
8172
+ "num_tokens": 10046031.0,
8173
+ "step": 9070
8174
+ },
8175
+ {
8176
+ "epoch": 1.8295385855329438,
8177
+ "grad_norm": 9.375,
8178
+ "learning_rate": 7.804419370004703e-06,
8179
+ "loss": 0.8358,
8180
+ "mean_token_accuracy": 0.7958399653434753,
8181
+ "num_tokens": 10057704.0,
8182
+ "step": 9080
8183
+ },
8184
+ {
8185
+ "epoch": 1.831553495869434,
8186
+ "grad_norm": 11.625,
8187
+ "learning_rate": 7.790986634428102e-06,
8188
+ "loss": 0.739,
8189
+ "mean_token_accuracy": 0.8191445827484131,
8190
+ "num_tokens": 10068849.0,
8191
+ "step": 9090
8192
+ },
8193
+ {
8194
+ "epoch": 1.8335684062059239,
8195
+ "grad_norm": 14.0,
8196
+ "learning_rate": 7.777553898851502e-06,
8197
+ "loss": 0.8177,
8198
+ "mean_token_accuracy": 0.7937594950199127,
8199
+ "num_tokens": 10080412.0,
8200
+ "step": 9100
8201
+ },
8202
+ {
8203
+ "epoch": 1.8355833165424138,
8204
+ "grad_norm": 13.75,
8205
+ "learning_rate": 7.764121163274901e-06,
8206
+ "loss": 0.8874,
8207
+ "mean_token_accuracy": 0.7819468438625335,
8208
+ "num_tokens": 10091110.0,
8209
+ "step": 9110
8210
+ },
8211
+ {
8212
+ "epoch": 1.837598226878904,
8213
+ "grad_norm": 11.9375,
8214
+ "learning_rate": 7.750688427698301e-06,
8215
+ "loss": 0.7289,
8216
+ "mean_token_accuracy": 0.8155353426933288,
8217
+ "num_tokens": 10101270.0,
8218
+ "step": 9120
8219
+ },
8220
+ {
8221
+ "epoch": 1.8396131372153939,
8222
+ "grad_norm": 10.5,
8223
+ "learning_rate": 7.737255692121702e-06,
8224
+ "loss": 0.8756,
8225
+ "mean_token_accuracy": 0.7849370181560517,
8226
+ "num_tokens": 10113436.0,
8227
+ "step": 9130
8228
+ },
8229
+ {
8230
+ "epoch": 1.8416280475518838,
8231
+ "grad_norm": 11.25,
8232
+ "learning_rate": 7.7238229565451e-06,
8233
+ "loss": 0.9212,
8234
+ "mean_token_accuracy": 0.7761917889118195,
8235
+ "num_tokens": 10123689.0,
8236
+ "step": 9140
8237
+ },
8238
+ {
8239
+ "epoch": 1.843642957888374,
8240
+ "grad_norm": 10.6875,
8241
+ "learning_rate": 7.710390220968501e-06,
8242
+ "loss": 0.8504,
8243
+ "mean_token_accuracy": 0.7979696393013,
8244
+ "num_tokens": 10135000.0,
8245
+ "step": 9150
8246
+ },
8247
+ {
8248
+ "epoch": 1.845657868224864,
8249
+ "grad_norm": 10.0625,
8250
+ "learning_rate": 7.6969574853919e-06,
8251
+ "loss": 0.7885,
8252
+ "mean_token_accuracy": 0.8057900547981263,
8253
+ "num_tokens": 10146460.0,
8254
+ "step": 9160
8255
+ },
8256
+ {
8257
+ "epoch": 1.847672778561354,
8258
+ "grad_norm": 12.1875,
8259
+ "learning_rate": 7.6835247498153e-06,
8260
+ "loss": 0.7174,
8261
+ "mean_token_accuracy": 0.8195405840873718,
8262
+ "num_tokens": 10156971.0,
8263
+ "step": 9170
8264
+ },
8265
+ {
8266
+ "epoch": 1.849687688897844,
8267
+ "grad_norm": 9.625,
8268
+ "learning_rate": 7.6700920142387e-06,
8269
+ "loss": 0.8307,
8270
+ "mean_token_accuracy": 0.7955503463745117,
8271
+ "num_tokens": 10168870.0,
8272
+ "step": 9180
8273
+ },
8274
+ {
8275
+ "epoch": 1.8517025992343341,
8276
+ "grad_norm": 10.1875,
8277
+ "learning_rate": 7.6566592786621e-06,
8278
+ "loss": 0.753,
8279
+ "mean_token_accuracy": 0.8073143362998962,
8280
+ "num_tokens": 10180116.0,
8281
+ "step": 9190
8282
+ },
8283
+ {
8284
+ "epoch": 1.8537175095708243,
8285
+ "grad_norm": 12.4375,
8286
+ "learning_rate": 7.6432265430855e-06,
8287
+ "loss": 0.7821,
8288
+ "mean_token_accuracy": 0.8077682852745056,
8289
+ "num_tokens": 10191372.0,
8290
+ "step": 9200
8291
+ },
8292
+ {
8293
+ "epoch": 1.855732419907314,
8294
+ "grad_norm": 10.8125,
8295
+ "learning_rate": 7.6297938075089e-06,
8296
+ "loss": 0.8415,
8297
+ "mean_token_accuracy": 0.7897944033145905,
8298
+ "num_tokens": 10202628.0,
8299
+ "step": 9210
8300
+ },
8301
+ {
8302
+ "epoch": 1.8577473302438041,
8303
+ "grad_norm": 11.1875,
8304
+ "learning_rate": 7.6163610719323e-06,
8305
+ "loss": 0.7986,
8306
+ "mean_token_accuracy": 0.7961230039596557,
8307
+ "num_tokens": 10212786.0,
8308
+ "step": 9220
8309
+ },
8310
+ {
8311
+ "epoch": 1.8597622405802943,
8312
+ "grad_norm": 8.625,
8313
+ "learning_rate": 7.6029283363557e-06,
8314
+ "loss": 0.8377,
8315
+ "mean_token_accuracy": 0.7946724176406861,
8316
+ "num_tokens": 10223326.0,
8317
+ "step": 9230
8318
+ },
8319
+ {
8320
+ "epoch": 1.8617771509167842,
8321
+ "grad_norm": 13.8125,
8322
+ "learning_rate": 7.589495600779098e-06,
8323
+ "loss": 0.7952,
8324
+ "mean_token_accuracy": 0.7984604299068451,
8325
+ "num_tokens": 10234476.0,
8326
+ "step": 9240
8327
+ },
8328
+ {
8329
+ "epoch": 1.8637920612532741,
8330
+ "grad_norm": 11.5625,
8331
+ "learning_rate": 7.576062865202499e-06,
8332
+ "loss": 0.814,
8333
+ "mean_token_accuracy": 0.792439204454422,
8334
+ "num_tokens": 10245659.0,
8335
+ "step": 9250
8336
+ },
8337
+ {
8338
+ "epoch": 1.8658069715897643,
8339
+ "grad_norm": 11.8125,
8340
+ "learning_rate": 7.562630129625899e-06,
8341
+ "loss": 0.8127,
8342
+ "mean_token_accuracy": 0.8034590363502503,
8343
+ "num_tokens": 10256084.0,
8344
+ "step": 9260
8345
+ },
8346
+ {
8347
+ "epoch": 1.8678218819262544,
8348
+ "grad_norm": 11.6875,
8349
+ "learning_rate": 7.549197394049299e-06,
8350
+ "loss": 0.7521,
8351
+ "mean_token_accuracy": 0.8105040609836578,
8352
+ "num_tokens": 10266693.0,
8353
+ "step": 9270
8354
+ },
8355
+ {
8356
+ "epoch": 1.8698367922627444,
8357
+ "grad_norm": 11.625,
8358
+ "learning_rate": 7.535764658472699e-06,
8359
+ "loss": 0.7934,
8360
+ "mean_token_accuracy": 0.8005593240261077,
8361
+ "num_tokens": 10277746.0,
8362
+ "step": 9280
8363
+ },
8364
+ {
8365
+ "epoch": 1.8718517025992343,
8366
+ "grad_norm": 12.5,
8367
+ "learning_rate": 7.5223319228960985e-06,
8368
+ "loss": 0.8947,
8369
+ "mean_token_accuracy": 0.776383513212204,
8370
+ "num_tokens": 10290031.0,
8371
+ "step": 9290
8372
+ },
8373
+ {
8374
+ "epoch": 1.8738666129357244,
8375
+ "grad_norm": 12.125,
8376
+ "learning_rate": 7.508899187319498e-06,
8377
+ "loss": 0.8513,
8378
+ "mean_token_accuracy": 0.7838102102279663,
8379
+ "num_tokens": 10300283.0,
8380
+ "step": 9300
8381
+ },
8382
+ {
8383
+ "epoch": 1.8758815232722144,
8384
+ "grad_norm": 9.5,
8385
+ "learning_rate": 7.495466451742898e-06,
8386
+ "loss": 0.7765,
8387
+ "mean_token_accuracy": 0.8086275160312653,
8388
+ "num_tokens": 10312347.0,
8389
+ "step": 9310
8390
+ },
8391
+ {
8392
+ "epoch": 1.8778964336087043,
8393
+ "grad_norm": 11.1875,
8394
+ "learning_rate": 7.482033716166298e-06,
8395
+ "loss": 0.8356,
8396
+ "mean_token_accuracy": 0.7951161444187165,
8397
+ "num_tokens": 10322835.0,
8398
+ "step": 9320
8399
+ },
8400
+ {
8401
+ "epoch": 1.8799113439451944,
8402
+ "grad_norm": 14.125,
8403
+ "learning_rate": 7.468600980589697e-06,
8404
+ "loss": 0.8884,
8405
+ "mean_token_accuracy": 0.7830281972885131,
8406
+ "num_tokens": 10333529.0,
8407
+ "step": 9330
8408
+ },
8409
+ {
8410
+ "epoch": 1.8819262542816846,
8411
+ "grad_norm": 8.5625,
8412
+ "learning_rate": 7.455168245013098e-06,
8413
+ "loss": 0.7518,
8414
+ "mean_token_accuracy": 0.8141887187957764,
8415
+ "num_tokens": 10345013.0,
8416
+ "step": 9340
8417
+ },
8418
+ {
8419
+ "epoch": 1.8839411646181745,
8420
+ "grad_norm": 14.0,
8421
+ "learning_rate": 7.4417355094364975e-06,
8422
+ "loss": 0.8164,
8423
+ "mean_token_accuracy": 0.7980745792388916,
8424
+ "num_tokens": 10355765.0,
8425
+ "step": 9350
8426
+ },
8427
+ {
8428
+ "epoch": 1.8859560749546644,
8429
+ "grad_norm": 12.25,
8430
+ "learning_rate": 7.428302773859897e-06,
8431
+ "loss": 0.852,
8432
+ "mean_token_accuracy": 0.793835461139679,
8433
+ "num_tokens": 10367992.0,
8434
+ "step": 9360
8435
+ },
8436
+ {
8437
+ "epoch": 1.8879709852911546,
8438
+ "grad_norm": 12.75,
8439
+ "learning_rate": 7.414870038283297e-06,
8440
+ "loss": 0.7216,
8441
+ "mean_token_accuracy": 0.8170075476169586,
8442
+ "num_tokens": 10378311.0,
8443
+ "step": 9370
8444
+ },
8445
+ {
8446
+ "epoch": 1.8899858956276445,
8447
+ "grad_norm": 11.625,
8448
+ "learning_rate": 7.4014373027066965e-06,
8449
+ "loss": 0.8858,
8450
+ "mean_token_accuracy": 0.779736053943634,
8451
+ "num_tokens": 10389989.0,
8452
+ "step": 9380
8453
+ },
8454
+ {
8455
+ "epoch": 1.8920008059641344,
8456
+ "grad_norm": 12.75,
8457
+ "learning_rate": 7.388004567130097e-06,
8458
+ "loss": 0.8378,
8459
+ "mean_token_accuracy": 0.7875830888748169,
8460
+ "num_tokens": 10400859.0,
8461
+ "step": 9390
8462
+ },
8463
+ {
8464
+ "epoch": 1.8940157163006246,
8465
+ "grad_norm": 14.0,
8466
+ "learning_rate": 7.374571831553497e-06,
8467
+ "loss": 0.7966,
8468
+ "mean_token_accuracy": 0.8022767186164856,
8469
+ "num_tokens": 10411388.0,
8470
+ "step": 9400
8471
+ },
8472
+ {
8473
+ "epoch": 1.8960306266371147,
8474
+ "grad_norm": 13.1875,
8475
+ "learning_rate": 7.3611390959768956e-06,
8476
+ "loss": 0.8626,
8477
+ "mean_token_accuracy": 0.7839694082736969,
8478
+ "num_tokens": 10422407.0,
8479
+ "step": 9410
8480
+ },
8481
+ {
8482
+ "epoch": 1.8980455369736047,
8483
+ "grad_norm": 13.6875,
8484
+ "learning_rate": 7.347706360400296e-06,
8485
+ "loss": 0.8651,
8486
+ "mean_token_accuracy": 0.786309540271759,
8487
+ "num_tokens": 10432219.0,
8488
+ "step": 9420
8489
+ },
8490
+ {
8491
+ "epoch": 1.9000604473100946,
8492
+ "grad_norm": 12.25,
8493
+ "learning_rate": 7.334273624823696e-06,
8494
+ "loss": 0.7416,
8495
+ "mean_token_accuracy": 0.809950202703476,
8496
+ "num_tokens": 10442843.0,
8497
+ "step": 9430
8498
+ },
8499
+ {
8500
+ "epoch": 1.9020753576465848,
8501
+ "grad_norm": 9.75,
8502
+ "learning_rate": 7.320840889247096e-06,
8503
+ "loss": 0.8379,
8504
+ "mean_token_accuracy": 0.7921497166156769,
8505
+ "num_tokens": 10454398.0,
8506
+ "step": 9440
8507
+ },
8508
+ {
8509
+ "epoch": 1.904090267983075,
8510
+ "grad_norm": 11.3125,
8511
+ "learning_rate": 7.307408153670495e-06,
8512
+ "loss": 0.7609,
8513
+ "mean_token_accuracy": 0.812008547782898,
8514
+ "num_tokens": 10465437.0,
8515
+ "step": 9450
8516
+ },
8517
+ {
8518
+ "epoch": 1.9061051783195648,
8519
+ "grad_norm": 13.1875,
8520
+ "learning_rate": 7.293975418093895e-06,
8521
+ "loss": 0.7676,
8522
+ "mean_token_accuracy": 0.8095838546752929,
8523
+ "num_tokens": 10475130.0,
8524
+ "step": 9460
8525
+ },
8526
+ {
8527
+ "epoch": 1.9081200886560548,
8528
+ "grad_norm": 10.0625,
8529
+ "learning_rate": 7.280542682517295e-06,
8530
+ "loss": 0.759,
8531
+ "mean_token_accuracy": 0.8101568818092346,
8532
+ "num_tokens": 10486391.0,
8533
+ "step": 9470
8534
+ },
8535
+ {
8536
+ "epoch": 1.910134998992545,
8537
+ "grad_norm": 12.25,
8538
+ "learning_rate": 7.267109946940695e-06,
8539
+ "loss": 0.7881,
8540
+ "mean_token_accuracy": 0.801960825920105,
8541
+ "num_tokens": 10498930.0,
8542
+ "step": 9480
8543
+ },
8544
+ {
8545
+ "epoch": 1.9121499093290348,
8546
+ "grad_norm": 10.625,
8547
+ "learning_rate": 7.2536772113640956e-06,
8548
+ "loss": 0.7299,
8549
+ "mean_token_accuracy": 0.8158387124538422,
8550
+ "num_tokens": 10510083.0,
8551
+ "step": 9490
8552
+ },
8553
+ {
8554
+ "epoch": 1.9141648196655248,
8555
+ "grad_norm": 11.875,
8556
+ "learning_rate": 7.240244475787494e-06,
8557
+ "loss": 0.803,
8558
+ "mean_token_accuracy": 0.8004867613315583,
8559
+ "num_tokens": 10520466.0,
8560
+ "step": 9500
8561
  }
8562
  ],
8563
  "logging_steps": 10,
 
8577
  "attributes": {}
8578
  }
8579
  },
8580
+ "total_flos": 1.2727359994976256e+16,
8581
  "train_batch_size": 8,
8582
  "trial_name": null,
8583
  "trial_params": null