Training in progress, step 10000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25829a343b7e06cb4e4167e9b46a367935f8229a77e72a1421998542e27d1c90
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5d82f068e68971eb9728724c53cc1a345fe8d815fa606c2f3450b9b39b939104
|
| 3 |
size 1072594443
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14645
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
|
| 3 |
size 14645
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2aa24cd194618e57510eb16be4a4510b1af7e8497163286c5cb19c98f052ca0
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8558,6 +8558,456 @@
|
|
| 8558 |
"mean_token_accuracy": 0.8004867613315583,
|
| 8559 |
"num_tokens": 10520466.0,
|
| 8560 |
"step": 9500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8561 |
}
|
| 8562 |
],
|
| 8563 |
"logging_steps": 10,
|
|
@@ -8577,7 +9027,7 @@
|
|
| 8577 |
"attributes": {}
|
| 8578 |
}
|
| 8579 |
},
|
| 8580 |
-
"total_flos": 1.
|
| 8581 |
"train_batch_size": 8,
|
| 8582 |
"trial_name": null,
|
| 8583 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.014910336490026,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 10000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8558 |
"mean_token_accuracy": 0.8004867613315583,
|
| 8559 |
"num_tokens": 10520466.0,
|
| 8560 |
"step": 9500
|
| 8561 |
+
},
|
| 8562 |
+
{
|
| 8563 |
+
"epoch": 1.916179730002015,
|
| 8564 |
+
"grad_norm": 13.8125,
|
| 8565 |
+
"learning_rate": 7.226811740210895e-06,
|
| 8566 |
+
"loss": 0.8085,
|
| 8567 |
+
"mean_token_accuracy": 0.8049318194389343,
|
| 8568 |
+
"num_tokens": 10532072.0,
|
| 8569 |
+
"step": 9510
|
| 8570 |
+
},
|
| 8571 |
+
{
|
| 8572 |
+
"epoch": 1.918194640338505,
|
| 8573 |
+
"grad_norm": 11.5,
|
| 8574 |
+
"learning_rate": 7.213379004634295e-06,
|
| 8575 |
+
"loss": 0.8021,
|
| 8576 |
+
"mean_token_accuracy": 0.802433705329895,
|
| 8577 |
+
"num_tokens": 10542914.0,
|
| 8578 |
+
"step": 9520
|
| 8579 |
+
},
|
| 8580 |
+
{
|
| 8581 |
+
"epoch": 1.920209550674995,
|
| 8582 |
+
"grad_norm": 10.6875,
|
| 8583 |
+
"learning_rate": 7.1999462690576934e-06,
|
| 8584 |
+
"loss": 0.8222,
|
| 8585 |
+
"mean_token_accuracy": 0.7974632799625396,
|
| 8586 |
+
"num_tokens": 10552708.0,
|
| 8587 |
+
"step": 9530
|
| 8588 |
+
},
|
| 8589 |
+
{
|
| 8590 |
+
"epoch": 1.922224461011485,
|
| 8591 |
+
"grad_norm": 11.9375,
|
| 8592 |
+
"learning_rate": 7.186513533481094e-06,
|
| 8593 |
+
"loss": 0.9031,
|
| 8594 |
+
"mean_token_accuracy": 0.7883083343505859,
|
| 8595 |
+
"num_tokens": 10563446.0,
|
| 8596 |
+
"step": 9540
|
| 8597 |
+
},
|
| 8598 |
+
{
|
| 8599 |
+
"epoch": 1.924239371347975,
|
| 8600 |
+
"grad_norm": 9.875,
|
| 8601 |
+
"learning_rate": 7.173080797904494e-06,
|
| 8602 |
+
"loss": 0.7346,
|
| 8603 |
+
"mean_token_accuracy": 0.8066632449626923,
|
| 8604 |
+
"num_tokens": 10575598.0,
|
| 8605 |
+
"step": 9550
|
| 8606 |
+
},
|
| 8607 |
+
{
|
| 8608 |
+
"epoch": 1.926254281684465,
|
| 8609 |
+
"grad_norm": 12.125,
|
| 8610 |
+
"learning_rate": 7.159648062327894e-06,
|
| 8611 |
+
"loss": 0.8905,
|
| 8612 |
+
"mean_token_accuracy": 0.7765018343925476,
|
| 8613 |
+
"num_tokens": 10586523.0,
|
| 8614 |
+
"step": 9560
|
| 8615 |
+
},
|
| 8616 |
+
{
|
| 8617 |
+
"epoch": 1.928269192020955,
|
| 8618 |
+
"grad_norm": 11.375,
|
| 8619 |
+
"learning_rate": 7.146215326751294e-06,
|
| 8620 |
+
"loss": 0.8584,
|
| 8621 |
+
"mean_token_accuracy": 0.7868084013462067,
|
| 8622 |
+
"num_tokens": 10597941.0,
|
| 8623 |
+
"step": 9570
|
| 8624 |
+
},
|
| 8625 |
+
{
|
| 8626 |
+
"epoch": 1.930284102357445,
|
| 8627 |
+
"grad_norm": 12.8125,
|
| 8628 |
+
"learning_rate": 7.132782591174693e-06,
|
| 8629 |
+
"loss": 0.8567,
|
| 8630 |
+
"mean_token_accuracy": 0.7957649648189544,
|
| 8631 |
+
"num_tokens": 10609135.0,
|
| 8632 |
+
"step": 9580
|
| 8633 |
+
},
|
| 8634 |
+
{
|
| 8635 |
+
"epoch": 1.9322990126939352,
|
| 8636 |
+
"grad_norm": 13.25,
|
| 8637 |
+
"learning_rate": 7.119349855598093e-06,
|
| 8638 |
+
"loss": 0.7257,
|
| 8639 |
+
"mean_token_accuracy": 0.8163708746433258,
|
| 8640 |
+
"num_tokens": 10620680.0,
|
| 8641 |
+
"step": 9590
|
| 8642 |
+
},
|
| 8643 |
+
{
|
| 8644 |
+
"epoch": 1.9343139230304252,
|
| 8645 |
+
"grad_norm": 13.5,
|
| 8646 |
+
"learning_rate": 7.105917120021493e-06,
|
| 8647 |
+
"loss": 0.832,
|
| 8648 |
+
"mean_token_accuracy": 0.7922478914260864,
|
| 8649 |
+
"num_tokens": 10630244.0,
|
| 8650 |
+
"step": 9600
|
| 8651 |
+
},
|
| 8652 |
+
{
|
| 8653 |
+
"epoch": 1.936328833366915,
|
| 8654 |
+
"grad_norm": 13.0,
|
| 8655 |
+
"learning_rate": 7.0924843844448934e-06,
|
| 8656 |
+
"loss": 0.8223,
|
| 8657 |
+
"mean_token_accuracy": 0.7944608926773071,
|
| 8658 |
+
"num_tokens": 10642124.0,
|
| 8659 |
+
"step": 9610
|
| 8660 |
+
},
|
| 8661 |
+
{
|
| 8662 |
+
"epoch": 1.9383437437034052,
|
| 8663 |
+
"grad_norm": 10.8125,
|
| 8664 |
+
"learning_rate": 7.079051648868292e-06,
|
| 8665 |
+
"loss": 0.8406,
|
| 8666 |
+
"mean_token_accuracy": 0.7907250881195068,
|
| 8667 |
+
"num_tokens": 10652833.0,
|
| 8668 |
+
"step": 9620
|
| 8669 |
+
},
|
| 8670 |
+
{
|
| 8671 |
+
"epoch": 1.9403586540398954,
|
| 8672 |
+
"grad_norm": 8.9375,
|
| 8673 |
+
"learning_rate": 7.065618913291692e-06,
|
| 8674 |
+
"loss": 0.7735,
|
| 8675 |
+
"mean_token_accuracy": 0.8065800249576569,
|
| 8676 |
+
"num_tokens": 10665301.0,
|
| 8677 |
+
"step": 9630
|
| 8678 |
+
},
|
| 8679 |
+
{
|
| 8680 |
+
"epoch": 1.942373564376385,
|
| 8681 |
+
"grad_norm": 11.0625,
|
| 8682 |
+
"learning_rate": 7.0521861777150925e-06,
|
| 8683 |
+
"loss": 0.8182,
|
| 8684 |
+
"mean_token_accuracy": 0.7966114640235901,
|
| 8685 |
+
"num_tokens": 10676994.0,
|
| 8686 |
+
"step": 9640
|
| 8687 |
+
},
|
| 8688 |
+
{
|
| 8689 |
+
"epoch": 1.9443884747128752,
|
| 8690 |
+
"grad_norm": 11.0,
|
| 8691 |
+
"learning_rate": 7.038753442138492e-06,
|
| 8692 |
+
"loss": 0.8937,
|
| 8693 |
+
"mean_token_accuracy": 0.7835995197296143,
|
| 8694 |
+
"num_tokens": 10688577.0,
|
| 8695 |
+
"step": 9650
|
| 8696 |
+
},
|
| 8697 |
+
{
|
| 8698 |
+
"epoch": 1.9464033850493654,
|
| 8699 |
+
"grad_norm": 11.0,
|
| 8700 |
+
"learning_rate": 7.025320706561892e-06,
|
| 8701 |
+
"loss": 0.8766,
|
| 8702 |
+
"mean_token_accuracy": 0.7863976120948791,
|
| 8703 |
+
"num_tokens": 10699498.0,
|
| 8704 |
+
"step": 9660
|
| 8705 |
+
},
|
| 8706 |
+
{
|
| 8707 |
+
"epoch": 1.9484182953858553,
|
| 8708 |
+
"grad_norm": 13.0625,
|
| 8709 |
+
"learning_rate": 7.0118879709852915e-06,
|
| 8710 |
+
"loss": 0.8543,
|
| 8711 |
+
"mean_token_accuracy": 0.7946681499481201,
|
| 8712 |
+
"num_tokens": 10711155.0,
|
| 8713 |
+
"step": 9670
|
| 8714 |
+
},
|
| 8715 |
+
{
|
| 8716 |
+
"epoch": 1.9504332057223452,
|
| 8717 |
+
"grad_norm": 15.5,
|
| 8718 |
+
"learning_rate": 6.998455235408692e-06,
|
| 8719 |
+
"loss": 0.905,
|
| 8720 |
+
"mean_token_accuracy": 0.7806779563426971,
|
| 8721 |
+
"num_tokens": 10722357.0,
|
| 8722 |
+
"step": 9680
|
| 8723 |
+
},
|
| 8724 |
+
{
|
| 8725 |
+
"epoch": 1.9524481160588354,
|
| 8726 |
+
"grad_norm": 12.25,
|
| 8727 |
+
"learning_rate": 6.985022499832092e-06,
|
| 8728 |
+
"loss": 0.8497,
|
| 8729 |
+
"mean_token_accuracy": 0.78778578042984,
|
| 8730 |
+
"num_tokens": 10734398.0,
|
| 8731 |
+
"step": 9690
|
| 8732 |
+
},
|
| 8733 |
+
{
|
| 8734 |
+
"epoch": 1.9544630263953255,
|
| 8735 |
+
"grad_norm": 11.9375,
|
| 8736 |
+
"learning_rate": 6.9715897642554906e-06,
|
| 8737 |
+
"loss": 0.8426,
|
| 8738 |
+
"mean_token_accuracy": 0.7919103622436523,
|
| 8739 |
+
"num_tokens": 10745256.0,
|
| 8740 |
+
"step": 9700
|
| 8741 |
+
},
|
| 8742 |
+
{
|
| 8743 |
+
"epoch": 1.9564779367318155,
|
| 8744 |
+
"grad_norm": 11.0,
|
| 8745 |
+
"learning_rate": 6.958157028678891e-06,
|
| 8746 |
+
"loss": 0.8311,
|
| 8747 |
+
"mean_token_accuracy": 0.7953451931476593,
|
| 8748 |
+
"num_tokens": 10755529.0,
|
| 8749 |
+
"step": 9710
|
| 8750 |
+
},
|
| 8751 |
+
{
|
| 8752 |
+
"epoch": 1.9584928470683054,
|
| 8753 |
+
"grad_norm": 11.6875,
|
| 8754 |
+
"learning_rate": 6.944724293102291e-06,
|
| 8755 |
+
"loss": 0.9166,
|
| 8756 |
+
"mean_token_accuracy": 0.7762543320655823,
|
| 8757 |
+
"num_tokens": 10767833.0,
|
| 8758 |
+
"step": 9720
|
| 8759 |
+
},
|
| 8760 |
+
{
|
| 8761 |
+
"epoch": 1.9605077574047955,
|
| 8762 |
+
"grad_norm": 11.0625,
|
| 8763 |
+
"learning_rate": 6.931291557525691e-06,
|
| 8764 |
+
"loss": 0.8342,
|
| 8765 |
+
"mean_token_accuracy": 0.7902640163898468,
|
| 8766 |
+
"num_tokens": 10778381.0,
|
| 8767 |
+
"step": 9730
|
| 8768 |
+
},
|
| 8769 |
+
{
|
| 8770 |
+
"epoch": 1.9625226677412855,
|
| 8771 |
+
"grad_norm": 11.1875,
|
| 8772 |
+
"learning_rate": 6.91785882194909e-06,
|
| 8773 |
+
"loss": 0.8793,
|
| 8774 |
+
"mean_token_accuracy": 0.7860461592674255,
|
| 8775 |
+
"num_tokens": 10789792.0,
|
| 8776 |
+
"step": 9740
|
| 8777 |
+
},
|
| 8778 |
+
{
|
| 8779 |
+
"epoch": 1.9645375780777754,
|
| 8780 |
+
"grad_norm": 9.625,
|
| 8781 |
+
"learning_rate": 6.90442608637249e-06,
|
| 8782 |
+
"loss": 0.8833,
|
| 8783 |
+
"mean_token_accuracy": 0.7854184091091156,
|
| 8784 |
+
"num_tokens": 10801905.0,
|
| 8785 |
+
"step": 9750
|
| 8786 |
+
},
|
| 8787 |
+
{
|
| 8788 |
+
"epoch": 1.9665524884142656,
|
| 8789 |
+
"grad_norm": 12.75,
|
| 8790 |
+
"learning_rate": 6.89099335079589e-06,
|
| 8791 |
+
"loss": 0.8529,
|
| 8792 |
+
"mean_token_accuracy": 0.7941727995872497,
|
| 8793 |
+
"num_tokens": 10812223.0,
|
| 8794 |
+
"step": 9760
|
| 8795 |
+
},
|
| 8796 |
+
{
|
| 8797 |
+
"epoch": 1.9685673987507557,
|
| 8798 |
+
"grad_norm": 11.125,
|
| 8799 |
+
"learning_rate": 6.87756061521929e-06,
|
| 8800 |
+
"loss": 0.7802,
|
| 8801 |
+
"mean_token_accuracy": 0.8009257316589355,
|
| 8802 |
+
"num_tokens": 10823248.0,
|
| 8803 |
+
"step": 9770
|
| 8804 |
+
},
|
| 8805 |
+
{
|
| 8806 |
+
"epoch": 1.9705823090872456,
|
| 8807 |
+
"grad_norm": 14.125,
|
| 8808 |
+
"learning_rate": 6.8641278796426906e-06,
|
| 8809 |
+
"loss": 0.8616,
|
| 8810 |
+
"mean_token_accuracy": 0.7909869194030762,
|
| 8811 |
+
"num_tokens": 10834683.0,
|
| 8812 |
+
"step": 9780
|
| 8813 |
+
},
|
| 8814 |
+
{
|
| 8815 |
+
"epoch": 1.9725972194237356,
|
| 8816 |
+
"grad_norm": 12.25,
|
| 8817 |
+
"learning_rate": 6.850695144066089e-06,
|
| 8818 |
+
"loss": 0.8485,
|
| 8819 |
+
"mean_token_accuracy": 0.792462158203125,
|
| 8820 |
+
"num_tokens": 10845744.0,
|
| 8821 |
+
"step": 9790
|
| 8822 |
+
},
|
| 8823 |
+
{
|
| 8824 |
+
"epoch": 1.9746121297602257,
|
| 8825 |
+
"grad_norm": 10.1875,
|
| 8826 |
+
"learning_rate": 6.837262408489489e-06,
|
| 8827 |
+
"loss": 0.7906,
|
| 8828 |
+
"mean_token_accuracy": 0.8116903901100159,
|
| 8829 |
+
"num_tokens": 10857457.0,
|
| 8830 |
+
"step": 9800
|
| 8831 |
+
},
|
| 8832 |
+
{
|
| 8833 |
+
"epoch": 1.9766270400967159,
|
| 8834 |
+
"grad_norm": 9.8125,
|
| 8835 |
+
"learning_rate": 6.82382967291289e-06,
|
| 8836 |
+
"loss": 0.7862,
|
| 8837 |
+
"mean_token_accuracy": 0.8046435177326202,
|
| 8838 |
+
"num_tokens": 10868106.0,
|
| 8839 |
+
"step": 9810
|
| 8840 |
+
},
|
| 8841 |
+
{
|
| 8842 |
+
"epoch": 1.9786419504332056,
|
| 8843 |
+
"grad_norm": 11.6875,
|
| 8844 |
+
"learning_rate": 6.8103969373362884e-06,
|
| 8845 |
+
"loss": 0.7895,
|
| 8846 |
+
"mean_token_accuracy": 0.8022366106510163,
|
| 8847 |
+
"num_tokens": 10879315.0,
|
| 8848 |
+
"step": 9820
|
| 8849 |
+
},
|
| 8850 |
+
{
|
| 8851 |
+
"epoch": 1.9806568607696957,
|
| 8852 |
+
"grad_norm": 11.75,
|
| 8853 |
+
"learning_rate": 6.796964201759689e-06,
|
| 8854 |
+
"loss": 0.9483,
|
| 8855 |
+
"mean_token_accuracy": 0.7742224156856536,
|
| 8856 |
+
"num_tokens": 10890523.0,
|
| 8857 |
+
"step": 9830
|
| 8858 |
+
},
|
| 8859 |
+
{
|
| 8860 |
+
"epoch": 1.9826717711061859,
|
| 8861 |
+
"grad_norm": 12.625,
|
| 8862 |
+
"learning_rate": 6.783531466183089e-06,
|
| 8863 |
+
"loss": 0.7439,
|
| 8864 |
+
"mean_token_accuracy": 0.8124743521213531,
|
| 8865 |
+
"num_tokens": 10901250.0,
|
| 8866 |
+
"step": 9840
|
| 8867 |
+
},
|
| 8868 |
+
{
|
| 8869 |
+
"epoch": 1.9846866814426758,
|
| 8870 |
+
"grad_norm": 13.4375,
|
| 8871 |
+
"learning_rate": 6.770098730606488e-06,
|
| 8872 |
+
"loss": 0.7185,
|
| 8873 |
+
"mean_token_accuracy": 0.8186926007270813,
|
| 8874 |
+
"num_tokens": 10912951.0,
|
| 8875 |
+
"step": 9850
|
| 8876 |
+
},
|
| 8877 |
+
{
|
| 8878 |
+
"epoch": 1.9867015917791657,
|
| 8879 |
+
"grad_norm": 11.3125,
|
| 8880 |
+
"learning_rate": 6.756665995029889e-06,
|
| 8881 |
+
"loss": 0.8252,
|
| 8882 |
+
"mean_token_accuracy": 0.7952579975128173,
|
| 8883 |
+
"num_tokens": 10924651.0,
|
| 8884 |
+
"step": 9860
|
| 8885 |
+
},
|
| 8886 |
+
{
|
| 8887 |
+
"epoch": 1.9887165021156559,
|
| 8888 |
+
"grad_norm": 10.1875,
|
| 8889 |
+
"learning_rate": 6.743233259453288e-06,
|
| 8890 |
+
"loss": 0.937,
|
| 8891 |
+
"mean_token_accuracy": 0.7707946419715881,
|
| 8892 |
+
"num_tokens": 10936427.0,
|
| 8893 |
+
"step": 9870
|
| 8894 |
+
},
|
| 8895 |
+
{
|
| 8896 |
+
"epoch": 1.990731412452146,
|
| 8897 |
+
"grad_norm": 15.375,
|
| 8898 |
+
"learning_rate": 6.729800523876688e-06,
|
| 8899 |
+
"loss": 0.7189,
|
| 8900 |
+
"mean_token_accuracy": 0.822449779510498,
|
| 8901 |
+
"num_tokens": 10946645.0,
|
| 8902 |
+
"step": 9880
|
| 8903 |
+
},
|
| 8904 |
+
{
|
| 8905 |
+
"epoch": 1.992746322788636,
|
| 8906 |
+
"grad_norm": 11.625,
|
| 8907 |
+
"learning_rate": 6.716367788300088e-06,
|
| 8908 |
+
"loss": 0.798,
|
| 8909 |
+
"mean_token_accuracy": 0.7974645853042602,
|
| 8910 |
+
"num_tokens": 10959231.0,
|
| 8911 |
+
"step": 9890
|
| 8912 |
+
},
|
| 8913 |
+
{
|
| 8914 |
+
"epoch": 1.9947612331251259,
|
| 8915 |
+
"grad_norm": 15.875,
|
| 8916 |
+
"learning_rate": 6.7029350527234884e-06,
|
| 8917 |
+
"loss": 0.825,
|
| 8918 |
+
"mean_token_accuracy": 0.7955174386501312,
|
| 8919 |
+
"num_tokens": 10970209.0,
|
| 8920 |
+
"step": 9900
|
| 8921 |
+
},
|
| 8922 |
+
{
|
| 8923 |
+
"epoch": 1.996776143461616,
|
| 8924 |
+
"grad_norm": 12.0625,
|
| 8925 |
+
"learning_rate": 6.689502317146887e-06,
|
| 8926 |
+
"loss": 0.8865,
|
| 8927 |
+
"mean_token_accuracy": 0.779301130771637,
|
| 8928 |
+
"num_tokens": 10981095.0,
|
| 8929 |
+
"step": 9910
|
| 8930 |
+
},
|
| 8931 |
+
{
|
| 8932 |
+
"epoch": 1.998791053798106,
|
| 8933 |
+
"grad_norm": 10.3125,
|
| 8934 |
+
"learning_rate": 6.676069581570287e-06,
|
| 8935 |
+
"loss": 0.8739,
|
| 8936 |
+
"mean_token_accuracy": 0.7855922758579255,
|
| 8937 |
+
"num_tokens": 10992314.0,
|
| 8938 |
+
"step": 9920
|
| 8939 |
+
},
|
| 8940 |
+
{
|
| 8941 |
+
"epoch": 2.000805964134596,
|
| 8942 |
+
"grad_norm": 9.75,
|
| 8943 |
+
"learning_rate": 6.6626368459936875e-06,
|
| 8944 |
+
"loss": 0.7853,
|
| 8945 |
+
"mean_token_accuracy": 0.7989992260932922,
|
| 8946 |
+
"num_tokens": 11002971.0,
|
| 8947 |
+
"step": 9930
|
| 8948 |
+
},
|
| 8949 |
+
{
|
| 8950 |
+
"epoch": 2.002820874471086,
|
| 8951 |
+
"grad_norm": 10.5,
|
| 8952 |
+
"learning_rate": 6.649204110417087e-06,
|
| 8953 |
+
"loss": 0.9011,
|
| 8954 |
+
"mean_token_accuracy": 0.7816862404346466,
|
| 8955 |
+
"num_tokens": 11014291.0,
|
| 8956 |
+
"step": 9940
|
| 8957 |
+
},
|
| 8958 |
+
{
|
| 8959 |
+
"epoch": 2.004835784807576,
|
| 8960 |
+
"grad_norm": 10.125,
|
| 8961 |
+
"learning_rate": 6.635771374840488e-06,
|
| 8962 |
+
"loss": 0.8415,
|
| 8963 |
+
"mean_token_accuracy": 0.7934383928775788,
|
| 8964 |
+
"num_tokens": 11024051.0,
|
| 8965 |
+
"step": 9950
|
| 8966 |
+
},
|
| 8967 |
+
{
|
| 8968 |
+
"epoch": 2.006850695144066,
|
| 8969 |
+
"grad_norm": 12.125,
|
| 8970 |
+
"learning_rate": 6.6223386392638865e-06,
|
| 8971 |
+
"loss": 0.8009,
|
| 8972 |
+
"mean_token_accuracy": 0.7968979775905609,
|
| 8973 |
+
"num_tokens": 11033995.0,
|
| 8974 |
+
"step": 9960
|
| 8975 |
+
},
|
| 8976 |
+
{
|
| 8977 |
+
"epoch": 2.008865605480556,
|
| 8978 |
+
"grad_norm": 11.75,
|
| 8979 |
+
"learning_rate": 6.608905903687286e-06,
|
| 8980 |
+
"loss": 0.8248,
|
| 8981 |
+
"mean_token_accuracy": 0.7963380098342896,
|
| 8982 |
+
"num_tokens": 11044468.0,
|
| 8983 |
+
"step": 9970
|
| 8984 |
+
},
|
| 8985 |
+
{
|
| 8986 |
+
"epoch": 2.010880515817046,
|
| 8987 |
+
"grad_norm": 12.625,
|
| 8988 |
+
"learning_rate": 6.595473168110687e-06,
|
| 8989 |
+
"loss": 0.7848,
|
| 8990 |
+
"mean_token_accuracy": 0.8049242258071899,
|
| 8991 |
+
"num_tokens": 11055288.0,
|
| 8992 |
+
"step": 9980
|
| 8993 |
+
},
|
| 8994 |
+
{
|
| 8995 |
+
"epoch": 2.0128954261535363,
|
| 8996 |
+
"grad_norm": 14.6875,
|
| 8997 |
+
"learning_rate": 6.5820404325340856e-06,
|
| 8998 |
+
"loss": 0.7735,
|
| 8999 |
+
"mean_token_accuracy": 0.8060416877269745,
|
| 9000 |
+
"num_tokens": 11066104.0,
|
| 9001 |
+
"step": 9990
|
| 9002 |
+
},
|
| 9003 |
+
{
|
| 9004 |
+
"epoch": 2.014910336490026,
|
| 9005 |
+
"grad_norm": 15.0625,
|
| 9006 |
+
"learning_rate": 6.568607696957486e-06,
|
| 9007 |
+
"loss": 0.7981,
|
| 9008 |
+
"mean_token_accuracy": 0.801843786239624,
|
| 9009 |
+
"num_tokens": 11076275.0,
|
| 9010 |
+
"step": 10000
|
| 9011 |
}
|
| 9012 |
],
|
| 9013 |
"logging_steps": 10,
|
|
|
|
| 9027 |
"attributes": {}
|
| 9028 |
}
|
| 9029 |
},
|
| 9030 |
+
"total_flos": 1.3397361208068096e+16,
|
| 9031 |
"train_batch_size": 8,
|
| 9032 |
"trial_name": null,
|
| 9033 |
"trial_params": null
|