Plofski commited on
Commit
d994af2
·
verified ·
1 Parent(s): 87946dd

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72b069175149869f318a48bd011ed6c0026b2c123ef90c0d91ce6c0713bbf92d
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25829a343b7e06cb4e4167e9b46a367935f8229a77e72a1421998542e27d1c90
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff22e875e6a914c0bc7bfb1c7e787c769c8414739be0f07bf5f2faaae0c3727f
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d82f068e68971eb9728724c53cc1a345fe8d815fa606c2f3450b9b39b939104
3
  size 1072594443
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96a89b82d40a4e75a0ac37545280e3be68c54204263336c42598e8db051948b3
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2aa24cd194618e57510eb16be4a4510b1af7e8497163286c5cb19c98f052ca0
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9141648196655248,
6
  "eval_steps": 500,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8558,6 +8558,456 @@
8558
  "mean_token_accuracy": 0.8004867613315583,
8559
  "num_tokens": 10520466.0,
8560
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8561
  }
8562
  ],
8563
  "logging_steps": 10,
@@ -8577,7 +9027,7 @@
8577
  "attributes": {}
8578
  }
8579
  },
8580
- "total_flos": 1.2727359994976256e+16,
8581
  "train_batch_size": 8,
8582
  "trial_name": null,
8583
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.014910336490026,
6
  "eval_steps": 500,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8558
  "mean_token_accuracy": 0.8004867613315583,
8559
  "num_tokens": 10520466.0,
8560
  "step": 9500
8561
+ },
8562
+ {
8563
+ "epoch": 1.916179730002015,
8564
+ "grad_norm": 13.8125,
8565
+ "learning_rate": 7.226811740210895e-06,
8566
+ "loss": 0.8085,
8567
+ "mean_token_accuracy": 0.8049318194389343,
8568
+ "num_tokens": 10532072.0,
8569
+ "step": 9510
8570
+ },
8571
+ {
8572
+ "epoch": 1.918194640338505,
8573
+ "grad_norm": 11.5,
8574
+ "learning_rate": 7.213379004634295e-06,
8575
+ "loss": 0.8021,
8576
+ "mean_token_accuracy": 0.802433705329895,
8577
+ "num_tokens": 10542914.0,
8578
+ "step": 9520
8579
+ },
8580
+ {
8581
+ "epoch": 1.920209550674995,
8582
+ "grad_norm": 10.6875,
8583
+ "learning_rate": 7.1999462690576934e-06,
8584
+ "loss": 0.8222,
8585
+ "mean_token_accuracy": 0.7974632799625396,
8586
+ "num_tokens": 10552708.0,
8587
+ "step": 9530
8588
+ },
8589
+ {
8590
+ "epoch": 1.922224461011485,
8591
+ "grad_norm": 11.9375,
8592
+ "learning_rate": 7.186513533481094e-06,
8593
+ "loss": 0.9031,
8594
+ "mean_token_accuracy": 0.7883083343505859,
8595
+ "num_tokens": 10563446.0,
8596
+ "step": 9540
8597
+ },
8598
+ {
8599
+ "epoch": 1.924239371347975,
8600
+ "grad_norm": 9.875,
8601
+ "learning_rate": 7.173080797904494e-06,
8602
+ "loss": 0.7346,
8603
+ "mean_token_accuracy": 0.8066632449626923,
8604
+ "num_tokens": 10575598.0,
8605
+ "step": 9550
8606
+ },
8607
+ {
8608
+ "epoch": 1.926254281684465,
8609
+ "grad_norm": 12.125,
8610
+ "learning_rate": 7.159648062327894e-06,
8611
+ "loss": 0.8905,
8612
+ "mean_token_accuracy": 0.7765018343925476,
8613
+ "num_tokens": 10586523.0,
8614
+ "step": 9560
8615
+ },
8616
+ {
8617
+ "epoch": 1.928269192020955,
8618
+ "grad_norm": 11.375,
8619
+ "learning_rate": 7.146215326751294e-06,
8620
+ "loss": 0.8584,
8621
+ "mean_token_accuracy": 0.7868084013462067,
8622
+ "num_tokens": 10597941.0,
8623
+ "step": 9570
8624
+ },
8625
+ {
8626
+ "epoch": 1.930284102357445,
8627
+ "grad_norm": 12.8125,
8628
+ "learning_rate": 7.132782591174693e-06,
8629
+ "loss": 0.8567,
8630
+ "mean_token_accuracy": 0.7957649648189544,
8631
+ "num_tokens": 10609135.0,
8632
+ "step": 9580
8633
+ },
8634
+ {
8635
+ "epoch": 1.9322990126939352,
8636
+ "grad_norm": 13.25,
8637
+ "learning_rate": 7.119349855598093e-06,
8638
+ "loss": 0.7257,
8639
+ "mean_token_accuracy": 0.8163708746433258,
8640
+ "num_tokens": 10620680.0,
8641
+ "step": 9590
8642
+ },
8643
+ {
8644
+ "epoch": 1.9343139230304252,
8645
+ "grad_norm": 13.5,
8646
+ "learning_rate": 7.105917120021493e-06,
8647
+ "loss": 0.832,
8648
+ "mean_token_accuracy": 0.7922478914260864,
8649
+ "num_tokens": 10630244.0,
8650
+ "step": 9600
8651
+ },
8652
+ {
8653
+ "epoch": 1.936328833366915,
8654
+ "grad_norm": 13.0,
8655
+ "learning_rate": 7.0924843844448934e-06,
8656
+ "loss": 0.8223,
8657
+ "mean_token_accuracy": 0.7944608926773071,
8658
+ "num_tokens": 10642124.0,
8659
+ "step": 9610
8660
+ },
8661
+ {
8662
+ "epoch": 1.9383437437034052,
8663
+ "grad_norm": 10.8125,
8664
+ "learning_rate": 7.079051648868292e-06,
8665
+ "loss": 0.8406,
8666
+ "mean_token_accuracy": 0.7907250881195068,
8667
+ "num_tokens": 10652833.0,
8668
+ "step": 9620
8669
+ },
8670
+ {
8671
+ "epoch": 1.9403586540398954,
8672
+ "grad_norm": 8.9375,
8673
+ "learning_rate": 7.065618913291692e-06,
8674
+ "loss": 0.7735,
8675
+ "mean_token_accuracy": 0.8065800249576569,
8676
+ "num_tokens": 10665301.0,
8677
+ "step": 9630
8678
+ },
8679
+ {
8680
+ "epoch": 1.942373564376385,
8681
+ "grad_norm": 11.0625,
8682
+ "learning_rate": 7.0521861777150925e-06,
8683
+ "loss": 0.8182,
8684
+ "mean_token_accuracy": 0.7966114640235901,
8685
+ "num_tokens": 10676994.0,
8686
+ "step": 9640
8687
+ },
8688
+ {
8689
+ "epoch": 1.9443884747128752,
8690
+ "grad_norm": 11.0,
8691
+ "learning_rate": 7.038753442138492e-06,
8692
+ "loss": 0.8937,
8693
+ "mean_token_accuracy": 0.7835995197296143,
8694
+ "num_tokens": 10688577.0,
8695
+ "step": 9650
8696
+ },
8697
+ {
8698
+ "epoch": 1.9464033850493654,
8699
+ "grad_norm": 11.0,
8700
+ "learning_rate": 7.025320706561892e-06,
8701
+ "loss": 0.8766,
8702
+ "mean_token_accuracy": 0.7863976120948791,
8703
+ "num_tokens": 10699498.0,
8704
+ "step": 9660
8705
+ },
8706
+ {
8707
+ "epoch": 1.9484182953858553,
8708
+ "grad_norm": 13.0625,
8709
+ "learning_rate": 7.0118879709852915e-06,
8710
+ "loss": 0.8543,
8711
+ "mean_token_accuracy": 0.7946681499481201,
8712
+ "num_tokens": 10711155.0,
8713
+ "step": 9670
8714
+ },
8715
+ {
8716
+ "epoch": 1.9504332057223452,
8717
+ "grad_norm": 15.5,
8718
+ "learning_rate": 6.998455235408692e-06,
8719
+ "loss": 0.905,
8720
+ "mean_token_accuracy": 0.7806779563426971,
8721
+ "num_tokens": 10722357.0,
8722
+ "step": 9680
8723
+ },
8724
+ {
8725
+ "epoch": 1.9524481160588354,
8726
+ "grad_norm": 12.25,
8727
+ "learning_rate": 6.985022499832092e-06,
8728
+ "loss": 0.8497,
8729
+ "mean_token_accuracy": 0.78778578042984,
8730
+ "num_tokens": 10734398.0,
8731
+ "step": 9690
8732
+ },
8733
+ {
8734
+ "epoch": 1.9544630263953255,
8735
+ "grad_norm": 11.9375,
8736
+ "learning_rate": 6.9715897642554906e-06,
8737
+ "loss": 0.8426,
8738
+ "mean_token_accuracy": 0.7919103622436523,
8739
+ "num_tokens": 10745256.0,
8740
+ "step": 9700
8741
+ },
8742
+ {
8743
+ "epoch": 1.9564779367318155,
8744
+ "grad_norm": 11.0,
8745
+ "learning_rate": 6.958157028678891e-06,
8746
+ "loss": 0.8311,
8747
+ "mean_token_accuracy": 0.7953451931476593,
8748
+ "num_tokens": 10755529.0,
8749
+ "step": 9710
8750
+ },
8751
+ {
8752
+ "epoch": 1.9584928470683054,
8753
+ "grad_norm": 11.6875,
8754
+ "learning_rate": 6.944724293102291e-06,
8755
+ "loss": 0.9166,
8756
+ "mean_token_accuracy": 0.7762543320655823,
8757
+ "num_tokens": 10767833.0,
8758
+ "step": 9720
8759
+ },
8760
+ {
8761
+ "epoch": 1.9605077574047955,
8762
+ "grad_norm": 11.0625,
8763
+ "learning_rate": 6.931291557525691e-06,
8764
+ "loss": 0.8342,
8765
+ "mean_token_accuracy": 0.7902640163898468,
8766
+ "num_tokens": 10778381.0,
8767
+ "step": 9730
8768
+ },
8769
+ {
8770
+ "epoch": 1.9625226677412855,
8771
+ "grad_norm": 11.1875,
8772
+ "learning_rate": 6.91785882194909e-06,
8773
+ "loss": 0.8793,
8774
+ "mean_token_accuracy": 0.7860461592674255,
8775
+ "num_tokens": 10789792.0,
8776
+ "step": 9740
8777
+ },
8778
+ {
8779
+ "epoch": 1.9645375780777754,
8780
+ "grad_norm": 9.625,
8781
+ "learning_rate": 6.90442608637249e-06,
8782
+ "loss": 0.8833,
8783
+ "mean_token_accuracy": 0.7854184091091156,
8784
+ "num_tokens": 10801905.0,
8785
+ "step": 9750
8786
+ },
8787
+ {
8788
+ "epoch": 1.9665524884142656,
8789
+ "grad_norm": 12.75,
8790
+ "learning_rate": 6.89099335079589e-06,
8791
+ "loss": 0.8529,
8792
+ "mean_token_accuracy": 0.7941727995872497,
8793
+ "num_tokens": 10812223.0,
8794
+ "step": 9760
8795
+ },
8796
+ {
8797
+ "epoch": 1.9685673987507557,
8798
+ "grad_norm": 11.125,
8799
+ "learning_rate": 6.87756061521929e-06,
8800
+ "loss": 0.7802,
8801
+ "mean_token_accuracy": 0.8009257316589355,
8802
+ "num_tokens": 10823248.0,
8803
+ "step": 9770
8804
+ },
8805
+ {
8806
+ "epoch": 1.9705823090872456,
8807
+ "grad_norm": 14.125,
8808
+ "learning_rate": 6.8641278796426906e-06,
8809
+ "loss": 0.8616,
8810
+ "mean_token_accuracy": 0.7909869194030762,
8811
+ "num_tokens": 10834683.0,
8812
+ "step": 9780
8813
+ },
8814
+ {
8815
+ "epoch": 1.9725972194237356,
8816
+ "grad_norm": 12.25,
8817
+ "learning_rate": 6.850695144066089e-06,
8818
+ "loss": 0.8485,
8819
+ "mean_token_accuracy": 0.792462158203125,
8820
+ "num_tokens": 10845744.0,
8821
+ "step": 9790
8822
+ },
8823
+ {
8824
+ "epoch": 1.9746121297602257,
8825
+ "grad_norm": 10.1875,
8826
+ "learning_rate": 6.837262408489489e-06,
8827
+ "loss": 0.7906,
8828
+ "mean_token_accuracy": 0.8116903901100159,
8829
+ "num_tokens": 10857457.0,
8830
+ "step": 9800
8831
+ },
8832
+ {
8833
+ "epoch": 1.9766270400967159,
8834
+ "grad_norm": 9.8125,
8835
+ "learning_rate": 6.82382967291289e-06,
8836
+ "loss": 0.7862,
8837
+ "mean_token_accuracy": 0.8046435177326202,
8838
+ "num_tokens": 10868106.0,
8839
+ "step": 9810
8840
+ },
8841
+ {
8842
+ "epoch": 1.9786419504332056,
8843
+ "grad_norm": 11.6875,
8844
+ "learning_rate": 6.8103969373362884e-06,
8845
+ "loss": 0.7895,
8846
+ "mean_token_accuracy": 0.8022366106510163,
8847
+ "num_tokens": 10879315.0,
8848
+ "step": 9820
8849
+ },
8850
+ {
8851
+ "epoch": 1.9806568607696957,
8852
+ "grad_norm": 11.75,
8853
+ "learning_rate": 6.796964201759689e-06,
8854
+ "loss": 0.9483,
8855
+ "mean_token_accuracy": 0.7742224156856536,
8856
+ "num_tokens": 10890523.0,
8857
+ "step": 9830
8858
+ },
8859
+ {
8860
+ "epoch": 1.9826717711061859,
8861
+ "grad_norm": 12.625,
8862
+ "learning_rate": 6.783531466183089e-06,
8863
+ "loss": 0.7439,
8864
+ "mean_token_accuracy": 0.8124743521213531,
8865
+ "num_tokens": 10901250.0,
8866
+ "step": 9840
8867
+ },
8868
+ {
8869
+ "epoch": 1.9846866814426758,
8870
+ "grad_norm": 13.4375,
8871
+ "learning_rate": 6.770098730606488e-06,
8872
+ "loss": 0.7185,
8873
+ "mean_token_accuracy": 0.8186926007270813,
8874
+ "num_tokens": 10912951.0,
8875
+ "step": 9850
8876
+ },
8877
+ {
8878
+ "epoch": 1.9867015917791657,
8879
+ "grad_norm": 11.3125,
8880
+ "learning_rate": 6.756665995029889e-06,
8881
+ "loss": 0.8252,
8882
+ "mean_token_accuracy": 0.7952579975128173,
8883
+ "num_tokens": 10924651.0,
8884
+ "step": 9860
8885
+ },
8886
+ {
8887
+ "epoch": 1.9887165021156559,
8888
+ "grad_norm": 10.1875,
8889
+ "learning_rate": 6.743233259453288e-06,
8890
+ "loss": 0.937,
8891
+ "mean_token_accuracy": 0.7707946419715881,
8892
+ "num_tokens": 10936427.0,
8893
+ "step": 9870
8894
+ },
8895
+ {
8896
+ "epoch": 1.990731412452146,
8897
+ "grad_norm": 15.375,
8898
+ "learning_rate": 6.729800523876688e-06,
8899
+ "loss": 0.7189,
8900
+ "mean_token_accuracy": 0.822449779510498,
8901
+ "num_tokens": 10946645.0,
8902
+ "step": 9880
8903
+ },
8904
+ {
8905
+ "epoch": 1.992746322788636,
8906
+ "grad_norm": 11.625,
8907
+ "learning_rate": 6.716367788300088e-06,
8908
+ "loss": 0.798,
8909
+ "mean_token_accuracy": 0.7974645853042602,
8910
+ "num_tokens": 10959231.0,
8911
+ "step": 9890
8912
+ },
8913
+ {
8914
+ "epoch": 1.9947612331251259,
8915
+ "grad_norm": 15.875,
8916
+ "learning_rate": 6.7029350527234884e-06,
8917
+ "loss": 0.825,
8918
+ "mean_token_accuracy": 0.7955174386501312,
8919
+ "num_tokens": 10970209.0,
8920
+ "step": 9900
8921
+ },
8922
+ {
8923
+ "epoch": 1.996776143461616,
8924
+ "grad_norm": 12.0625,
8925
+ "learning_rate": 6.689502317146887e-06,
8926
+ "loss": 0.8865,
8927
+ "mean_token_accuracy": 0.779301130771637,
8928
+ "num_tokens": 10981095.0,
8929
+ "step": 9910
8930
+ },
8931
+ {
8932
+ "epoch": 1.998791053798106,
8933
+ "grad_norm": 10.3125,
8934
+ "learning_rate": 6.676069581570287e-06,
8935
+ "loss": 0.8739,
8936
+ "mean_token_accuracy": 0.7855922758579255,
8937
+ "num_tokens": 10992314.0,
8938
+ "step": 9920
8939
+ },
8940
+ {
8941
+ "epoch": 2.000805964134596,
8942
+ "grad_norm": 9.75,
8943
+ "learning_rate": 6.6626368459936875e-06,
8944
+ "loss": 0.7853,
8945
+ "mean_token_accuracy": 0.7989992260932922,
8946
+ "num_tokens": 11002971.0,
8947
+ "step": 9930
8948
+ },
8949
+ {
8950
+ "epoch": 2.002820874471086,
8951
+ "grad_norm": 10.5,
8952
+ "learning_rate": 6.649204110417087e-06,
8953
+ "loss": 0.9011,
8954
+ "mean_token_accuracy": 0.7816862404346466,
8955
+ "num_tokens": 11014291.0,
8956
+ "step": 9940
8957
+ },
8958
+ {
8959
+ "epoch": 2.004835784807576,
8960
+ "grad_norm": 10.125,
8961
+ "learning_rate": 6.635771374840488e-06,
8962
+ "loss": 0.8415,
8963
+ "mean_token_accuracy": 0.7934383928775788,
8964
+ "num_tokens": 11024051.0,
8965
+ "step": 9950
8966
+ },
8967
+ {
8968
+ "epoch": 2.006850695144066,
8969
+ "grad_norm": 12.125,
8970
+ "learning_rate": 6.6223386392638865e-06,
8971
+ "loss": 0.8009,
8972
+ "mean_token_accuracy": 0.7968979775905609,
8973
+ "num_tokens": 11033995.0,
8974
+ "step": 9960
8975
+ },
8976
+ {
8977
+ "epoch": 2.008865605480556,
8978
+ "grad_norm": 11.75,
8979
+ "learning_rate": 6.608905903687286e-06,
8980
+ "loss": 0.8248,
8981
+ "mean_token_accuracy": 0.7963380098342896,
8982
+ "num_tokens": 11044468.0,
8983
+ "step": 9970
8984
+ },
8985
+ {
8986
+ "epoch": 2.010880515817046,
8987
+ "grad_norm": 12.625,
8988
+ "learning_rate": 6.595473168110687e-06,
8989
+ "loss": 0.7848,
8990
+ "mean_token_accuracy": 0.8049242258071899,
8991
+ "num_tokens": 11055288.0,
8992
+ "step": 9980
8993
+ },
8994
+ {
8995
+ "epoch": 2.0128954261535363,
8996
+ "grad_norm": 14.6875,
8997
+ "learning_rate": 6.5820404325340856e-06,
8998
+ "loss": 0.7735,
8999
+ "mean_token_accuracy": 0.8060416877269745,
9000
+ "num_tokens": 11066104.0,
9001
+ "step": 9990
9002
+ },
9003
+ {
9004
+ "epoch": 2.014910336490026,
9005
+ "grad_norm": 15.0625,
9006
+ "learning_rate": 6.568607696957486e-06,
9007
+ "loss": 0.7981,
9008
+ "mean_token_accuracy": 0.801843786239624,
9009
+ "num_tokens": 11076275.0,
9010
+ "step": 10000
9011
  }
9012
  ],
9013
  "logging_steps": 10,
 
9027
  "attributes": {}
9028
  }
9029
  },
9030
+ "total_flos": 1.3397361208068096e+16,
9031
  "train_batch_size": 8,
9032
  "trial_name": null,
9033
  "trial_params": null