fats-fme commited on
Commit
8da97d6
1 Parent(s): 48392aa

Training in progress, step 141, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52a3b51954c721f4e0f1a70f69c82689a0d13a3cfd8545a9ff2347d07b1107e7
3
  size 50503544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c3aa1b7c7e2131be00257b3a8ad8d8a5cc00d488de310e3aa2ab739b792e83
3
  size 50503544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:960d4c684267a7b67d6e53f747c2044cfbbd699f1e9a423e22ecc02c0a59fba2
3
  size 101184122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6b4c4d50e731ccf97aba3d5220dc59dc330f0eba15762debc58c947b14c121
3
  size 101184122
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1de59d07a68b3279e0fc3a797a6401f62f72e97ad97b47278800d6d6ffbb91e1
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4b688646a1843cb3001738b5b25a88991e005c9c88cb1e32423e5d4a76cb0fc
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13281551d9d2bd9876ec10c3b51e9aaa8758f61508cf9259298bab8540ebb4ab
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a3f1a7b3324e65a1d3becf5bb547ac5dc1f1e4bf3ec0e53f905de1e26d2dee
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67273009a321cf39fd65d544bab368c783d106793a780845817d1f7f88ead9de
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9344f562fad06c4f4d31fd318ba7c558b48f7df5b7e58f8a207127dca92aacd
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5023380093520374,
5
  "eval_steps": 47,
6
- "global_step": 94,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -689,6 +689,343 @@
689
  "eval_samples_per_second": 19.547,
690
  "eval_steps_per_second": 4.887,
691
  "step": 94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
  }
693
  ],
694
  "logging_steps": 1,
@@ -708,7 +1045,7 @@
708
  "attributes": {}
709
  }
710
  },
711
- "total_flos": 7.740853165699891e+16,
712
  "train_batch_size": 2,
713
  "trial_name": null,
714
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7535070140280561,
5
  "eval_steps": 47,
6
+ "global_step": 141,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
689
  "eval_samples_per_second": 19.547,
690
  "eval_steps_per_second": 4.887,
691
  "step": 94
692
+ },
693
+ {
694
+ "epoch": 0.5076820307281229,
695
+ "grad_norm": 2.303241014480591,
696
+ "learning_rate": 0.00011592208874310426,
697
+ "loss": 1.4493,
698
+ "step": 95
699
+ },
700
+ {
701
+ "epoch": 0.5130260521042084,
702
+ "grad_norm": 4.363836288452148,
703
+ "learning_rate": 0.00011406218624648985,
704
+ "loss": 3.1983,
705
+ "step": 96
706
+ },
707
+ {
708
+ "epoch": 0.518370073480294,
709
+ "grad_norm": 6.261653423309326,
710
+ "learning_rate": 0.00011219730744658921,
711
+ "loss": 3.5712,
712
+ "step": 97
713
+ },
714
+ {
715
+ "epoch": 0.5237140948563794,
716
+ "grad_norm": 6.892831802368164,
717
+ "learning_rate": 0.00011032811228363766,
718
+ "loss": 3.5572,
719
+ "step": 98
720
+ },
721
+ {
722
+ "epoch": 0.5290581162324649,
723
+ "grad_norm": 5.144479751586914,
724
+ "learning_rate": 0.00010845526222533791,
725
+ "loss": 3.1015,
726
+ "step": 99
727
+ },
728
+ {
729
+ "epoch": 0.5344021376085505,
730
+ "grad_norm": 7.790161609649658,
731
+ "learning_rate": 0.00010657942003278107,
732
+ "loss": 2.8553,
733
+ "step": 100
734
+ },
735
+ {
736
+ "epoch": 0.539746158984636,
737
+ "grad_norm": 7.2072434425354,
738
+ "learning_rate": 0.00010470124952590977,
739
+ "loss": 3.8293,
740
+ "step": 101
741
+ },
742
+ {
743
+ "epoch": 0.5450901803607214,
744
+ "grad_norm": 6.349959850311279,
745
+ "learning_rate": 0.0001028214153486066,
746
+ "loss": 3.9364,
747
+ "step": 102
748
+ },
749
+ {
750
+ "epoch": 0.5504342017368069,
751
+ "grad_norm": 6.868096351623535,
752
+ "learning_rate": 0.00010094058273349125,
753
+ "loss": 3.3003,
754
+ "step": 103
755
+ },
756
+ {
757
+ "epoch": 0.5557782231128925,
758
+ "grad_norm": 5.542849063873291,
759
+ "learning_rate": 9.90594172665088e-05,
760
+ "loss": 2.9885,
761
+ "step": 104
762
+ },
763
+ {
764
+ "epoch": 0.561122244488978,
765
+ "grad_norm": 3.5966439247131348,
766
+ "learning_rate": 9.717858465139342e-05,
767
+ "loss": 3.389,
768
+ "step": 105
769
+ },
770
+ {
771
+ "epoch": 0.5664662658650634,
772
+ "grad_norm": 3.587754726409912,
773
+ "learning_rate": 9.529875047409027e-05,
774
+ "loss": 3.3237,
775
+ "step": 106
776
+ },
777
+ {
778
+ "epoch": 0.571810287241149,
779
+ "grad_norm": 2.8883140087127686,
780
+ "learning_rate": 9.342057996721894e-05,
781
+ "loss": 2.8259,
782
+ "step": 107
783
+ },
784
+ {
785
+ "epoch": 0.5771543086172345,
786
+ "grad_norm": 3.080749988555908,
787
+ "learning_rate": 9.15447377746621e-05,
788
+ "loss": 2.0154,
789
+ "step": 108
790
+ },
791
+ {
792
+ "epoch": 0.58249832999332,
793
+ "grad_norm": 3.2097012996673584,
794
+ "learning_rate": 8.967188771636236e-05,
795
+ "loss": 2.4829,
796
+ "step": 109
797
+ },
798
+ {
799
+ "epoch": 0.5878423513694054,
800
+ "grad_norm": 2.596616268157959,
801
+ "learning_rate": 8.78026925534108e-05,
802
+ "loss": 2.9976,
803
+ "step": 110
804
+ },
805
+ {
806
+ "epoch": 0.593186372745491,
807
+ "grad_norm": 2.7296924591064453,
808
+ "learning_rate": 8.59378137535102e-05,
809
+ "loss": 3.1661,
810
+ "step": 111
811
+ },
812
+ {
813
+ "epoch": 0.5985303941215765,
814
+ "grad_norm": 2.597951889038086,
815
+ "learning_rate": 8.407791125689578e-05,
816
+ "loss": 2.9017,
817
+ "step": 112
818
+ },
819
+ {
820
+ "epoch": 0.603874415497662,
821
+ "grad_norm": 2.5481317043304443,
822
+ "learning_rate": 8.222364324279689e-05,
823
+ "loss": 3.0536,
824
+ "step": 113
825
+ },
826
+ {
827
+ "epoch": 0.6092184368737475,
828
+ "grad_norm": 3.0392074584960938,
829
+ "learning_rate": 8.037566589652141e-05,
830
+ "loss": 3.2358,
831
+ "step": 114
832
+ },
833
+ {
834
+ "epoch": 0.614562458249833,
835
+ "grad_norm": 2.675720453262329,
836
+ "learning_rate": 7.853463317724614e-05,
837
+ "loss": 2.8082,
838
+ "step": 115
839
+ },
840
+ {
841
+ "epoch": 0.6199064796259185,
842
+ "grad_norm": 3.3868227005004883,
843
+ "learning_rate": 7.67011965865947e-05,
844
+ "loss": 1.7784,
845
+ "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.625250501002004,
849
+ "grad_norm": 3.886552572250366,
850
+ "learning_rate": 7.487600493808513e-05,
851
+ "loss": 1.4271,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.6305945223780896,
856
+ "grad_norm": 3.350705862045288,
857
+ "learning_rate": 7.305970412752909e-05,
858
+ "loss": 1.421,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.635938543754175,
863
+ "grad_norm": 2.7557640075683594,
864
+ "learning_rate": 7.125293690446306e-05,
865
+ "loss": 1.2332,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.6412825651302605,
870
+ "grad_norm": 1.7134552001953125,
871
+ "learning_rate": 6.945634264469339e-05,
872
+ "loss": 1.9718,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.6466265865063461,
877
+ "grad_norm": 2.961225748062134,
878
+ "learning_rate": 6.76705571240348e-05,
879
+ "loss": 3.0191,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.6519706078824316,
884
+ "grad_norm": 4.059014320373535,
885
+ "learning_rate": 6.58962122933234e-05,
886
+ "loss": 3.1156,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.657314629258517,
891
+ "grad_norm": 4.71870231628418,
892
+ "learning_rate": 6.413393605478275e-05,
893
+ "loss": 3.3435,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.6626586506346025,
898
+ "grad_norm": 5.368070125579834,
899
+ "learning_rate": 6.238435203982278e-05,
900
+ "loss": 3.0588,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.6680026720106881,
905
+ "grad_norm": 7.744447708129883,
906
+ "learning_rate": 6.0648079388350466e-05,
907
+ "loss": 3.0363,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.6733466933867736,
912
+ "grad_norm": 2.309891700744629,
913
+ "learning_rate": 5.892573252966926e-05,
914
+ "loss": 3.3485,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 0.678690714762859,
919
+ "grad_norm": 3.1183762550354004,
920
+ "learning_rate": 5.721792096504611e-05,
921
+ "loss": 3.7819,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 0.6840347361389446,
926
+ "grad_norm": 3.1708717346191406,
927
+ "learning_rate": 5.5525249052022076e-05,
928
+ "loss": 2.6204,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 0.6893787575150301,
933
+ "grad_norm": 2.9361085891723633,
934
+ "learning_rate": 5.3848315790543126e-05,
935
+ "loss": 2.9473,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 0.6947227788911156,
940
+ "grad_norm": 2.8519327640533447,
941
+ "learning_rate": 5.218771461098733e-05,
942
+ "loss": 3.3713,
943
+ "step": 130
944
+ },
945
+ {
946
+ "epoch": 0.700066800267201,
947
+ "grad_norm": 3.078104019165039,
948
+ "learning_rate": 5.054403316416247e-05,
949
+ "loss": 3.2767,
950
+ "step": 131
951
+ },
952
+ {
953
+ "epoch": 0.7054108216432866,
954
+ "grad_norm": 2.610368013381958,
955
+ "learning_rate": 4.891785311334923e-05,
956
+ "loss": 3.1482,
957
+ "step": 132
958
+ },
959
+ {
960
+ "epoch": 0.7107548430193721,
961
+ "grad_norm": 2.4465456008911133,
962
+ "learning_rate": 4.7309749928463035e-05,
963
+ "loss": 2.4185,
964
+ "step": 133
965
+ },
966
+ {
967
+ "epoch": 0.7160988643954576,
968
+ "grad_norm": 2.264397621154785,
969
+ "learning_rate": 4.5720292682407874e-05,
970
+ "loss": 2.0626,
971
+ "step": 134
972
+ },
973
+ {
974
+ "epoch": 0.7214428857715431,
975
+ "grad_norm": 2.2613508701324463,
976
+ "learning_rate": 4.41500438496937e-05,
977
+ "loss": 2.7535,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 0.7267869071476286,
982
+ "grad_norm": 2.047032356262207,
983
+ "learning_rate": 4.2599559107388645e-05,
984
+ "loss": 2.9348,
985
+ "step": 136
986
+ },
987
+ {
988
+ "epoch": 0.7321309285237141,
989
+ "grad_norm": 2.202378273010254,
990
+ "learning_rate": 4.10693871384773e-05,
991
+ "loss": 2.674,
992
+ "step": 137
993
+ },
994
+ {
995
+ "epoch": 0.7374749498997996,
996
+ "grad_norm": 2.3247923851013184,
997
+ "learning_rate": 3.95600694376933e-05,
998
+ "loss": 3.1481,
999
+ "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.7428189712758851,
1003
+ "grad_norm": 2.374443531036377,
1004
+ "learning_rate": 3.8072140119896504e-05,
1005
+ "loss": 3.0967,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.7481629926519706,
1010
+ "grad_norm": 2.3245203495025635,
1011
+ "learning_rate": 3.660612573106081e-05,
1012
+ "loss": 2.9844,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.7535070140280561,
1017
+ "grad_norm": 2.2515153884887695,
1018
+ "learning_rate": 3.5162545061941335e-05,
1019
+ "loss": 1.9046,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.7535070140280561,
1024
+ "eval_loss": 2.6246707439422607,
1025
+ "eval_runtime": 15.9998,
1026
+ "eval_samples_per_second": 19.75,
1027
+ "eval_steps_per_second": 4.938,
1028
+ "step": 141
1029
  }
1030
  ],
1031
  "logging_steps": 1,
 
1045
  "attributes": {}
1046
  }
1047
  },
1048
+ "total_flos": 1.1611279748549837e+17,
1049
  "train_batch_size": 2,
1050
  "trial_name": null,
1051
  "trial_params": null