fats-fme commited on
Commit
18e9868
·
verified ·
1 Parent(s): 52566bc

Training in progress, step 174, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:898721c74f3f6db9754d0e51763590d601ef6f6aa1ea960702dec11ce90f7aa8
3
  size 97307544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abcde8ac3d04c98a861549359d1224345d51b37788f3b8a385d055e42467481e
3
  size 97307544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3bbc704f2d554dba80c269377a8d05a954a71224277b51a710ffc1c8c2bcf47
3
  size 194840426
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e12dbbc77c83e3643e10bdd747b43283d299a4a0443c375284bff3cb8b034d78
3
  size 194840426
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b16142a47bc7a326beb92cd8c6e770378643df7060bf0910e73d114e53bbb34
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6885c98f9944031b1d47a617f1a2d46af56909da93ca8c4ac4a873f90d3142fe
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d7200ae41ef4ca4e25b400cb2c31ee104706220a889be74188adae29a4f900f
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3901384dd9ab7f4272cbe89ec0e7d7be7b55f7e04d725cfcd750d27555d4c8c0
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab7b6fce0ffd68999dfaed3e79782a798f64a449077f4f179771fda8a7a023e3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cfc363eda5dfe78796b361134c848de53d3bd2047f481ddb99265e158e573b4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5032537960954447,
5
  "eval_steps": 58,
6
- "global_step": 116,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -843,6 +843,420 @@
843
  "eval_samples_per_second": 8.853,
844
  "eval_steps_per_second": 2.23,
845
  "step": 116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  }
847
  ],
848
  "logging_steps": 1,
@@ -862,7 +1276,7 @@
862
  "attributes": {}
863
  }
864
  },
865
- "total_flos": 2.5936142383290778e+17,
866
  "train_batch_size": 2,
867
  "trial_name": null,
868
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.754880694143167,
5
  "eval_steps": 58,
6
+ "global_step": 174,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
843
  "eval_samples_per_second": 8.853,
844
  "eval_steps_per_second": 2.23,
845
  "step": 116
846
+ },
847
+ {
848
+ "epoch": 0.5075921908893709,
849
+ "grad_norm": 1.3170753717422485,
850
+ "learning_rate": 0.00013907311284892736,
851
+ "loss": 2.9572,
852
+ "step": 117
853
+ },
854
+ {
855
+ "epoch": 0.5119305856832972,
856
+ "grad_norm": 1.5243107080459595,
857
+ "learning_rate": 0.00013746065934159123,
858
+ "loss": 3.3082,
859
+ "step": 118
860
+ },
861
+ {
862
+ "epoch": 0.5162689804772235,
863
+ "grad_norm": 1.5845880508422852,
864
+ "learning_rate": 0.00013583679495453,
865
+ "loss": 3.4819,
866
+ "step": 119
867
+ },
868
+ {
869
+ "epoch": 0.5206073752711496,
870
+ "grad_norm": 1.66307532787323,
871
+ "learning_rate": 0.00013420201433256689,
872
+ "loss": 3.1131,
873
+ "step": 120
874
+ },
875
+ {
876
+ "epoch": 0.5249457700650759,
877
+ "grad_norm": 1.6470588445663452,
878
+ "learning_rate": 0.00013255681544571568,
879
+ "loss": 3.2847,
880
+ "step": 121
881
+ },
882
+ {
883
+ "epoch": 0.5292841648590022,
884
+ "grad_norm": 2.1118075847625732,
885
+ "learning_rate": 0.00013090169943749476,
886
+ "loss": 3.4669,
887
+ "step": 122
888
+ },
889
+ {
890
+ "epoch": 0.5336225596529284,
891
+ "grad_norm": 2.056396722793579,
892
+ "learning_rate": 0.00012923717047227368,
893
+ "loss": 3.1136,
894
+ "step": 123
895
+ },
896
+ {
897
+ "epoch": 0.5379609544468547,
898
+ "grad_norm": 2.2389657497406006,
899
+ "learning_rate": 0.0001275637355816999,
900
+ "loss": 2.9323,
901
+ "step": 124
902
+ },
903
+ {
904
+ "epoch": 0.5422993492407809,
905
+ "grad_norm": 2.863621711730957,
906
+ "learning_rate": 0.00012588190451025207,
907
+ "loss": 2.9585,
908
+ "step": 125
909
+ },
910
+ {
911
+ "epoch": 0.5466377440347071,
912
+ "grad_norm": 0.8712321519851685,
913
+ "learning_rate": 0.00012419218955996676,
914
+ "loss": 3.1439,
915
+ "step": 126
916
+ },
917
+ {
918
+ "epoch": 0.5509761388286334,
919
+ "grad_norm": 1.0713740587234497,
920
+ "learning_rate": 0.0001224951054343865,
921
+ "loss": 3.2213,
922
+ "step": 127
923
+ },
924
+ {
925
+ "epoch": 0.5553145336225597,
926
+ "grad_norm": 1.104315996170044,
927
+ "learning_rate": 0.00012079116908177593,
928
+ "loss": 3.4522,
929
+ "step": 128
930
+ },
931
+ {
932
+ "epoch": 0.559652928416486,
933
+ "grad_norm": 1.0883917808532715,
934
+ "learning_rate": 0.00011908089953765449,
935
+ "loss": 3.3503,
936
+ "step": 129
937
+ },
938
+ {
939
+ "epoch": 0.5639913232104121,
940
+ "grad_norm": 1.0000834465026855,
941
+ "learning_rate": 0.00011736481776669306,
942
+ "loss": 3.4036,
943
+ "step": 130
944
+ },
945
+ {
946
+ "epoch": 0.5683297180043384,
947
+ "grad_norm": 0.8869354128837585,
948
+ "learning_rate": 0.0001156434465040231,
949
+ "loss": 3.2749,
950
+ "step": 131
951
+ },
952
+ {
953
+ "epoch": 0.5726681127982647,
954
+ "grad_norm": 0.8651937246322632,
955
+ "learning_rate": 0.00011391731009600654,
956
+ "loss": 3.3679,
957
+ "step": 132
958
+ },
959
+ {
960
+ "epoch": 0.5770065075921909,
961
+ "grad_norm": 0.9174556136131287,
962
+ "learning_rate": 0.00011218693434051475,
963
+ "loss": 3.311,
964
+ "step": 133
965
+ },
966
+ {
967
+ "epoch": 0.5813449023861171,
968
+ "grad_norm": 0.930533230304718,
969
+ "learning_rate": 0.00011045284632676536,
970
+ "loss": 3.3761,
971
+ "step": 134
972
+ },
973
+ {
974
+ "epoch": 0.5856832971800434,
975
+ "grad_norm": 0.9851680994033813,
976
+ "learning_rate": 0.00010871557427476583,
977
+ "loss": 3.2752,
978
+ "step": 135
979
+ },
980
+ {
981
+ "epoch": 0.5900216919739696,
982
+ "grad_norm": 0.9633740782737732,
983
+ "learning_rate": 0.00010697564737441252,
984
+ "loss": 3.2373,
985
+ "step": 136
986
+ },
987
+ {
988
+ "epoch": 0.5943600867678959,
989
+ "grad_norm": 1.132585048675537,
990
+ "learning_rate": 0.0001052335956242944,
991
+ "loss": 3.2323,
992
+ "step": 137
993
+ },
994
+ {
995
+ "epoch": 0.5986984815618221,
996
+ "grad_norm": 1.1232091188430786,
997
+ "learning_rate": 0.00010348994967025012,
998
+ "loss": 3.2874,
999
+ "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.6030368763557483,
1003
+ "grad_norm": 1.2559125423431396,
1004
+ "learning_rate": 0.00010174524064372837,
1005
+ "loss": 3.2367,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.6073752711496746,
1010
+ "grad_norm": 1.2623041868209839,
1011
+ "learning_rate": 0.0001,
1012
+ "loss": 3.2243,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.6117136659436009,
1017
+ "grad_norm": 1.3554457426071167,
1018
+ "learning_rate": 9.825475935627165e-05,
1019
+ "loss": 3.4802,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.6160520607375272,
1024
+ "grad_norm": 1.4170132875442505,
1025
+ "learning_rate": 9.651005032974994e-05,
1026
+ "loss": 3.354,
1027
+ "step": 142
1028
+ },
1029
+ {
1030
+ "epoch": 0.6203904555314533,
1031
+ "grad_norm": 1.4309097528457642,
1032
+ "learning_rate": 9.476640437570562e-05,
1033
+ "loss": 3.1352,
1034
+ "step": 143
1035
+ },
1036
+ {
1037
+ "epoch": 0.6247288503253796,
1038
+ "grad_norm": 1.5829153060913086,
1039
+ "learning_rate": 9.302435262558747e-05,
1040
+ "loss": 3.2455,
1041
+ "step": 144
1042
+ },
1043
+ {
1044
+ "epoch": 0.6290672451193059,
1045
+ "grad_norm": 1.8210502862930298,
1046
+ "learning_rate": 9.128442572523417e-05,
1047
+ "loss": 3.2991,
1048
+ "step": 145
1049
+ },
1050
+ {
1051
+ "epoch": 0.6334056399132321,
1052
+ "grad_norm": 1.842761516571045,
1053
+ "learning_rate": 8.954715367323468e-05,
1054
+ "loss": 3.2255,
1055
+ "step": 146
1056
+ },
1057
+ {
1058
+ "epoch": 0.6377440347071583,
1059
+ "grad_norm": 1.9258646965026855,
1060
+ "learning_rate": 8.781306565948528e-05,
1061
+ "loss": 3.1397,
1062
+ "step": 147
1063
+ },
1064
+ {
1065
+ "epoch": 0.6420824295010846,
1066
+ "grad_norm": 2.1189215183258057,
1067
+ "learning_rate": 8.608268990399349e-05,
1068
+ "loss": 3.0414,
1069
+ "step": 148
1070
+ },
1071
+ {
1072
+ "epoch": 0.6464208242950108,
1073
+ "grad_norm": 2.4063761234283447,
1074
+ "learning_rate": 8.435655349597689e-05,
1075
+ "loss": 2.8524,
1076
+ "step": 149
1077
+ },
1078
+ {
1079
+ "epoch": 0.6507592190889371,
1080
+ "grad_norm": 3.6420836448669434,
1081
+ "learning_rate": 8.263518223330697e-05,
1082
+ "loss": 3.0156,
1083
+ "step": 150
1084
+ },
1085
+ {
1086
+ "epoch": 0.6550976138828634,
1087
+ "grad_norm": 0.7080674171447754,
1088
+ "learning_rate": 8.091910046234552e-05,
1089
+ "loss": 3.1636,
1090
+ "step": 151
1091
+ },
1092
+ {
1093
+ "epoch": 0.6594360086767896,
1094
+ "grad_norm": 0.798520565032959,
1095
+ "learning_rate": 7.920883091822408e-05,
1096
+ "loss": 3.212,
1097
+ "step": 152
1098
+ },
1099
+ {
1100
+ "epoch": 0.6637744034707158,
1101
+ "grad_norm": 0.8640486001968384,
1102
+ "learning_rate": 7.750489456561352e-05,
1103
+ "loss": 3.1644,
1104
+ "step": 153
1105
+ },
1106
+ {
1107
+ "epoch": 0.6681127982646421,
1108
+ "grad_norm": 0.870906412601471,
1109
+ "learning_rate": 7.580781044003324e-05,
1110
+ "loss": 3.1876,
1111
+ "step": 154
1112
+ },
1113
+ {
1114
+ "epoch": 0.6724511930585684,
1115
+ "grad_norm": 0.8581348061561584,
1116
+ "learning_rate": 7.411809548974792e-05,
1117
+ "loss": 3.2739,
1118
+ "step": 155
1119
+ },
1120
+ {
1121
+ "epoch": 0.6767895878524945,
1122
+ "grad_norm": 0.8691614270210266,
1123
+ "learning_rate": 7.243626441830009e-05,
1124
+ "loss": 3.2444,
1125
+ "step": 156
1126
+ },
1127
+ {
1128
+ "epoch": 0.6811279826464208,
1129
+ "grad_norm": 0.9455673098564148,
1130
+ "learning_rate": 7.076282952772633e-05,
1131
+ "loss": 3.3004,
1132
+ "step": 157
1133
+ },
1134
+ {
1135
+ "epoch": 0.6854663774403471,
1136
+ "grad_norm": 0.8873337507247925,
1137
+ "learning_rate": 6.909830056250527e-05,
1138
+ "loss": 3.1778,
1139
+ "step": 158
1140
+ },
1141
+ {
1142
+ "epoch": 0.6898047722342733,
1143
+ "grad_norm": 0.910775363445282,
1144
+ "learning_rate": 6.744318455428436e-05,
1145
+ "loss": 3.1346,
1146
+ "step": 159
1147
+ },
1148
+ {
1149
+ "epoch": 0.6941431670281996,
1150
+ "grad_norm": 0.9872409105300903,
1151
+ "learning_rate": 6.579798566743314e-05,
1152
+ "loss": 3.1665,
1153
+ "step": 160
1154
+ },
1155
+ {
1156
+ "epoch": 0.6984815618221258,
1157
+ "grad_norm": 1.0516481399536133,
1158
+ "learning_rate": 6.416320504546997e-05,
1159
+ "loss": 3.3064,
1160
+ "step": 161
1161
+ },
1162
+ {
1163
+ "epoch": 0.702819956616052,
1164
+ "grad_norm": 1.0263571739196777,
1165
+ "learning_rate": 6.25393406584088e-05,
1166
+ "loss": 3.3698,
1167
+ "step": 162
1168
+ },
1169
+ {
1170
+ "epoch": 0.7071583514099783,
1171
+ "grad_norm": 1.1050878763198853,
1172
+ "learning_rate": 6.092688715107264e-05,
1173
+ "loss": 3.2436,
1174
+ "step": 163
1175
+ },
1176
+ {
1177
+ "epoch": 0.7114967462039046,
1178
+ "grad_norm": 1.1121841669082642,
1179
+ "learning_rate": 5.9326335692419995e-05,
1180
+ "loss": 2.9433,
1181
+ "step": 164
1182
+ },
1183
+ {
1184
+ "epoch": 0.7158351409978309,
1185
+ "grad_norm": 1.2424358129501343,
1186
+ "learning_rate": 5.773817382593008e-05,
1187
+ "loss": 3.3616,
1188
+ "step": 165
1189
+ },
1190
+ {
1191
+ "epoch": 0.720173535791757,
1192
+ "grad_norm": 1.1899327039718628,
1193
+ "learning_rate": 5.616288532109225e-05,
1194
+ "loss": 3.0392,
1195
+ "step": 166
1196
+ },
1197
+ {
1198
+ "epoch": 0.7245119305856833,
1199
+ "grad_norm": 1.3395730257034302,
1200
+ "learning_rate": 5.4600950026045326e-05,
1201
+ "loss": 3.0905,
1202
+ "step": 167
1203
+ },
1204
+ {
1205
+ "epoch": 0.7288503253796096,
1206
+ "grad_norm": 1.4268842935562134,
1207
+ "learning_rate": 5.305284372141095e-05,
1208
+ "loss": 3.1247,
1209
+ "step": 168
1210
+ },
1211
+ {
1212
+ "epoch": 0.7331887201735358,
1213
+ "grad_norm": 1.5514875650405884,
1214
+ "learning_rate": 5.15190379753663e-05,
1215
+ "loss": 3.3772,
1216
+ "step": 169
1217
+ },
1218
+ {
1219
+ "epoch": 0.737527114967462,
1220
+ "grad_norm": 1.8371058702468872,
1221
+ "learning_rate": 5.000000000000002e-05,
1222
+ "loss": 2.9262,
1223
+ "step": 170
1224
+ },
1225
+ {
1226
+ "epoch": 0.7418655097613883,
1227
+ "grad_norm": 1.7641676664352417,
1228
+ "learning_rate": 4.8496192508994576e-05,
1229
+ "loss": 3.0113,
1230
+ "step": 171
1231
+ },
1232
+ {
1233
+ "epoch": 0.7462039045553145,
1234
+ "grad_norm": 1.8325039148330688,
1235
+ "learning_rate": 4.700807357667952e-05,
1236
+ "loss": 3.0551,
1237
+ "step": 172
1238
+ },
1239
+ {
1240
+ "epoch": 0.7505422993492408,
1241
+ "grad_norm": 1.9740185737609863,
1242
+ "learning_rate": 4.5536096498497295e-05,
1243
+ "loss": 3.1479,
1244
+ "step": 173
1245
+ },
1246
+ {
1247
+ "epoch": 0.754880694143167,
1248
+ "grad_norm": 2.327420234680176,
1249
+ "learning_rate": 4.4080709652925336e-05,
1250
+ "loss": 3.2149,
1251
+ "step": 174
1252
+ },
1253
+ {
1254
+ "epoch": 0.754880694143167,
1255
+ "eval_loss": 3.163884162902832,
1256
+ "eval_runtime": 43.9914,
1257
+ "eval_samples_per_second": 8.843,
1258
+ "eval_steps_per_second": 2.228,
1259
+ "step": 174
1260
  }
1261
  ],
1262
  "logging_steps": 1,
 
1276
  "attributes": {}
1277
  }
1278
  },
1279
+ "total_flos": 3.8904213574936166e+17,
1280
  "train_batch_size": 2,
1281
  "trial_name": null,
1282
  "trial_params": null