fats-fme commited on
Commit
e8c2181
·
verified ·
1 Parent(s): eb64d07

Training in progress, step 282, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca8ab5d9608b907960fe2e4cec1b282565bba34ba8caa259adf031cec50b5fd9
3
  size 101752088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c04db13401440bb120e3569a23dbda67cd78267d7c0b1c77f3d3b3cee4cdee
3
  size 101752088
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9433992d4fe48d07ea678c15456a035b339f7ea76d22edcf4a5d4401482a6809
3
  size 203713238
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1a032c7471714a5d4a253e904e854da99f9722e45c96bc0da82257681a15490
3
  size 203713238
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00a86fe0c8b02bbcee8b3561ea3d9506dec8e361138bfc545d38955d42f5be26
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45144a3e80d33a7835b701c1b7b63faebde586b75158a47eb826cd0228136ec0
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acadf30df93695037d7a7f90add51398566ee38c1556f28a30733e5785f52c0d
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feb6925b0db33b6f02f0ccbd50be336d8d47178a933641d2c637051d854a6c60
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e243ddb0ff8a57f4dea8942d7b9c1152f7f318f4ab23c0adbfc8e8e068c72ba
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c28833a5c9fe2e108390575900c0ade8d470ff95484328f12052b199c28b6360
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2503884572697003,
5
  "eval_steps": 141,
6
- "global_step": 141,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1010,6 +1010,1001 @@
1010
  "eval_samples_per_second": 5.938,
1011
  "eval_steps_per_second": 1.489,
1012
  "step": 141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
  }
1014
  ],
1015
  "logging_steps": 1,
@@ -1029,7 +2024,7 @@
1029
  "attributes": {}
1030
  }
1031
  },
1032
- "total_flos": 1.8570783162472858e+17,
1033
  "train_batch_size": 2,
1034
  "trial_name": null,
1035
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5007769145394007,
5
  "eval_steps": 141,
6
+ "global_step": 282,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1010
  "eval_samples_per_second": 5.938,
1011
  "eval_steps_per_second": 1.489,
1012
  "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.25216426193118757,
1016
+ "grad_norm": 0.3713844120502472,
1017
+ "learning_rate": 0.0001845441764722514,
1018
+ "loss": 0.7688,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.2539400665926748,
1023
+ "grad_norm": 0.352450430393219,
1024
+ "learning_rate": 0.00018421553219875658,
1025
+ "loss": 0.7769,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.25571587125416206,
1030
+ "grad_norm": 0.3609173893928528,
1031
+ "learning_rate": 0.00018388372960726228,
1032
+ "loss": 0.7718,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.25749167591564925,
1037
+ "grad_norm": 0.36195874214172363,
1038
+ "learning_rate": 0.00018354878114129367,
1039
+ "loss": 0.7375,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.2592674805771365,
1044
+ "grad_norm": 0.3802485466003418,
1045
+ "learning_rate": 0.00018321069936235503,
1046
+ "loss": 0.7778,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.26104328523862375,
1051
+ "grad_norm": 0.38449469208717346,
1052
+ "learning_rate": 0.00018286949694945866,
1053
+ "loss": 0.7458,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.262819089900111,
1058
+ "grad_norm": 0.3975572884082794,
1059
+ "learning_rate": 0.00018252518669864936,
1060
+ "loss": 0.7367,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.26459489456159824,
1065
+ "grad_norm": 0.49581316113471985,
1066
+ "learning_rate": 0.0001821777815225245,
1067
+ "loss": 0.7948,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.2663706992230855,
1072
+ "grad_norm": 0.5556712746620178,
1073
+ "learning_rate": 0.00018182729444974992,
1074
+ "loss": 0.8143,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.2681465038845727,
1079
+ "grad_norm": 0.3207700848579407,
1080
+ "learning_rate": 0.00018147373862457107,
1081
+ "loss": 0.8578,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.2699223085460599,
1086
+ "grad_norm": 0.3484250605106354,
1087
+ "learning_rate": 0.00018111712730632022,
1088
+ "loss": 0.8757,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.27169811320754716,
1093
+ "grad_norm": 0.33792024850845337,
1094
+ "learning_rate": 0.0001807574738689193,
1095
+ "loss": 0.8464,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.2734739178690344,
1100
+ "grad_norm": 0.3430371582508087,
1101
+ "learning_rate": 0.000180394791800378,
1102
+ "loss": 0.8607,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.27524972253052166,
1107
+ "grad_norm": 0.3120534420013428,
1108
+ "learning_rate": 0.00018002909470228842,
1109
+ "loss": 0.8392,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.2770255271920089,
1114
+ "grad_norm": 0.3126620054244995,
1115
+ "learning_rate": 0.00017966039628931446,
1116
+ "loss": 0.8191,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.2788013318534961,
1121
+ "grad_norm": 0.32269468903541565,
1122
+ "learning_rate": 0.00017928871038867784,
1123
+ "loss": 0.8164,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.28057713651498334,
1128
+ "grad_norm": 0.3052617907524109,
1129
+ "learning_rate": 0.00017891405093963938,
1130
+ "loss": 0.8268,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.2823529411764706,
1135
+ "grad_norm": 0.29926028847694397,
1136
+ "learning_rate": 0.00017853643199297633,
1137
+ "loss": 0.7847,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.28412874583795783,
1142
+ "grad_norm": 0.2997240722179413,
1143
+ "learning_rate": 0.00017815586771045535,
1144
+ "loss": 0.8143,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.2859045504994451,
1149
+ "grad_norm": 0.29772111773490906,
1150
+ "learning_rate": 0.0001777723723643014,
1151
+ "loss": 0.7412,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.2876803551609323,
1156
+ "grad_norm": 0.3138352632522583,
1157
+ "learning_rate": 0.0001773859603366626,
1158
+ "loss": 0.7747,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.2894561598224195,
1163
+ "grad_norm": 0.32726818323135376,
1164
+ "learning_rate": 0.00017699664611907072,
1165
+ "loss": 0.8123,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.29123196448390676,
1170
+ "grad_norm": 0.3244825005531311,
1171
+ "learning_rate": 0.0001766044443118978,
1172
+ "loss": 0.7705,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.293007769145394,
1177
+ "grad_norm": 0.35875847935676575,
1178
+ "learning_rate": 0.00017620936962380856,
1179
+ "loss": 0.7881,
1180
+ "step": 165
1181
+ },
1182
+ {
1183
+ "epoch": 0.29478357380688125,
1184
+ "grad_norm": 0.36488401889801025,
1185
+ "learning_rate": 0.00017581143687120875,
1186
+ "loss": 0.7956,
1187
+ "step": 166
1188
+ },
1189
+ {
1190
+ "epoch": 0.2965593784683685,
1191
+ "grad_norm": 0.33817097544670105,
1192
+ "learning_rate": 0.00017541066097768963,
1193
+ "loss": 0.7719,
1194
+ "step": 167
1195
+ },
1196
+ {
1197
+ "epoch": 0.29833518312985574,
1198
+ "grad_norm": 0.36390411853790283,
1199
+ "learning_rate": 0.0001750070569734681,
1200
+ "loss": 0.8172,
1201
+ "step": 168
1202
+ },
1203
+ {
1204
+ "epoch": 0.30011098779134293,
1205
+ "grad_norm": 0.34076422452926636,
1206
+ "learning_rate": 0.00017460063999482316,
1207
+ "loss": 0.7419,
1208
+ "step": 169
1209
+ },
1210
+ {
1211
+ "epoch": 0.3018867924528302,
1212
+ "grad_norm": 0.39437592029571533,
1213
+ "learning_rate": 0.00017419142528352817,
1214
+ "loss": 0.7519,
1215
+ "step": 170
1216
+ },
1217
+ {
1218
+ "epoch": 0.3036625971143174,
1219
+ "grad_norm": 0.4019312560558319,
1220
+ "learning_rate": 0.00017377942818627942,
1221
+ "loss": 0.7944,
1222
+ "step": 171
1223
+ },
1224
+ {
1225
+ "epoch": 0.30543840177580467,
1226
+ "grad_norm": 0.40751898288726807,
1227
+ "learning_rate": 0.00017336466415412028,
1228
+ "loss": 0.7827,
1229
+ "step": 172
1230
+ },
1231
+ {
1232
+ "epoch": 0.3072142064372919,
1233
+ "grad_norm": 0.4780448079109192,
1234
+ "learning_rate": 0.0001729471487418621,
1235
+ "loss": 0.7872,
1236
+ "step": 173
1237
+ },
1238
+ {
1239
+ "epoch": 0.30899001109877916,
1240
+ "grad_norm": 0.40511685609817505,
1241
+ "learning_rate": 0.0001725268976075005,
1242
+ "loss": 0.7642,
1243
+ "step": 174
1244
+ },
1245
+ {
1246
+ "epoch": 0.31076581576026635,
1247
+ "grad_norm": 0.5618127584457397,
1248
+ "learning_rate": 0.0001721039265116285,
1249
+ "loss": 0.872,
1250
+ "step": 175
1251
+ },
1252
+ {
1253
+ "epoch": 0.3125416204217536,
1254
+ "grad_norm": 0.294917494058609,
1255
+ "learning_rate": 0.00017167825131684513,
1256
+ "loss": 0.8545,
1257
+ "step": 176
1258
+ },
1259
+ {
1260
+ "epoch": 0.31431742508324084,
1261
+ "grad_norm": 0.3281805217266083,
1262
+ "learning_rate": 0.00017124988798716083,
1263
+ "loss": 0.8404,
1264
+ "step": 177
1265
+ },
1266
+ {
1267
+ "epoch": 0.3160932297447281,
1268
+ "grad_norm": 0.33336278796195984,
1269
+ "learning_rate": 0.00017081885258739846,
1270
+ "loss": 0.8495,
1271
+ "step": 178
1272
+ },
1273
+ {
1274
+ "epoch": 0.31786903440621533,
1275
+ "grad_norm": 0.3366440236568451,
1276
+ "learning_rate": 0.00017038516128259115,
1277
+ "loss": 0.8659,
1278
+ "step": 179
1279
+ },
1280
+ {
1281
+ "epoch": 0.3196448390677026,
1282
+ "grad_norm": 0.32397955656051636,
1283
+ "learning_rate": 0.00016994883033737582,
1284
+ "loss": 0.8292,
1285
+ "step": 180
1286
+ },
1287
+ {
1288
+ "epoch": 0.32142064372918977,
1289
+ "grad_norm": 0.2874945402145386,
1290
+ "learning_rate": 0.00016950987611538324,
1291
+ "loss": 0.7949,
1292
+ "step": 181
1293
+ },
1294
+ {
1295
+ "epoch": 0.323196448390677,
1296
+ "grad_norm": 0.3074096143245697,
1297
+ "learning_rate": 0.00016906831507862443,
1298
+ "loss": 0.8076,
1299
+ "step": 182
1300
+ },
1301
+ {
1302
+ "epoch": 0.32497225305216426,
1303
+ "grad_norm": 0.30116966366767883,
1304
+ "learning_rate": 0.0001686241637868734,
1305
+ "loss": 0.8058,
1306
+ "step": 183
1307
+ },
1308
+ {
1309
+ "epoch": 0.3267480577136515,
1310
+ "grad_norm": 0.3052218556404114,
1311
+ "learning_rate": 0.00016817743889704565,
1312
+ "loss": 0.8067,
1313
+ "step": 184
1314
+ },
1315
+ {
1316
+ "epoch": 0.32852386237513875,
1317
+ "grad_norm": 0.3073555827140808,
1318
+ "learning_rate": 0.00016772815716257412,
1319
+ "loss": 0.8496,
1320
+ "step": 185
1321
+ },
1322
+ {
1323
+ "epoch": 0.33029966703662594,
1324
+ "grad_norm": 0.289145290851593,
1325
+ "learning_rate": 0.0001672763354327804,
1326
+ "loss": 0.7362,
1327
+ "step": 186
1328
+ },
1329
+ {
1330
+ "epoch": 0.3320754716981132,
1331
+ "grad_norm": 0.31561294198036194,
1332
+ "learning_rate": 0.00016682199065224307,
1333
+ "loss": 0.802,
1334
+ "step": 187
1335
+ },
1336
+ {
1337
+ "epoch": 0.33385127635960044,
1338
+ "grad_norm": 0.2900339365005493,
1339
+ "learning_rate": 0.00016636513986016213,
1340
+ "loss": 0.7432,
1341
+ "step": 188
1342
+ },
1343
+ {
1344
+ "epoch": 0.3356270810210877,
1345
+ "grad_norm": 0.3267146646976471,
1346
+ "learning_rate": 0.0001659058001897201,
1347
+ "loss": 0.7771,
1348
+ "step": 189
1349
+ },
1350
+ {
1351
+ "epoch": 0.3374028856825749,
1352
+ "grad_norm": 0.3258307874202728,
1353
+ "learning_rate": 0.00016544398886743933,
1354
+ "loss": 0.7345,
1355
+ "step": 190
1356
+ },
1357
+ {
1358
+ "epoch": 0.3391786903440622,
1359
+ "grad_norm": 0.32989659905433655,
1360
+ "learning_rate": 0.000164979723212536,
1361
+ "loss": 0.7383,
1362
+ "step": 191
1363
+ },
1364
+ {
1365
+ "epoch": 0.34095449500554936,
1366
+ "grad_norm": 0.3265599310398102,
1367
+ "learning_rate": 0.00016451302063627066,
1368
+ "loss": 0.6977,
1369
+ "step": 192
1370
+ },
1371
+ {
1372
+ "epoch": 0.3427302996670366,
1373
+ "grad_norm": 0.39376598596572876,
1374
+ "learning_rate": 0.00016404389864129533,
1375
+ "loss": 0.7851,
1376
+ "step": 193
1377
+ },
1378
+ {
1379
+ "epoch": 0.34450610432852385,
1380
+ "grad_norm": 0.40358301997184753,
1381
+ "learning_rate": 0.00016357237482099684,
1382
+ "loss": 0.7928,
1383
+ "step": 194
1384
+ },
1385
+ {
1386
+ "epoch": 0.3462819089900111,
1387
+ "grad_norm": 0.3747034966945648,
1388
+ "learning_rate": 0.00016309846685883726,
1389
+ "loss": 0.7751,
1390
+ "step": 195
1391
+ },
1392
+ {
1393
+ "epoch": 0.34805771365149835,
1394
+ "grad_norm": 0.4160248041152954,
1395
+ "learning_rate": 0.00016262219252769064,
1396
+ "loss": 0.8035,
1397
+ "step": 196
1398
+ },
1399
+ {
1400
+ "epoch": 0.3498335183129856,
1401
+ "grad_norm": 0.39067476987838745,
1402
+ "learning_rate": 0.00016214356968917648,
1403
+ "loss": 0.6726,
1404
+ "step": 197
1405
+ },
1406
+ {
1407
+ "epoch": 0.3516093229744728,
1408
+ "grad_norm": 0.4980023205280304,
1409
+ "learning_rate": 0.00016166261629298995,
1410
+ "loss": 0.7917,
1411
+ "step": 198
1412
+ },
1413
+ {
1414
+ "epoch": 0.35338512763596003,
1415
+ "grad_norm": 0.4774058163166046,
1416
+ "learning_rate": 0.0001611793503762285,
1417
+ "loss": 0.7599,
1418
+ "step": 199
1419
+ },
1420
+ {
1421
+ "epoch": 0.3551609322974473,
1422
+ "grad_norm": 0.5196167230606079,
1423
+ "learning_rate": 0.00016069379006271566,
1424
+ "loss": 0.7608,
1425
+ "step": 200
1426
+ },
1427
+ {
1428
+ "epoch": 0.3569367369589345,
1429
+ "grad_norm": 0.2735799551010132,
1430
+ "learning_rate": 0.00016020595356232135,
1431
+ "loss": 0.8588,
1432
+ "step": 201
1433
+ },
1434
+ {
1435
+ "epoch": 0.35871254162042177,
1436
+ "grad_norm": 0.30770814418792725,
1437
+ "learning_rate": 0.00015971585917027862,
1438
+ "loss": 0.8222,
1439
+ "step": 202
1440
+ },
1441
+ {
1442
+ "epoch": 0.360488346281909,
1443
+ "grad_norm": 0.317123144865036,
1444
+ "learning_rate": 0.00015922352526649803,
1445
+ "loss": 0.7941,
1446
+ "step": 203
1447
+ },
1448
+ {
1449
+ "epoch": 0.3622641509433962,
1450
+ "grad_norm": 0.32672154903411865,
1451
+ "learning_rate": 0.00015872897031487791,
1452
+ "loss": 0.867,
1453
+ "step": 204
1454
+ },
1455
+ {
1456
+ "epoch": 0.36403995560488345,
1457
+ "grad_norm": 0.3169744610786438,
1458
+ "learning_rate": 0.00015823221286261215,
1459
+ "loss": 0.8781,
1460
+ "step": 205
1461
+ },
1462
+ {
1463
+ "epoch": 0.3658157602663707,
1464
+ "grad_norm": 0.30588722229003906,
1465
+ "learning_rate": 0.00015773327153949465,
1466
+ "loss": 0.7827,
1467
+ "step": 206
1468
+ },
1469
+ {
1470
+ "epoch": 0.36759156492785794,
1471
+ "grad_norm": 0.3179618716239929,
1472
+ "learning_rate": 0.0001572321650572205,
1473
+ "loss": 0.8178,
1474
+ "step": 207
1475
+ },
1476
+ {
1477
+ "epoch": 0.3693673695893452,
1478
+ "grad_norm": 0.3094286322593689,
1479
+ "learning_rate": 0.00015672891220868432,
1480
+ "loss": 0.7966,
1481
+ "step": 208
1482
+ },
1483
+ {
1484
+ "epoch": 0.37114317425083243,
1485
+ "grad_norm": 0.31584280729293823,
1486
+ "learning_rate": 0.00015622353186727544,
1487
+ "loss": 0.7982,
1488
+ "step": 209
1489
+ },
1490
+ {
1491
+ "epoch": 0.3729189789123196,
1492
+ "grad_norm": 0.29120850563049316,
1493
+ "learning_rate": 0.0001557160429861702,
1494
+ "loss": 0.7789,
1495
+ "step": 210
1496
+ },
1497
+ {
1498
+ "epoch": 0.37469478357380687,
1499
+ "grad_norm": 0.29743698239326477,
1500
+ "learning_rate": 0.000155206464597621,
1501
+ "loss": 0.7799,
1502
+ "step": 211
1503
+ },
1504
+ {
1505
+ "epoch": 0.3764705882352941,
1506
+ "grad_norm": 0.31440189480781555,
1507
+ "learning_rate": 0.00015469481581224272,
1508
+ "loss": 0.7661,
1509
+ "step": 212
1510
+ },
1511
+ {
1512
+ "epoch": 0.37824639289678136,
1513
+ "grad_norm": 0.3395606279373169,
1514
+ "learning_rate": 0.00015418111581829574,
1515
+ "loss": 0.7657,
1516
+ "step": 213
1517
+ },
1518
+ {
1519
+ "epoch": 0.3800221975582686,
1520
+ "grad_norm": 0.31749066710472107,
1521
+ "learning_rate": 0.0001536653838809667,
1522
+ "loss": 0.7913,
1523
+ "step": 214
1524
+ },
1525
+ {
1526
+ "epoch": 0.38179800221975585,
1527
+ "grad_norm": 0.3586166501045227,
1528
+ "learning_rate": 0.0001531476393416456,
1529
+ "loss": 0.7774,
1530
+ "step": 215
1531
+ },
1532
+ {
1533
+ "epoch": 0.38357380688124304,
1534
+ "grad_norm": 0.32895100116729736,
1535
+ "learning_rate": 0.0001526279016172008,
1536
+ "loss": 0.7882,
1537
+ "step": 216
1538
+ },
1539
+ {
1540
+ "epoch": 0.3853496115427303,
1541
+ "grad_norm": 0.3541489839553833,
1542
+ "learning_rate": 0.00015210619019925066,
1543
+ "loss": 0.7708,
1544
+ "step": 217
1545
+ },
1546
+ {
1547
+ "epoch": 0.38712541620421753,
1548
+ "grad_norm": 0.3232908546924591,
1549
+ "learning_rate": 0.00015158252465343242,
1550
+ "loss": 0.7238,
1551
+ "step": 218
1552
+ },
1553
+ {
1554
+ "epoch": 0.3889012208657048,
1555
+ "grad_norm": 0.36565467715263367,
1556
+ "learning_rate": 0.00015105692461866874,
1557
+ "loss": 0.7685,
1558
+ "step": 219
1559
+ },
1560
+ {
1561
+ "epoch": 0.390677025527192,
1562
+ "grad_norm": 0.3799486756324768,
1563
+ "learning_rate": 0.000150529409806431,
1564
+ "loss": 0.7296,
1565
+ "step": 220
1566
+ },
1567
+ {
1568
+ "epoch": 0.39245283018867927,
1569
+ "grad_norm": 0.4193985164165497,
1570
+ "learning_rate": 0.00015000000000000001,
1571
+ "loss": 0.7731,
1572
+ "step": 221
1573
+ },
1574
+ {
1575
+ "epoch": 0.39422863485016646,
1576
+ "grad_norm": 0.4226386845111847,
1577
+ "learning_rate": 0.00014946871505372425,
1578
+ "loss": 0.8048,
1579
+ "step": 222
1580
+ },
1581
+ {
1582
+ "epoch": 0.3960044395116537,
1583
+ "grad_norm": 0.40805166959762573,
1584
+ "learning_rate": 0.00014893557489227517,
1585
+ "loss": 0.7389,
1586
+ "step": 223
1587
+ },
1588
+ {
1589
+ "epoch": 0.39778024417314095,
1590
+ "grad_norm": 0.5135468244552612,
1591
+ "learning_rate": 0.0001484005995098999,
1592
+ "loss": 0.779,
1593
+ "step": 224
1594
+ },
1595
+ {
1596
+ "epoch": 0.3995560488346282,
1597
+ "grad_norm": 0.6674650311470032,
1598
+ "learning_rate": 0.0001478638089696716,
1599
+ "loss": 0.82,
1600
+ "step": 225
1601
+ },
1602
+ {
1603
+ "epoch": 0.40133185349611544,
1604
+ "grad_norm": 0.3206911087036133,
1605
+ "learning_rate": 0.00014732522340273684,
1606
+ "loss": 0.8985,
1607
+ "step": 226
1608
+ },
1609
+ {
1610
+ "epoch": 0.4031076581576027,
1611
+ "grad_norm": 0.33583980798721313,
1612
+ "learning_rate": 0.0001467848630075608,
1613
+ "loss": 0.8171,
1614
+ "step": 227
1615
+ },
1616
+ {
1617
+ "epoch": 0.4048834628190899,
1618
+ "grad_norm": 0.3324304223060608,
1619
+ "learning_rate": 0.00014624274804916958,
1620
+ "loss": 0.8531,
1621
+ "step": 228
1622
+ },
1623
+ {
1624
+ "epoch": 0.4066592674805771,
1625
+ "grad_norm": 0.32210710644721985,
1626
+ "learning_rate": 0.00014569889885839037,
1627
+ "loss": 0.8349,
1628
+ "step": 229
1629
+ },
1630
+ {
1631
+ "epoch": 0.40843507214206437,
1632
+ "grad_norm": 0.30829885601997375,
1633
+ "learning_rate": 0.00014515333583108896,
1634
+ "loss": 0.8176,
1635
+ "step": 230
1636
+ },
1637
+ {
1638
+ "epoch": 0.4102108768035516,
1639
+ "grad_norm": 0.31730225682258606,
1640
+ "learning_rate": 0.00014460607942740468,
1641
+ "loss": 0.8109,
1642
+ "step": 231
1643
+ },
1644
+ {
1645
+ "epoch": 0.41198668146503886,
1646
+ "grad_norm": 0.32128164172172546,
1647
+ "learning_rate": 0.00014405715017098335,
1648
+ "loss": 0.8049,
1649
+ "step": 232
1650
+ },
1651
+ {
1652
+ "epoch": 0.4137624861265261,
1653
+ "grad_norm": 0.32257241010665894,
1654
+ "learning_rate": 0.00014350656864820733,
1655
+ "loss": 0.79,
1656
+ "step": 233
1657
+ },
1658
+ {
1659
+ "epoch": 0.4155382907880133,
1660
+ "grad_norm": 0.29663363099098206,
1661
+ "learning_rate": 0.0001429543555074237,
1662
+ "loss": 0.7606,
1663
+ "step": 234
1664
+ },
1665
+ {
1666
+ "epoch": 0.41731409544950054,
1667
+ "grad_norm": 0.3175968527793884,
1668
+ "learning_rate": 0.00014240053145816967,
1669
+ "loss": 0.8093,
1670
+ "step": 235
1671
+ },
1672
+ {
1673
+ "epoch": 0.4190899001109878,
1674
+ "grad_norm": 0.30839797854423523,
1675
+ "learning_rate": 0.00014184511727039612,
1676
+ "loss": 0.8033,
1677
+ "step": 236
1678
+ },
1679
+ {
1680
+ "epoch": 0.42086570477247504,
1681
+ "grad_norm": 0.32169485092163086,
1682
+ "learning_rate": 0.0001412881337736885,
1683
+ "loss": 0.7583,
1684
+ "step": 237
1685
+ },
1686
+ {
1687
+ "epoch": 0.4226415094339623,
1688
+ "grad_norm": 0.3165202736854553,
1689
+ "learning_rate": 0.00014072960185648577,
1690
+ "loss": 0.7864,
1691
+ "step": 238
1692
+ },
1693
+ {
1694
+ "epoch": 0.4244173140954495,
1695
+ "grad_norm": 0.3507262170314789,
1696
+ "learning_rate": 0.00014016954246529696,
1697
+ "loss": 0.8196,
1698
+ "step": 239
1699
+ },
1700
+ {
1701
+ "epoch": 0.4261931187569367,
1702
+ "grad_norm": 0.3330634534358978,
1703
+ "learning_rate": 0.0001396079766039157,
1704
+ "loss": 0.7356,
1705
+ "step": 240
1706
+ },
1707
+ {
1708
+ "epoch": 0.42796892341842396,
1709
+ "grad_norm": 0.3456502854824066,
1710
+ "learning_rate": 0.00013904492533263244,
1711
+ "loss": 0.7636,
1712
+ "step": 241
1713
+ },
1714
+ {
1715
+ "epoch": 0.4297447280799112,
1716
+ "grad_norm": 0.3290559649467468,
1717
+ "learning_rate": 0.00013848040976744457,
1718
+ "loss": 0.6921,
1719
+ "step": 242
1720
+ },
1721
+ {
1722
+ "epoch": 0.43152053274139845,
1723
+ "grad_norm": 0.34343284368515015,
1724
+ "learning_rate": 0.00013791445107926478,
1725
+ "loss": 0.7661,
1726
+ "step": 243
1727
+ },
1728
+ {
1729
+ "epoch": 0.4332963374028857,
1730
+ "grad_norm": 0.34806933999061584,
1731
+ "learning_rate": 0.00013734707049312673,
1732
+ "loss": 0.7266,
1733
+ "step": 244
1734
+ },
1735
+ {
1736
+ "epoch": 0.43507214206437295,
1737
+ "grad_norm": 0.3577682375907898,
1738
+ "learning_rate": 0.00013677828928738934,
1739
+ "loss": 0.7337,
1740
+ "step": 245
1741
+ },
1742
+ {
1743
+ "epoch": 0.43684794672586014,
1744
+ "grad_norm": 0.37708649039268494,
1745
+ "learning_rate": 0.00013620812879293863,
1746
+ "loss": 0.6949,
1747
+ "step": 246
1748
+ },
1749
+ {
1750
+ "epoch": 0.4386237513873474,
1751
+ "grad_norm": 0.3661216199398041,
1752
+ "learning_rate": 0.00013563661039238785,
1753
+ "loss": 0.7049,
1754
+ "step": 247
1755
+ },
1756
+ {
1757
+ "epoch": 0.44039955604883463,
1758
+ "grad_norm": 0.4453539550304413,
1759
+ "learning_rate": 0.00013506375551927547,
1760
+ "loss": 0.7957,
1761
+ "step": 248
1762
+ },
1763
+ {
1764
+ "epoch": 0.4421753607103219,
1765
+ "grad_norm": 0.46171826124191284,
1766
+ "learning_rate": 0.00013448958565726144,
1767
+ "loss": 0.7175,
1768
+ "step": 249
1769
+ },
1770
+ {
1771
+ "epoch": 0.4439511653718091,
1772
+ "grad_norm": 0.6314205527305603,
1773
+ "learning_rate": 0.00013391412233932149,
1774
+ "loss": 0.8853,
1775
+ "step": 250
1776
+ },
1777
+ {
1778
+ "epoch": 0.4457269700332963,
1779
+ "grad_norm": 0.29680782556533813,
1780
+ "learning_rate": 0.00013333738714693956,
1781
+ "loss": 0.8789,
1782
+ "step": 251
1783
+ },
1784
+ {
1785
+ "epoch": 0.44750277469478356,
1786
+ "grad_norm": 0.30771735310554504,
1787
+ "learning_rate": 0.00013275940170929843,
1788
+ "loss": 0.8126,
1789
+ "step": 252
1790
+ },
1791
+ {
1792
+ "epoch": 0.4492785793562708,
1793
+ "grad_norm": 0.3242880403995514,
1794
+ "learning_rate": 0.00013218018770246858,
1795
+ "loss": 0.7787,
1796
+ "step": 253
1797
+ },
1798
+ {
1799
+ "epoch": 0.45105438401775805,
1800
+ "grad_norm": 0.33549076318740845,
1801
+ "learning_rate": 0.00013159976684859527,
1802
+ "loss": 0.8113,
1803
+ "step": 254
1804
+ },
1805
+ {
1806
+ "epoch": 0.4528301886792453,
1807
+ "grad_norm": 0.34281155467033386,
1808
+ "learning_rate": 0.00013101816091508388,
1809
+ "loss": 0.8371,
1810
+ "step": 255
1811
+ },
1812
+ {
1813
+ "epoch": 0.45460599334073254,
1814
+ "grad_norm": 0.3422442078590393,
1815
+ "learning_rate": 0.0001304353917137836,
1816
+ "loss": 0.8362,
1817
+ "step": 256
1818
+ },
1819
+ {
1820
+ "epoch": 0.45638179800221973,
1821
+ "grad_norm": 0.3019155263900757,
1822
+ "learning_rate": 0.00012985148110016947,
1823
+ "loss": 0.7317,
1824
+ "step": 257
1825
+ },
1826
+ {
1827
+ "epoch": 0.458157602663707,
1828
+ "grad_norm": 0.32793429493904114,
1829
+ "learning_rate": 0.0001292664509725226,
1830
+ "loss": 0.7861,
1831
+ "step": 258
1832
+ },
1833
+ {
1834
+ "epoch": 0.4599334073251942,
1835
+ "grad_norm": 0.32433855533599854,
1836
+ "learning_rate": 0.00012868032327110904,
1837
+ "loss": 0.7708,
1838
+ "step": 259
1839
+ },
1840
+ {
1841
+ "epoch": 0.46170921198668147,
1842
+ "grad_norm": 0.31858816742897034,
1843
+ "learning_rate": 0.00012809311997735696,
1844
+ "loss": 0.7754,
1845
+ "step": 260
1846
+ },
1847
+ {
1848
+ "epoch": 0.4634850166481687,
1849
+ "grad_norm": 0.3172609210014343,
1850
+ "learning_rate": 0.00012750486311303218,
1851
+ "loss": 0.7839,
1852
+ "step": 261
1853
+ },
1854
+ {
1855
+ "epoch": 0.46526082130965596,
1856
+ "grad_norm": 0.2951931953430176,
1857
+ "learning_rate": 0.00012691557473941243,
1858
+ "loss": 0.7261,
1859
+ "step": 262
1860
+ },
1861
+ {
1862
+ "epoch": 0.46703662597114315,
1863
+ "grad_norm": 0.31385374069213867,
1864
+ "learning_rate": 0.00012632527695645993,
1865
+ "loss": 0.8221,
1866
+ "step": 263
1867
+ },
1868
+ {
1869
+ "epoch": 0.4688124306326304,
1870
+ "grad_norm": 0.31157392263412476,
1871
+ "learning_rate": 0.0001257339919019925,
1872
+ "loss": 0.7711,
1873
+ "step": 264
1874
+ },
1875
+ {
1876
+ "epoch": 0.47058823529411764,
1877
+ "grad_norm": 0.32580870389938354,
1878
+ "learning_rate": 0.00012514174175085345,
1879
+ "loss": 0.7592,
1880
+ "step": 265
1881
+ },
1882
+ {
1883
+ "epoch": 0.4723640399556049,
1884
+ "grad_norm": 0.33285781741142273,
1885
+ "learning_rate": 0.00012454854871407994,
1886
+ "loss": 0.7349,
1887
+ "step": 266
1888
+ },
1889
+ {
1890
+ "epoch": 0.47413984461709213,
1891
+ "grad_norm": 0.3179035186767578,
1892
+ "learning_rate": 0.0001239544350380699,
1893
+ "loss": 0.7338,
1894
+ "step": 267
1895
+ },
1896
+ {
1897
+ "epoch": 0.4759156492785794,
1898
+ "grad_norm": 0.31393003463745117,
1899
+ "learning_rate": 0.00012335942300374788,
1900
+ "loss": 0.7088,
1901
+ "step": 268
1902
+ },
1903
+ {
1904
+ "epoch": 0.47769145394006657,
1905
+ "grad_norm": 0.33285436034202576,
1906
+ "learning_rate": 0.00012276353492572935,
1907
+ "loss": 0.7069,
1908
+ "step": 269
1909
+ },
1910
+ {
1911
+ "epoch": 0.4794672586015538,
1912
+ "grad_norm": 0.38329485058784485,
1913
+ "learning_rate": 0.00012216679315148386,
1914
+ "loss": 0.7093,
1915
+ "step": 270
1916
+ },
1917
+ {
1918
+ "epoch": 0.48124306326304106,
1919
+ "grad_norm": 0.3584016263484955,
1920
+ "learning_rate": 0.00012156922006049702,
1921
+ "loss": 0.7513,
1922
+ "step": 271
1923
+ },
1924
+ {
1925
+ "epoch": 0.4830188679245283,
1926
+ "grad_norm": 0.3995126187801361,
1927
+ "learning_rate": 0.00012097083806343103,
1928
+ "loss": 0.7384,
1929
+ "step": 272
1930
+ },
1931
+ {
1932
+ "epoch": 0.48479467258601555,
1933
+ "grad_norm": 0.4097007215023041,
1934
+ "learning_rate": 0.00012037166960128443,
1935
+ "loss": 0.7794,
1936
+ "step": 273
1937
+ },
1938
+ {
1939
+ "epoch": 0.4865704772475028,
1940
+ "grad_norm": 0.4780315160751343,
1941
+ "learning_rate": 0.00011977173714455034,
1942
+ "loss": 0.7437,
1943
+ "step": 274
1944
+ },
1945
+ {
1946
+ "epoch": 0.48834628190899,
1947
+ "grad_norm": 0.5396427512168884,
1948
+ "learning_rate": 0.00011917106319237386,
1949
+ "loss": 0.7542,
1950
+ "step": 275
1951
+ },
1952
+ {
1953
+ "epoch": 0.49012208657047723,
1954
+ "grad_norm": 0.29439178109169006,
1955
+ "learning_rate": 0.00011856967027170818,
1956
+ "loss": 0.8389,
1957
+ "step": 276
1958
+ },
1959
+ {
1960
+ "epoch": 0.4918978912319645,
1961
+ "grad_norm": 0.3243663012981415,
1962
+ "learning_rate": 0.00011796758093646989,
1963
+ "loss": 0.8767,
1964
+ "step": 277
1965
+ },
1966
+ {
1967
+ "epoch": 0.4936736958934517,
1968
+ "grad_norm": 0.342454195022583,
1969
+ "learning_rate": 0.00011736481776669306,
1970
+ "loss": 0.8538,
1971
+ "step": 278
1972
+ },
1973
+ {
1974
+ "epoch": 0.49544950055493897,
1975
+ "grad_norm": 0.30882903933525085,
1976
+ "learning_rate": 0.00011676140336768236,
1977
+ "loss": 0.7766,
1978
+ "step": 279
1979
+ },
1980
+ {
1981
+ "epoch": 0.4972253052164262,
1982
+ "grad_norm": 0.3247200548648834,
1983
+ "learning_rate": 0.00011615736036916549,
1984
+ "loss": 0.8268,
1985
+ "step": 280
1986
+ },
1987
+ {
1988
+ "epoch": 0.4990011098779134,
1989
+ "grad_norm": 0.3077162504196167,
1990
+ "learning_rate": 0.00011555271142444433,
1991
+ "loss": 0.7786,
1992
+ "step": 281
1993
+ },
1994
+ {
1995
+ "epoch": 0.5007769145394007,
1996
+ "grad_norm": 0.3300260603427887,
1997
+ "learning_rate": 0.00011494747920954545,
1998
+ "loss": 0.7853,
1999
+ "step": 282
2000
+ },
2001
+ {
2002
+ "epoch": 0.5007769145394007,
2003
+ "eval_loss": 0.7658749222755432,
2004
+ "eval_runtime": 158.4653,
2005
+ "eval_samples_per_second": 5.989,
2006
+ "eval_steps_per_second": 1.502,
2007
+ "step": 282
2008
  }
2009
  ],
2010
  "logging_steps": 1,
 
2024
  "attributes": {}
2025
  }
2026
  },
2027
+ "total_flos": 3.7141566324945715e+17,
2028
  "train_batch_size": 2,
2029
  "trial_name": null,
2030
  "trial_params": null