cwaud commited on
Commit
d92f206
1 Parent(s): 33fa2a1

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:844e860d2831d5f1c9dac4baa5e060deae878fc65580708da4d721525515e244
3
  size 147770496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daa6a5dd07b72ef9430ace44e750fcdb4b7759bb0894a8b53a989328b0f60392
3
  size 147770496
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17545806be4cc91e5b8716eceb6cc561c3f16d33445eae457b88bbe10d85d9b4
3
  size 295765866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee8d3047990aa86dc0a36c1ef1cd2bb44cd433c10bb8c7e8d5f0f851a8fd47e6
3
  size 295765866
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23cf5c6a9723d3e0888f1d57d10b7875593a7b5ca622d73faef8dfd02c8188b6
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e26f5521fc1e6154bc9e3b1f3aa7674a376b8de111db48b8d3988bf77187f582
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89903de470ab92c49c93b06b8eee1fdb914dd305343743334145482322d2079b
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd15b1b91b4e9d24520dd50941a3a4e7796b5305fbac3c66fe134a0d0a7ca684
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89bc779d49cd08996d91f2e4fd7feac65f39235a68fefd44802519c4293df22e
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8ec05176df7a0c2aa9818822dcf5a3c91459baab3140e17c9bffba86c07f82f
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb78b5918aefa5aa6e0fe0980bb3e0af4218a3955ad60ed3e2eed068948e6115
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f70ac3e835d56a709928da1914610ebaf09c78697ec6a3898b27108602ce80de
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b01fa28747f99e5f3057ec8cf64e211a75a94a55d24d1c5d7fa3e34600433ef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78f280e0e658edbdc4fcfbaf05da6eb84d8d86c74ef9e6edc7763096efc3a439
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2015514373779297,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-150",
4
- "epoch": 1.29136400322841,
5
  "eval_steps": 25,
6
- "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1113,6 +1113,372 @@
1113
  "eval_samples_per_second": 39.35,
1114
  "eval_steps_per_second": 10.231,
1115
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1116
  }
1117
  ],
1118
  "logging_steps": 1,
@@ -1141,7 +1507,7 @@
1141
  "attributes": {}
1142
  }
1143
  },
1144
- "total_flos": 3.186390028423004e+17,
1145
  "train_batch_size": 1,
1146
  "trial_name": null,
1147
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.1887668371200562,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 1.7218186709712133,
5
  "eval_steps": 25,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1113
  "eval_samples_per_second": 39.35,
1114
  "eval_steps_per_second": 10.231,
1115
  "step": 150
1116
+ },
1117
+ {
1118
+ "epoch": 1.299973096583266,
1119
+ "grad_norm": 0.7098111510276794,
1120
+ "learning_rate": 8.262628577071638e-05,
1121
+ "loss": 0.9429,
1122
+ "step": 151
1123
+ },
1124
+ {
1125
+ "epoch": 1.308582189938122,
1126
+ "grad_norm": 0.705226719379425,
1127
+ "learning_rate": 8.237311363287896e-05,
1128
+ "loss": 0.9092,
1129
+ "step": 152
1130
+ },
1131
+ {
1132
+ "epoch": 1.3171912832929782,
1133
+ "grad_norm": 1.0044783353805542,
1134
+ "learning_rate": 8.211855863706654e-05,
1135
+ "loss": 1.0479,
1136
+ "step": 153
1137
+ },
1138
+ {
1139
+ "epoch": 1.3258003766478343,
1140
+ "grad_norm": 1.1548088788986206,
1141
+ "learning_rate": 8.18626336431025e-05,
1142
+ "loss": 1.5881,
1143
+ "step": 154
1144
+ },
1145
+ {
1146
+ "epoch": 1.3344094700026903,
1147
+ "grad_norm": 0.8297902941703796,
1148
+ "learning_rate": 8.160535158002092e-05,
1149
+ "loss": 1.1529,
1150
+ "step": 155
1151
+ },
1152
+ {
1153
+ "epoch": 1.3430185633575464,
1154
+ "grad_norm": 0.9534130096435547,
1155
+ "learning_rate": 8.13467254454134e-05,
1156
+ "loss": 1.1669,
1157
+ "step": 156
1158
+ },
1159
+ {
1160
+ "epoch": 1.3516276567124024,
1161
+ "grad_norm": 0.8651900291442871,
1162
+ "learning_rate": 8.108676830477255e-05,
1163
+ "loss": 1.0837,
1164
+ "step": 157
1165
+ },
1166
+ {
1167
+ "epoch": 1.3602367500672585,
1168
+ "grad_norm": 0.7334545254707336,
1169
+ "learning_rate": 8.082549329083179e-05,
1170
+ "loss": 0.9547,
1171
+ "step": 158
1172
+ },
1173
+ {
1174
+ "epoch": 1.3688458434221147,
1175
+ "grad_norm": 0.7457680702209473,
1176
+ "learning_rate": 8.056291360290201e-05,
1177
+ "loss": 1.0481,
1178
+ "step": 159
1179
+ },
1180
+ {
1181
+ "epoch": 1.3774549367769706,
1182
+ "grad_norm": 0.6921712756156921,
1183
+ "learning_rate": 8.029904250620473e-05,
1184
+ "loss": 0.8894,
1185
+ "step": 160
1186
+ },
1187
+ {
1188
+ "epoch": 1.3860640301318268,
1189
+ "grad_norm": 0.7528761625289917,
1190
+ "learning_rate": 8.003389333120192e-05,
1191
+ "loss": 0.9376,
1192
+ "step": 161
1193
+ },
1194
+ {
1195
+ "epoch": 1.3946731234866827,
1196
+ "grad_norm": 0.8016011714935303,
1197
+ "learning_rate": 7.976747947292258e-05,
1198
+ "loss": 0.867,
1199
+ "step": 162
1200
+ },
1201
+ {
1202
+ "epoch": 1.4032822168415389,
1203
+ "grad_norm": 0.8211402893066406,
1204
+ "learning_rate": 7.949981439028605e-05,
1205
+ "loss": 0.8881,
1206
+ "step": 163
1207
+ },
1208
+ {
1209
+ "epoch": 1.411891310196395,
1210
+ "grad_norm": 0.8662030696868896,
1211
+ "learning_rate": 7.923091160542212e-05,
1212
+ "loss": 1.0727,
1213
+ "step": 164
1214
+ },
1215
+ {
1216
+ "epoch": 1.420500403551251,
1217
+ "grad_norm": 0.9291104078292847,
1218
+ "learning_rate": 7.896078470298774e-05,
1219
+ "loss": 0.9085,
1220
+ "step": 165
1221
+ },
1222
+ {
1223
+ "epoch": 1.4291094969061071,
1224
+ "grad_norm": 0.9801819324493408,
1225
+ "learning_rate": 7.868944732948101e-05,
1226
+ "loss": 1.1554,
1227
+ "step": 166
1228
+ },
1229
+ {
1230
+ "epoch": 1.437718590260963,
1231
+ "grad_norm": 1.1721428632736206,
1232
+ "learning_rate": 7.841691319255154e-05,
1233
+ "loss": 1.363,
1234
+ "step": 167
1235
+ },
1236
+ {
1237
+ "epoch": 1.4463276836158192,
1238
+ "grad_norm": 1.0181959867477417,
1239
+ "learning_rate": 7.814319606030803e-05,
1240
+ "loss": 1.135,
1241
+ "step": 168
1242
+ },
1243
+ {
1244
+ "epoch": 1.4549367769706754,
1245
+ "grad_norm": 0.757427453994751,
1246
+ "learning_rate": 7.78683097606228e-05,
1247
+ "loss": 0.9332,
1248
+ "step": 169
1249
+ },
1250
+ {
1251
+ "epoch": 1.4635458703255313,
1252
+ "grad_norm": 0.8897256255149841,
1253
+ "learning_rate": 7.759226818043309e-05,
1254
+ "loss": 1.0629,
1255
+ "step": 170
1256
+ },
1257
+ {
1258
+ "epoch": 1.4721549636803875,
1259
+ "grad_norm": 1.0402635335922241,
1260
+ "learning_rate": 7.73150852650396e-05,
1261
+ "loss": 0.9096,
1262
+ "step": 171
1263
+ },
1264
+ {
1265
+ "epoch": 1.4807640570352434,
1266
+ "grad_norm": 0.6742547750473022,
1267
+ "learning_rate": 7.703677501740194e-05,
1268
+ "loss": 0.9271,
1269
+ "step": 172
1270
+ },
1271
+ {
1272
+ "epoch": 1.4893731503900995,
1273
+ "grad_norm": 0.8658159375190735,
1274
+ "learning_rate": 7.675735149743131e-05,
1275
+ "loss": 0.7919,
1276
+ "step": 173
1277
+ },
1278
+ {
1279
+ "epoch": 1.4979822437449557,
1280
+ "grad_norm": 0.8262009024620056,
1281
+ "learning_rate": 7.647682882128002e-05,
1282
+ "loss": 0.9107,
1283
+ "step": 174
1284
+ },
1285
+ {
1286
+ "epoch": 1.5065913370998116,
1287
+ "grad_norm": 0.6974323391914368,
1288
+ "learning_rate": 7.619522116062857e-05,
1289
+ "loss": 0.8795,
1290
+ "step": 175
1291
+ },
1292
+ {
1293
+ "epoch": 1.5065913370998116,
1294
+ "eval_loss": 1.192854642868042,
1295
+ "eval_runtime": 1.2757,
1296
+ "eval_samples_per_second": 39.194,
1297
+ "eval_steps_per_second": 10.19,
1298
+ "step": 175
1299
+ },
1300
+ {
1301
+ "epoch": 1.5152004304546678,
1302
+ "grad_norm": 0.8274783492088318,
1303
+ "learning_rate": 7.591254274196959e-05,
1304
+ "loss": 0.9152,
1305
+ "step": 176
1306
+ },
1307
+ {
1308
+ "epoch": 1.5238095238095237,
1309
+ "grad_norm": 1.0990360975265503,
1310
+ "learning_rate": 7.562880784588916e-05,
1311
+ "loss": 0.8394,
1312
+ "step": 177
1313
+ },
1314
+ {
1315
+ "epoch": 1.5324186171643799,
1316
+ "grad_norm": 0.8963367342948914,
1317
+ "learning_rate": 7.534403080634538e-05,
1318
+ "loss": 0.7276,
1319
+ "step": 178
1320
+ },
1321
+ {
1322
+ "epoch": 1.541027710519236,
1323
+ "grad_norm": 1.506883978843689,
1324
+ "learning_rate": 7.505822600994424e-05,
1325
+ "loss": 1.4906,
1326
+ "step": 179
1327
+ },
1328
+ {
1329
+ "epoch": 1.549636803874092,
1330
+ "grad_norm": 1.6557178497314453,
1331
+ "learning_rate": 7.477140789521276e-05,
1332
+ "loss": 1.1603,
1333
+ "step": 180
1334
+ },
1335
+ {
1336
+ "epoch": 1.5582458972289481,
1337
+ "grad_norm": 0.9448726773262024,
1338
+ "learning_rate": 7.448359095186973e-05,
1339
+ "loss": 1.1379,
1340
+ "step": 181
1341
+ },
1342
+ {
1343
+ "epoch": 1.566854990583804,
1344
+ "grad_norm": 0.754417359828949,
1345
+ "learning_rate": 7.419478972009348e-05,
1346
+ "loss": 1.0106,
1347
+ "step": 182
1348
+ },
1349
+ {
1350
+ "epoch": 1.5754640839386602,
1351
+ "grad_norm": 0.844013512134552,
1352
+ "learning_rate": 7.390501878978759e-05,
1353
+ "loss": 0.938,
1354
+ "step": 183
1355
+ },
1356
+ {
1357
+ "epoch": 1.5840731772935164,
1358
+ "grad_norm": 0.8120993971824646,
1359
+ "learning_rate": 7.361429279984355e-05,
1360
+ "loss": 0.9858,
1361
+ "step": 184
1362
+ },
1363
+ {
1364
+ "epoch": 1.5926822706483723,
1365
+ "grad_norm": 0.8605924844741821,
1366
+ "learning_rate": 7.332262643740144e-05,
1367
+ "loss": 0.9036,
1368
+ "step": 185
1369
+ },
1370
+ {
1371
+ "epoch": 1.6012913640032282,
1372
+ "grad_norm": 0.8876140117645264,
1373
+ "learning_rate": 7.303003443710784e-05,
1374
+ "loss": 0.8838,
1375
+ "step": 186
1376
+ },
1377
+ {
1378
+ "epoch": 1.6099004573580844,
1379
+ "grad_norm": 0.9637414216995239,
1380
+ "learning_rate": 7.273653158037151e-05,
1381
+ "loss": 0.828,
1382
+ "step": 187
1383
+ },
1384
+ {
1385
+ "epoch": 1.6185095507129406,
1386
+ "grad_norm": 0.8308393955230713,
1387
+ "learning_rate": 7.244213269461656e-05,
1388
+ "loss": 0.9496,
1389
+ "step": 188
1390
+ },
1391
+ {
1392
+ "epoch": 1.6271186440677967,
1393
+ "grad_norm": 0.8119847178459167,
1394
+ "learning_rate": 7.214685265253351e-05,
1395
+ "loss": 0.9974,
1396
+ "step": 189
1397
+ },
1398
+ {
1399
+ "epoch": 1.6357277374226527,
1400
+ "grad_norm": 0.8133478760719299,
1401
+ "learning_rate": 7.185070637132787e-05,
1402
+ "loss": 0.7914,
1403
+ "step": 190
1404
+ },
1405
+ {
1406
+ "epoch": 1.6443368307775086,
1407
+ "grad_norm": 1.075223445892334,
1408
+ "learning_rate": 7.15537088119665e-05,
1409
+ "loss": 1.095,
1410
+ "step": 191
1411
+ },
1412
+ {
1413
+ "epoch": 1.6529459241323647,
1414
+ "grad_norm": 1.0103572607040405,
1415
+ "learning_rate": 7.12558749784219e-05,
1416
+ "loss": 1.2195,
1417
+ "step": 192
1418
+ },
1419
+ {
1420
+ "epoch": 1.661555017487221,
1421
+ "grad_norm": 1.1441484689712524,
1422
+ "learning_rate": 7.095721991691411e-05,
1423
+ "loss": 1.0929,
1424
+ "step": 193
1425
+ },
1426
+ {
1427
+ "epoch": 1.670164110842077,
1428
+ "grad_norm": 1.2061221599578857,
1429
+ "learning_rate": 7.065775871515072e-05,
1430
+ "loss": 1.1353,
1431
+ "step": 194
1432
+ },
1433
+ {
1434
+ "epoch": 1.678773204196933,
1435
+ "grad_norm": 1.0299855470657349,
1436
+ "learning_rate": 7.035750650156458e-05,
1437
+ "loss": 1.0423,
1438
+ "step": 195
1439
+ },
1440
+ {
1441
+ "epoch": 1.687382297551789,
1442
+ "grad_norm": 0.7968188524246216,
1443
+ "learning_rate": 7.005647844454949e-05,
1444
+ "loss": 1.0405,
1445
+ "step": 196
1446
+ },
1447
+ {
1448
+ "epoch": 1.695991390906645,
1449
+ "grad_norm": 0.7397557497024536,
1450
+ "learning_rate": 6.975468975169402e-05,
1451
+ "loss": 0.9024,
1452
+ "step": 197
1453
+ },
1454
+ {
1455
+ "epoch": 1.7046004842615012,
1456
+ "grad_norm": 0.7559933662414551,
1457
+ "learning_rate": 6.945215566901315e-05,
1458
+ "loss": 0.8593,
1459
+ "step": 198
1460
+ },
1461
+ {
1462
+ "epoch": 1.7132095776163574,
1463
+ "grad_norm": 0.7246858477592468,
1464
+ "learning_rate": 6.914889148017809e-05,
1465
+ "loss": 0.9244,
1466
+ "step": 199
1467
+ },
1468
+ {
1469
+ "epoch": 1.7218186709712133,
1470
+ "grad_norm": 0.6950869560241699,
1471
+ "learning_rate": 6.884491250574415e-05,
1472
+ "loss": 0.797,
1473
+ "step": 200
1474
+ },
1475
+ {
1476
+ "epoch": 1.7218186709712133,
1477
+ "eval_loss": 1.1887668371200562,
1478
+ "eval_runtime": 1.2738,
1479
+ "eval_samples_per_second": 39.253,
1480
+ "eval_steps_per_second": 10.206,
1481
+ "step": 200
1482
  }
1483
  ],
1484
  "logging_steps": 1,
 
1507
  "attributes": {}
1508
  }
1509
  },
1510
+ "total_flos": 4.248409672001782e+17,
1511
  "train_batch_size": 1,
1512
  "trial_name": null,
1513
  "trial_params": null