Nexspear commited on
Commit
baf04f6
·
verified ·
1 Parent(s): 58fab32

Training in progress, step 250, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06ce627cdf978313cef135b956261249dacd1bd1df2c373f6ad783346e0af483
3
  size 156926880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0220fc1d90ea8fc616a75a349e3a3de3f88c807fd95fbaeb193fce07d25ce28e
3
  size 156926880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:18d9fdddc307660017992a02560363ce71fd150bf9e6b4a56622008485a71007
3
  size 79968772
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8aa25bcfa9e3a2765287904893589e75a733b33907eb0862824d2de06ddf0b
3
  size 79968772
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a2b134c170c3861762b86730d8fb738bec4fdfede8e45ec8de681871bebc097
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cda862f419774a7c8b4ce1fb67a9fbe630bd62f435694c988f3646fa0602c1e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:606e244bd4e377995f514a9afd58b985a53e5ab6166ef4b49183e16b2f949d47
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e60969e1684cc952ca893a4a3214ee3106c2aedea456babc2aab138cf120dc6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.6099563241004944,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 1.6618257261410787,
5
  "eval_steps": 50,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1447,6 +1447,364 @@
1447
  "eval_samples_per_second": 28.342,
1448
  "eval_steps_per_second": 7.12,
1449
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1450
  }
1451
  ],
1452
  "logging_steps": 1,
@@ -1475,7 +1833,7 @@
1475
  "attributes": {}
1476
  }
1477
  },
1478
- "total_flos": 7.94731890081792e+16,
1479
  "train_batch_size": 8,
1480
  "trial_name": null,
1481
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.4097523093223572,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-250",
4
+ "epoch": 2.078838174273859,
5
  "eval_steps": 50,
6
+ "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1447
  "eval_samples_per_second": 28.342,
1448
  "eval_steps_per_second": 7.12,
1449
  "step": 200
1450
+ },
1451
+ {
1452
+ "epoch": 1.6701244813278007,
1453
+ "grad_norm": 7.207304954528809,
1454
+ "learning_rate": 4.308566424176336e-05,
1455
+ "loss": 0.5757,
1456
+ "step": 201
1457
+ },
1458
+ {
1459
+ "epoch": 1.6784232365145229,
1460
+ "grad_norm": 8.592957496643066,
1461
+ "learning_rate": 4.264272645841419e-05,
1462
+ "loss": 0.487,
1463
+ "step": 202
1464
+ },
1465
+ {
1466
+ "epoch": 1.6867219917012448,
1467
+ "grad_norm": 4.523548603057861,
1468
+ "learning_rate": 4.2200378060659116e-05,
1469
+ "loss": 0.1985,
1470
+ "step": 203
1471
+ },
1472
+ {
1473
+ "epoch": 1.6950207468879668,
1474
+ "grad_norm": 3.577657699584961,
1475
+ "learning_rate": 4.1758654484692186e-05,
1476
+ "loss": 0.217,
1477
+ "step": 204
1478
+ },
1479
+ {
1480
+ "epoch": 1.703319502074689,
1481
+ "grad_norm": 5.221662998199463,
1482
+ "learning_rate": 4.131759111665349e-05,
1483
+ "loss": 0.3242,
1484
+ "step": 205
1485
+ },
1486
+ {
1487
+ "epoch": 1.711618257261411,
1488
+ "grad_norm": 4.403975009918213,
1489
+ "learning_rate": 4.087722328979438e-05,
1490
+ "loss": 0.2363,
1491
+ "step": 206
1492
+ },
1493
+ {
1494
+ "epoch": 1.7199170124481329,
1495
+ "grad_norm": 4.955993175506592,
1496
+ "learning_rate": 4.043758628164688e-05,
1497
+ "loss": 0.4589,
1498
+ "step": 207
1499
+ },
1500
+ {
1501
+ "epoch": 1.7282157676348548,
1502
+ "grad_norm": 5.290609836578369,
1503
+ "learning_rate": 3.9998715311197785e-05,
1504
+ "loss": 0.304,
1505
+ "step": 208
1506
+ },
1507
+ {
1508
+ "epoch": 1.7365145228215768,
1509
+ "grad_norm": 4.659958362579346,
1510
+ "learning_rate": 3.956064553606708e-05,
1511
+ "loss": 0.1433,
1512
+ "step": 209
1513
+ },
1514
+ {
1515
+ "epoch": 1.7448132780082988,
1516
+ "grad_norm": 6.113132953643799,
1517
+ "learning_rate": 3.912341204969164e-05,
1518
+ "loss": 0.2607,
1519
+ "step": 210
1520
+ },
1521
+ {
1522
+ "epoch": 1.7531120331950207,
1523
+ "grad_norm": 7.161942005157471,
1524
+ "learning_rate": 3.86870498785139e-05,
1525
+ "loss": 0.5274,
1526
+ "step": 211
1527
+ },
1528
+ {
1529
+ "epoch": 1.7614107883817427,
1530
+ "grad_norm": 7.1513671875,
1531
+ "learning_rate": 3.825159397917589e-05,
1532
+ "loss": 0.4634,
1533
+ "step": 212
1534
+ },
1535
+ {
1536
+ "epoch": 1.7697095435684647,
1537
+ "grad_norm": 5.32570219039917,
1538
+ "learning_rate": 3.781707923571891e-05,
1539
+ "loss": 0.1875,
1540
+ "step": 213
1541
+ },
1542
+ {
1543
+ "epoch": 1.7780082987551866,
1544
+ "grad_norm": 4.187331199645996,
1545
+ "learning_rate": 3.738354045678891e-05,
1546
+ "loss": 0.1516,
1547
+ "step": 214
1548
+ },
1549
+ {
1550
+ "epoch": 1.7863070539419086,
1551
+ "grad_norm": 7.296047687530518,
1552
+ "learning_rate": 3.695101237284815e-05,
1553
+ "loss": 0.3847,
1554
+ "step": 215
1555
+ },
1556
+ {
1557
+ "epoch": 1.7946058091286305,
1558
+ "grad_norm": 5.951739311218262,
1559
+ "learning_rate": 3.651952963339282e-05,
1560
+ "loss": 0.2656,
1561
+ "step": 216
1562
+ },
1563
+ {
1564
+ "epoch": 1.8029045643153527,
1565
+ "grad_norm": 6.873353958129883,
1566
+ "learning_rate": 3.608912680417737e-05,
1567
+ "loss": 0.165,
1568
+ "step": 217
1569
+ },
1570
+ {
1571
+ "epoch": 1.8112033195020747,
1572
+ "grad_norm": 6.497419834136963,
1573
+ "learning_rate": 3.5659838364445505e-05,
1574
+ "loss": 0.2504,
1575
+ "step": 218
1576
+ },
1577
+ {
1578
+ "epoch": 1.8195020746887967,
1579
+ "grad_norm": 7.605969429016113,
1580
+ "learning_rate": 3.523169870416795e-05,
1581
+ "loss": 0.3794,
1582
+ "step": 219
1583
+ },
1584
+ {
1585
+ "epoch": 1.8278008298755186,
1586
+ "grad_norm": 7.117875576019287,
1587
+ "learning_rate": 3.480474212128766e-05,
1588
+ "loss": 0.3687,
1589
+ "step": 220
1590
+ },
1591
+ {
1592
+ "epoch": 1.8360995850622408,
1593
+ "grad_norm": 4.442015647888184,
1594
+ "learning_rate": 3.4379002818972124e-05,
1595
+ "loss": 0.2042,
1596
+ "step": 221
1597
+ },
1598
+ {
1599
+ "epoch": 1.8443983402489628,
1600
+ "grad_norm": 8.238639831542969,
1601
+ "learning_rate": 3.3954514902873425e-05,
1602
+ "loss": 0.5648,
1603
+ "step": 222
1604
+ },
1605
+ {
1606
+ "epoch": 1.8526970954356847,
1607
+ "grad_norm": 7.620614051818848,
1608
+ "learning_rate": 3.3531312378396026e-05,
1609
+ "loss": 0.3973,
1610
+ "step": 223
1611
+ },
1612
+ {
1613
+ "epoch": 1.8609958506224067,
1614
+ "grad_norm": 5.214080333709717,
1615
+ "learning_rate": 3.310942914797265e-05,
1616
+ "loss": 0.2133,
1617
+ "step": 224
1618
+ },
1619
+ {
1620
+ "epoch": 1.8692946058091287,
1621
+ "grad_norm": 5.911538600921631,
1622
+ "learning_rate": 3.2688899008348386e-05,
1623
+ "loss": 0.219,
1624
+ "step": 225
1625
+ },
1626
+ {
1627
+ "epoch": 1.8775933609958506,
1628
+ "grad_norm": 10.658268928527832,
1629
+ "learning_rate": 3.226975564787322e-05,
1630
+ "loss": 0.338,
1631
+ "step": 226
1632
+ },
1633
+ {
1634
+ "epoch": 1.8858921161825726,
1635
+ "grad_norm": 6.180187702178955,
1636
+ "learning_rate": 3.185203264380338e-05,
1637
+ "loss": 0.2234,
1638
+ "step": 227
1639
+ },
1640
+ {
1641
+ "epoch": 1.8941908713692945,
1642
+ "grad_norm": 5.702350616455078,
1643
+ "learning_rate": 3.143576345961132e-05,
1644
+ "loss": 0.2215,
1645
+ "step": 228
1646
+ },
1647
+ {
1648
+ "epoch": 1.9024896265560165,
1649
+ "grad_norm": 2.7787413597106934,
1650
+ "learning_rate": 3.1020981442305184e-05,
1651
+ "loss": 0.0856,
1652
+ "step": 229
1653
+ },
1654
+ {
1655
+ "epoch": 1.9107883817427385,
1656
+ "grad_norm": 5.721746921539307,
1657
+ "learning_rate": 3.060771981975726e-05,
1658
+ "loss": 0.3339,
1659
+ "step": 230
1660
+ },
1661
+ {
1662
+ "epoch": 1.9190871369294604,
1663
+ "grad_norm": 6.0323567390441895,
1664
+ "learning_rate": 3.019601169804216e-05,
1665
+ "loss": 0.1757,
1666
+ "step": 231
1667
+ },
1668
+ {
1669
+ "epoch": 1.9273858921161826,
1670
+ "grad_norm": 8.973467826843262,
1671
+ "learning_rate": 2.978589005878476e-05,
1672
+ "loss": 0.262,
1673
+ "step": 232
1674
+ },
1675
+ {
1676
+ "epoch": 1.9356846473029046,
1677
+ "grad_norm": 3.4507062435150146,
1678
+ "learning_rate": 2.9377387756517982e-05,
1679
+ "loss": 0.1235,
1680
+ "step": 233
1681
+ },
1682
+ {
1683
+ "epoch": 1.9439834024896265,
1684
+ "grad_norm": 5.919799327850342,
1685
+ "learning_rate": 2.897053751605093e-05,
1686
+ "loss": 0.2862,
1687
+ "step": 234
1688
+ },
1689
+ {
1690
+ "epoch": 1.9522821576763485,
1691
+ "grad_norm": 4.9692487716674805,
1692
+ "learning_rate": 2.8565371929847284e-05,
1693
+ "loss": 0.1764,
1694
+ "step": 235
1695
+ },
1696
+ {
1697
+ "epoch": 1.9605809128630707,
1698
+ "grad_norm": 5.740837574005127,
1699
+ "learning_rate": 2.8161923455414367e-05,
1700
+ "loss": 0.1705,
1701
+ "step": 236
1702
+ },
1703
+ {
1704
+ "epoch": 1.9688796680497926,
1705
+ "grad_norm": 5.595933437347412,
1706
+ "learning_rate": 2.776022441270295e-05,
1707
+ "loss": 0.2613,
1708
+ "step": 237
1709
+ },
1710
+ {
1711
+ "epoch": 1.9771784232365146,
1712
+ "grad_norm": 4.591209888458252,
1713
+ "learning_rate": 2.7360306981518146e-05,
1714
+ "loss": 0.1906,
1715
+ "step": 238
1716
+ },
1717
+ {
1718
+ "epoch": 1.9854771784232366,
1719
+ "grad_norm": 7.699284553527832,
1720
+ "learning_rate": 2.6962203198941587e-05,
1721
+ "loss": 0.4312,
1722
+ "step": 239
1723
+ },
1724
+ {
1725
+ "epoch": 1.9937759336099585,
1726
+ "grad_norm": 6.995236873626709,
1727
+ "learning_rate": 2.656594495676482e-05,
1728
+ "loss": 0.3351,
1729
+ "step": 240
1730
+ },
1731
+ {
1732
+ "epoch": 2.004149377593361,
1733
+ "grad_norm": 6.956718921661377,
1734
+ "learning_rate": 2.6171563998934605e-05,
1735
+ "loss": 0.2213,
1736
+ "step": 241
1737
+ },
1738
+ {
1739
+ "epoch": 2.012448132780083,
1740
+ "grad_norm": 3.839200258255005,
1741
+ "learning_rate": 2.5779091919009877e-05,
1742
+ "loss": 0.2139,
1743
+ "step": 242
1744
+ },
1745
+ {
1746
+ "epoch": 2.020746887966805,
1747
+ "grad_norm": 1.65491783618927,
1748
+ "learning_rate": 2.5388560157630765e-05,
1749
+ "loss": 0.0616,
1750
+ "step": 243
1751
+ },
1752
+ {
1753
+ "epoch": 2.029045643153527,
1754
+ "grad_norm": 3.041247606277466,
1755
+ "learning_rate": 2.500000000000001e-05,
1756
+ "loss": 0.0922,
1757
+ "step": 244
1758
+ },
1759
+ {
1760
+ "epoch": 2.037344398340249,
1761
+ "grad_norm": 1.837149739265442,
1762
+ "learning_rate": 2.461344257337662e-05,
1763
+ "loss": 0.0552,
1764
+ "step": 245
1765
+ },
1766
+ {
1767
+ "epoch": 2.045643153526971,
1768
+ "grad_norm": 3.2361204624176025,
1769
+ "learning_rate": 2.422891884458241e-05,
1770
+ "loss": 0.0536,
1771
+ "step": 246
1772
+ },
1773
+ {
1774
+ "epoch": 2.0539419087136928,
1775
+ "grad_norm": 4.026633262634277,
1776
+ "learning_rate": 2.3846459617521128e-05,
1777
+ "loss": 0.2692,
1778
+ "step": 247
1779
+ },
1780
+ {
1781
+ "epoch": 2.0622406639004147,
1782
+ "grad_norm": 3.3415215015411377,
1783
+ "learning_rate": 2.346609553071093e-05,
1784
+ "loss": 0.0902,
1785
+ "step": 248
1786
+ },
1787
+ {
1788
+ "epoch": 2.070539419087137,
1789
+ "grad_norm": 0.9877287745475769,
1790
+ "learning_rate": 2.308785705482982e-05,
1791
+ "loss": 0.0222,
1792
+ "step": 249
1793
+ },
1794
+ {
1795
+ "epoch": 2.078838174273859,
1796
+ "grad_norm": 1.6253547668457031,
1797
+ "learning_rate": 2.2711774490274766e-05,
1798
+ "loss": 0.0311,
1799
+ "step": 250
1800
+ },
1801
+ {
1802
+ "epoch": 2.078838174273859,
1803
+ "eval_loss": 0.4097523093223572,
1804
+ "eval_runtime": 7.1697,
1805
+ "eval_samples_per_second": 28.314,
1806
+ "eval_steps_per_second": 7.113,
1807
+ "step": 250
1808
  }
1809
  ],
1810
  "logging_steps": 1,
 
1833
  "attributes": {}
1834
  }
1835
  },
1836
+ "total_flos": 9.9341486260224e+16,
1837
  "train_batch_size": 8,
1838
  "trial_name": null,
1839
  "trial_params": null