fats-fme commited on
Commit
90b49c8
·
verified ·
1 Parent(s): 73feb1a

Training in progress, step 303, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a2250d36748bb7d399966be9e1ac96a9989a297e888397e3e7ebcf51fac95e
3
  size 200068512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0427f86d7d5760f9f2892185120d52ea905f189a23778690b640c6e7b2bfee0
3
  size 200068512
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:465489ed56c14e7c881e482d47c72f5a1c0e9572aadcab5d66cba1ebb7f9ff9f
3
  size 400361770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda5403109d1815513cd307c5d04da79ff238c94f57ef60264630aad1a0537c0
3
  size 400361770
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b113f3c97acb6b80373cbd3c63d2c9966ff4fe4f9787a4fba50fe48a6ceff302
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd63a3f186d7d9668e94f89f556dd29bd07623e521811d733335bd7a7673589
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7798188a3b0517541e640967d23618db3978bee71a27396e53ea24d979b35816
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47a3e48ee3b6e1ca20c61b24db65efd5ba71f40aca5455a3e5c4ff8759187809
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8b5a959d9459f2fc9422d29e6b89c56e276e4fcd74ca81bc84eb9dc629ceec0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd637ac2493efabb59c857ff309da18b7f1fa1c1f76d6f3f94628c5048f8d88
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.14887365328109697,
5
  "eval_steps": 76,
6
- "global_step": 228,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1635,6 +1635,531 @@
1635
  "eval_samples_per_second": 3.672,
1636
  "eval_steps_per_second": 0.918,
1637
  "step": 228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1638
  }
1639
  ],
1640
  "logging_steps": 1,
@@ -1649,12 +2174,12 @@
1649
  "should_evaluate": false,
1650
  "should_log": false,
1651
  "should_save": true,
1652
- "should_training_stop": false
1653
  },
1654
  "attributes": {}
1655
  }
1656
  },
1657
- "total_flos": 6.994076271252603e+17,
1658
  "train_batch_size": 2,
1659
  "trial_name": null,
1660
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.197845249755142,
5
  "eval_steps": 76,
6
+ "global_step": 303,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1635
  "eval_samples_per_second": 3.672,
1636
  "eval_steps_per_second": 0.918,
1637
  "step": 228
1638
+ },
1639
+ {
1640
+ "epoch": 0.1495266079007509,
1641
+ "grad_norm": 2.8979952335357666,
1642
+ "learning_rate": 3.932929715102863e-05,
1643
+ "loss": 1.4961,
1644
+ "step": 229
1645
+ },
1646
+ {
1647
+ "epoch": 0.15017956252040482,
1648
+ "grad_norm": 2.8216731548309326,
1649
+ "learning_rate": 3.834691207696649e-05,
1650
+ "loss": 1.5215,
1651
+ "step": 230
1652
+ },
1653
+ {
1654
+ "epoch": 0.15083251714005877,
1655
+ "grad_norm": 2.5655124187469482,
1656
+ "learning_rate": 3.7374033224987084e-05,
1657
+ "loss": 1.5517,
1658
+ "step": 231
1659
+ },
1660
+ {
1661
+ "epoch": 0.1514854717597127,
1662
+ "grad_norm": 2.572943687438965,
1663
+ "learning_rate": 3.6410810602214684e-05,
1664
+ "loss": 1.2292,
1665
+ "step": 232
1666
+ },
1667
+ {
1668
+ "epoch": 0.15213842637936664,
1669
+ "grad_norm": 2.6719319820404053,
1670
+ "learning_rate": 3.5457392726890236e-05,
1671
+ "loss": 1.1804,
1672
+ "step": 233
1673
+ },
1674
+ {
1675
+ "epoch": 0.15279138099902057,
1676
+ "grad_norm": 2.3251240253448486,
1677
+ "learning_rate": 3.45139266054715e-05,
1678
+ "loss": 0.9235,
1679
+ "step": 234
1680
+ },
1681
+ {
1682
+ "epoch": 0.15344433561867452,
1683
+ "grad_norm": 2.7189831733703613,
1684
+ "learning_rate": 3.3580557709966066e-05,
1685
+ "loss": 0.9769,
1686
+ "step": 235
1687
+ },
1688
+ {
1689
+ "epoch": 0.15409729023832844,
1690
+ "grad_norm": 2.5098495483398438,
1691
+ "learning_rate": 3.2657429955501394e-05,
1692
+ "loss": 1.1983,
1693
+ "step": 236
1694
+ },
1695
+ {
1696
+ "epoch": 0.15475024485798236,
1697
+ "grad_norm": 2.41629695892334,
1698
+ "learning_rate": 3.174468567813461e-05,
1699
+ "loss": 1.7956,
1700
+ "step": 237
1701
+ },
1702
+ {
1703
+ "epoch": 0.1554031994776363,
1704
+ "grad_norm": 2.5007240772247314,
1705
+ "learning_rate": 3.0842465612905837e-05,
1706
+ "loss": 1.7452,
1707
+ "step": 238
1708
+ },
1709
+ {
1710
+ "epoch": 0.15605615409729023,
1711
+ "grad_norm": 1.9491175413131714,
1712
+ "learning_rate": 2.9950908872138584e-05,
1713
+ "loss": 1.3075,
1714
+ "step": 239
1715
+ },
1716
+ {
1717
+ "epoch": 0.15670910871694418,
1718
+ "grad_norm": 2.146371841430664,
1719
+ "learning_rate": 2.9070152923989946e-05,
1720
+ "loss": 1.7472,
1721
+ "step": 240
1722
+ },
1723
+ {
1724
+ "epoch": 0.1573620633365981,
1725
+ "grad_norm": 2.1137588024139404,
1726
+ "learning_rate": 2.82003335712546e-05,
1727
+ "loss": 1.8301,
1728
+ "step": 241
1729
+ },
1730
+ {
1731
+ "epoch": 0.15801501795625203,
1732
+ "grad_norm": 2.4170126914978027,
1733
+ "learning_rate": 2.7341584930425657e-05,
1734
+ "loss": 1.8519,
1735
+ "step": 242
1736
+ },
1737
+ {
1738
+ "epoch": 0.15866797257590598,
1739
+ "grad_norm": 2.6382076740264893,
1740
+ "learning_rate": 2.6494039411015193e-05,
1741
+ "loss": 2.012,
1742
+ "step": 243
1743
+ },
1744
+ {
1745
+ "epoch": 0.1593209271955599,
1746
+ "grad_norm": 2.1745855808258057,
1747
+ "learning_rate": 2.5657827695138372e-05,
1748
+ "loss": 1.7181,
1749
+ "step": 244
1750
+ },
1751
+ {
1752
+ "epoch": 0.15997388181521385,
1753
+ "grad_norm": 2.9943010807037354,
1754
+ "learning_rate": 2.4833078717363544e-05,
1755
+ "loss": 1.5537,
1756
+ "step": 245
1757
+ },
1758
+ {
1759
+ "epoch": 0.16062683643486778,
1760
+ "grad_norm": 3.608306407928467,
1761
+ "learning_rate": 2.4019919644832023e-05,
1762
+ "loss": 1.7291,
1763
+ "step": 246
1764
+ },
1765
+ {
1766
+ "epoch": 0.1612797910545217,
1767
+ "grad_norm": 2.9223315715789795,
1768
+ "learning_rate": 2.3218475857650346e-05,
1769
+ "loss": 1.7161,
1770
+ "step": 247
1771
+ },
1772
+ {
1773
+ "epoch": 0.16193274567417565,
1774
+ "grad_norm": 3.305222749710083,
1775
+ "learning_rate": 2.242887092955801e-05,
1776
+ "loss": 1.4356,
1777
+ "step": 248
1778
+ },
1779
+ {
1780
+ "epoch": 0.16258570029382957,
1781
+ "grad_norm": 2.8326313495635986,
1782
+ "learning_rate": 2.1651226608873877e-05,
1783
+ "loss": 1.3055,
1784
+ "step": 249
1785
+ },
1786
+ {
1787
+ "epoch": 0.16323865491348352,
1788
+ "grad_norm": 3.042375087738037,
1789
+ "learning_rate": 2.0885662799723804e-05,
1790
+ "loss": 0.8947,
1791
+ "step": 250
1792
+ },
1793
+ {
1794
+ "epoch": 0.16389160953313744,
1795
+ "grad_norm": 2.6567325592041016,
1796
+ "learning_rate": 2.0132297543552757e-05,
1797
+ "loss": 1.776,
1798
+ "step": 251
1799
+ },
1800
+ {
1801
+ "epoch": 0.16454456415279137,
1802
+ "grad_norm": 2.997938871383667,
1803
+ "learning_rate": 1.939124700092423e-05,
1804
+ "loss": 1.8796,
1805
+ "step": 252
1806
+ },
1807
+ {
1808
+ "epoch": 0.16519751877244532,
1809
+ "grad_norm": 2.9053895473480225,
1810
+ "learning_rate": 1.866262543360958e-05,
1811
+ "loss": 1.662,
1812
+ "step": 253
1813
+ },
1814
+ {
1815
+ "epoch": 0.16585047339209924,
1816
+ "grad_norm": 2.6591832637786865,
1817
+ "learning_rate": 1.7946545186970022e-05,
1818
+ "loss": 1.6189,
1819
+ "step": 254
1820
+ },
1821
+ {
1822
+ "epoch": 0.1665034280117532,
1823
+ "grad_norm": 2.9311482906341553,
1824
+ "learning_rate": 1.7243116672634262e-05,
1825
+ "loss": 1.5663,
1826
+ "step": 255
1827
+ },
1828
+ {
1829
+ "epoch": 0.1671563826314071,
1830
+ "grad_norm": 2.9493162631988525,
1831
+ "learning_rate": 1.6552448351474304e-05,
1832
+ "loss": 1.5125,
1833
+ "step": 256
1834
+ },
1835
+ {
1836
+ "epoch": 0.16780933725106106,
1837
+ "grad_norm": 2.19980525970459,
1838
+ "learning_rate": 1.587464671688187e-05,
1839
+ "loss": 1.171,
1840
+ "step": 257
1841
+ },
1842
+ {
1843
+ "epoch": 0.168462291870715,
1844
+ "grad_norm": 3.4264473915100098,
1845
+ "learning_rate": 1.520981627834851e-05,
1846
+ "loss": 1.0619,
1847
+ "step": 258
1848
+ },
1849
+ {
1850
+ "epoch": 0.1691152464903689,
1851
+ "grad_norm": 2.755953550338745,
1852
+ "learning_rate": 1.4558059545351143e-05,
1853
+ "loss": 1.3709,
1854
+ "step": 259
1855
+ },
1856
+ {
1857
+ "epoch": 0.16976820111002286,
1858
+ "grad_norm": 3.0671868324279785,
1859
+ "learning_rate": 1.3919477011546423e-05,
1860
+ "loss": 1.3373,
1861
+ "step": 260
1862
+ },
1863
+ {
1864
+ "epoch": 0.17042115572967678,
1865
+ "grad_norm": 2.703378200531006,
1866
+ "learning_rate": 1.3294167139275593e-05,
1867
+ "loss": 1.4812,
1868
+ "step": 261
1869
+ },
1870
+ {
1871
+ "epoch": 0.17107411034933073,
1872
+ "grad_norm": 2.7871146202087402,
1873
+ "learning_rate": 1.2682226344382796e-05,
1874
+ "loss": 1.5082,
1875
+ "step": 262
1876
+ },
1877
+ {
1878
+ "epoch": 0.17172706496898466,
1879
+ "grad_norm": 2.1928627490997314,
1880
+ "learning_rate": 1.208374898134883e-05,
1881
+ "loss": 1.7859,
1882
+ "step": 263
1883
+ },
1884
+ {
1885
+ "epoch": 0.17238001958863858,
1886
+ "grad_norm": 2.330217123031616,
1887
+ "learning_rate": 1.1498827328742623e-05,
1888
+ "loss": 1.9261,
1889
+ "step": 264
1890
+ },
1891
+ {
1892
+ "epoch": 0.17303297420829253,
1893
+ "grad_norm": 2.532562732696533,
1894
+ "learning_rate": 1.0927551574992967e-05,
1895
+ "loss": 1.744,
1896
+ "step": 265
1897
+ },
1898
+ {
1899
+ "epoch": 0.17368592882794645,
1900
+ "grad_norm": 2.453615188598633,
1901
+ "learning_rate": 1.0370009804482483e-05,
1902
+ "loss": 1.8084,
1903
+ "step": 266
1904
+ },
1905
+ {
1906
+ "epoch": 0.1743388834476004,
1907
+ "grad_norm": 2.09717059135437,
1908
+ "learning_rate": 9.82628798396592e-06,
1909
+ "loss": 1.7786,
1910
+ "step": 267
1911
+ },
1912
+ {
1913
+ "epoch": 0.17499183806725432,
1914
+ "grad_norm": 3.074509620666504,
1915
+ "learning_rate": 9.296469949315156e-06,
1916
+ "loss": 1.9725,
1917
+ "step": 268
1918
+ },
1919
+ {
1920
+ "epoch": 0.17564479268690825,
1921
+ "grad_norm": 2.6540865898132324,
1922
+ "learning_rate": 8.780637392592495e-06,
1923
+ "loss": 1.6455,
1924
+ "step": 269
1925
+ },
1926
+ {
1927
+ "epoch": 0.1762977473065622,
1928
+ "grad_norm": 3.0181193351745605,
1929
+ "learning_rate": 8.278869849454718e-06,
1930
+ "loss": 1.7383,
1931
+ "step": 270
1932
+ },
1933
+ {
1934
+ "epoch": 0.17695070192621612,
1935
+ "grad_norm": 3.034897565841675,
1936
+ "learning_rate": 7.791244686889588e-06,
1937
+ "loss": 1.7912,
1938
+ "step": 271
1939
+ },
1940
+ {
1941
+ "epoch": 0.17760365654587007,
1942
+ "grad_norm": 3.0905239582061768,
1943
+ "learning_rate": 7.317837091286706e-06,
1944
+ "loss": 1.5027,
1945
+ "step": 272
1946
+ },
1947
+ {
1948
+ "epoch": 0.178256611165524,
1949
+ "grad_norm": 3.1407511234283447,
1950
+ "learning_rate": 6.858720056844614e-06,
1951
+ "loss": 1.2953,
1952
+ "step": 273
1953
+ },
1954
+ {
1955
+ "epoch": 0.17890956578517794,
1956
+ "grad_norm": 2.7985000610351562,
1957
+ "learning_rate": 6.413964374315851e-06,
1958
+ "loss": 1.4958,
1959
+ "step": 274
1960
+ },
1961
+ {
1962
+ "epoch": 0.17956252040483187,
1963
+ "grad_norm": 2.70698881149292,
1964
+ "learning_rate": 5.983638620091858e-06,
1965
+ "loss": 1.1345,
1966
+ "step": 275
1967
+ },
1968
+ {
1969
+ "epoch": 0.1802154750244858,
1970
+ "grad_norm": 3.302675485610962,
1971
+ "learning_rate": 5.567809145629244e-06,
1972
+ "loss": 1.8954,
1973
+ "step": 276
1974
+ },
1975
+ {
1976
+ "epoch": 0.18086842964413974,
1977
+ "grad_norm": 2.7587976455688477,
1978
+ "learning_rate": 5.1665400672190725e-06,
1979
+ "loss": 1.6744,
1980
+ "step": 277
1981
+ },
1982
+ {
1983
+ "epoch": 0.18152138426379366,
1984
+ "grad_norm": 2.4605860710144043,
1985
+ "learning_rate": 4.7798932561009865e-06,
1986
+ "loss": 1.4846,
1987
+ "step": 278
1988
+ },
1989
+ {
1990
+ "epoch": 0.1821743388834476,
1991
+ "grad_norm": 3.0369081497192383,
1992
+ "learning_rate": 4.407928328923194e-06,
1993
+ "loss": 1.5561,
1994
+ "step": 279
1995
+ },
1996
+ {
1997
+ "epoch": 0.18282729350310153,
1998
+ "grad_norm": 2.6894164085388184,
1999
+ "learning_rate": 4.050702638550275e-06,
2000
+ "loss": 1.1031,
2001
+ "step": 280
2002
+ },
2003
+ {
2004
+ "epoch": 0.18348024812275546,
2005
+ "grad_norm": 2.849666118621826,
2006
+ "learning_rate": 3.7082712652200867e-06,
2007
+ "loss": 1.366,
2008
+ "step": 281
2009
+ },
2010
+ {
2011
+ "epoch": 0.1841332027424094,
2012
+ "grad_norm": 2.834463119506836,
2013
+ "learning_rate": 3.380687008050909e-06,
2014
+ "loss": 1.1118,
2015
+ "step": 282
2016
+ },
2017
+ {
2018
+ "epoch": 0.18478615736206333,
2019
+ "grad_norm": 2.5449914932250977,
2020
+ "learning_rate": 3.068000376900515e-06,
2021
+ "loss": 1.0414,
2022
+ "step": 283
2023
+ },
2024
+ {
2025
+ "epoch": 0.18543911198171728,
2026
+ "grad_norm": 2.6311261653900146,
2027
+ "learning_rate": 2.770259584577972e-06,
2028
+ "loss": 0.9975,
2029
+ "step": 284
2030
+ },
2031
+ {
2032
+ "epoch": 0.1860920666013712,
2033
+ "grad_norm": 2.644763946533203,
2034
+ "learning_rate": 2.4875105394098654e-06,
2035
+ "loss": 1.1698,
2036
+ "step": 285
2037
+ },
2038
+ {
2039
+ "epoch": 0.18674502122102513,
2040
+ "grad_norm": 2.4958488941192627,
2041
+ "learning_rate": 2.219796838161681e-06,
2042
+ "loss": 1.6067,
2043
+ "step": 286
2044
+ },
2045
+ {
2046
+ "epoch": 0.18739797584067908,
2047
+ "grad_norm": 2.388773202896118,
2048
+ "learning_rate": 1.967159759315751e-06,
2049
+ "loss": 1.9297,
2050
+ "step": 287
2051
+ },
2052
+ {
2053
+ "epoch": 0.188050930460333,
2054
+ "grad_norm": 2.2818784713745117,
2055
+ "learning_rate": 1.7296382567064672e-06,
2056
+ "loss": 1.7271,
2057
+ "step": 288
2058
+ },
2059
+ {
2060
+ "epoch": 0.18870388507998695,
2061
+ "grad_norm": 2.581960439682007,
2062
+ "learning_rate": 1.5072689535141072e-06,
2063
+ "loss": 1.7268,
2064
+ "step": 289
2065
+ },
2066
+ {
2067
+ "epoch": 0.18935683969964087,
2068
+ "grad_norm": 2.5180134773254395,
2069
+ "learning_rate": 1.3000861366179062e-06,
2070
+ "loss": 1.9191,
2071
+ "step": 290
2072
+ },
2073
+ {
2074
+ "epoch": 0.19000979431929482,
2075
+ "grad_norm": 2.125568151473999,
2076
+ "learning_rate": 1.1081217513094212e-06,
2077
+ "loss": 1.7848,
2078
+ "step": 291
2079
+ },
2080
+ {
2081
+ "epoch": 0.19066274893894875,
2082
+ "grad_norm": 2.7064099311828613,
2083
+ "learning_rate": 9.314053963669245e-07,
2084
+ "loss": 1.6782,
2085
+ "step": 292
2086
+ },
2087
+ {
2088
+ "epoch": 0.19131570355860267,
2089
+ "grad_norm": 2.437058448791504,
2090
+ "learning_rate": 7.699643194915784e-07,
2091
+ "loss": 1.8157,
2092
+ "step": 293
2093
+ },
2094
+ {
2095
+ "epoch": 0.19196865817825662,
2096
+ "grad_norm": 3.013352870941162,
2097
+ "learning_rate": 6.238234131061616e-07,
2098
+ "loss": 1.3813,
2099
+ "step": 294
2100
+ },
2101
+ {
2102
+ "epoch": 0.19262161279791054,
2103
+ "grad_norm": 2.7739081382751465,
2104
+ "learning_rate": 4.93005210516928e-07,
2105
+ "loss": 1.7656,
2106
+ "step": 295
2107
+ },
2108
+ {
2109
+ "epoch": 0.1932745674175645,
2110
+ "grad_norm": 3.043750286102295,
2111
+ "learning_rate": 3.775298824391982e-07,
2112
+ "loss": 1.5386,
2113
+ "step": 296
2114
+ },
2115
+ {
2116
+ "epoch": 0.1939275220372184,
2117
+ "grad_norm": 2.856957197189331,
2118
+ "learning_rate": 2.774152338873126e-07,
2119
+ "loss": 1.4946,
2120
+ "step": 297
2121
+ },
2122
+ {
2123
+ "epoch": 0.19458047665687234,
2124
+ "grad_norm": 2.9437901973724365,
2125
+ "learning_rate": 1.9267670142926187e-07,
2126
+ "loss": 1.3205,
2127
+ "step": 298
2128
+ },
2129
+ {
2130
+ "epoch": 0.1952334312765263,
2131
+ "grad_norm": 2.6561126708984375,
2132
+ "learning_rate": 1.2332735080651248e-07,
2133
+ "loss": 1.1922,
2134
+ "step": 299
2135
+ },
2136
+ {
2137
+ "epoch": 0.1958863858961802,
2138
+ "grad_norm": 2.5239169597625732,
2139
+ "learning_rate": 6.9377874919474e-08,
2140
+ "loss": 0.956,
2141
+ "step": 300
2142
+ },
2143
+ {
2144
+ "epoch": 0.19653934051583416,
2145
+ "grad_norm": 2.8316755294799805,
2146
+ "learning_rate": 3.0836592178717926e-08,
2147
+ "loss": 1.8631,
2148
+ "step": 301
2149
+ },
2150
+ {
2151
+ "epoch": 0.19719229513548808,
2152
+ "grad_norm": 2.5579283237457275,
2153
+ "learning_rate": 7.709445222403577e-09,
2154
+ "loss": 1.4522,
2155
+ "step": 302
2156
+ },
2157
+ {
2158
+ "epoch": 0.197845249755142,
2159
+ "grad_norm": 2.6677441596984863,
2160
+ "learning_rate": 0.0,
2161
+ "loss": 1.7175,
2162
+ "step": 303
2163
  }
2164
  ],
2165
  "logging_steps": 1,
 
2174
  "should_evaluate": false,
2175
  "should_log": false,
2176
  "should_save": true,
2177
+ "should_training_stop": true
2178
  },
2179
  "attributes": {}
2180
  }
2181
  },
2182
+ "total_flos": 9.294759255217275e+17,
2183
  "train_batch_size": 2,
2184
  "trial_name": null,
2185
  "trial_params": null