g4rg commited on
Commit
61145c7
1 Parent(s): a5eadd1

Training in progress, step 326, checkpoint

Browse files
Files changed (28) hide show
  1. last-checkpoint/adapter_model.safetensors +1 -1
  2. last-checkpoint/global_step326/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  3. last-checkpoint/global_step326/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  4. last-checkpoint/global_step326/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  5. last-checkpoint/global_step326/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  6. last-checkpoint/global_step326/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  7. last-checkpoint/global_step326/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  8. last-checkpoint/global_step326/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  9. last-checkpoint/global_step326/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  10. last-checkpoint/global_step326/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  11. last-checkpoint/global_step326/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  12. last-checkpoint/global_step326/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  13. last-checkpoint/global_step326/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  14. last-checkpoint/global_step326/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  15. last-checkpoint/global_step326/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  16. last-checkpoint/global_step326/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  17. last-checkpoint/global_step326/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  18. last-checkpoint/latest +1 -1
  19. last-checkpoint/rng_state_0.pth +1 -1
  20. last-checkpoint/rng_state_1.pth +1 -1
  21. last-checkpoint/rng_state_2.pth +1 -1
  22. last-checkpoint/rng_state_3.pth +1 -1
  23. last-checkpoint/rng_state_4.pth +1 -1
  24. last-checkpoint/rng_state_5.pth +1 -1
  25. last-checkpoint/rng_state_6.pth +1 -1
  26. last-checkpoint/rng_state_7.pth +1 -1
  27. last-checkpoint/scheduler.pt +1 -1
  28. last-checkpoint/trainer_state.json +438 -4
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6b7fbaf2d6a6e1654728bf2b64ff7a097f615d5247c146dd31d3eccfa8fc30f
3
  size 763470136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa8b2208e339d0966b30c53e09a99df61e2311b931ce1a0d629c8ac892f616a5
3
  size 763470136
last-checkpoint/global_step326/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43b3137f947c6139cf829b67978cdba814ebd02eaee9fdcc3c0f167fd35fd35b
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6db33c332d8581358d602504c1c91f0d3a883b074cf661ac66a15b03dd40abd
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc2873da28d4036072b8fb47729fcb07fd661ca126c821a1edd1b5e0e0e30097
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2310adddf2072beb2ec0a8d7336c7313d0522f5d558ddf1404c757f814fa692
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c0a3de1b7fabd73a2f8e028bc47bd34d2cadec6c6a7e0f55c1ec00eaa8d5f09
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65446abe8198c8cdae4ef4d6043c5b7cc00aaa255a4eb5fde655e3a2c814d26
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99b1a452b0ea7e88f778e0e3e688314048124e3e186d2dbefa61af7e7fcb6d38
3
+ size 289065424
last-checkpoint/global_step326/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:feac3121144190c7b1081674f7d997011edfb3697de551be384f00cdc0ba3d16
3
+ size 289065424
last-checkpoint/global_step326/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99cf2bb29fbbb9b7a3cdf4a3f999edea48aca7423658b0613e3cad205c425e76
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ae71c4246c7a236ee1d9aafa93f4d8184b878bddd8411262e99f470ba6a22d6
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f353e9171125c04ba586a5e1399b5e436346795c2d781358b392ae0949ba32f3
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b3bac07cff339e40e4d293b58834fa924ecd46e0a7004f2a7b23e4ae2dccd50
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdcdd82841eb104d9e8a4621cac38919d3eb554fb1fff0a673fde94a7dd6e2ed
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf08c5cca6ee378c601258dcdcdc6750f8263163766efc8715e728d7374f16b
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04351dfca49e5d3ecaa0f61d4f3ff75c1018f5ac923af4f043a17b42b16aa183
3
+ size 348711830
last-checkpoint/global_step326/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7323dd294955fbd4f4d06dbac88975d8a0dfb8cf0c1a2c0e35064347930eed11
3
+ size 348711830
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step264
 
1
+ global_step326
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:756188867614fe144ce7bb4100b8fdc4a53793718efdbfd597ab9a7af1127cb3
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7773f084902535989bdb41582efe57404415ae441c0e941b91e35ed5bef8d6c
3
  size 15920
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9326dda8ccb88256fea16bdb08bf3d8ee2d7890d74941621ea0ae79baad53127
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb6f1e872eaa090ac7fcbb7390762ebd32f4720fffac3f24df60938a27e68cd4
3
  size 15920
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc7ea8107c02800ceda5d3219d8139cc0c46423c770369f8d482750d2ee66b59
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:168f0069d86758b09cb8707be4dc71abfea652954fd7c1fc7710c08989d444bb
3
  size 15920
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5100775819feb4598b355aaf5ae7a2d05f1e6c33d82585848692501430716b79
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6f3f1877afae4463c0da7af29b5016c2a4b26f8ab03a4bb94b21beefb8705ac
3
  size 15920
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:375d7beb01cab64b2715fb3d805593967127e2433072776577d1a22535bc71f6
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc6bc1a489faad48156164ba681062284f4ce06e78099aed3eb21be38bdcae8
3
  size 15920
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be4bc162636adeba1331e40da73f3fb1fde2fb44472545ff46bc3e2a6588d115
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3874e1aa39cf2ac616290be1045cac257b998568136e9a70f9a79d503a77c1be
3
  size 15920
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e4ec9613f9c318e718457c34ba482fb1b487745cd80d6e26c4479f47030f964
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08d35009470b1536e71f50cca3e4f2587ed7caac64c4ff1c8286f89f2bdbd9d9
3
  size 15920
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6c5785e3656da35a0034b82ee38c2b260ac87d57dc93498957445739f27c017
3
  size 15920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6aec33103c51dff2dd3527e0d1edfb46d84c375b17676323ddceb55412f0047
3
  size 15920
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26eca587873b25805521ebb406b132a4ba3e54d5f099d35d9e497769da91dcd6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c1397d76155071779653df2de895577183fdb8d7655b1d6346b073c3c09830d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8098159509202454,
5
  "eval_steps": 66,
6
- "global_step": 264,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1895,6 +1895,440 @@
1895
  "eval_samples_per_second": 1.794,
1896
  "eval_steps_per_second": 0.126,
1897
  "step": 264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1898
  }
1899
  ],
1900
  "logging_steps": 1,
@@ -1909,12 +2343,12 @@
1909
  "should_evaluate": false,
1910
  "should_log": false,
1911
  "should_save": true,
1912
- "should_training_stop": false
1913
  },
1914
  "attributes": {}
1915
  }
1916
  },
1917
- "total_flos": 288286794842112.0,
1918
  "train_batch_size": 2,
1919
  "trial_name": null,
1920
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 66,
6
+ "global_step": 326,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1895
  "eval_samples_per_second": 1.794,
1896
  "eval_steps_per_second": 0.126,
1897
  "step": 264
1898
+ },
1899
+ {
1900
+ "epoch": 0.8128834355828221,
1901
+ "grad_norm": 0.2830531876648041,
1902
+ "learning_rate": 1.8540000807185192e-05,
1903
+ "loss": 1.9384,
1904
+ "step": 265
1905
+ },
1906
+ {
1907
+ "epoch": 0.8159509202453987,
1908
+ "grad_norm": 0.2670241830579454,
1909
+ "learning_rate": 1.827113894397003e-05,
1910
+ "loss": 1.8443,
1911
+ "step": 266
1912
+ },
1913
+ {
1914
+ "epoch": 0.8190184049079755,
1915
+ "grad_norm": 0.5199599677205632,
1916
+ "learning_rate": 1.800614841383898e-05,
1917
+ "loss": 1.9262,
1918
+ "step": 267
1919
+ },
1920
+ {
1921
+ "epoch": 0.8220858895705522,
1922
+ "grad_norm": 0.2979059774589199,
1923
+ "learning_rate": 1.7745057147595694e-05,
1924
+ "loss": 1.8408,
1925
+ "step": 268
1926
+ },
1927
+ {
1928
+ "epoch": 0.8251533742331288,
1929
+ "grad_norm": 0.3369017601149041,
1930
+ "learning_rate": 1.7487892665049627e-05,
1931
+ "loss": 1.9671,
1932
+ "step": 269
1933
+ },
1934
+ {
1935
+ "epoch": 0.8282208588957055,
1936
+ "grad_norm": 0.24208825522114308,
1937
+ "learning_rate": 1.7234682072115305e-05,
1938
+ "loss": 1.9101,
1939
+ "step": 270
1940
+ },
1941
+ {
1942
+ "epoch": 0.8312883435582822,
1943
+ "grad_norm": 0.3809834134932596,
1944
+ "learning_rate": 1.698545205795536e-05,
1945
+ "loss": 1.8445,
1946
+ "step": 271
1947
+ },
1948
+ {
1949
+ "epoch": 0.8343558282208589,
1950
+ "grad_norm": 0.27384739149228576,
1951
+ "learning_rate": 1.674022889216737e-05,
1952
+ "loss": 1.9337,
1953
+ "step": 272
1954
+ },
1955
+ {
1956
+ "epoch": 0.8374233128834356,
1957
+ "grad_norm": 0.25542052798806203,
1958
+ "learning_rate": 1.6499038422014962e-05,
1959
+ "loss": 1.8697,
1960
+ "step": 273
1961
+ },
1962
+ {
1963
+ "epoch": 0.8404907975460123,
1964
+ "grad_norm": 0.30649006891608727,
1965
+ "learning_rate": 1.626190606970346e-05,
1966
+ "loss": 1.8985,
1967
+ "step": 274
1968
+ },
1969
+ {
1970
+ "epoch": 0.843558282208589,
1971
+ "grad_norm": 0.27648461915446576,
1972
+ "learning_rate": 1.602885682970026e-05,
1973
+ "loss": 1.8851,
1974
+ "step": 275
1975
+ },
1976
+ {
1977
+ "epoch": 0.8466257668711656,
1978
+ "grad_norm": 1.1533982638871452,
1979
+ "learning_rate": 1.57999152661004e-05,
1980
+ "loss": 1.9318,
1981
+ "step": 276
1982
+ },
1983
+ {
1984
+ "epoch": 0.8496932515337423,
1985
+ "grad_norm": 0.33969524913455146,
1986
+ "learning_rate": 1.5575105510037396e-05,
1987
+ "loss": 2.0149,
1988
+ "step": 277
1989
+ },
1990
+ {
1991
+ "epoch": 0.852760736196319,
1992
+ "grad_norm": 0.5956725111127443,
1993
+ "learning_rate": 1.53544512571397e-05,
1994
+ "loss": 1.8834,
1995
+ "step": 278
1996
+ },
1997
+ {
1998
+ "epoch": 0.8558282208588958,
1999
+ "grad_norm": 0.5892298656241596,
2000
+ "learning_rate": 1.5137975765033205e-05,
2001
+ "loss": 1.8972,
2002
+ "step": 279
2003
+ },
2004
+ {
2005
+ "epoch": 0.8588957055214724,
2006
+ "grad_norm": 0.41593605055209165,
2007
+ "learning_rate": 1.4925701850889772e-05,
2008
+ "loss": 1.9427,
2009
+ "step": 280
2010
+ },
2011
+ {
2012
+ "epoch": 0.8619631901840491,
2013
+ "grad_norm": 0.2630748817948859,
2014
+ "learning_rate": 1.4717651889022202e-05,
2015
+ "loss": 1.9469,
2016
+ "step": 281
2017
+ },
2018
+ {
2019
+ "epoch": 0.8650306748466258,
2020
+ "grad_norm": 0.2232832403928089,
2021
+ "learning_rate": 1.4513847808525969e-05,
2022
+ "loss": 1.9662,
2023
+ "step": 282
2024
+ },
2025
+ {
2026
+ "epoch": 0.8680981595092024,
2027
+ "grad_norm": 0.31719749827250515,
2028
+ "learning_rate": 1.4314311090967786e-05,
2029
+ "loss": 1.9091,
2030
+ "step": 283
2031
+ },
2032
+ {
2033
+ "epoch": 0.8711656441717791,
2034
+ "grad_norm": 0.301123405840287,
2035
+ "learning_rate": 1.4119062768121433e-05,
2036
+ "loss": 1.8862,
2037
+ "step": 284
2038
+ },
2039
+ {
2040
+ "epoch": 0.8742331288343558,
2041
+ "grad_norm": 0.6726088360165043,
2042
+ "learning_rate": 1.3928123419750888e-05,
2043
+ "loss": 1.8739,
2044
+ "step": 285
2045
+ },
2046
+ {
2047
+ "epoch": 0.8773006134969326,
2048
+ "grad_norm": 0.4202167476604764,
2049
+ "learning_rate": 1.3741513171441176e-05,
2050
+ "loss": 1.9232,
2051
+ "step": 286
2052
+ },
2053
+ {
2054
+ "epoch": 0.8803680981595092,
2055
+ "grad_norm": 0.304988395998919,
2056
+ "learning_rate": 1.3559251692477087e-05,
2057
+ "loss": 1.9318,
2058
+ "step": 287
2059
+ },
2060
+ {
2061
+ "epoch": 0.8834355828220859,
2062
+ "grad_norm": 0.274507041819108,
2063
+ "learning_rate": 1.3381358193769976e-05,
2064
+ "loss": 1.8499,
2065
+ "step": 288
2066
+ },
2067
+ {
2068
+ "epoch": 0.8865030674846626,
2069
+ "grad_norm": 0.47861538421593386,
2070
+ "learning_rate": 1.320785142583284e-05,
2071
+ "loss": 1.9518,
2072
+ "step": 289
2073
+ },
2074
+ {
2075
+ "epoch": 0.8895705521472392,
2076
+ "grad_norm": 0.45942646770952145,
2077
+ "learning_rate": 1.3038749676803994e-05,
2078
+ "loss": 1.9109,
2079
+ "step": 290
2080
+ },
2081
+ {
2082
+ "epoch": 0.8926380368098159,
2083
+ "grad_norm": 0.27087716251353355,
2084
+ "learning_rate": 1.2874070770519428e-05,
2085
+ "loss": 1.8813,
2086
+ "step": 291
2087
+ },
2088
+ {
2089
+ "epoch": 0.8957055214723927,
2090
+ "grad_norm": 0.255203728473793,
2091
+ "learning_rate": 1.2713832064634126e-05,
2092
+ "loss": 1.873,
2093
+ "step": 292
2094
+ },
2095
+ {
2096
+ "epoch": 0.8987730061349694,
2097
+ "grad_norm": 0.40071001023936836,
2098
+ "learning_rate": 1.2558050448792515e-05,
2099
+ "loss": 1.9324,
2100
+ "step": 293
2101
+ },
2102
+ {
2103
+ "epoch": 0.901840490797546,
2104
+ "grad_norm": 0.33237213114045755,
2105
+ "learning_rate": 1.2406742342848248e-05,
2106
+ "loss": 1.96,
2107
+ "step": 294
2108
+ },
2109
+ {
2110
+ "epoch": 0.9049079754601227,
2111
+ "grad_norm": 0.2921583930232282,
2112
+ "learning_rate": 1.2259923695133503e-05,
2113
+ "loss": 1.8696,
2114
+ "step": 295
2115
+ },
2116
+ {
2117
+ "epoch": 0.9079754601226994,
2118
+ "grad_norm": 0.2753105203678559,
2119
+ "learning_rate": 1.2117609980777959e-05,
2120
+ "loss": 1.9038,
2121
+ "step": 296
2122
+ },
2123
+ {
2124
+ "epoch": 0.911042944785276,
2125
+ "grad_norm": 0.497963211949326,
2126
+ "learning_rate": 1.1979816200077707e-05,
2127
+ "loss": 1.9388,
2128
+ "step": 297
2129
+ },
2130
+ {
2131
+ "epoch": 0.9141104294478528,
2132
+ "grad_norm": 0.2474786285871462,
2133
+ "learning_rate": 1.1846556876914151e-05,
2134
+ "loss": 1.9544,
2135
+ "step": 298
2136
+ },
2137
+ {
2138
+ "epoch": 0.9171779141104295,
2139
+ "grad_norm": 0.26791445026050176,
2140
+ "learning_rate": 1.1717846057223144e-05,
2141
+ "loss": 1.9231,
2142
+ "step": 299
2143
+ },
2144
+ {
2145
+ "epoch": 0.9202453987730062,
2146
+ "grad_norm": 0.3923236183364779,
2147
+ "learning_rate": 1.159369730751452e-05,
2148
+ "loss": 1.8686,
2149
+ "step": 300
2150
+ },
2151
+ {
2152
+ "epoch": 0.9233128834355828,
2153
+ "grad_norm": 0.36556731516768504,
2154
+ "learning_rate": 1.1474123713442137e-05,
2155
+ "loss": 1.9278,
2156
+ "step": 301
2157
+ },
2158
+ {
2159
+ "epoch": 0.9263803680981595,
2160
+ "grad_norm": 0.24192425833135245,
2161
+ "learning_rate": 1.1359137878424578e-05,
2162
+ "loss": 1.8853,
2163
+ "step": 302
2164
+ },
2165
+ {
2166
+ "epoch": 0.9294478527607362,
2167
+ "grad_norm": 0.31690600810620534,
2168
+ "learning_rate": 1.1248751922316776e-05,
2169
+ "loss": 1.9523,
2170
+ "step": 303
2171
+ },
2172
+ {
2173
+ "epoch": 0.9325153374233128,
2174
+ "grad_norm": 0.27955140199036155,
2175
+ "learning_rate": 1.1142977480132493e-05,
2176
+ "loss": 1.8225,
2177
+ "step": 304
2178
+ },
2179
+ {
2180
+ "epoch": 0.9355828220858896,
2181
+ "grad_norm": 0.2831264739725871,
2182
+ "learning_rate": 1.104182570081797e-05,
2183
+ "loss": 1.9258,
2184
+ "step": 305
2185
+ },
2186
+ {
2187
+ "epoch": 0.9386503067484663,
2188
+ "grad_norm": 0.26580496177825247,
2189
+ "learning_rate": 1.0945307246076797e-05,
2190
+ "loss": 1.9327,
2191
+ "step": 306
2192
+ },
2193
+ {
2194
+ "epoch": 0.941717791411043,
2195
+ "grad_norm": 0.30887069355917346,
2196
+ "learning_rate": 1.0853432289246138e-05,
2197
+ "loss": 1.9412,
2198
+ "step": 307
2199
+ },
2200
+ {
2201
+ "epoch": 0.9447852760736196,
2202
+ "grad_norm": 0.44810137462917216,
2203
+ "learning_rate": 1.076621051422442e-05,
2204
+ "loss": 1.9057,
2205
+ "step": 308
2206
+ },
2207
+ {
2208
+ "epoch": 0.9478527607361963,
2209
+ "grad_norm": 0.27583855429775517,
2210
+ "learning_rate": 1.0683651114450641e-05,
2211
+ "loss": 1.9357,
2212
+ "step": 309
2213
+ },
2214
+ {
2215
+ "epoch": 0.950920245398773,
2216
+ "grad_norm": 0.26050390516719396,
2217
+ "learning_rate": 1.0605762791935325e-05,
2218
+ "loss": 1.8674,
2219
+ "step": 310
2220
+ },
2221
+ {
2222
+ "epoch": 0.9539877300613497,
2223
+ "grad_norm": 0.26034125726942287,
2224
+ "learning_rate": 1.0532553756343328e-05,
2225
+ "loss": 1.8837,
2226
+ "step": 311
2227
+ },
2228
+ {
2229
+ "epoch": 0.9570552147239264,
2230
+ "grad_norm": 0.380331760419281,
2231
+ "learning_rate": 1.0464031724128512e-05,
2232
+ "loss": 1.9202,
2233
+ "step": 312
2234
+ },
2235
+ {
2236
+ "epoch": 0.9601226993865031,
2237
+ "grad_norm": 0.3024899052220286,
2238
+ "learning_rate": 1.0400203917720394e-05,
2239
+ "loss": 1.833,
2240
+ "step": 313
2241
+ },
2242
+ {
2243
+ "epoch": 0.9631901840490797,
2244
+ "grad_norm": 0.26156906536760005,
2245
+ "learning_rate": 1.0341077064762893e-05,
2246
+ "loss": 1.8538,
2247
+ "step": 314
2248
+ },
2249
+ {
2250
+ "epoch": 0.9662576687116564,
2251
+ "grad_norm": 0.5419644400783428,
2252
+ "learning_rate": 1.0286657397405204e-05,
2253
+ "loss": 1.8956,
2254
+ "step": 315
2255
+ },
2256
+ {
2257
+ "epoch": 0.9693251533742331,
2258
+ "grad_norm": 0.2754473793756419,
2259
+ "learning_rate": 1.0236950651644922e-05,
2260
+ "loss": 1.8821,
2261
+ "step": 316
2262
+ },
2263
+ {
2264
+ "epoch": 0.9723926380368099,
2265
+ "grad_norm": 0.32743295245170423,
2266
+ "learning_rate": 1.019196206672345e-05,
2267
+ "loss": 1.8669,
2268
+ "step": 317
2269
+ },
2270
+ {
2271
+ "epoch": 0.9754601226993865,
2272
+ "grad_norm": 0.2983793501294546,
2273
+ "learning_rate": 1.0151696384573753e-05,
2274
+ "loss": 1.8806,
2275
+ "step": 318
2276
+ },
2277
+ {
2278
+ "epoch": 0.9785276073619632,
2279
+ "grad_norm": 0.274678179585171,
2280
+ "learning_rate": 1.011615784932056e-05,
2281
+ "loss": 1.9428,
2282
+ "step": 319
2283
+ },
2284
+ {
2285
+ "epoch": 0.9815950920245399,
2286
+ "grad_norm": 0.802831711997894,
2287
+ "learning_rate": 1.0085350206833016e-05,
2288
+ "loss": 1.8988,
2289
+ "step": 320
2290
+ },
2291
+ {
2292
+ "epoch": 0.9846625766871165,
2293
+ "grad_norm": 0.36523952422202455,
2294
+ "learning_rate": 1.0059276704329856e-05,
2295
+ "loss": 1.8695,
2296
+ "step": 321
2297
+ },
2298
+ {
2299
+ "epoch": 0.9877300613496932,
2300
+ "grad_norm": 0.2857793976397457,
2301
+ "learning_rate": 1.003794009003713e-05,
2302
+ "loss": 1.8923,
2303
+ "step": 322
2304
+ },
2305
+ {
2306
+ "epoch": 0.99079754601227,
2307
+ "grad_norm": 0.306887686398712,
2308
+ "learning_rate": 1.0021342612898534e-05,
2309
+ "loss": 1.9541,
2310
+ "step": 323
2311
+ },
2312
+ {
2313
+ "epoch": 0.9938650306748467,
2314
+ "grad_norm": 0.5124292513803443,
2315
+ "learning_rate": 1.0009486022338391e-05,
2316
+ "loss": 1.9622,
2317
+ "step": 324
2318
+ },
2319
+ {
2320
+ "epoch": 0.9969325153374233,
2321
+ "grad_norm": 0.27281561169770374,
2322
+ "learning_rate": 1.0002371568077212e-05,
2323
+ "loss": 1.9336,
2324
+ "step": 325
2325
+ },
2326
+ {
2327
+ "epoch": 1.0,
2328
+ "grad_norm": 0.28851290398135704,
2329
+ "learning_rate": 1e-05,
2330
+ "loss": 1.8766,
2331
+ "step": 326
2332
  }
2333
  ],
2334
  "logging_steps": 1,
 
2343
  "should_evaluate": false,
2344
  "should_log": false,
2345
  "should_save": true,
2346
+ "should_training_stop": true
2347
  },
2348
  "attributes": {}
2349
  }
2350
  },
2351
+ "total_flos": 355990511812608.0,
2352
  "train_batch_size": 2,
2353
  "trial_name": null,
2354
  "trial_params": null