jdannem6 commited on
Commit
75243b0
1 Parent(s): 15f7c85

Uploaded checkpoint-17500

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1793 -3
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72488cd54366854c06a01fafe7066eaa10428f64cfdfa8ca4f9c680c16b0c7ca
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:162241111a35b4dd2bd8251eb44c4f9ed485c39f432082deec2b913318be26b3
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24f80cbc86f212c40a3aca37f4d46ec54ba6068eb48c70c79ac1b990ed212bde
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2640cb6f400e7439fff7ee437394f2676dd7a329f43b4ef033bc2e958e48c385
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49329d32f6b2bd4cf3b30fe0a52e76812bcd564ae6f0febf2a8baa96581aa016
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6ca59958578f29e636921c5d01edf609d279634685f5c1700ffd019a9a229f9
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cf4bd40b0e3062c56584d972e9743cac19669a0283ba7de8c76540e6d58df00
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb29930cc2c8e69e7c76b92867840499fb9c566b9d6b348753e567d4e680bb99
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 1.3866758346557617,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-12500",
4
- "epoch": 0.375,
5
  "eval_steps": 500,
6
- "global_step": 15000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -10747,6 +10747,1796 @@
10747
  "eval_samples_per_second": 15.121,
10748
  "eval_steps_per_second": 15.121,
10749
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10750
  }
10751
  ],
10752
  "logging_steps": 10,
@@ -10754,7 +12544,7 @@
10754
  "num_input_tokens_seen": 0,
10755
  "num_train_epochs": 1,
10756
  "save_steps": 2500,
10757
- "total_flos": 2.4153188990976e+17,
10758
  "train_batch_size": 1,
10759
  "trial_name": null,
10760
  "trial_params": null
 
1
  {
2
  "best_metric": 1.3866758346557617,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240422-165831/checkpoint-12500",
4
+ "epoch": 0.4375,
5
  "eval_steps": 500,
6
+ "global_step": 17500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
10747
  "eval_samples_per_second": 15.121,
10748
  "eval_steps_per_second": 15.121,
10749
  "step": 15000
10750
+ },
10751
+ {
10752
+ "epoch": 0.38,
10753
+ "grad_norm": 3.327223777770996,
10754
+ "learning_rate": 1.016271186440678e-05,
10755
+ "loss": 1.4933,
10756
+ "step": 15010
10757
+ },
10758
+ {
10759
+ "epoch": 0.38,
10760
+ "grad_norm": 5.010586738586426,
10761
+ "learning_rate": 1.015593220338983e-05,
10762
+ "loss": 1.2914,
10763
+ "step": 15020
10764
+ },
10765
+ {
10766
+ "epoch": 0.38,
10767
+ "grad_norm": 4.52363395690918,
10768
+ "learning_rate": 1.0149152542372882e-05,
10769
+ "loss": 1.3007,
10770
+ "step": 15030
10771
+ },
10772
+ {
10773
+ "epoch": 0.38,
10774
+ "grad_norm": 4.301648139953613,
10775
+ "learning_rate": 1.0142372881355933e-05,
10776
+ "loss": 1.3652,
10777
+ "step": 15040
10778
+ },
10779
+ {
10780
+ "epoch": 0.38,
10781
+ "grad_norm": 8.124075889587402,
10782
+ "learning_rate": 1.0135593220338985e-05,
10783
+ "loss": 1.3957,
10784
+ "step": 15050
10785
+ },
10786
+ {
10787
+ "epoch": 0.38,
10788
+ "grad_norm": 4.588447093963623,
10789
+ "learning_rate": 1.0128813559322034e-05,
10790
+ "loss": 1.1597,
10791
+ "step": 15060
10792
+ },
10793
+ {
10794
+ "epoch": 0.38,
10795
+ "grad_norm": 3.6226577758789062,
10796
+ "learning_rate": 1.0122033898305086e-05,
10797
+ "loss": 1.5604,
10798
+ "step": 15070
10799
+ },
10800
+ {
10801
+ "epoch": 0.38,
10802
+ "grad_norm": 7.039438724517822,
10803
+ "learning_rate": 1.0115254237288137e-05,
10804
+ "loss": 1.2913,
10805
+ "step": 15080
10806
+ },
10807
+ {
10808
+ "epoch": 0.38,
10809
+ "grad_norm": 1.9853217601776123,
10810
+ "learning_rate": 1.0108474576271189e-05,
10811
+ "loss": 1.2087,
10812
+ "step": 15090
10813
+ },
10814
+ {
10815
+ "epoch": 0.38,
10816
+ "grad_norm": 6.037110805511475,
10817
+ "learning_rate": 1.0101694915254238e-05,
10818
+ "loss": 1.3358,
10819
+ "step": 15100
10820
+ },
10821
+ {
10822
+ "epoch": 0.38,
10823
+ "grad_norm": 1.2350672483444214,
10824
+ "learning_rate": 1.009491525423729e-05,
10825
+ "loss": 1.4253,
10826
+ "step": 15110
10827
+ },
10828
+ {
10829
+ "epoch": 0.38,
10830
+ "grad_norm": 4.601665019989014,
10831
+ "learning_rate": 1.0088135593220341e-05,
10832
+ "loss": 1.3471,
10833
+ "step": 15120
10834
+ },
10835
+ {
10836
+ "epoch": 0.38,
10837
+ "grad_norm": 7.822099208831787,
10838
+ "learning_rate": 1.008135593220339e-05,
10839
+ "loss": 1.4255,
10840
+ "step": 15130
10841
+ },
10842
+ {
10843
+ "epoch": 0.38,
10844
+ "grad_norm": 2.1710550785064697,
10845
+ "learning_rate": 1.0074576271186442e-05,
10846
+ "loss": 1.4419,
10847
+ "step": 15140
10848
+ },
10849
+ {
10850
+ "epoch": 0.38,
10851
+ "grad_norm": 6.896235942840576,
10852
+ "learning_rate": 1.0067796610169492e-05,
10853
+ "loss": 1.3166,
10854
+ "step": 15150
10855
+ },
10856
+ {
10857
+ "epoch": 0.38,
10858
+ "grad_norm": 2.7912566661834717,
10859
+ "learning_rate": 1.0061016949152542e-05,
10860
+ "loss": 1.3577,
10861
+ "step": 15160
10862
+ },
10863
+ {
10864
+ "epoch": 0.38,
10865
+ "grad_norm": 2.8911972045898438,
10866
+ "learning_rate": 1.0054237288135593e-05,
10867
+ "loss": 1.3415,
10868
+ "step": 15170
10869
+ },
10870
+ {
10871
+ "epoch": 0.38,
10872
+ "grad_norm": 1.8997740745544434,
10873
+ "learning_rate": 1.0047457627118644e-05,
10874
+ "loss": 1.3933,
10875
+ "step": 15180
10876
+ },
10877
+ {
10878
+ "epoch": 0.38,
10879
+ "grad_norm": 5.039108753204346,
10880
+ "learning_rate": 1.0040677966101696e-05,
10881
+ "loss": 1.3395,
10882
+ "step": 15190
10883
+ },
10884
+ {
10885
+ "epoch": 0.38,
10886
+ "grad_norm": 5.744147300720215,
10887
+ "learning_rate": 1.0033898305084746e-05,
10888
+ "loss": 1.4215,
10889
+ "step": 15200
10890
+ },
10891
+ {
10892
+ "epoch": 0.38,
10893
+ "grad_norm": 18.610450744628906,
10894
+ "learning_rate": 1.0027118644067797e-05,
10895
+ "loss": 1.4639,
10896
+ "step": 15210
10897
+ },
10898
+ {
10899
+ "epoch": 0.38,
10900
+ "grad_norm": 4.890042304992676,
10901
+ "learning_rate": 1.0020338983050848e-05,
10902
+ "loss": 1.34,
10903
+ "step": 15220
10904
+ },
10905
+ {
10906
+ "epoch": 0.38,
10907
+ "grad_norm": 2.5182347297668457,
10908
+ "learning_rate": 1.00135593220339e-05,
10909
+ "loss": 1.5222,
10910
+ "step": 15230
10911
+ },
10912
+ {
10913
+ "epoch": 0.38,
10914
+ "grad_norm": 2.647080183029175,
10915
+ "learning_rate": 1.000677966101695e-05,
10916
+ "loss": 1.3716,
10917
+ "step": 15240
10918
+ },
10919
+ {
10920
+ "epoch": 0.38,
10921
+ "grad_norm": 7.10032320022583,
10922
+ "learning_rate": 1e-05,
10923
+ "loss": 1.4433,
10924
+ "step": 15250
10925
+ },
10926
+ {
10927
+ "epoch": 0.38,
10928
+ "grad_norm": 10.124835968017578,
10929
+ "learning_rate": 9.993220338983052e-06,
10930
+ "loss": 1.402,
10931
+ "step": 15260
10932
+ },
10933
+ {
10934
+ "epoch": 0.38,
10935
+ "grad_norm": 13.186834335327148,
10936
+ "learning_rate": 9.986440677966102e-06,
10937
+ "loss": 1.4211,
10938
+ "step": 15270
10939
+ },
10940
+ {
10941
+ "epoch": 0.38,
10942
+ "grad_norm": 3.1523687839508057,
10943
+ "learning_rate": 9.979661016949153e-06,
10944
+ "loss": 1.3585,
10945
+ "step": 15280
10946
+ },
10947
+ {
10948
+ "epoch": 0.38,
10949
+ "grad_norm": 5.048830032348633,
10950
+ "learning_rate": 9.972881355932205e-06,
10951
+ "loss": 1.4079,
10952
+ "step": 15290
10953
+ },
10954
+ {
10955
+ "epoch": 0.38,
10956
+ "grad_norm": 9.165743827819824,
10957
+ "learning_rate": 9.966101694915256e-06,
10958
+ "loss": 1.4847,
10959
+ "step": 15300
10960
+ },
10961
+ {
10962
+ "epoch": 0.38,
10963
+ "grad_norm": 10.89134693145752,
10964
+ "learning_rate": 9.959322033898306e-06,
10965
+ "loss": 1.4922,
10966
+ "step": 15310
10967
+ },
10968
+ {
10969
+ "epoch": 0.38,
10970
+ "grad_norm": 2.4734840393066406,
10971
+ "learning_rate": 9.952542372881356e-06,
10972
+ "loss": 1.496,
10973
+ "step": 15320
10974
+ },
10975
+ {
10976
+ "epoch": 0.38,
10977
+ "grad_norm": 2.6290500164031982,
10978
+ "learning_rate": 9.945762711864407e-06,
10979
+ "loss": 1.4658,
10980
+ "step": 15330
10981
+ },
10982
+ {
10983
+ "epoch": 0.38,
10984
+ "grad_norm": 2.3270649909973145,
10985
+ "learning_rate": 9.938983050847458e-06,
10986
+ "loss": 1.3203,
10987
+ "step": 15340
10988
+ },
10989
+ {
10990
+ "epoch": 0.38,
10991
+ "grad_norm": 3.256084442138672,
10992
+ "learning_rate": 9.93220338983051e-06,
10993
+ "loss": 1.3317,
10994
+ "step": 15350
10995
+ },
10996
+ {
10997
+ "epoch": 0.38,
10998
+ "grad_norm": 7.33984899520874,
10999
+ "learning_rate": 9.92542372881356e-06,
11000
+ "loss": 1.2692,
11001
+ "step": 15360
11002
+ },
11003
+ {
11004
+ "epoch": 0.38,
11005
+ "grad_norm": 5.8543877601623535,
11006
+ "learning_rate": 9.918644067796611e-06,
11007
+ "loss": 1.4074,
11008
+ "step": 15370
11009
+ },
11010
+ {
11011
+ "epoch": 0.38,
11012
+ "grad_norm": 3.914440870285034,
11013
+ "learning_rate": 9.911864406779662e-06,
11014
+ "loss": 1.4998,
11015
+ "step": 15380
11016
+ },
11017
+ {
11018
+ "epoch": 0.38,
11019
+ "grad_norm": 4.911335468292236,
11020
+ "learning_rate": 9.905084745762714e-06,
11021
+ "loss": 1.2676,
11022
+ "step": 15390
11023
+ },
11024
+ {
11025
+ "epoch": 0.39,
11026
+ "grad_norm": 2.9057512283325195,
11027
+ "learning_rate": 9.898305084745763e-06,
11028
+ "loss": 1.4411,
11029
+ "step": 15400
11030
+ },
11031
+ {
11032
+ "epoch": 0.39,
11033
+ "grad_norm": 6.507194519042969,
11034
+ "learning_rate": 9.891525423728813e-06,
11035
+ "loss": 1.3075,
11036
+ "step": 15410
11037
+ },
11038
+ {
11039
+ "epoch": 0.39,
11040
+ "grad_norm": 2.6351258754730225,
11041
+ "learning_rate": 9.884745762711864e-06,
11042
+ "loss": 1.368,
11043
+ "step": 15420
11044
+ },
11045
+ {
11046
+ "epoch": 0.39,
11047
+ "grad_norm": 10.542738914489746,
11048
+ "learning_rate": 9.877966101694916e-06,
11049
+ "loss": 1.3388,
11050
+ "step": 15430
11051
+ },
11052
+ {
11053
+ "epoch": 0.39,
11054
+ "grad_norm": 5.773244857788086,
11055
+ "learning_rate": 9.871186440677967e-06,
11056
+ "loss": 1.3478,
11057
+ "step": 15440
11058
+ },
11059
+ {
11060
+ "epoch": 0.39,
11061
+ "grad_norm": 3.678858757019043,
11062
+ "learning_rate": 9.864406779661017e-06,
11063
+ "loss": 1.3866,
11064
+ "step": 15450
11065
+ },
11066
+ {
11067
+ "epoch": 0.39,
11068
+ "grad_norm": 6.312896251678467,
11069
+ "learning_rate": 9.857627118644068e-06,
11070
+ "loss": 1.418,
11071
+ "step": 15460
11072
+ },
11073
+ {
11074
+ "epoch": 0.39,
11075
+ "grad_norm": 6.026124954223633,
11076
+ "learning_rate": 9.85084745762712e-06,
11077
+ "loss": 1.584,
11078
+ "step": 15470
11079
+ },
11080
+ {
11081
+ "epoch": 0.39,
11082
+ "grad_norm": 3.8132076263427734,
11083
+ "learning_rate": 9.844067796610171e-06,
11084
+ "loss": 1.4433,
11085
+ "step": 15480
11086
+ },
11087
+ {
11088
+ "epoch": 0.39,
11089
+ "grad_norm": 3.9010612964630127,
11090
+ "learning_rate": 9.837288135593221e-06,
11091
+ "loss": 1.3962,
11092
+ "step": 15490
11093
+ },
11094
+ {
11095
+ "epoch": 0.39,
11096
+ "grad_norm": 2.6610724925994873,
11097
+ "learning_rate": 9.830508474576272e-06,
11098
+ "loss": 1.3506,
11099
+ "step": 15500
11100
+ },
11101
+ {
11102
+ "epoch": 0.39,
11103
+ "eval_loss": 1.4206969738006592,
11104
+ "eval_runtime": 66.1225,
11105
+ "eval_samples_per_second": 15.123,
11106
+ "eval_steps_per_second": 15.123,
11107
+ "step": 15500
11108
+ },
11109
+ {
11110
+ "epoch": 0.39,
11111
+ "grad_norm": 10.500436782836914,
11112
+ "learning_rate": 9.823728813559322e-06,
11113
+ "loss": 1.4373,
11114
+ "step": 15510
11115
+ },
11116
+ {
11117
+ "epoch": 0.39,
11118
+ "grad_norm": 4.49819278717041,
11119
+ "learning_rate": 9.816949152542373e-06,
11120
+ "loss": 1.3353,
11121
+ "step": 15520
11122
+ },
11123
+ {
11124
+ "epoch": 0.39,
11125
+ "grad_norm": 4.517664909362793,
11126
+ "learning_rate": 9.810169491525425e-06,
11127
+ "loss": 1.4062,
11128
+ "step": 15530
11129
+ },
11130
+ {
11131
+ "epoch": 0.39,
11132
+ "grad_norm": 11.253808975219727,
11133
+ "learning_rate": 9.803389830508474e-06,
11134
+ "loss": 1.4507,
11135
+ "step": 15540
11136
+ },
11137
+ {
11138
+ "epoch": 0.39,
11139
+ "grad_norm": 2.5719611644744873,
11140
+ "learning_rate": 9.796610169491526e-06,
11141
+ "loss": 1.2784,
11142
+ "step": 15550
11143
+ },
11144
+ {
11145
+ "epoch": 0.39,
11146
+ "grad_norm": 2.9650001525878906,
11147
+ "learning_rate": 9.789830508474577e-06,
11148
+ "loss": 1.3021,
11149
+ "step": 15560
11150
+ },
11151
+ {
11152
+ "epoch": 0.39,
11153
+ "grad_norm": 4.1970319747924805,
11154
+ "learning_rate": 9.783050847457629e-06,
11155
+ "loss": 1.4252,
11156
+ "step": 15570
11157
+ },
11158
+ {
11159
+ "epoch": 0.39,
11160
+ "grad_norm": 1.926850438117981,
11161
+ "learning_rate": 9.776271186440678e-06,
11162
+ "loss": 1.2455,
11163
+ "step": 15580
11164
+ },
11165
+ {
11166
+ "epoch": 0.39,
11167
+ "grad_norm": 8.753373146057129,
11168
+ "learning_rate": 9.76949152542373e-06,
11169
+ "loss": 1.1841,
11170
+ "step": 15590
11171
+ },
11172
+ {
11173
+ "epoch": 0.39,
11174
+ "grad_norm": 11.411351203918457,
11175
+ "learning_rate": 9.762711864406781e-06,
11176
+ "loss": 1.391,
11177
+ "step": 15600
11178
+ },
11179
+ {
11180
+ "epoch": 0.39,
11181
+ "grad_norm": 9.503700256347656,
11182
+ "learning_rate": 9.755932203389833e-06,
11183
+ "loss": 1.4071,
11184
+ "step": 15610
11185
+ },
11186
+ {
11187
+ "epoch": 0.39,
11188
+ "grad_norm": 3.8297741413116455,
11189
+ "learning_rate": 9.749152542372882e-06,
11190
+ "loss": 1.4042,
11191
+ "step": 15620
11192
+ },
11193
+ {
11194
+ "epoch": 0.39,
11195
+ "grad_norm": 7.153141498565674,
11196
+ "learning_rate": 9.742372881355932e-06,
11197
+ "loss": 1.3474,
11198
+ "step": 15630
11199
+ },
11200
+ {
11201
+ "epoch": 0.39,
11202
+ "grad_norm": 2.806121826171875,
11203
+ "learning_rate": 9.735593220338983e-06,
11204
+ "loss": 1.4126,
11205
+ "step": 15640
11206
+ },
11207
+ {
11208
+ "epoch": 0.39,
11209
+ "grad_norm": 7.874058246612549,
11210
+ "learning_rate": 9.728813559322035e-06,
11211
+ "loss": 1.5157,
11212
+ "step": 15650
11213
+ },
11214
+ {
11215
+ "epoch": 0.39,
11216
+ "grad_norm": 3.6346802711486816,
11217
+ "learning_rate": 9.722033898305086e-06,
11218
+ "loss": 1.3617,
11219
+ "step": 15660
11220
+ },
11221
+ {
11222
+ "epoch": 0.39,
11223
+ "grad_norm": 19.185550689697266,
11224
+ "learning_rate": 9.715254237288136e-06,
11225
+ "loss": 1.4801,
11226
+ "step": 15670
11227
+ },
11228
+ {
11229
+ "epoch": 0.39,
11230
+ "grad_norm": 15.9682035446167,
11231
+ "learning_rate": 9.708474576271187e-06,
11232
+ "loss": 1.1923,
11233
+ "step": 15680
11234
+ },
11235
+ {
11236
+ "epoch": 0.39,
11237
+ "grad_norm": 2.7673888206481934,
11238
+ "learning_rate": 9.701694915254239e-06,
11239
+ "loss": 1.5568,
11240
+ "step": 15690
11241
+ },
11242
+ {
11243
+ "epoch": 0.39,
11244
+ "grad_norm": 7.633556842803955,
11245
+ "learning_rate": 9.69491525423729e-06,
11246
+ "loss": 1.4391,
11247
+ "step": 15700
11248
+ },
11249
+ {
11250
+ "epoch": 0.39,
11251
+ "grad_norm": 11.961231231689453,
11252
+ "learning_rate": 9.68813559322034e-06,
11253
+ "loss": 1.4212,
11254
+ "step": 15710
11255
+ },
11256
+ {
11257
+ "epoch": 0.39,
11258
+ "grad_norm": 2.426793098449707,
11259
+ "learning_rate": 9.68135593220339e-06,
11260
+ "loss": 1.1991,
11261
+ "step": 15720
11262
+ },
11263
+ {
11264
+ "epoch": 0.39,
11265
+ "grad_norm": 7.8566107749938965,
11266
+ "learning_rate": 9.674576271186441e-06,
11267
+ "loss": 1.2991,
11268
+ "step": 15730
11269
+ },
11270
+ {
11271
+ "epoch": 0.39,
11272
+ "grad_norm": 10.158808708190918,
11273
+ "learning_rate": 9.667796610169492e-06,
11274
+ "loss": 1.4826,
11275
+ "step": 15740
11276
+ },
11277
+ {
11278
+ "epoch": 0.39,
11279
+ "grad_norm": 6.697884559631348,
11280
+ "learning_rate": 9.661016949152544e-06,
11281
+ "loss": 1.4448,
11282
+ "step": 15750
11283
+ },
11284
+ {
11285
+ "epoch": 0.39,
11286
+ "grad_norm": 3.4867446422576904,
11287
+ "learning_rate": 9.654237288135593e-06,
11288
+ "loss": 1.4839,
11289
+ "step": 15760
11290
+ },
11291
+ {
11292
+ "epoch": 0.39,
11293
+ "grad_norm": 1.6158361434936523,
11294
+ "learning_rate": 9.647457627118645e-06,
11295
+ "loss": 1.2674,
11296
+ "step": 15770
11297
+ },
11298
+ {
11299
+ "epoch": 0.39,
11300
+ "grad_norm": 3.913695812225342,
11301
+ "learning_rate": 9.640677966101696e-06,
11302
+ "loss": 1.3351,
11303
+ "step": 15780
11304
+ },
11305
+ {
11306
+ "epoch": 0.39,
11307
+ "grad_norm": 1.9007806777954102,
11308
+ "learning_rate": 9.633898305084746e-06,
11309
+ "loss": 1.3478,
11310
+ "step": 15790
11311
+ },
11312
+ {
11313
+ "epoch": 0.4,
11314
+ "grad_norm": 2.1645798683166504,
11315
+ "learning_rate": 9.627118644067797e-06,
11316
+ "loss": 1.3018,
11317
+ "step": 15800
11318
+ },
11319
+ {
11320
+ "epoch": 0.4,
11321
+ "grad_norm": 3.865454912185669,
11322
+ "learning_rate": 9.620338983050849e-06,
11323
+ "loss": 1.5398,
11324
+ "step": 15810
11325
+ },
11326
+ {
11327
+ "epoch": 0.4,
11328
+ "grad_norm": 4.120809555053711,
11329
+ "learning_rate": 9.6135593220339e-06,
11330
+ "loss": 1.4546,
11331
+ "step": 15820
11332
+ },
11333
+ {
11334
+ "epoch": 0.4,
11335
+ "grad_norm": 5.410675525665283,
11336
+ "learning_rate": 9.60677966101695e-06,
11337
+ "loss": 1.4648,
11338
+ "step": 15830
11339
+ },
11340
+ {
11341
+ "epoch": 0.4,
11342
+ "grad_norm": 8.03393268585205,
11343
+ "learning_rate": 9.600000000000001e-06,
11344
+ "loss": 1.2891,
11345
+ "step": 15840
11346
+ },
11347
+ {
11348
+ "epoch": 0.4,
11349
+ "grad_norm": 7.1069655418396,
11350
+ "learning_rate": 9.593220338983051e-06,
11351
+ "loss": 1.314,
11352
+ "step": 15850
11353
+ },
11354
+ {
11355
+ "epoch": 0.4,
11356
+ "grad_norm": 6.312065601348877,
11357
+ "learning_rate": 9.586440677966102e-06,
11358
+ "loss": 1.2608,
11359
+ "step": 15860
11360
+ },
11361
+ {
11362
+ "epoch": 0.4,
11363
+ "grad_norm": 3.499418258666992,
11364
+ "learning_rate": 9.579661016949154e-06,
11365
+ "loss": 1.3816,
11366
+ "step": 15870
11367
+ },
11368
+ {
11369
+ "epoch": 0.4,
11370
+ "grad_norm": 5.211281776428223,
11371
+ "learning_rate": 9.572881355932203e-06,
11372
+ "loss": 1.1021,
11373
+ "step": 15880
11374
+ },
11375
+ {
11376
+ "epoch": 0.4,
11377
+ "grad_norm": 4.372194290161133,
11378
+ "learning_rate": 9.566101694915255e-06,
11379
+ "loss": 1.4179,
11380
+ "step": 15890
11381
+ },
11382
+ {
11383
+ "epoch": 0.4,
11384
+ "grad_norm": 5.44555139541626,
11385
+ "learning_rate": 9.559322033898306e-06,
11386
+ "loss": 1.4848,
11387
+ "step": 15900
11388
+ },
11389
+ {
11390
+ "epoch": 0.4,
11391
+ "grad_norm": 3.483454942703247,
11392
+ "learning_rate": 9.552542372881358e-06,
11393
+ "loss": 1.2985,
11394
+ "step": 15910
11395
+ },
11396
+ {
11397
+ "epoch": 0.4,
11398
+ "grad_norm": 3.8721625804901123,
11399
+ "learning_rate": 9.545762711864407e-06,
11400
+ "loss": 1.4731,
11401
+ "step": 15920
11402
+ },
11403
+ {
11404
+ "epoch": 0.4,
11405
+ "grad_norm": 4.317465305328369,
11406
+ "learning_rate": 9.538983050847457e-06,
11407
+ "loss": 1.501,
11408
+ "step": 15930
11409
+ },
11410
+ {
11411
+ "epoch": 0.4,
11412
+ "grad_norm": 7.075793743133545,
11413
+ "learning_rate": 9.532203389830508e-06,
11414
+ "loss": 1.5138,
11415
+ "step": 15940
11416
+ },
11417
+ {
11418
+ "epoch": 0.4,
11419
+ "grad_norm": 5.606925964355469,
11420
+ "learning_rate": 9.52542372881356e-06,
11421
+ "loss": 1.3771,
11422
+ "step": 15950
11423
+ },
11424
+ {
11425
+ "epoch": 0.4,
11426
+ "grad_norm": 2.937833547592163,
11427
+ "learning_rate": 9.518644067796611e-06,
11428
+ "loss": 1.4764,
11429
+ "step": 15960
11430
+ },
11431
+ {
11432
+ "epoch": 0.4,
11433
+ "grad_norm": 4.39876651763916,
11434
+ "learning_rate": 9.511864406779661e-06,
11435
+ "loss": 1.4022,
11436
+ "step": 15970
11437
+ },
11438
+ {
11439
+ "epoch": 0.4,
11440
+ "grad_norm": 6.455230712890625,
11441
+ "learning_rate": 9.505084745762712e-06,
11442
+ "loss": 1.3569,
11443
+ "step": 15980
11444
+ },
11445
+ {
11446
+ "epoch": 0.4,
11447
+ "grad_norm": 4.89456033706665,
11448
+ "learning_rate": 9.498305084745764e-06,
11449
+ "loss": 1.3426,
11450
+ "step": 15990
11451
+ },
11452
+ {
11453
+ "epoch": 0.4,
11454
+ "grad_norm": 5.696287631988525,
11455
+ "learning_rate": 9.491525423728815e-06,
11456
+ "loss": 1.5381,
11457
+ "step": 16000
11458
+ },
11459
+ {
11460
+ "epoch": 0.4,
11461
+ "eval_loss": 1.3579893112182617,
11462
+ "eval_runtime": 66.2433,
11463
+ "eval_samples_per_second": 15.096,
11464
+ "eval_steps_per_second": 15.096,
11465
+ "step": 16000
11466
+ },
11467
+ {
11468
+ "epoch": 0.4,
11469
+ "grad_norm": 6.06011962890625,
11470
+ "learning_rate": 9.484745762711865e-06,
11471
+ "loss": 1.4422,
11472
+ "step": 16010
11473
+ },
11474
+ {
11475
+ "epoch": 0.4,
11476
+ "grad_norm": 0.9439699053764343,
11477
+ "learning_rate": 9.477966101694916e-06,
11478
+ "loss": 1.3661,
11479
+ "step": 16020
11480
+ },
11481
+ {
11482
+ "epoch": 0.4,
11483
+ "grad_norm": 2.9158008098602295,
11484
+ "learning_rate": 9.471186440677966e-06,
11485
+ "loss": 1.1349,
11486
+ "step": 16030
11487
+ },
11488
+ {
11489
+ "epoch": 0.4,
11490
+ "grad_norm": 3.3317673206329346,
11491
+ "learning_rate": 9.464406779661017e-06,
11492
+ "loss": 1.4161,
11493
+ "step": 16040
11494
+ },
11495
+ {
11496
+ "epoch": 0.4,
11497
+ "grad_norm": 2.866821050643921,
11498
+ "learning_rate": 9.457627118644069e-06,
11499
+ "loss": 1.2952,
11500
+ "step": 16050
11501
+ },
11502
+ {
11503
+ "epoch": 0.4,
11504
+ "grad_norm": 8.00590991973877,
11505
+ "learning_rate": 9.450847457627119e-06,
11506
+ "loss": 1.3493,
11507
+ "step": 16060
11508
+ },
11509
+ {
11510
+ "epoch": 0.4,
11511
+ "grad_norm": 2.1362667083740234,
11512
+ "learning_rate": 9.44406779661017e-06,
11513
+ "loss": 1.3385,
11514
+ "step": 16070
11515
+ },
11516
+ {
11517
+ "epoch": 0.4,
11518
+ "grad_norm": 4.567724227905273,
11519
+ "learning_rate": 9.437288135593221e-06,
11520
+ "loss": 1.2988,
11521
+ "step": 16080
11522
+ },
11523
+ {
11524
+ "epoch": 0.4,
11525
+ "grad_norm": 3.382605791091919,
11526
+ "learning_rate": 9.430508474576273e-06,
11527
+ "loss": 1.2131,
11528
+ "step": 16090
11529
+ },
11530
+ {
11531
+ "epoch": 0.4,
11532
+ "grad_norm": 13.581380844116211,
11533
+ "learning_rate": 9.423728813559322e-06,
11534
+ "loss": 1.3962,
11535
+ "step": 16100
11536
+ },
11537
+ {
11538
+ "epoch": 0.4,
11539
+ "grad_norm": 6.365314483642578,
11540
+ "learning_rate": 9.416949152542374e-06,
11541
+ "loss": 1.4928,
11542
+ "step": 16110
11543
+ },
11544
+ {
11545
+ "epoch": 0.4,
11546
+ "grad_norm": 5.413196563720703,
11547
+ "learning_rate": 9.410169491525425e-06,
11548
+ "loss": 1.2728,
11549
+ "step": 16120
11550
+ },
11551
+ {
11552
+ "epoch": 0.4,
11553
+ "grad_norm": 3.6787874698638916,
11554
+ "learning_rate": 9.403389830508477e-06,
11555
+ "loss": 1.1831,
11556
+ "step": 16130
11557
+ },
11558
+ {
11559
+ "epoch": 0.4,
11560
+ "grad_norm": 2.8850882053375244,
11561
+ "learning_rate": 9.396610169491526e-06,
11562
+ "loss": 1.3025,
11563
+ "step": 16140
11564
+ },
11565
+ {
11566
+ "epoch": 0.4,
11567
+ "grad_norm": 3.6025898456573486,
11568
+ "learning_rate": 9.389830508474576e-06,
11569
+ "loss": 1.3553,
11570
+ "step": 16150
11571
+ },
11572
+ {
11573
+ "epoch": 0.4,
11574
+ "grad_norm": 3.00011944770813,
11575
+ "learning_rate": 9.383050847457627e-06,
11576
+ "loss": 1.5127,
11577
+ "step": 16160
11578
+ },
11579
+ {
11580
+ "epoch": 0.4,
11581
+ "grad_norm": 4.33295202255249,
11582
+ "learning_rate": 9.376271186440679e-06,
11583
+ "loss": 1.5001,
11584
+ "step": 16170
11585
+ },
11586
+ {
11587
+ "epoch": 0.4,
11588
+ "grad_norm": 3.6982882022857666,
11589
+ "learning_rate": 9.36949152542373e-06,
11590
+ "loss": 1.4729,
11591
+ "step": 16180
11592
+ },
11593
+ {
11594
+ "epoch": 0.4,
11595
+ "grad_norm": 7.049408435821533,
11596
+ "learning_rate": 9.36271186440678e-06,
11597
+ "loss": 1.276,
11598
+ "step": 16190
11599
+ },
11600
+ {
11601
+ "epoch": 0.41,
11602
+ "grad_norm": 1.6569970846176147,
11603
+ "learning_rate": 9.355932203389831e-06,
11604
+ "loss": 1.3321,
11605
+ "step": 16200
11606
+ },
11607
+ {
11608
+ "epoch": 0.41,
11609
+ "grad_norm": 2.4531731605529785,
11610
+ "learning_rate": 9.349152542372883e-06,
11611
+ "loss": 1.435,
11612
+ "step": 16210
11613
+ },
11614
+ {
11615
+ "epoch": 0.41,
11616
+ "grad_norm": 2.776325225830078,
11617
+ "learning_rate": 9.342372881355934e-06,
11618
+ "loss": 1.2586,
11619
+ "step": 16220
11620
+ },
11621
+ {
11622
+ "epoch": 0.41,
11623
+ "grad_norm": 2.805823564529419,
11624
+ "learning_rate": 9.335593220338984e-06,
11625
+ "loss": 1.3393,
11626
+ "step": 16230
11627
+ },
11628
+ {
11629
+ "epoch": 0.41,
11630
+ "grad_norm": 2.3886725902557373,
11631
+ "learning_rate": 9.328813559322034e-06,
11632
+ "loss": 1.4241,
11633
+ "step": 16240
11634
+ },
11635
+ {
11636
+ "epoch": 0.41,
11637
+ "grad_norm": 10.983514785766602,
11638
+ "learning_rate": 9.322033898305085e-06,
11639
+ "loss": 1.5032,
11640
+ "step": 16250
11641
+ },
11642
+ {
11643
+ "epoch": 0.41,
11644
+ "grad_norm": 2.922609329223633,
11645
+ "learning_rate": 9.315254237288136e-06,
11646
+ "loss": 1.2289,
11647
+ "step": 16260
11648
+ },
11649
+ {
11650
+ "epoch": 0.41,
11651
+ "grad_norm": 3.623088836669922,
11652
+ "learning_rate": 9.308474576271188e-06,
11653
+ "loss": 1.2854,
11654
+ "step": 16270
11655
+ },
11656
+ {
11657
+ "epoch": 0.41,
11658
+ "grad_norm": 3.6580920219421387,
11659
+ "learning_rate": 9.301694915254237e-06,
11660
+ "loss": 1.3371,
11661
+ "step": 16280
11662
+ },
11663
+ {
11664
+ "epoch": 0.41,
11665
+ "grad_norm": 3.010573625564575,
11666
+ "learning_rate": 9.294915254237289e-06,
11667
+ "loss": 1.465,
11668
+ "step": 16290
11669
+ },
11670
+ {
11671
+ "epoch": 0.41,
11672
+ "grad_norm": 3.1502161026000977,
11673
+ "learning_rate": 9.28813559322034e-06,
11674
+ "loss": 1.2245,
11675
+ "step": 16300
11676
+ },
11677
+ {
11678
+ "epoch": 0.41,
11679
+ "grad_norm": 2.7607476711273193,
11680
+ "learning_rate": 9.28135593220339e-06,
11681
+ "loss": 1.4668,
11682
+ "step": 16310
11683
+ },
11684
+ {
11685
+ "epoch": 0.41,
11686
+ "grad_norm": 6.022114276885986,
11687
+ "learning_rate": 9.274576271186441e-06,
11688
+ "loss": 1.1924,
11689
+ "step": 16320
11690
+ },
11691
+ {
11692
+ "epoch": 0.41,
11693
+ "grad_norm": 5.824821949005127,
11694
+ "learning_rate": 9.267796610169493e-06,
11695
+ "loss": 1.3773,
11696
+ "step": 16330
11697
+ },
11698
+ {
11699
+ "epoch": 0.41,
11700
+ "grad_norm": 3.7949423789978027,
11701
+ "learning_rate": 9.261016949152544e-06,
11702
+ "loss": 1.3334,
11703
+ "step": 16340
11704
+ },
11705
+ {
11706
+ "epoch": 0.41,
11707
+ "grad_norm": 1.7965031862258911,
11708
+ "learning_rate": 9.254237288135594e-06,
11709
+ "loss": 1.5022,
11710
+ "step": 16350
11711
+ },
11712
+ {
11713
+ "epoch": 0.41,
11714
+ "grad_norm": 4.245846271514893,
11715
+ "learning_rate": 9.247457627118645e-06,
11716
+ "loss": 1.2312,
11717
+ "step": 16360
11718
+ },
11719
+ {
11720
+ "epoch": 0.41,
11721
+ "grad_norm": 7.283750057220459,
11722
+ "learning_rate": 9.240677966101695e-06,
11723
+ "loss": 1.2448,
11724
+ "step": 16370
11725
+ },
11726
+ {
11727
+ "epoch": 0.41,
11728
+ "grad_norm": 3.51558780670166,
11729
+ "learning_rate": 9.233898305084746e-06,
11730
+ "loss": 1.4823,
11731
+ "step": 16380
11732
+ },
11733
+ {
11734
+ "epoch": 0.41,
11735
+ "grad_norm": 7.842286586761475,
11736
+ "learning_rate": 9.227118644067798e-06,
11737
+ "loss": 1.3931,
11738
+ "step": 16390
11739
+ },
11740
+ {
11741
+ "epoch": 0.41,
11742
+ "grad_norm": 2.6128389835357666,
11743
+ "learning_rate": 9.220338983050847e-06,
11744
+ "loss": 1.4359,
11745
+ "step": 16400
11746
+ },
11747
+ {
11748
+ "epoch": 0.41,
11749
+ "grad_norm": 2.5427112579345703,
11750
+ "learning_rate": 9.213559322033899e-06,
11751
+ "loss": 1.4364,
11752
+ "step": 16410
11753
+ },
11754
+ {
11755
+ "epoch": 0.41,
11756
+ "grad_norm": 3.0635411739349365,
11757
+ "learning_rate": 9.20677966101695e-06,
11758
+ "loss": 1.4804,
11759
+ "step": 16420
11760
+ },
11761
+ {
11762
+ "epoch": 0.41,
11763
+ "grad_norm": 4.00955057144165,
11764
+ "learning_rate": 9.200000000000002e-06,
11765
+ "loss": 1.1819,
11766
+ "step": 16430
11767
+ },
11768
+ {
11769
+ "epoch": 0.41,
11770
+ "grad_norm": 2.0021607875823975,
11771
+ "learning_rate": 9.193220338983051e-06,
11772
+ "loss": 1.312,
11773
+ "step": 16440
11774
+ },
11775
+ {
11776
+ "epoch": 0.41,
11777
+ "grad_norm": 6.373915672302246,
11778
+ "learning_rate": 9.186440677966101e-06,
11779
+ "loss": 1.5638,
11780
+ "step": 16450
11781
+ },
11782
+ {
11783
+ "epoch": 0.41,
11784
+ "grad_norm": 7.486032962799072,
11785
+ "learning_rate": 9.179661016949153e-06,
11786
+ "loss": 1.5156,
11787
+ "step": 16460
11788
+ },
11789
+ {
11790
+ "epoch": 0.41,
11791
+ "grad_norm": 7.894711971282959,
11792
+ "learning_rate": 9.172881355932204e-06,
11793
+ "loss": 1.3042,
11794
+ "step": 16470
11795
+ },
11796
+ {
11797
+ "epoch": 0.41,
11798
+ "grad_norm": 4.769121170043945,
11799
+ "learning_rate": 9.166101694915255e-06,
11800
+ "loss": 1.5031,
11801
+ "step": 16480
11802
+ },
11803
+ {
11804
+ "epoch": 0.41,
11805
+ "grad_norm": 5.979970455169678,
11806
+ "learning_rate": 9.159322033898305e-06,
11807
+ "loss": 1.2542,
11808
+ "step": 16490
11809
+ },
11810
+ {
11811
+ "epoch": 0.41,
11812
+ "grad_norm": 6.4072651863098145,
11813
+ "learning_rate": 9.152542372881356e-06,
11814
+ "loss": 1.319,
11815
+ "step": 16500
11816
+ },
11817
+ {
11818
+ "epoch": 0.41,
11819
+ "eval_loss": 1.3969231843948364,
11820
+ "eval_runtime": 66.1684,
11821
+ "eval_samples_per_second": 15.113,
11822
+ "eval_steps_per_second": 15.113,
11823
+ "step": 16500
11824
+ },
11825
+ {
11826
+ "epoch": 0.41,
11827
+ "grad_norm": 6.439510822296143,
11828
+ "learning_rate": 9.145762711864408e-06,
11829
+ "loss": 1.6106,
11830
+ "step": 16510
11831
+ },
11832
+ {
11833
+ "epoch": 0.41,
11834
+ "grad_norm": 3.3735623359680176,
11835
+ "learning_rate": 9.13898305084746e-06,
11836
+ "loss": 1.3179,
11837
+ "step": 16520
11838
+ },
11839
+ {
11840
+ "epoch": 0.41,
11841
+ "grad_norm": 4.679867267608643,
11842
+ "learning_rate": 9.132203389830509e-06,
11843
+ "loss": 1.465,
11844
+ "step": 16530
11845
+ },
11846
+ {
11847
+ "epoch": 0.41,
11848
+ "grad_norm": 3.878396987915039,
11849
+ "learning_rate": 9.12542372881356e-06,
11850
+ "loss": 1.2999,
11851
+ "step": 16540
11852
+ },
11853
+ {
11854
+ "epoch": 0.41,
11855
+ "grad_norm": 5.397853374481201,
11856
+ "learning_rate": 9.11864406779661e-06,
11857
+ "loss": 1.3575,
11858
+ "step": 16550
11859
+ },
11860
+ {
11861
+ "epoch": 0.41,
11862
+ "grad_norm": 3.2240700721740723,
11863
+ "learning_rate": 9.111864406779661e-06,
11864
+ "loss": 1.5086,
11865
+ "step": 16560
11866
+ },
11867
+ {
11868
+ "epoch": 0.41,
11869
+ "grad_norm": 4.239155292510986,
11870
+ "learning_rate": 9.105084745762713e-06,
11871
+ "loss": 1.3695,
11872
+ "step": 16570
11873
+ },
11874
+ {
11875
+ "epoch": 0.41,
11876
+ "grad_norm": 9.791718482971191,
11877
+ "learning_rate": 9.098305084745763e-06,
11878
+ "loss": 1.4928,
11879
+ "step": 16580
11880
+ },
11881
+ {
11882
+ "epoch": 0.41,
11883
+ "grad_norm": 4.839406967163086,
11884
+ "learning_rate": 9.091525423728814e-06,
11885
+ "loss": 1.4183,
11886
+ "step": 16590
11887
+ },
11888
+ {
11889
+ "epoch": 0.41,
11890
+ "grad_norm": 3.728440046310425,
11891
+ "learning_rate": 9.084745762711865e-06,
11892
+ "loss": 1.2819,
11893
+ "step": 16600
11894
+ },
11895
+ {
11896
+ "epoch": 0.42,
11897
+ "grad_norm": 2.5298233032226562,
11898
+ "learning_rate": 9.077966101694917e-06,
11899
+ "loss": 1.3922,
11900
+ "step": 16610
11901
+ },
11902
+ {
11903
+ "epoch": 0.42,
11904
+ "grad_norm": 4.599859714508057,
11905
+ "learning_rate": 9.071186440677966e-06,
11906
+ "loss": 1.4627,
11907
+ "step": 16620
11908
+ },
11909
+ {
11910
+ "epoch": 0.42,
11911
+ "grad_norm": 2.5592544078826904,
11912
+ "learning_rate": 9.064406779661018e-06,
11913
+ "loss": 1.544,
11914
+ "step": 16630
11915
+ },
11916
+ {
11917
+ "epoch": 0.42,
11918
+ "grad_norm": 8.281981468200684,
11919
+ "learning_rate": 9.05762711864407e-06,
11920
+ "loss": 1.4084,
11921
+ "step": 16640
11922
+ },
11923
+ {
11924
+ "epoch": 0.42,
11925
+ "grad_norm": 5.964704990386963,
11926
+ "learning_rate": 9.05084745762712e-06,
11927
+ "loss": 1.4186,
11928
+ "step": 16650
11929
+ },
11930
+ {
11931
+ "epoch": 0.42,
11932
+ "grad_norm": 4.577818393707275,
11933
+ "learning_rate": 9.04406779661017e-06,
11934
+ "loss": 1.2691,
11935
+ "step": 16660
11936
+ },
11937
+ {
11938
+ "epoch": 0.42,
11939
+ "grad_norm": 3.4889256954193115,
11940
+ "learning_rate": 9.03728813559322e-06,
11941
+ "loss": 1.3494,
11942
+ "step": 16670
11943
+ },
11944
+ {
11945
+ "epoch": 0.42,
11946
+ "grad_norm": 4.458433628082275,
11947
+ "learning_rate": 9.030508474576271e-06,
11948
+ "loss": 1.4639,
11949
+ "step": 16680
11950
+ },
11951
+ {
11952
+ "epoch": 0.42,
11953
+ "grad_norm": 5.813331127166748,
11954
+ "learning_rate": 9.023728813559323e-06,
11955
+ "loss": 1.2206,
11956
+ "step": 16690
11957
+ },
11958
+ {
11959
+ "epoch": 0.42,
11960
+ "grad_norm": 2.583115339279175,
11961
+ "learning_rate": 9.016949152542374e-06,
11962
+ "loss": 1.4299,
11963
+ "step": 16700
11964
+ },
11965
+ {
11966
+ "epoch": 0.42,
11967
+ "grad_norm": 7.3467206954956055,
11968
+ "learning_rate": 9.010169491525424e-06,
11969
+ "loss": 1.3001,
11970
+ "step": 16710
11971
+ },
11972
+ {
11973
+ "epoch": 0.42,
11974
+ "grad_norm": 3.896238088607788,
11975
+ "learning_rate": 9.003389830508475e-06,
11976
+ "loss": 1.3541,
11977
+ "step": 16720
11978
+ },
11979
+ {
11980
+ "epoch": 0.42,
11981
+ "grad_norm": 3.967496156692505,
11982
+ "learning_rate": 8.996610169491527e-06,
11983
+ "loss": 1.3315,
11984
+ "step": 16730
11985
+ },
11986
+ {
11987
+ "epoch": 0.42,
11988
+ "grad_norm": 2.9316656589508057,
11989
+ "learning_rate": 8.989830508474578e-06,
11990
+ "loss": 1.3935,
11991
+ "step": 16740
11992
+ },
11993
+ {
11994
+ "epoch": 0.42,
11995
+ "grad_norm": 1.619555950164795,
11996
+ "learning_rate": 8.983050847457628e-06,
11997
+ "loss": 1.2879,
11998
+ "step": 16750
11999
+ },
12000
+ {
12001
+ "epoch": 0.42,
12002
+ "grad_norm": 3.6497581005096436,
12003
+ "learning_rate": 8.976271186440678e-06,
12004
+ "loss": 1.5195,
12005
+ "step": 16760
12006
+ },
12007
+ {
12008
+ "epoch": 0.42,
12009
+ "grad_norm": 5.33650016784668,
12010
+ "learning_rate": 8.969491525423729e-06,
12011
+ "loss": 1.4568,
12012
+ "step": 16770
12013
+ },
12014
+ {
12015
+ "epoch": 0.42,
12016
+ "grad_norm": 7.370872974395752,
12017
+ "learning_rate": 8.96271186440678e-06,
12018
+ "loss": 1.5122,
12019
+ "step": 16780
12020
+ },
12021
+ {
12022
+ "epoch": 0.42,
12023
+ "grad_norm": 6.295271873474121,
12024
+ "learning_rate": 8.955932203389832e-06,
12025
+ "loss": 1.177,
12026
+ "step": 16790
12027
+ },
12028
+ {
12029
+ "epoch": 0.42,
12030
+ "grad_norm": 7.165903091430664,
12031
+ "learning_rate": 8.949152542372881e-06,
12032
+ "loss": 1.3597,
12033
+ "step": 16800
12034
+ },
12035
+ {
12036
+ "epoch": 0.42,
12037
+ "grad_norm": 4.152050018310547,
12038
+ "learning_rate": 8.942372881355933e-06,
12039
+ "loss": 1.3837,
12040
+ "step": 16810
12041
+ },
12042
+ {
12043
+ "epoch": 0.42,
12044
+ "grad_norm": 2.2240993976593018,
12045
+ "learning_rate": 8.935593220338984e-06,
12046
+ "loss": 1.3974,
12047
+ "step": 16820
12048
+ },
12049
+ {
12050
+ "epoch": 0.42,
12051
+ "grad_norm": 2.854851245880127,
12052
+ "learning_rate": 8.928813559322036e-06,
12053
+ "loss": 1.4478,
12054
+ "step": 16830
12055
+ },
12056
+ {
12057
+ "epoch": 0.42,
12058
+ "grad_norm": 4.584380149841309,
12059
+ "learning_rate": 8.922033898305085e-06,
12060
+ "loss": 1.4808,
12061
+ "step": 16840
12062
+ },
12063
+ {
12064
+ "epoch": 0.42,
12065
+ "grad_norm": 5.496336460113525,
12066
+ "learning_rate": 8.915254237288137e-06,
12067
+ "loss": 1.4587,
12068
+ "step": 16850
12069
+ },
12070
+ {
12071
+ "epoch": 0.42,
12072
+ "grad_norm": 9.56225872039795,
12073
+ "learning_rate": 8.908474576271188e-06,
12074
+ "loss": 1.2726,
12075
+ "step": 16860
12076
+ },
12077
+ {
12078
+ "epoch": 0.42,
12079
+ "grad_norm": 3.6302268505096436,
12080
+ "learning_rate": 8.901694915254238e-06,
12081
+ "loss": 1.4753,
12082
+ "step": 16870
12083
+ },
12084
+ {
12085
+ "epoch": 0.42,
12086
+ "grad_norm": 9.815473556518555,
12087
+ "learning_rate": 8.89491525423729e-06,
12088
+ "loss": 1.3163,
12089
+ "step": 16880
12090
+ },
12091
+ {
12092
+ "epoch": 0.42,
12093
+ "grad_norm": 6.138949871063232,
12094
+ "learning_rate": 8.888135593220339e-06,
12095
+ "loss": 1.3637,
12096
+ "step": 16890
12097
+ },
12098
+ {
12099
+ "epoch": 0.42,
12100
+ "grad_norm": 5.0360212326049805,
12101
+ "learning_rate": 8.88135593220339e-06,
12102
+ "loss": 1.5961,
12103
+ "step": 16900
12104
+ },
12105
+ {
12106
+ "epoch": 0.42,
12107
+ "grad_norm": 11.983102798461914,
12108
+ "learning_rate": 8.874576271186442e-06,
12109
+ "loss": 1.3359,
12110
+ "step": 16910
12111
+ },
12112
+ {
12113
+ "epoch": 0.42,
12114
+ "grad_norm": 3.8704030513763428,
12115
+ "learning_rate": 8.867796610169492e-06,
12116
+ "loss": 1.2624,
12117
+ "step": 16920
12118
+ },
12119
+ {
12120
+ "epoch": 0.42,
12121
+ "grad_norm": 8.61357593536377,
12122
+ "learning_rate": 8.861016949152543e-06,
12123
+ "loss": 1.4325,
12124
+ "step": 16930
12125
+ },
12126
+ {
12127
+ "epoch": 0.42,
12128
+ "grad_norm": 1.4079731702804565,
12129
+ "learning_rate": 8.854237288135594e-06,
12130
+ "loss": 1.3984,
12131
+ "step": 16940
12132
+ },
12133
+ {
12134
+ "epoch": 0.42,
12135
+ "grad_norm": 4.425031661987305,
12136
+ "learning_rate": 8.847457627118646e-06,
12137
+ "loss": 1.3202,
12138
+ "step": 16950
12139
+ },
12140
+ {
12141
+ "epoch": 0.42,
12142
+ "grad_norm": 7.565074920654297,
12143
+ "learning_rate": 8.840677966101695e-06,
12144
+ "loss": 1.4561,
12145
+ "step": 16960
12146
+ },
12147
+ {
12148
+ "epoch": 0.42,
12149
+ "grad_norm": 4.513836860656738,
12150
+ "learning_rate": 8.833898305084747e-06,
12151
+ "loss": 1.52,
12152
+ "step": 16970
12153
+ },
12154
+ {
12155
+ "epoch": 0.42,
12156
+ "grad_norm": 7.830351829528809,
12157
+ "learning_rate": 8.827118644067797e-06,
12158
+ "loss": 1.4321,
12159
+ "step": 16980
12160
+ },
12161
+ {
12162
+ "epoch": 0.42,
12163
+ "grad_norm": 4.269097328186035,
12164
+ "learning_rate": 8.820338983050848e-06,
12165
+ "loss": 1.4247,
12166
+ "step": 16990
12167
+ },
12168
+ {
12169
+ "epoch": 0.42,
12170
+ "grad_norm": 3.2125535011291504,
12171
+ "learning_rate": 8.8135593220339e-06,
12172
+ "loss": 1.1334,
12173
+ "step": 17000
12174
+ },
12175
+ {
12176
+ "epoch": 0.42,
12177
+ "eval_loss": 1.3314954042434692,
12178
+ "eval_runtime": 66.1036,
12179
+ "eval_samples_per_second": 15.128,
12180
+ "eval_steps_per_second": 15.128,
12181
+ "step": 17000
12182
+ },
12183
+ {
12184
+ "epoch": 0.43,
12185
+ "grad_norm": 3.5944836139678955,
12186
+ "learning_rate": 8.806779661016949e-06,
12187
+ "loss": 1.2363,
12188
+ "step": 17010
12189
+ },
12190
+ {
12191
+ "epoch": 0.43,
12192
+ "grad_norm": 3.680687189102173,
12193
+ "learning_rate": 8.8e-06,
12194
+ "loss": 1.2968,
12195
+ "step": 17020
12196
+ },
12197
+ {
12198
+ "epoch": 0.43,
12199
+ "grad_norm": 9.756001472473145,
12200
+ "learning_rate": 8.793220338983052e-06,
12201
+ "loss": 1.378,
12202
+ "step": 17030
12203
+ },
12204
+ {
12205
+ "epoch": 0.43,
12206
+ "grad_norm": 3.8217949867248535,
12207
+ "learning_rate": 8.786440677966103e-06,
12208
+ "loss": 1.1933,
12209
+ "step": 17040
12210
+ },
12211
+ {
12212
+ "epoch": 0.43,
12213
+ "grad_norm": 4.635326862335205,
12214
+ "learning_rate": 8.779661016949153e-06,
12215
+ "loss": 1.4635,
12216
+ "step": 17050
12217
+ },
12218
+ {
12219
+ "epoch": 0.43,
12220
+ "grad_norm": 12.349688529968262,
12221
+ "learning_rate": 8.772881355932204e-06,
12222
+ "loss": 1.1233,
12223
+ "step": 17060
12224
+ },
12225
+ {
12226
+ "epoch": 0.43,
12227
+ "grad_norm": 2.0350751876831055,
12228
+ "learning_rate": 8.766101694915254e-06,
12229
+ "loss": 1.5517,
12230
+ "step": 17070
12231
+ },
12232
+ {
12233
+ "epoch": 0.43,
12234
+ "grad_norm": 2.533076524734497,
12235
+ "learning_rate": 8.759322033898305e-06,
12236
+ "loss": 1.2468,
12237
+ "step": 17080
12238
+ },
12239
+ {
12240
+ "epoch": 0.43,
12241
+ "grad_norm": 5.4851813316345215,
12242
+ "learning_rate": 8.752542372881357e-06,
12243
+ "loss": 1.3057,
12244
+ "step": 17090
12245
+ },
12246
+ {
12247
+ "epoch": 0.43,
12248
+ "grad_norm": 5.9718499183654785,
12249
+ "learning_rate": 8.745762711864407e-06,
12250
+ "loss": 1.3685,
12251
+ "step": 17100
12252
+ },
12253
+ {
12254
+ "epoch": 0.43,
12255
+ "grad_norm": 6.759610176086426,
12256
+ "learning_rate": 8.738983050847458e-06,
12257
+ "loss": 1.2429,
12258
+ "step": 17110
12259
+ },
12260
+ {
12261
+ "epoch": 0.43,
12262
+ "grad_norm": 10.515771865844727,
12263
+ "learning_rate": 8.73220338983051e-06,
12264
+ "loss": 1.1885,
12265
+ "step": 17120
12266
+ },
12267
+ {
12268
+ "epoch": 0.43,
12269
+ "grad_norm": 5.439294815063477,
12270
+ "learning_rate": 8.72542372881356e-06,
12271
+ "loss": 1.4804,
12272
+ "step": 17130
12273
+ },
12274
+ {
12275
+ "epoch": 0.43,
12276
+ "grad_norm": 3.2483952045440674,
12277
+ "learning_rate": 8.71864406779661e-06,
12278
+ "loss": 1.2879,
12279
+ "step": 17140
12280
+ },
12281
+ {
12282
+ "epoch": 0.43,
12283
+ "grad_norm": 12.16537857055664,
12284
+ "learning_rate": 8.711864406779662e-06,
12285
+ "loss": 1.365,
12286
+ "step": 17150
12287
+ },
12288
+ {
12289
+ "epoch": 0.43,
12290
+ "grad_norm": 7.272519588470459,
12291
+ "learning_rate": 8.705084745762713e-06,
12292
+ "loss": 1.2356,
12293
+ "step": 17160
12294
+ },
12295
+ {
12296
+ "epoch": 0.43,
12297
+ "grad_norm": 5.10579252243042,
12298
+ "learning_rate": 8.698305084745765e-06,
12299
+ "loss": 1.2475,
12300
+ "step": 17170
12301
+ },
12302
+ {
12303
+ "epoch": 0.43,
12304
+ "grad_norm": 2.060873031616211,
12305
+ "learning_rate": 8.691525423728814e-06,
12306
+ "loss": 1.2728,
12307
+ "step": 17180
12308
+ },
12309
+ {
12310
+ "epoch": 0.43,
12311
+ "grad_norm": 4.275463581085205,
12312
+ "learning_rate": 8.684745762711864e-06,
12313
+ "loss": 1.3372,
12314
+ "step": 17190
12315
+ },
12316
+ {
12317
+ "epoch": 0.43,
12318
+ "grad_norm": 7.369024753570557,
12319
+ "learning_rate": 8.677966101694915e-06,
12320
+ "loss": 1.3319,
12321
+ "step": 17200
12322
+ },
12323
+ {
12324
+ "epoch": 0.43,
12325
+ "grad_norm": 4.748632431030273,
12326
+ "learning_rate": 8.671186440677967e-06,
12327
+ "loss": 1.3954,
12328
+ "step": 17210
12329
+ },
12330
+ {
12331
+ "epoch": 0.43,
12332
+ "grad_norm": 6.593578815460205,
12333
+ "learning_rate": 8.664406779661018e-06,
12334
+ "loss": 1.218,
12335
+ "step": 17220
12336
+ },
12337
+ {
12338
+ "epoch": 0.43,
12339
+ "grad_norm": 9.88740348815918,
12340
+ "learning_rate": 8.657627118644068e-06,
12341
+ "loss": 1.3813,
12342
+ "step": 17230
12343
+ },
12344
+ {
12345
+ "epoch": 0.43,
12346
+ "grad_norm": 5.244362831115723,
12347
+ "learning_rate": 8.65084745762712e-06,
12348
+ "loss": 1.3307,
12349
+ "step": 17240
12350
+ },
12351
+ {
12352
+ "epoch": 0.43,
12353
+ "grad_norm": 6.388607025146484,
12354
+ "learning_rate": 8.64406779661017e-06,
12355
+ "loss": 1.3975,
12356
+ "step": 17250
12357
+ },
12358
+ {
12359
+ "epoch": 0.43,
12360
+ "grad_norm": 5.940032005310059,
12361
+ "learning_rate": 8.637288135593222e-06,
12362
+ "loss": 1.452,
12363
+ "step": 17260
12364
+ },
12365
+ {
12366
+ "epoch": 0.43,
12367
+ "grad_norm": 1.8261200189590454,
12368
+ "learning_rate": 8.630508474576272e-06,
12369
+ "loss": 1.3114,
12370
+ "step": 17270
12371
+ },
12372
+ {
12373
+ "epoch": 0.43,
12374
+ "grad_norm": 4.878362655639648,
12375
+ "learning_rate": 8.623728813559322e-06,
12376
+ "loss": 1.3152,
12377
+ "step": 17280
12378
+ },
12379
+ {
12380
+ "epoch": 0.43,
12381
+ "grad_norm": 5.558933258056641,
12382
+ "learning_rate": 8.616949152542373e-06,
12383
+ "loss": 1.5567,
12384
+ "step": 17290
12385
+ },
12386
+ {
12387
+ "epoch": 0.43,
12388
+ "grad_norm": 2.346237897872925,
12389
+ "learning_rate": 8.610169491525424e-06,
12390
+ "loss": 1.4462,
12391
+ "step": 17300
12392
+ },
12393
+ {
12394
+ "epoch": 0.43,
12395
+ "grad_norm": 5.712830066680908,
12396
+ "learning_rate": 8.603389830508476e-06,
12397
+ "loss": 1.2658,
12398
+ "step": 17310
12399
+ },
12400
+ {
12401
+ "epoch": 0.43,
12402
+ "grad_norm": 6.277383804321289,
12403
+ "learning_rate": 8.596610169491526e-06,
12404
+ "loss": 1.3562,
12405
+ "step": 17320
12406
+ },
12407
+ {
12408
+ "epoch": 0.43,
12409
+ "grad_norm": 2.8792171478271484,
12410
+ "learning_rate": 8.589830508474577e-06,
12411
+ "loss": 1.2409,
12412
+ "step": 17330
12413
+ },
12414
+ {
12415
+ "epoch": 0.43,
12416
+ "grad_norm": 4.000193119049072,
12417
+ "learning_rate": 8.583050847457628e-06,
12418
+ "loss": 1.3503,
12419
+ "step": 17340
12420
+ },
12421
+ {
12422
+ "epoch": 0.43,
12423
+ "grad_norm": 6.255504131317139,
12424
+ "learning_rate": 8.57627118644068e-06,
12425
+ "loss": 1.3411,
12426
+ "step": 17350
12427
+ },
12428
+ {
12429
+ "epoch": 0.43,
12430
+ "grad_norm": 5.366970539093018,
12431
+ "learning_rate": 8.56949152542373e-06,
12432
+ "loss": 1.2918,
12433
+ "step": 17360
12434
+ },
12435
+ {
12436
+ "epoch": 0.43,
12437
+ "grad_norm": 1.9505482912063599,
12438
+ "learning_rate": 8.56271186440678e-06,
12439
+ "loss": 1.3139,
12440
+ "step": 17370
12441
+ },
12442
+ {
12443
+ "epoch": 0.43,
12444
+ "grad_norm": 8.022522926330566,
12445
+ "learning_rate": 8.55593220338983e-06,
12446
+ "loss": 1.515,
12447
+ "step": 17380
12448
+ },
12449
+ {
12450
+ "epoch": 0.43,
12451
+ "grad_norm": 5.483621120452881,
12452
+ "learning_rate": 8.549152542372882e-06,
12453
+ "loss": 1.4464,
12454
+ "step": 17390
12455
+ },
12456
+ {
12457
+ "epoch": 0.43,
12458
+ "grad_norm": 5.349776268005371,
12459
+ "learning_rate": 8.542372881355933e-06,
12460
+ "loss": 1.5864,
12461
+ "step": 17400
12462
+ },
12463
+ {
12464
+ "epoch": 0.44,
12465
+ "grad_norm": 3.4136993885040283,
12466
+ "learning_rate": 8.535593220338983e-06,
12467
+ "loss": 1.2964,
12468
+ "step": 17410
12469
+ },
12470
+ {
12471
+ "epoch": 0.44,
12472
+ "grad_norm": 4.270326614379883,
12473
+ "learning_rate": 8.528813559322034e-06,
12474
+ "loss": 1.3736,
12475
+ "step": 17420
12476
+ },
12477
+ {
12478
+ "epoch": 0.44,
12479
+ "grad_norm": 2.886993169784546,
12480
+ "learning_rate": 8.522033898305086e-06,
12481
+ "loss": 1.3513,
12482
+ "step": 17430
12483
+ },
12484
+ {
12485
+ "epoch": 0.44,
12486
+ "grad_norm": 4.42819881439209,
12487
+ "learning_rate": 8.515254237288136e-06,
12488
+ "loss": 1.4226,
12489
+ "step": 17440
12490
+ },
12491
+ {
12492
+ "epoch": 0.44,
12493
+ "grad_norm": 5.844710350036621,
12494
+ "learning_rate": 8.508474576271187e-06,
12495
+ "loss": 1.3135,
12496
+ "step": 17450
12497
+ },
12498
+ {
12499
+ "epoch": 0.44,
12500
+ "grad_norm": 2.4220869541168213,
12501
+ "learning_rate": 8.501694915254238e-06,
12502
+ "loss": 1.165,
12503
+ "step": 17460
12504
+ },
12505
+ {
12506
+ "epoch": 0.44,
12507
+ "grad_norm": 4.976444244384766,
12508
+ "learning_rate": 8.49491525423729e-06,
12509
+ "loss": 1.1964,
12510
+ "step": 17470
12511
+ },
12512
+ {
12513
+ "epoch": 0.44,
12514
+ "grad_norm": 12.55490779876709,
12515
+ "learning_rate": 8.48813559322034e-06,
12516
+ "loss": 1.5891,
12517
+ "step": 17480
12518
+ },
12519
+ {
12520
+ "epoch": 0.44,
12521
+ "grad_norm": 4.102578163146973,
12522
+ "learning_rate": 8.481355932203391e-06,
12523
+ "loss": 1.445,
12524
+ "step": 17490
12525
+ },
12526
+ {
12527
+ "epoch": 0.44,
12528
+ "grad_norm": 2.6767678260803223,
12529
+ "learning_rate": 8.47457627118644e-06,
12530
+ "loss": 1.536,
12531
+ "step": 17500
12532
+ },
12533
+ {
12534
+ "epoch": 0.44,
12535
+ "eval_loss": 1.388473391532898,
12536
+ "eval_runtime": 66.17,
12537
+ "eval_samples_per_second": 15.113,
12538
+ "eval_steps_per_second": 15.113,
12539
+ "step": 17500
12540
  }
12541
  ],
12542
  "logging_steps": 10,
 
12544
  "num_input_tokens_seen": 0,
12545
  "num_train_epochs": 1,
12546
  "save_steps": 2500,
12547
+ "total_flos": 2.8178720489472e+17,
12548
  "train_batch_size": 1,
12549
  "trial_name": null,
12550
  "trial_params": null