jdannem6 commited on
Commit
42a607a
1 Parent(s): e605084

Uploaded checkpoint-30000

Browse files
Files changed (5) hide show
  1. adapter_model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +1793 -3
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:51ef8a0ba7bad5db9a03c998b7e0f0b9be8a11cac8ec229721c1db943b49949b
3
  size 119975656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e59f876dca35a1a4ac533ffe67b1c4f369b647cb09902d9a650f2f1c93e62485
3
  size 119975656
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a5eef17387e7ce5efb991a9df23b4cb28f5da1437f9349935c3f248a0519d26
3
  size 240145026
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99a798e8a221a75b00994a335c48f10dffb4f1c3d6378f32b272cb149311abb
3
  size 240145026
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42ed1734c5823abfe806343a4de18dcccd1e9ad5af5349e08097c7bde2aa7437
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc4e81d8710419a2e714a6936530b076f37f0580ef4ada57c8cd6f905915e300
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bae572518ab53ddc674f52a5ef01613875bea64a8d9c53d4b7d4a9aedc712f19
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b36093e06845c6146f3175c64f0e8bdb441d4f7fc67a6962ed0b80b6725daf1
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.9729424715042114,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240421-183352/checkpoint-25000",
4
- "epoch": 0.6366956652601481,
5
  "eval_steps": 500,
6
- "global_step": 27500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -19697,6 +19697,1796 @@
19697
  "eval_samples_per_second": 15.013,
19698
  "eval_steps_per_second": 15.013,
19699
  "step": 27500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19700
  }
19701
  ],
19702
  "logging_steps": 10,
@@ -19704,7 +21494,7 @@
19704
  "num_input_tokens_seen": 0,
19705
  "num_train_epochs": 1,
19706
  "save_steps": 2500,
19707
- "total_flos": 4.4280846483456e+17,
19708
  "train_batch_size": 1,
19709
  "trial_name": null,
19710
  "trial_params": null
 
1
  {
2
  "best_metric": 0.9729424715042114,
3
  "best_model_checkpoint": "runs/deepseek_lora_20240421-183352/checkpoint-25000",
4
+ "epoch": 0.694577089374707,
5
  "eval_steps": 500,
6
+ "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
19697
  "eval_samples_per_second": 15.013,
19698
  "eval_steps_per_second": 15.013,
19699
  "step": 27500
19700
+ },
19701
+ {
19702
+ "epoch": 0.64,
19703
+ "grad_norm": 8.995759963989258,
19704
+ "learning_rate": 1.6881355932203391e-06,
19705
+ "loss": 1.1534,
19706
+ "step": 27510
19707
+ },
19708
+ {
19709
+ "epoch": 0.64,
19710
+ "grad_norm": 4.333745002746582,
19711
+ "learning_rate": 1.6813559322033901e-06,
19712
+ "loss": 1.3447,
19713
+ "step": 27520
19714
+ },
19715
+ {
19716
+ "epoch": 0.64,
19717
+ "grad_norm": 9.009721755981445,
19718
+ "learning_rate": 1.6745762711864409e-06,
19719
+ "loss": 1.102,
19720
+ "step": 27530
19721
+ },
19722
+ {
19723
+ "epoch": 0.64,
19724
+ "grad_norm": 1.8660717010498047,
19725
+ "learning_rate": 1.6677966101694916e-06,
19726
+ "loss": 1.3257,
19727
+ "step": 27540
19728
+ },
19729
+ {
19730
+ "epoch": 0.64,
19731
+ "grad_norm": 8.743804931640625,
19732
+ "learning_rate": 1.6610169491525424e-06,
19733
+ "loss": 1.3621,
19734
+ "step": 27550
19735
+ },
19736
+ {
19737
+ "epoch": 0.64,
19738
+ "grad_norm": 4.872500419616699,
19739
+ "learning_rate": 1.6542372881355934e-06,
19740
+ "loss": 1.348,
19741
+ "step": 27560
19742
+ },
19743
+ {
19744
+ "epoch": 0.64,
19745
+ "grad_norm": 6.630684852600098,
19746
+ "learning_rate": 1.6474576271186444e-06,
19747
+ "loss": 1.1863,
19748
+ "step": 27570
19749
+ },
19750
+ {
19751
+ "epoch": 0.64,
19752
+ "grad_norm": 3.588062286376953,
19753
+ "learning_rate": 1.640677966101695e-06,
19754
+ "loss": 1.1248,
19755
+ "step": 27580
19756
+ },
19757
+ {
19758
+ "epoch": 0.64,
19759
+ "grad_norm": 9.038145065307617,
19760
+ "learning_rate": 1.6338983050847459e-06,
19761
+ "loss": 1.2381,
19762
+ "step": 27590
19763
+ },
19764
+ {
19765
+ "epoch": 0.64,
19766
+ "grad_norm": 2.703775644302368,
19767
+ "learning_rate": 1.6271186440677967e-06,
19768
+ "loss": 1.2388,
19769
+ "step": 27600
19770
+ },
19771
+ {
19772
+ "epoch": 0.64,
19773
+ "grad_norm": 6.438141345977783,
19774
+ "learning_rate": 1.6203389830508476e-06,
19775
+ "loss": 1.4026,
19776
+ "step": 27610
19777
+ },
19778
+ {
19779
+ "epoch": 0.64,
19780
+ "grad_norm": 23.630895614624023,
19781
+ "learning_rate": 1.6135593220338986e-06,
19782
+ "loss": 1.2498,
19783
+ "step": 27620
19784
+ },
19785
+ {
19786
+ "epoch": 0.64,
19787
+ "grad_norm": 3.4313204288482666,
19788
+ "learning_rate": 1.6067796610169492e-06,
19789
+ "loss": 1.275,
19790
+ "step": 27630
19791
+ },
19792
+ {
19793
+ "epoch": 0.64,
19794
+ "grad_norm": 9.49851131439209,
19795
+ "learning_rate": 1.6000000000000001e-06,
19796
+ "loss": 1.2096,
19797
+ "step": 27640
19798
+ },
19799
+ {
19800
+ "epoch": 0.64,
19801
+ "grad_norm": 3.8425252437591553,
19802
+ "learning_rate": 1.593220338983051e-06,
19803
+ "loss": 1.225,
19804
+ "step": 27650
19805
+ },
19806
+ {
19807
+ "epoch": 0.64,
19808
+ "grad_norm": 4.10614538192749,
19809
+ "learning_rate": 1.5864406779661019e-06,
19810
+ "loss": 1.182,
19811
+ "step": 27660
19812
+ },
19813
+ {
19814
+ "epoch": 0.64,
19815
+ "grad_norm": 6.613473892211914,
19816
+ "learning_rate": 1.5796610169491526e-06,
19817
+ "loss": 1.316,
19818
+ "step": 27670
19819
+ },
19820
+ {
19821
+ "epoch": 0.64,
19822
+ "grad_norm": 3.006357431411743,
19823
+ "learning_rate": 1.5728813559322034e-06,
19824
+ "loss": 0.991,
19825
+ "step": 27680
19826
+ },
19827
+ {
19828
+ "epoch": 0.64,
19829
+ "grad_norm": 6.98958158493042,
19830
+ "learning_rate": 1.5661016949152544e-06,
19831
+ "loss": 1.2518,
19832
+ "step": 27690
19833
+ },
19834
+ {
19835
+ "epoch": 0.64,
19836
+ "grad_norm": 18.53824806213379,
19837
+ "learning_rate": 1.5593220338983054e-06,
19838
+ "loss": 1.2037,
19839
+ "step": 27700
19840
+ },
19841
+ {
19842
+ "epoch": 0.64,
19843
+ "grad_norm": 9.595442771911621,
19844
+ "learning_rate": 1.552542372881356e-06,
19845
+ "loss": 1.1781,
19846
+ "step": 27710
19847
+ },
19848
+ {
19849
+ "epoch": 0.64,
19850
+ "grad_norm": 8.525481224060059,
19851
+ "learning_rate": 1.545762711864407e-06,
19852
+ "loss": 1.0589,
19853
+ "step": 27720
19854
+ },
19855
+ {
19856
+ "epoch": 0.64,
19857
+ "grad_norm": 7.7480010986328125,
19858
+ "learning_rate": 1.5389830508474577e-06,
19859
+ "loss": 1.2321,
19860
+ "step": 27730
19861
+ },
19862
+ {
19863
+ "epoch": 0.64,
19864
+ "grad_norm": 4.403713703155518,
19865
+ "learning_rate": 1.5322033898305086e-06,
19866
+ "loss": 1.2394,
19867
+ "step": 27740
19868
+ },
19869
+ {
19870
+ "epoch": 0.64,
19871
+ "grad_norm": 8.966909408569336,
19872
+ "learning_rate": 1.5254237288135596e-06,
19873
+ "loss": 1.1507,
19874
+ "step": 27750
19875
+ },
19876
+ {
19877
+ "epoch": 0.64,
19878
+ "grad_norm": 3.3880155086517334,
19879
+ "learning_rate": 1.5186440677966102e-06,
19880
+ "loss": 1.3728,
19881
+ "step": 27760
19882
+ },
19883
+ {
19884
+ "epoch": 0.64,
19885
+ "grad_norm": 5.387197971343994,
19886
+ "learning_rate": 1.5118644067796611e-06,
19887
+ "loss": 1.3463,
19888
+ "step": 27770
19889
+ },
19890
+ {
19891
+ "epoch": 0.64,
19892
+ "grad_norm": 9.160344123840332,
19893
+ "learning_rate": 1.505084745762712e-06,
19894
+ "loss": 1.2807,
19895
+ "step": 27780
19896
+ },
19897
+ {
19898
+ "epoch": 0.64,
19899
+ "grad_norm": 9.067780494689941,
19900
+ "learning_rate": 1.4983050847457629e-06,
19901
+ "loss": 1.2772,
19902
+ "step": 27790
19903
+ },
19904
+ {
19905
+ "epoch": 0.64,
19906
+ "grad_norm": 11.345379829406738,
19907
+ "learning_rate": 1.4915254237288139e-06,
19908
+ "loss": 1.2251,
19909
+ "step": 27800
19910
+ },
19911
+ {
19912
+ "epoch": 0.64,
19913
+ "grad_norm": 1.177361249923706,
19914
+ "learning_rate": 1.4847457627118644e-06,
19915
+ "loss": 1.2393,
19916
+ "step": 27810
19917
+ },
19918
+ {
19919
+ "epoch": 0.64,
19920
+ "grad_norm": 10.291409492492676,
19921
+ "learning_rate": 1.4779661016949154e-06,
19922
+ "loss": 1.1844,
19923
+ "step": 27820
19924
+ },
19925
+ {
19926
+ "epoch": 0.64,
19927
+ "grad_norm": 3.2554609775543213,
19928
+ "learning_rate": 1.4711864406779664e-06,
19929
+ "loss": 1.4098,
19930
+ "step": 27830
19931
+ },
19932
+ {
19933
+ "epoch": 0.64,
19934
+ "grad_norm": 10.152715682983398,
19935
+ "learning_rate": 1.464406779661017e-06,
19936
+ "loss": 1.1902,
19937
+ "step": 27840
19938
+ },
19939
+ {
19940
+ "epoch": 0.64,
19941
+ "grad_norm": 7.060266494750977,
19942
+ "learning_rate": 1.457627118644068e-06,
19943
+ "loss": 1.255,
19944
+ "step": 27850
19945
+ },
19946
+ {
19947
+ "epoch": 0.65,
19948
+ "grad_norm": 1.495127558708191,
19949
+ "learning_rate": 1.4508474576271187e-06,
19950
+ "loss": 1.2665,
19951
+ "step": 27860
19952
+ },
19953
+ {
19954
+ "epoch": 0.65,
19955
+ "grad_norm": 4.548370838165283,
19956
+ "learning_rate": 1.4440677966101696e-06,
19957
+ "loss": 1.301,
19958
+ "step": 27870
19959
+ },
19960
+ {
19961
+ "epoch": 0.65,
19962
+ "grad_norm": 7.4621734619140625,
19963
+ "learning_rate": 1.4372881355932206e-06,
19964
+ "loss": 1.2617,
19965
+ "step": 27880
19966
+ },
19967
+ {
19968
+ "epoch": 0.65,
19969
+ "grad_norm": 7.851536273956299,
19970
+ "learning_rate": 1.4305084745762712e-06,
19971
+ "loss": 1.2416,
19972
+ "step": 27890
19973
+ },
19974
+ {
19975
+ "epoch": 0.65,
19976
+ "grad_norm": 8.992350578308105,
19977
+ "learning_rate": 1.4237288135593222e-06,
19978
+ "loss": 1.1459,
19979
+ "step": 27900
19980
+ },
19981
+ {
19982
+ "epoch": 0.65,
19983
+ "grad_norm": 4.336245059967041,
19984
+ "learning_rate": 1.416949152542373e-06,
19985
+ "loss": 1.4332,
19986
+ "step": 27910
19987
+ },
19988
+ {
19989
+ "epoch": 0.65,
19990
+ "grad_norm": 11.527148246765137,
19991
+ "learning_rate": 1.410169491525424e-06,
19992
+ "loss": 1.1412,
19993
+ "step": 27920
19994
+ },
19995
+ {
19996
+ "epoch": 0.65,
19997
+ "grad_norm": 3.6670548915863037,
19998
+ "learning_rate": 1.4033898305084749e-06,
19999
+ "loss": 1.3735,
20000
+ "step": 27930
20001
+ },
20002
+ {
20003
+ "epoch": 0.65,
20004
+ "grad_norm": 10.903753280639648,
20005
+ "learning_rate": 1.3966101694915254e-06,
20006
+ "loss": 1.2655,
20007
+ "step": 27940
20008
+ },
20009
+ {
20010
+ "epoch": 0.65,
20011
+ "grad_norm": 17.927316665649414,
20012
+ "learning_rate": 1.3898305084745764e-06,
20013
+ "loss": 1.2136,
20014
+ "step": 27950
20015
+ },
20016
+ {
20017
+ "epoch": 0.65,
20018
+ "grad_norm": 8.938633918762207,
20019
+ "learning_rate": 1.3830508474576274e-06,
20020
+ "loss": 1.4995,
20021
+ "step": 27960
20022
+ },
20023
+ {
20024
+ "epoch": 0.65,
20025
+ "grad_norm": 8.094979286193848,
20026
+ "learning_rate": 1.376271186440678e-06,
20027
+ "loss": 1.2659,
20028
+ "step": 27970
20029
+ },
20030
+ {
20031
+ "epoch": 0.65,
20032
+ "grad_norm": 9.596714973449707,
20033
+ "learning_rate": 1.369491525423729e-06,
20034
+ "loss": 1.1559,
20035
+ "step": 27980
20036
+ },
20037
+ {
20038
+ "epoch": 0.65,
20039
+ "grad_norm": 10.698532104492188,
20040
+ "learning_rate": 1.3627118644067797e-06,
20041
+ "loss": 1.3288,
20042
+ "step": 27990
20043
+ },
20044
+ {
20045
+ "epoch": 0.65,
20046
+ "grad_norm": 2.6723885536193848,
20047
+ "learning_rate": 1.3559322033898307e-06,
20048
+ "loss": 1.2338,
20049
+ "step": 28000
20050
+ },
20051
+ {
20052
+ "epoch": 0.65,
20053
+ "eval_loss": 1.0027884244918823,
20054
+ "eval_runtime": 67.3345,
20055
+ "eval_samples_per_second": 14.851,
20056
+ "eval_steps_per_second": 14.851,
20057
+ "step": 28000
20058
+ },
20059
+ {
20060
+ "epoch": 0.65,
20061
+ "grad_norm": 7.515227794647217,
20062
+ "learning_rate": 1.3491525423728816e-06,
20063
+ "loss": 1.365,
20064
+ "step": 28010
20065
+ },
20066
+ {
20067
+ "epoch": 0.65,
20068
+ "grad_norm": 3.3241326808929443,
20069
+ "learning_rate": 1.3423728813559322e-06,
20070
+ "loss": 1.2401,
20071
+ "step": 28020
20072
+ },
20073
+ {
20074
+ "epoch": 0.65,
20075
+ "grad_norm": 8.155590057373047,
20076
+ "learning_rate": 1.3355932203389832e-06,
20077
+ "loss": 1.3003,
20078
+ "step": 28030
20079
+ },
20080
+ {
20081
+ "epoch": 0.65,
20082
+ "grad_norm": 2.6986613273620605,
20083
+ "learning_rate": 1.328813559322034e-06,
20084
+ "loss": 1.3645,
20085
+ "step": 28040
20086
+ },
20087
+ {
20088
+ "epoch": 0.65,
20089
+ "grad_norm": 3.558117151260376,
20090
+ "learning_rate": 1.322033898305085e-06,
20091
+ "loss": 1.2466,
20092
+ "step": 28050
20093
+ },
20094
+ {
20095
+ "epoch": 0.65,
20096
+ "grad_norm": 2.6389589309692383,
20097
+ "learning_rate": 1.3152542372881359e-06,
20098
+ "loss": 1.4448,
20099
+ "step": 28060
20100
+ },
20101
+ {
20102
+ "epoch": 0.65,
20103
+ "grad_norm": 4.357456684112549,
20104
+ "learning_rate": 1.3084745762711864e-06,
20105
+ "loss": 1.0606,
20106
+ "step": 28070
20107
+ },
20108
+ {
20109
+ "epoch": 0.65,
20110
+ "grad_norm": 3.8627960681915283,
20111
+ "learning_rate": 1.3016949152542374e-06,
20112
+ "loss": 1.2686,
20113
+ "step": 28080
20114
+ },
20115
+ {
20116
+ "epoch": 0.65,
20117
+ "grad_norm": 6.192730903625488,
20118
+ "learning_rate": 1.2949152542372884e-06,
20119
+ "loss": 1.4787,
20120
+ "step": 28090
20121
+ },
20122
+ {
20123
+ "epoch": 0.65,
20124
+ "grad_norm": 4.1315131187438965,
20125
+ "learning_rate": 1.288135593220339e-06,
20126
+ "loss": 1.2422,
20127
+ "step": 28100
20128
+ },
20129
+ {
20130
+ "epoch": 0.65,
20131
+ "grad_norm": 13.30528450012207,
20132
+ "learning_rate": 1.28135593220339e-06,
20133
+ "loss": 1.2519,
20134
+ "step": 28110
20135
+ },
20136
+ {
20137
+ "epoch": 0.65,
20138
+ "grad_norm": 6.925759315490723,
20139
+ "learning_rate": 1.2745762711864407e-06,
20140
+ "loss": 1.388,
20141
+ "step": 28120
20142
+ },
20143
+ {
20144
+ "epoch": 0.65,
20145
+ "grad_norm": 5.491391181945801,
20146
+ "learning_rate": 1.2677966101694917e-06,
20147
+ "loss": 1.2848,
20148
+ "step": 28130
20149
+ },
20150
+ {
20151
+ "epoch": 0.65,
20152
+ "grad_norm": 6.054429054260254,
20153
+ "learning_rate": 1.2610169491525426e-06,
20154
+ "loss": 1.1429,
20155
+ "step": 28140
20156
+ },
20157
+ {
20158
+ "epoch": 0.65,
20159
+ "grad_norm": 13.854817390441895,
20160
+ "learning_rate": 1.2542372881355932e-06,
20161
+ "loss": 1.207,
20162
+ "step": 28150
20163
+ },
20164
+ {
20165
+ "epoch": 0.65,
20166
+ "grad_norm": 10.493091583251953,
20167
+ "learning_rate": 1.2474576271186442e-06,
20168
+ "loss": 1.0886,
20169
+ "step": 28160
20170
+ },
20171
+ {
20172
+ "epoch": 0.65,
20173
+ "grad_norm": 2.1718876361846924,
20174
+ "learning_rate": 1.240677966101695e-06,
20175
+ "loss": 1.2488,
20176
+ "step": 28170
20177
+ },
20178
+ {
20179
+ "epoch": 0.65,
20180
+ "grad_norm": 2.642390727996826,
20181
+ "learning_rate": 1.233898305084746e-06,
20182
+ "loss": 1.1109,
20183
+ "step": 28180
20184
+ },
20185
+ {
20186
+ "epoch": 0.65,
20187
+ "grad_norm": 7.8057332038879395,
20188
+ "learning_rate": 1.2271186440677967e-06,
20189
+ "loss": 1.121,
20190
+ "step": 28190
20191
+ },
20192
+ {
20193
+ "epoch": 0.65,
20194
+ "grad_norm": 4.232807636260986,
20195
+ "learning_rate": 1.2203389830508477e-06,
20196
+ "loss": 1.193,
20197
+ "step": 28200
20198
+ },
20199
+ {
20200
+ "epoch": 0.65,
20201
+ "grad_norm": 7.1039862632751465,
20202
+ "learning_rate": 1.2135593220338984e-06,
20203
+ "loss": 1.142,
20204
+ "step": 28210
20205
+ },
20206
+ {
20207
+ "epoch": 0.65,
20208
+ "grad_norm": 4.6590752601623535,
20209
+ "learning_rate": 1.2067796610169492e-06,
20210
+ "loss": 1.2522,
20211
+ "step": 28220
20212
+ },
20213
+ {
20214
+ "epoch": 0.65,
20215
+ "grad_norm": 6.117308139801025,
20216
+ "learning_rate": 1.2000000000000002e-06,
20217
+ "loss": 1.3453,
20218
+ "step": 28230
20219
+ },
20220
+ {
20221
+ "epoch": 0.65,
20222
+ "grad_norm": 6.451774597167969,
20223
+ "learning_rate": 1.193220338983051e-06,
20224
+ "loss": 1.297,
20225
+ "step": 28240
20226
+ },
20227
+ {
20228
+ "epoch": 0.65,
20229
+ "grad_norm": 4.05746603012085,
20230
+ "learning_rate": 1.186440677966102e-06,
20231
+ "loss": 1.2008,
20232
+ "step": 28250
20233
+ },
20234
+ {
20235
+ "epoch": 0.65,
20236
+ "grad_norm": 12.00547981262207,
20237
+ "learning_rate": 1.1796610169491527e-06,
20238
+ "loss": 1.1584,
20239
+ "step": 28260
20240
+ },
20241
+ {
20242
+ "epoch": 0.65,
20243
+ "grad_norm": 5.004361152648926,
20244
+ "learning_rate": 1.1728813559322034e-06,
20245
+ "loss": 1.1504,
20246
+ "step": 28270
20247
+ },
20248
+ {
20249
+ "epoch": 0.65,
20250
+ "grad_norm": 5.276272296905518,
20251
+ "learning_rate": 1.1661016949152542e-06,
20252
+ "loss": 1.1542,
20253
+ "step": 28280
20254
+ },
20255
+ {
20256
+ "epoch": 0.65,
20257
+ "grad_norm": 6.736107349395752,
20258
+ "learning_rate": 1.1593220338983052e-06,
20259
+ "loss": 1.2236,
20260
+ "step": 28290
20261
+ },
20262
+ {
20263
+ "epoch": 0.66,
20264
+ "grad_norm": 3.8124780654907227,
20265
+ "learning_rate": 1.152542372881356e-06,
20266
+ "loss": 1.2236,
20267
+ "step": 28300
20268
+ },
20269
+ {
20270
+ "epoch": 0.66,
20271
+ "grad_norm": 7.842329025268555,
20272
+ "learning_rate": 1.145762711864407e-06,
20273
+ "loss": 1.3622,
20274
+ "step": 28310
20275
+ },
20276
+ {
20277
+ "epoch": 0.66,
20278
+ "grad_norm": 6.599586486816406,
20279
+ "learning_rate": 1.1389830508474577e-06,
20280
+ "loss": 1.1218,
20281
+ "step": 28320
20282
+ },
20283
+ {
20284
+ "epoch": 0.66,
20285
+ "grad_norm": 4.077427387237549,
20286
+ "learning_rate": 1.1322033898305087e-06,
20287
+ "loss": 1.2038,
20288
+ "step": 28330
20289
+ },
20290
+ {
20291
+ "epoch": 0.66,
20292
+ "grad_norm": 8.3840970993042,
20293
+ "learning_rate": 1.1254237288135594e-06,
20294
+ "loss": 1.3661,
20295
+ "step": 28340
20296
+ },
20297
+ {
20298
+ "epoch": 0.66,
20299
+ "grad_norm": 8.212132453918457,
20300
+ "learning_rate": 1.1186440677966102e-06,
20301
+ "loss": 1.1966,
20302
+ "step": 28350
20303
+ },
20304
+ {
20305
+ "epoch": 0.66,
20306
+ "grad_norm": 7.084817886352539,
20307
+ "learning_rate": 1.1118644067796612e-06,
20308
+ "loss": 1.2721,
20309
+ "step": 28360
20310
+ },
20311
+ {
20312
+ "epoch": 0.66,
20313
+ "grad_norm": 9.409709930419922,
20314
+ "learning_rate": 1.105084745762712e-06,
20315
+ "loss": 1.2941,
20316
+ "step": 28370
20317
+ },
20318
+ {
20319
+ "epoch": 0.66,
20320
+ "grad_norm": 4.141493320465088,
20321
+ "learning_rate": 1.098305084745763e-06,
20322
+ "loss": 1.4355,
20323
+ "step": 28380
20324
+ },
20325
+ {
20326
+ "epoch": 0.66,
20327
+ "grad_norm": 8.694832801818848,
20328
+ "learning_rate": 1.0915254237288137e-06,
20329
+ "loss": 1.2153,
20330
+ "step": 28390
20331
+ },
20332
+ {
20333
+ "epoch": 0.66,
20334
+ "grad_norm": 12.472442626953125,
20335
+ "learning_rate": 1.0847457627118644e-06,
20336
+ "loss": 1.1019,
20337
+ "step": 28400
20338
+ },
20339
+ {
20340
+ "epoch": 0.66,
20341
+ "grad_norm": 6.5215163230896,
20342
+ "learning_rate": 1.0779661016949152e-06,
20343
+ "loss": 1.0885,
20344
+ "step": 28410
20345
+ },
20346
+ {
20347
+ "epoch": 0.66,
20348
+ "grad_norm": 5.509222030639648,
20349
+ "learning_rate": 1.0711864406779662e-06,
20350
+ "loss": 1.2676,
20351
+ "step": 28420
20352
+ },
20353
+ {
20354
+ "epoch": 0.66,
20355
+ "grad_norm": 9.21153736114502,
20356
+ "learning_rate": 1.064406779661017e-06,
20357
+ "loss": 1.3133,
20358
+ "step": 28430
20359
+ },
20360
+ {
20361
+ "epoch": 0.66,
20362
+ "grad_norm": 5.743494987487793,
20363
+ "learning_rate": 1.057627118644068e-06,
20364
+ "loss": 1.292,
20365
+ "step": 28440
20366
+ },
20367
+ {
20368
+ "epoch": 0.66,
20369
+ "grad_norm": 8.526168823242188,
20370
+ "learning_rate": 1.0508474576271187e-06,
20371
+ "loss": 1.0585,
20372
+ "step": 28450
20373
+ },
20374
+ {
20375
+ "epoch": 0.66,
20376
+ "grad_norm": 3.5986156463623047,
20377
+ "learning_rate": 1.0440677966101697e-06,
20378
+ "loss": 1.1234,
20379
+ "step": 28460
20380
+ },
20381
+ {
20382
+ "epoch": 0.66,
20383
+ "grad_norm": 13.26816463470459,
20384
+ "learning_rate": 1.0372881355932204e-06,
20385
+ "loss": 1.1792,
20386
+ "step": 28470
20387
+ },
20388
+ {
20389
+ "epoch": 0.66,
20390
+ "grad_norm": 3.407280445098877,
20391
+ "learning_rate": 1.0305084745762712e-06,
20392
+ "loss": 1.1652,
20393
+ "step": 28480
20394
+ },
20395
+ {
20396
+ "epoch": 0.66,
20397
+ "grad_norm": 8.368290901184082,
20398
+ "learning_rate": 1.0237288135593222e-06,
20399
+ "loss": 1.3459,
20400
+ "step": 28490
20401
+ },
20402
+ {
20403
+ "epoch": 0.66,
20404
+ "grad_norm": 8.060779571533203,
20405
+ "learning_rate": 1.016949152542373e-06,
20406
+ "loss": 1.3085,
20407
+ "step": 28500
20408
+ },
20409
+ {
20410
+ "epoch": 0.66,
20411
+ "eval_loss": 1.0383493900299072,
20412
+ "eval_runtime": 68.471,
20413
+ "eval_samples_per_second": 14.605,
20414
+ "eval_steps_per_second": 14.605,
20415
+ "step": 28500
20416
+ },
20417
+ {
20418
+ "epoch": 0.66,
20419
+ "grad_norm": 2.879181385040283,
20420
+ "learning_rate": 1.010169491525424e-06,
20421
+ "loss": 1.2482,
20422
+ "step": 28510
20423
+ },
20424
+ {
20425
+ "epoch": 0.66,
20426
+ "grad_norm": 10.214177131652832,
20427
+ "learning_rate": 1.0033898305084747e-06,
20428
+ "loss": 1.2781,
20429
+ "step": 28520
20430
+ },
20431
+ {
20432
+ "epoch": 0.66,
20433
+ "grad_norm": 17.13050079345703,
20434
+ "learning_rate": 9.966101694915254e-07,
20435
+ "loss": 1.0837,
20436
+ "step": 28530
20437
+ },
20438
+ {
20439
+ "epoch": 0.66,
20440
+ "grad_norm": 15.841142654418945,
20441
+ "learning_rate": 9.898305084745762e-07,
20442
+ "loss": 1.2962,
20443
+ "step": 28540
20444
+ },
20445
+ {
20446
+ "epoch": 0.66,
20447
+ "grad_norm": 7.079030990600586,
20448
+ "learning_rate": 9.830508474576272e-07,
20449
+ "loss": 0.9777,
20450
+ "step": 28550
20451
+ },
20452
+ {
20453
+ "epoch": 0.66,
20454
+ "grad_norm": 3.9758317470550537,
20455
+ "learning_rate": 9.762711864406782e-07,
20456
+ "loss": 1.1584,
20457
+ "step": 28560
20458
+ },
20459
+ {
20460
+ "epoch": 0.66,
20461
+ "grad_norm": 4.3860392570495605,
20462
+ "learning_rate": 9.69491525423729e-07,
20463
+ "loss": 1.1247,
20464
+ "step": 28570
20465
+ },
20466
+ {
20467
+ "epoch": 0.66,
20468
+ "grad_norm": 3.759638786315918,
20469
+ "learning_rate": 9.627118644067797e-07,
20470
+ "loss": 1.2665,
20471
+ "step": 28580
20472
+ },
20473
+ {
20474
+ "epoch": 0.66,
20475
+ "grad_norm": 16.677642822265625,
20476
+ "learning_rate": 9.559322033898307e-07,
20477
+ "loss": 1.4559,
20478
+ "step": 28590
20479
+ },
20480
+ {
20481
+ "epoch": 0.66,
20482
+ "grad_norm": 7.994900703430176,
20483
+ "learning_rate": 9.491525423728814e-07,
20484
+ "loss": 1.0231,
20485
+ "step": 28600
20486
+ },
20487
+ {
20488
+ "epoch": 0.66,
20489
+ "grad_norm": 5.5230841636657715,
20490
+ "learning_rate": 9.423728813559323e-07,
20491
+ "loss": 1.1959,
20492
+ "step": 28610
20493
+ },
20494
+ {
20495
+ "epoch": 0.66,
20496
+ "grad_norm": 1.7889764308929443,
20497
+ "learning_rate": 9.355932203389831e-07,
20498
+ "loss": 1.3042,
20499
+ "step": 28620
20500
+ },
20501
+ {
20502
+ "epoch": 0.66,
20503
+ "grad_norm": 1.4875215291976929,
20504
+ "learning_rate": 9.28813559322034e-07,
20505
+ "loss": 1.1124,
20506
+ "step": 28630
20507
+ },
20508
+ {
20509
+ "epoch": 0.66,
20510
+ "grad_norm": 4.865635395050049,
20511
+ "learning_rate": 9.220338983050848e-07,
20512
+ "loss": 1.2639,
20513
+ "step": 28640
20514
+ },
20515
+ {
20516
+ "epoch": 0.66,
20517
+ "grad_norm": 3.1794333457946777,
20518
+ "learning_rate": 9.152542372881357e-07,
20519
+ "loss": 1.2244,
20520
+ "step": 28650
20521
+ },
20522
+ {
20523
+ "epoch": 0.66,
20524
+ "grad_norm": 9.40485954284668,
20525
+ "learning_rate": 9.084745762711864e-07,
20526
+ "loss": 1.244,
20527
+ "step": 28660
20528
+ },
20529
+ {
20530
+ "epoch": 0.66,
20531
+ "grad_norm": 8.51891803741455,
20532
+ "learning_rate": 9.016949152542373e-07,
20533
+ "loss": 1.2456,
20534
+ "step": 28670
20535
+ },
20536
+ {
20537
+ "epoch": 0.66,
20538
+ "grad_norm": 1.864579200744629,
20539
+ "learning_rate": 8.949152542372883e-07,
20540
+ "loss": 1.1113,
20541
+ "step": 28680
20542
+ },
20543
+ {
20544
+ "epoch": 0.66,
20545
+ "grad_norm": 12.539873123168945,
20546
+ "learning_rate": 8.881355932203391e-07,
20547
+ "loss": 1.1996,
20548
+ "step": 28690
20549
+ },
20550
+ {
20551
+ "epoch": 0.66,
20552
+ "grad_norm": 6.042193412780762,
20553
+ "learning_rate": 8.813559322033899e-07,
20554
+ "loss": 1.2614,
20555
+ "step": 28700
20556
+ },
20557
+ {
20558
+ "epoch": 0.66,
20559
+ "grad_norm": 5.701644420623779,
20560
+ "learning_rate": 8.745762711864407e-07,
20561
+ "loss": 1.1356,
20562
+ "step": 28710
20563
+ },
20564
+ {
20565
+ "epoch": 0.66,
20566
+ "grad_norm": 1.5281639099121094,
20567
+ "learning_rate": 8.677966101694917e-07,
20568
+ "loss": 1.244,
20569
+ "step": 28720
20570
+ },
20571
+ {
20572
+ "epoch": 0.67,
20573
+ "grad_norm": 11.545940399169922,
20574
+ "learning_rate": 8.610169491525424e-07,
20575
+ "loss": 1.2447,
20576
+ "step": 28730
20577
+ },
20578
+ {
20579
+ "epoch": 0.67,
20580
+ "grad_norm": 3.2043371200561523,
20581
+ "learning_rate": 8.542372881355933e-07,
20582
+ "loss": 1.2653,
20583
+ "step": 28740
20584
+ },
20585
+ {
20586
+ "epoch": 0.67,
20587
+ "grad_norm": 9.815803527832031,
20588
+ "learning_rate": 8.474576271186441e-07,
20589
+ "loss": 1.2609,
20590
+ "step": 28750
20591
+ },
20592
+ {
20593
+ "epoch": 0.67,
20594
+ "grad_norm": 2.5789248943328857,
20595
+ "learning_rate": 8.406779661016951e-07,
20596
+ "loss": 1.3839,
20597
+ "step": 28760
20598
+ },
20599
+ {
20600
+ "epoch": 0.67,
20601
+ "grad_norm": 6.396823883056641,
20602
+ "learning_rate": 8.338983050847458e-07,
20603
+ "loss": 1.2267,
20604
+ "step": 28770
20605
+ },
20606
+ {
20607
+ "epoch": 0.67,
20608
+ "grad_norm": 4.092123985290527,
20609
+ "learning_rate": 8.271186440677967e-07,
20610
+ "loss": 1.16,
20611
+ "step": 28780
20612
+ },
20613
+ {
20614
+ "epoch": 0.67,
20615
+ "grad_norm": 1.5519795417785645,
20616
+ "learning_rate": 8.203389830508475e-07,
20617
+ "loss": 1.1852,
20618
+ "step": 28790
20619
+ },
20620
+ {
20621
+ "epoch": 0.67,
20622
+ "grad_norm": 10.254398345947266,
20623
+ "learning_rate": 8.135593220338983e-07,
20624
+ "loss": 1.3109,
20625
+ "step": 28800
20626
+ },
20627
+ {
20628
+ "epoch": 0.67,
20629
+ "grad_norm": 5.28373384475708,
20630
+ "learning_rate": 8.067796610169493e-07,
20631
+ "loss": 1.2634,
20632
+ "step": 28810
20633
+ },
20634
+ {
20635
+ "epoch": 0.67,
20636
+ "grad_norm": 5.046334266662598,
20637
+ "learning_rate": 8.000000000000001e-07,
20638
+ "loss": 1.3029,
20639
+ "step": 28820
20640
+ },
20641
+ {
20642
+ "epoch": 0.67,
20643
+ "grad_norm": 7.71566104888916,
20644
+ "learning_rate": 7.932203389830509e-07,
20645
+ "loss": 1.1061,
20646
+ "step": 28830
20647
+ },
20648
+ {
20649
+ "epoch": 0.67,
20650
+ "grad_norm": 2.412494659423828,
20651
+ "learning_rate": 7.864406779661017e-07,
20652
+ "loss": 1.4086,
20653
+ "step": 28840
20654
+ },
20655
+ {
20656
+ "epoch": 0.67,
20657
+ "grad_norm": 6.458393096923828,
20658
+ "learning_rate": 7.796610169491527e-07,
20659
+ "loss": 1.238,
20660
+ "step": 28850
20661
+ },
20662
+ {
20663
+ "epoch": 0.67,
20664
+ "grad_norm": 4.167256832122803,
20665
+ "learning_rate": 7.728813559322034e-07,
20666
+ "loss": 1.24,
20667
+ "step": 28860
20668
+ },
20669
+ {
20670
+ "epoch": 0.67,
20671
+ "grad_norm": 7.2174177169799805,
20672
+ "learning_rate": 7.661016949152543e-07,
20673
+ "loss": 0.9788,
20674
+ "step": 28870
20675
+ },
20676
+ {
20677
+ "epoch": 0.67,
20678
+ "grad_norm": 8.519877433776855,
20679
+ "learning_rate": 7.593220338983051e-07,
20680
+ "loss": 1.3748,
20681
+ "step": 28880
20682
+ },
20683
+ {
20684
+ "epoch": 0.67,
20685
+ "grad_norm": 5.170273303985596,
20686
+ "learning_rate": 7.52542372881356e-07,
20687
+ "loss": 1.2798,
20688
+ "step": 28890
20689
+ },
20690
+ {
20691
+ "epoch": 0.67,
20692
+ "grad_norm": 9.433271408081055,
20693
+ "learning_rate": 7.457627118644069e-07,
20694
+ "loss": 1.2496,
20695
+ "step": 28900
20696
+ },
20697
+ {
20698
+ "epoch": 0.67,
20699
+ "grad_norm": 3.0811305046081543,
20700
+ "learning_rate": 7.389830508474577e-07,
20701
+ "loss": 1.3467,
20702
+ "step": 28910
20703
+ },
20704
+ {
20705
+ "epoch": 0.67,
20706
+ "grad_norm": 3.777050495147705,
20707
+ "learning_rate": 7.322033898305085e-07,
20708
+ "loss": 1.3679,
20709
+ "step": 28920
20710
+ },
20711
+ {
20712
+ "epoch": 0.67,
20713
+ "grad_norm": 2.2685747146606445,
20714
+ "learning_rate": 7.254237288135593e-07,
20715
+ "loss": 1.3076,
20716
+ "step": 28930
20717
+ },
20718
+ {
20719
+ "epoch": 0.67,
20720
+ "grad_norm": 6.493512153625488,
20721
+ "learning_rate": 7.186440677966103e-07,
20722
+ "loss": 1.1433,
20723
+ "step": 28940
20724
+ },
20725
+ {
20726
+ "epoch": 0.67,
20727
+ "grad_norm": 3.230098009109497,
20728
+ "learning_rate": 7.118644067796611e-07,
20729
+ "loss": 1.3191,
20730
+ "step": 28950
20731
+ },
20732
+ {
20733
+ "epoch": 0.67,
20734
+ "grad_norm": 1.8938180208206177,
20735
+ "learning_rate": 7.05084745762712e-07,
20736
+ "loss": 1.138,
20737
+ "step": 28960
20738
+ },
20739
+ {
20740
+ "epoch": 0.67,
20741
+ "grad_norm": 7.385470867156982,
20742
+ "learning_rate": 6.983050847457627e-07,
20743
+ "loss": 1.1831,
20744
+ "step": 28970
20745
+ },
20746
+ {
20747
+ "epoch": 0.67,
20748
+ "grad_norm": 4.667689800262451,
20749
+ "learning_rate": 6.915254237288137e-07,
20750
+ "loss": 1.3282,
20751
+ "step": 28980
20752
+ },
20753
+ {
20754
+ "epoch": 0.67,
20755
+ "grad_norm": 7.002562999725342,
20756
+ "learning_rate": 6.847457627118645e-07,
20757
+ "loss": 1.3384,
20758
+ "step": 28990
20759
+ },
20760
+ {
20761
+ "epoch": 0.67,
20762
+ "grad_norm": 7.729028701782227,
20763
+ "learning_rate": 6.779661016949153e-07,
20764
+ "loss": 1.2527,
20765
+ "step": 29000
20766
+ },
20767
+ {
20768
+ "epoch": 0.67,
20769
+ "eval_loss": 1.036794900894165,
20770
+ "eval_runtime": 67.0053,
20771
+ "eval_samples_per_second": 14.924,
20772
+ "eval_steps_per_second": 14.924,
20773
+ "step": 29000
20774
+ },
20775
+ {
20776
+ "epoch": 0.67,
20777
+ "grad_norm": 5.627978801727295,
20778
+ "learning_rate": 6.711864406779661e-07,
20779
+ "loss": 1.2501,
20780
+ "step": 29010
20781
+ },
20782
+ {
20783
+ "epoch": 0.67,
20784
+ "grad_norm": 18.444440841674805,
20785
+ "learning_rate": 6.64406779661017e-07,
20786
+ "loss": 1.0848,
20787
+ "step": 29020
20788
+ },
20789
+ {
20790
+ "epoch": 0.67,
20791
+ "grad_norm": 5.111781120300293,
20792
+ "learning_rate": 6.576271186440679e-07,
20793
+ "loss": 1.1205,
20794
+ "step": 29030
20795
+ },
20796
+ {
20797
+ "epoch": 0.67,
20798
+ "grad_norm": 10.785350799560547,
20799
+ "learning_rate": 6.508474576271187e-07,
20800
+ "loss": 1.0386,
20801
+ "step": 29040
20802
+ },
20803
+ {
20804
+ "epoch": 0.67,
20805
+ "grad_norm": 5.114529609680176,
20806
+ "learning_rate": 6.440677966101695e-07,
20807
+ "loss": 1.2195,
20808
+ "step": 29050
20809
+ },
20810
+ {
20811
+ "epoch": 0.67,
20812
+ "grad_norm": 9.179306983947754,
20813
+ "learning_rate": 6.372881355932203e-07,
20814
+ "loss": 1.2742,
20815
+ "step": 29060
20816
+ },
20817
+ {
20818
+ "epoch": 0.67,
20819
+ "grad_norm": 3.526697874069214,
20820
+ "learning_rate": 6.305084745762713e-07,
20821
+ "loss": 1.2442,
20822
+ "step": 29070
20823
+ },
20824
+ {
20825
+ "epoch": 0.67,
20826
+ "grad_norm": 2.499582052230835,
20827
+ "learning_rate": 6.237288135593221e-07,
20828
+ "loss": 1.1093,
20829
+ "step": 29080
20830
+ },
20831
+ {
20832
+ "epoch": 0.67,
20833
+ "grad_norm": 7.2823638916015625,
20834
+ "learning_rate": 6.16949152542373e-07,
20835
+ "loss": 1.3122,
20836
+ "step": 29090
20837
+ },
20838
+ {
20839
+ "epoch": 0.67,
20840
+ "grad_norm": 4.526163578033447,
20841
+ "learning_rate": 6.101694915254238e-07,
20842
+ "loss": 1.3623,
20843
+ "step": 29100
20844
+ },
20845
+ {
20846
+ "epoch": 0.67,
20847
+ "grad_norm": 3.685267686843872,
20848
+ "learning_rate": 6.033898305084746e-07,
20849
+ "loss": 1.0836,
20850
+ "step": 29110
20851
+ },
20852
+ {
20853
+ "epoch": 0.67,
20854
+ "grad_norm": 5.807446479797363,
20855
+ "learning_rate": 5.966101694915255e-07,
20856
+ "loss": 1.4172,
20857
+ "step": 29120
20858
+ },
20859
+ {
20860
+ "epoch": 0.67,
20861
+ "grad_norm": 7.100170135498047,
20862
+ "learning_rate": 5.898305084745763e-07,
20863
+ "loss": 1.424,
20864
+ "step": 29130
20865
+ },
20866
+ {
20867
+ "epoch": 0.67,
20868
+ "grad_norm": 2.293673276901245,
20869
+ "learning_rate": 5.830508474576271e-07,
20870
+ "loss": 1.2277,
20871
+ "step": 29140
20872
+ },
20873
+ {
20874
+ "epoch": 0.67,
20875
+ "grad_norm": 5.1463165283203125,
20876
+ "learning_rate": 5.76271186440678e-07,
20877
+ "loss": 1.0603,
20878
+ "step": 29150
20879
+ },
20880
+ {
20881
+ "epoch": 0.68,
20882
+ "grad_norm": 2.3667409420013428,
20883
+ "learning_rate": 5.694915254237288e-07,
20884
+ "loss": 1.235,
20885
+ "step": 29160
20886
+ },
20887
+ {
20888
+ "epoch": 0.68,
20889
+ "grad_norm": 12.005767822265625,
20890
+ "learning_rate": 5.627118644067797e-07,
20891
+ "loss": 1.2028,
20892
+ "step": 29170
20893
+ },
20894
+ {
20895
+ "epoch": 0.68,
20896
+ "grad_norm": 6.225417613983154,
20897
+ "learning_rate": 5.559322033898306e-07,
20898
+ "loss": 1.0662,
20899
+ "step": 29180
20900
+ },
20901
+ {
20902
+ "epoch": 0.68,
20903
+ "grad_norm": 3.9254140853881836,
20904
+ "learning_rate": 5.491525423728815e-07,
20905
+ "loss": 1.209,
20906
+ "step": 29190
20907
+ },
20908
+ {
20909
+ "epoch": 0.68,
20910
+ "grad_norm": 2.4858005046844482,
20911
+ "learning_rate": 5.423728813559322e-07,
20912
+ "loss": 1.1661,
20913
+ "step": 29200
20914
+ },
20915
+ {
20916
+ "epoch": 0.68,
20917
+ "grad_norm": 5.9373884201049805,
20918
+ "learning_rate": 5.355932203389831e-07,
20919
+ "loss": 1.1282,
20920
+ "step": 29210
20921
+ },
20922
+ {
20923
+ "epoch": 0.68,
20924
+ "grad_norm": 6.60464334487915,
20925
+ "learning_rate": 5.28813559322034e-07,
20926
+ "loss": 1.3383,
20927
+ "step": 29220
20928
+ },
20929
+ {
20930
+ "epoch": 0.68,
20931
+ "grad_norm": 4.6242828369140625,
20932
+ "learning_rate": 5.220338983050848e-07,
20933
+ "loss": 1.3385,
20934
+ "step": 29230
20935
+ },
20936
+ {
20937
+ "epoch": 0.68,
20938
+ "grad_norm": 7.414794445037842,
20939
+ "learning_rate": 5.152542372881356e-07,
20940
+ "loss": 1.2059,
20941
+ "step": 29240
20942
+ },
20943
+ {
20944
+ "epoch": 0.68,
20945
+ "grad_norm": 3.5053508281707764,
20946
+ "learning_rate": 5.084745762711865e-07,
20947
+ "loss": 1.2981,
20948
+ "step": 29250
20949
+ },
20950
+ {
20951
+ "epoch": 0.68,
20952
+ "grad_norm": 5.528652191162109,
20953
+ "learning_rate": 5.016949152542373e-07,
20954
+ "loss": 1.2069,
20955
+ "step": 29260
20956
+ },
20957
+ {
20958
+ "epoch": 0.68,
20959
+ "grad_norm": 7.826475143432617,
20960
+ "learning_rate": 4.949152542372881e-07,
20961
+ "loss": 1.286,
20962
+ "step": 29270
20963
+ },
20964
+ {
20965
+ "epoch": 0.68,
20966
+ "grad_norm": 2.1104512214660645,
20967
+ "learning_rate": 4.881355932203391e-07,
20968
+ "loss": 1.0453,
20969
+ "step": 29280
20970
+ },
20971
+ {
20972
+ "epoch": 0.68,
20973
+ "grad_norm": 3.8154537677764893,
20974
+ "learning_rate": 4.813559322033898e-07,
20975
+ "loss": 1.265,
20976
+ "step": 29290
20977
+ },
20978
+ {
20979
+ "epoch": 0.68,
20980
+ "grad_norm": 8.516002655029297,
20981
+ "learning_rate": 4.745762711864407e-07,
20982
+ "loss": 1.2142,
20983
+ "step": 29300
20984
+ },
20985
+ {
20986
+ "epoch": 0.68,
20987
+ "grad_norm": 3.6356046199798584,
20988
+ "learning_rate": 4.6779661016949154e-07,
20989
+ "loss": 1.3932,
20990
+ "step": 29310
20991
+ },
20992
+ {
20993
+ "epoch": 0.68,
20994
+ "grad_norm": 5.065585613250732,
20995
+ "learning_rate": 4.610169491525424e-07,
20996
+ "loss": 1.1825,
20997
+ "step": 29320
20998
+ },
20999
+ {
21000
+ "epoch": 0.68,
21001
+ "grad_norm": 3.2396178245544434,
21002
+ "learning_rate": 4.542372881355932e-07,
21003
+ "loss": 1.269,
21004
+ "step": 29330
21005
+ },
21006
+ {
21007
+ "epoch": 0.68,
21008
+ "grad_norm": 3.066288471221924,
21009
+ "learning_rate": 4.4745762711864415e-07,
21010
+ "loss": 1.1986,
21011
+ "step": 29340
21012
+ },
21013
+ {
21014
+ "epoch": 0.68,
21015
+ "grad_norm": 8.957977294921875,
21016
+ "learning_rate": 4.4067796610169497e-07,
21017
+ "loss": 1.2721,
21018
+ "step": 29350
21019
+ },
21020
+ {
21021
+ "epoch": 0.68,
21022
+ "grad_norm": 6.242004871368408,
21023
+ "learning_rate": 4.3389830508474584e-07,
21024
+ "loss": 1.1724,
21025
+ "step": 29360
21026
+ },
21027
+ {
21028
+ "epoch": 0.68,
21029
+ "grad_norm": 10.986865997314453,
21030
+ "learning_rate": 4.2711864406779666e-07,
21031
+ "loss": 1.3121,
21032
+ "step": 29370
21033
+ },
21034
+ {
21035
+ "epoch": 0.68,
21036
+ "grad_norm": 7.176580905914307,
21037
+ "learning_rate": 4.2033898305084753e-07,
21038
+ "loss": 1.0941,
21039
+ "step": 29380
21040
+ },
21041
+ {
21042
+ "epoch": 0.68,
21043
+ "grad_norm": 8.999109268188477,
21044
+ "learning_rate": 4.1355932203389835e-07,
21045
+ "loss": 1.2917,
21046
+ "step": 29390
21047
+ },
21048
+ {
21049
+ "epoch": 0.68,
21050
+ "grad_norm": 3.9739575386047363,
21051
+ "learning_rate": 4.0677966101694916e-07,
21052
+ "loss": 1.1372,
21053
+ "step": 29400
21054
+ },
21055
+ {
21056
+ "epoch": 0.68,
21057
+ "grad_norm": 7.752053737640381,
21058
+ "learning_rate": 4.0000000000000003e-07,
21059
+ "loss": 1.229,
21060
+ "step": 29410
21061
+ },
21062
+ {
21063
+ "epoch": 0.68,
21064
+ "grad_norm": 15.617894172668457,
21065
+ "learning_rate": 3.9322033898305085e-07,
21066
+ "loss": 1.2178,
21067
+ "step": 29420
21068
+ },
21069
+ {
21070
+ "epoch": 0.68,
21071
+ "grad_norm": 5.103687763214111,
21072
+ "learning_rate": 3.864406779661017e-07,
21073
+ "loss": 1.1923,
21074
+ "step": 29430
21075
+ },
21076
+ {
21077
+ "epoch": 0.68,
21078
+ "grad_norm": 13.210460662841797,
21079
+ "learning_rate": 3.7966101694915254e-07,
21080
+ "loss": 1.3562,
21081
+ "step": 29440
21082
+ },
21083
+ {
21084
+ "epoch": 0.68,
21085
+ "grad_norm": 14.239701271057129,
21086
+ "learning_rate": 3.7288135593220347e-07,
21087
+ "loss": 1.2422,
21088
+ "step": 29450
21089
+ },
21090
+ {
21091
+ "epoch": 0.68,
21092
+ "grad_norm": 15.124138832092285,
21093
+ "learning_rate": 3.6610169491525423e-07,
21094
+ "loss": 1.3159,
21095
+ "step": 29460
21096
+ },
21097
+ {
21098
+ "epoch": 0.68,
21099
+ "grad_norm": 10.07345199584961,
21100
+ "learning_rate": 3.5932203389830516e-07,
21101
+ "loss": 1.0358,
21102
+ "step": 29470
21103
+ },
21104
+ {
21105
+ "epoch": 0.68,
21106
+ "grad_norm": 8.566622734069824,
21107
+ "learning_rate": 3.52542372881356e-07,
21108
+ "loss": 1.0562,
21109
+ "step": 29480
21110
+ },
21111
+ {
21112
+ "epoch": 0.68,
21113
+ "grad_norm": 13.473173141479492,
21114
+ "learning_rate": 3.4576271186440684e-07,
21115
+ "loss": 1.1538,
21116
+ "step": 29490
21117
+ },
21118
+ {
21119
+ "epoch": 0.68,
21120
+ "grad_norm": 3.2546353340148926,
21121
+ "learning_rate": 3.3898305084745766e-07,
21122
+ "loss": 1.3963,
21123
+ "step": 29500
21124
+ },
21125
+ {
21126
+ "epoch": 0.68,
21127
+ "eval_loss": 0.9776638150215149,
21128
+ "eval_runtime": 67.0335,
21129
+ "eval_samples_per_second": 14.918,
21130
+ "eval_steps_per_second": 14.918,
21131
+ "step": 29500
21132
+ },
21133
+ {
21134
+ "epoch": 0.68,
21135
+ "grad_norm": 8.819803237915039,
21136
+ "learning_rate": 3.322033898305085e-07,
21137
+ "loss": 1.3515,
21138
+ "step": 29510
21139
+ },
21140
+ {
21141
+ "epoch": 0.68,
21142
+ "grad_norm": 26.38185691833496,
21143
+ "learning_rate": 3.2542372881355935e-07,
21144
+ "loss": 1.1068,
21145
+ "step": 29520
21146
+ },
21147
+ {
21148
+ "epoch": 0.68,
21149
+ "grad_norm": 5.794632434844971,
21150
+ "learning_rate": 3.1864406779661017e-07,
21151
+ "loss": 1.2475,
21152
+ "step": 29530
21153
+ },
21154
+ {
21155
+ "epoch": 0.68,
21156
+ "grad_norm": 9.786145210266113,
21157
+ "learning_rate": 3.1186440677966104e-07,
21158
+ "loss": 1.2613,
21159
+ "step": 29540
21160
+ },
21161
+ {
21162
+ "epoch": 0.68,
21163
+ "grad_norm": 8.388969421386719,
21164
+ "learning_rate": 3.050847457627119e-07,
21165
+ "loss": 1.4074,
21166
+ "step": 29550
21167
+ },
21168
+ {
21169
+ "epoch": 0.68,
21170
+ "grad_norm": 3.0043601989746094,
21171
+ "learning_rate": 2.9830508474576273e-07,
21172
+ "loss": 1.4106,
21173
+ "step": 29560
21174
+ },
21175
+ {
21176
+ "epoch": 0.68,
21177
+ "grad_norm": 12.958208084106445,
21178
+ "learning_rate": 2.9152542372881355e-07,
21179
+ "loss": 1.2264,
21180
+ "step": 29570
21181
+ },
21182
+ {
21183
+ "epoch": 0.68,
21184
+ "grad_norm": 6.8235063552856445,
21185
+ "learning_rate": 2.847457627118644e-07,
21186
+ "loss": 1.1727,
21187
+ "step": 29580
21188
+ },
21189
+ {
21190
+ "epoch": 0.69,
21191
+ "grad_norm": 4.284834384918213,
21192
+ "learning_rate": 2.779661016949153e-07,
21193
+ "loss": 1.2897,
21194
+ "step": 29590
21195
+ },
21196
+ {
21197
+ "epoch": 0.69,
21198
+ "grad_norm": 3.338212251663208,
21199
+ "learning_rate": 2.711864406779661e-07,
21200
+ "loss": 1.2449,
21201
+ "step": 29600
21202
+ },
21203
+ {
21204
+ "epoch": 0.69,
21205
+ "grad_norm": 23.881393432617188,
21206
+ "learning_rate": 2.64406779661017e-07,
21207
+ "loss": 1.2986,
21208
+ "step": 29610
21209
+ },
21210
+ {
21211
+ "epoch": 0.69,
21212
+ "grad_norm": 9.715893745422363,
21213
+ "learning_rate": 2.576271186440678e-07,
21214
+ "loss": 1.2566,
21215
+ "step": 29620
21216
+ },
21217
+ {
21218
+ "epoch": 0.69,
21219
+ "grad_norm": 2.027327060699463,
21220
+ "learning_rate": 2.5084745762711867e-07,
21221
+ "loss": 1.2739,
21222
+ "step": 29630
21223
+ },
21224
+ {
21225
+ "epoch": 0.69,
21226
+ "grad_norm": 6.084362030029297,
21227
+ "learning_rate": 2.4406779661016954e-07,
21228
+ "loss": 1.1787,
21229
+ "step": 29640
21230
+ },
21231
+ {
21232
+ "epoch": 0.69,
21233
+ "grad_norm": 9.463132858276367,
21234
+ "learning_rate": 2.3728813559322036e-07,
21235
+ "loss": 1.1894,
21236
+ "step": 29650
21237
+ },
21238
+ {
21239
+ "epoch": 0.69,
21240
+ "grad_norm": 12.049840927124023,
21241
+ "learning_rate": 2.305084745762712e-07,
21242
+ "loss": 1.2903,
21243
+ "step": 29660
21244
+ },
21245
+ {
21246
+ "epoch": 0.69,
21247
+ "grad_norm": 4.152366638183594,
21248
+ "learning_rate": 2.2372881355932207e-07,
21249
+ "loss": 1.096,
21250
+ "step": 29670
21251
+ },
21252
+ {
21253
+ "epoch": 0.69,
21254
+ "grad_norm": 9.684791564941406,
21255
+ "learning_rate": 2.1694915254237292e-07,
21256
+ "loss": 1.2017,
21257
+ "step": 29680
21258
+ },
21259
+ {
21260
+ "epoch": 0.69,
21261
+ "grad_norm": 7.867280960083008,
21262
+ "learning_rate": 2.1016949152542376e-07,
21263
+ "loss": 1.1625,
21264
+ "step": 29690
21265
+ },
21266
+ {
21267
+ "epoch": 0.69,
21268
+ "grad_norm": 1.4374995231628418,
21269
+ "learning_rate": 2.0338983050847458e-07,
21270
+ "loss": 1.4949,
21271
+ "step": 29700
21272
+ },
21273
+ {
21274
+ "epoch": 0.69,
21275
+ "grad_norm": 4.6095428466796875,
21276
+ "learning_rate": 1.9661016949152543e-07,
21277
+ "loss": 1.3103,
21278
+ "step": 29710
21279
+ },
21280
+ {
21281
+ "epoch": 0.69,
21282
+ "grad_norm": 5.289913177490234,
21283
+ "learning_rate": 1.8983050847457627e-07,
21284
+ "loss": 1.3265,
21285
+ "step": 29720
21286
+ },
21287
+ {
21288
+ "epoch": 0.69,
21289
+ "grad_norm": 9.2072172164917,
21290
+ "learning_rate": 1.8305084745762712e-07,
21291
+ "loss": 1.1148,
21292
+ "step": 29730
21293
+ },
21294
+ {
21295
+ "epoch": 0.69,
21296
+ "grad_norm": 4.103106498718262,
21297
+ "learning_rate": 1.76271186440678e-07,
21298
+ "loss": 1.3842,
21299
+ "step": 29740
21300
+ },
21301
+ {
21302
+ "epoch": 0.69,
21303
+ "grad_norm": 15.820337295532227,
21304
+ "learning_rate": 1.6949152542372883e-07,
21305
+ "loss": 1.2755,
21306
+ "step": 29750
21307
+ },
21308
+ {
21309
+ "epoch": 0.69,
21310
+ "grad_norm": 4.733695030212402,
21311
+ "learning_rate": 1.6271186440677968e-07,
21312
+ "loss": 1.462,
21313
+ "step": 29760
21314
+ },
21315
+ {
21316
+ "epoch": 0.69,
21317
+ "grad_norm": 8.206982612609863,
21318
+ "learning_rate": 1.5593220338983052e-07,
21319
+ "loss": 1.235,
21320
+ "step": 29770
21321
+ },
21322
+ {
21323
+ "epoch": 0.69,
21324
+ "grad_norm": 7.3087663650512695,
21325
+ "learning_rate": 1.4915254237288137e-07,
21326
+ "loss": 1.1845,
21327
+ "step": 29780
21328
+ },
21329
+ {
21330
+ "epoch": 0.69,
21331
+ "grad_norm": 15.122031211853027,
21332
+ "learning_rate": 1.423728813559322e-07,
21333
+ "loss": 1.1609,
21334
+ "step": 29790
21335
+ },
21336
+ {
21337
+ "epoch": 0.69,
21338
+ "grad_norm": 5.786062717437744,
21339
+ "learning_rate": 1.3559322033898305e-07,
21340
+ "loss": 1.0769,
21341
+ "step": 29800
21342
+ },
21343
+ {
21344
+ "epoch": 0.69,
21345
+ "grad_norm": 1.0411622524261475,
21346
+ "learning_rate": 1.288135593220339e-07,
21347
+ "loss": 1.2727,
21348
+ "step": 29810
21349
+ },
21350
+ {
21351
+ "epoch": 0.69,
21352
+ "grad_norm": 3.1188580989837646,
21353
+ "learning_rate": 1.2203389830508477e-07,
21354
+ "loss": 1.2504,
21355
+ "step": 29820
21356
+ },
21357
+ {
21358
+ "epoch": 0.69,
21359
+ "grad_norm": 1.6318976879119873,
21360
+ "learning_rate": 1.152542372881356e-07,
21361
+ "loss": 1.2839,
21362
+ "step": 29830
21363
+ },
21364
+ {
21365
+ "epoch": 0.69,
21366
+ "grad_norm": 7.041867733001709,
21367
+ "learning_rate": 1.0847457627118646e-07,
21368
+ "loss": 1.0513,
21369
+ "step": 29840
21370
+ },
21371
+ {
21372
+ "epoch": 0.69,
21373
+ "grad_norm": 12.029023170471191,
21374
+ "learning_rate": 1.0169491525423729e-07,
21375
+ "loss": 1.132,
21376
+ "step": 29850
21377
+ },
21378
+ {
21379
+ "epoch": 0.69,
21380
+ "grad_norm": 5.014979839324951,
21381
+ "learning_rate": 9.491525423728814e-08,
21382
+ "loss": 1.2018,
21383
+ "step": 29860
21384
+ },
21385
+ {
21386
+ "epoch": 0.69,
21387
+ "grad_norm": 2.5008723735809326,
21388
+ "learning_rate": 8.8135593220339e-08,
21389
+ "loss": 1.2404,
21390
+ "step": 29870
21391
+ },
21392
+ {
21393
+ "epoch": 0.69,
21394
+ "grad_norm": 10.014387130737305,
21395
+ "learning_rate": 8.135593220338984e-08,
21396
+ "loss": 1.2046,
21397
+ "step": 29880
21398
+ },
21399
+ {
21400
+ "epoch": 0.69,
21401
+ "grad_norm": 12.765832901000977,
21402
+ "learning_rate": 7.457627118644068e-08,
21403
+ "loss": 1.0579,
21404
+ "step": 29890
21405
+ },
21406
+ {
21407
+ "epoch": 0.69,
21408
+ "grad_norm": 4.352108001708984,
21409
+ "learning_rate": 6.779661016949153e-08,
21410
+ "loss": 1.3139,
21411
+ "step": 29900
21412
+ },
21413
+ {
21414
+ "epoch": 0.69,
21415
+ "grad_norm": 10.630574226379395,
21416
+ "learning_rate": 6.101694915254239e-08,
21417
+ "loss": 1.3174,
21418
+ "step": 29910
21419
+ },
21420
+ {
21421
+ "epoch": 0.69,
21422
+ "grad_norm": 11.868256568908691,
21423
+ "learning_rate": 5.423728813559323e-08,
21424
+ "loss": 0.9875,
21425
+ "step": 29920
21426
+ },
21427
+ {
21428
+ "epoch": 0.69,
21429
+ "grad_norm": 3.4866442680358887,
21430
+ "learning_rate": 4.745762711864407e-08,
21431
+ "loss": 1.3073,
21432
+ "step": 29930
21433
+ },
21434
+ {
21435
+ "epoch": 0.69,
21436
+ "grad_norm": 9.97262191772461,
21437
+ "learning_rate": 4.067796610169492e-08,
21438
+ "loss": 1.276,
21439
+ "step": 29940
21440
+ },
21441
+ {
21442
+ "epoch": 0.69,
21443
+ "grad_norm": 3.1585495471954346,
21444
+ "learning_rate": 3.3898305084745764e-08,
21445
+ "loss": 1.3528,
21446
+ "step": 29950
21447
+ },
21448
+ {
21449
+ "epoch": 0.69,
21450
+ "grad_norm": 14.819723129272461,
21451
+ "learning_rate": 2.7118644067796615e-08,
21452
+ "loss": 1.1333,
21453
+ "step": 29960
21454
+ },
21455
+ {
21456
+ "epoch": 0.69,
21457
+ "grad_norm": 4.731696128845215,
21458
+ "learning_rate": 2.033898305084746e-08,
21459
+ "loss": 1.5296,
21460
+ "step": 29970
21461
+ },
21462
+ {
21463
+ "epoch": 0.69,
21464
+ "grad_norm": 9.690890312194824,
21465
+ "learning_rate": 1.3559322033898307e-08,
21466
+ "loss": 1.2169,
21467
+ "step": 29980
21468
+ },
21469
+ {
21470
+ "epoch": 0.69,
21471
+ "grad_norm": 3.7671804428100586,
21472
+ "learning_rate": 6.779661016949154e-09,
21473
+ "loss": 1.304,
21474
+ "step": 29990
21475
+ },
21476
+ {
21477
+ "epoch": 0.69,
21478
+ "grad_norm": 13.029277801513672,
21479
+ "learning_rate": 0.0,
21480
+ "loss": 1.0771,
21481
+ "step": 30000
21482
+ },
21483
+ {
21484
+ "epoch": 0.69,
21485
+ "eval_loss": 1.0159568786621094,
21486
+ "eval_runtime": 67.116,
21487
+ "eval_samples_per_second": 14.9,
21488
+ "eval_steps_per_second": 14.9,
21489
+ "step": 30000
21490
  }
21491
  ],
21492
  "logging_steps": 10,
 
21494
  "num_input_tokens_seen": 0,
21495
  "num_train_epochs": 1,
21496
  "save_steps": 2500,
21497
+ "total_flos": 4.8306377981952e+17,
21498
  "train_batch_size": 1,
21499
  "trial_name": null,
21500
  "trial_params": null