🍻 cheers

Browse files

Files changed (6) hide show

README.md +6 -5
all_results.json +10 -10
eval_results.json +5 -5
runs/Mar23_12-27-13_65433f580760/events.out.tfevents.1711199781.65433f580760.3628.1 +3 -0
train_results.json +5 -5
trainer_state.json +1697 -231

README.md CHANGED Viewed

@@ -2,6 +2,7 @@
 license: apache-2.0
 base_model: google/vit-base-patch16-224-in21k
 tags:
 - generated_from_trainer
 datasets:
 - renovation
@@ -14,7 +15,7 @@ model-index:
       name: Image Classification
       type: image-classification
     dataset:
-      name: renovation
       type: renovation
       config: default
       split: validation
@@ -22,7 +23,7 @@ model-index:
     metrics:
     - name: Accuracy
       type: accuracy
-      value: 0.6950596252129472
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -30,10 +31,10 @@ should probably proofread and complete it, then remove this comment. -->
 # vit-base-beans-demo-v5
-This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the renovation dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.2470
-- Accuracy: 0.6951
 ## Model description

 license: apache-2.0
 base_model: google/vit-base-patch16-224-in21k
 tags:
+- image-classification
 - generated_from_trainer
 datasets:
 - renovation
       name: Image Classification
       type: image-classification
     dataset:
+      name: beans
       type: renovation
       config: default
       split: validation
     metrics:
     - name: Accuracy
       type: accuracy
+      value: 0.6695059625212947
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # vit-base-beans-demo-v5
+This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the beans dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.8460
+- Accuracy: 0.6695
 ## Model description

all_results.json CHANGED Viewed

@@ -1,13 +1,13 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 0.634703196347032,
-    "eval_loss": 0.929534375667572,
-    "eval_runtime": 8.1809,
-    "eval_samples_per_second": 26.77,
-    "eval_steps_per_second": 3.423,
-    "total_flos": 6.10974224738132e+17,
-    "train_loss": 0.25425288126233125,
-    "train_runtime": 387.3536,
-    "train_samples_per_second": 20.353,
-    "train_steps_per_second": 1.28
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.6695059625212947,
+    "eval_loss": 0.8459659218788147,
+    "eval_runtime": 36.9315,
+    "eval_samples_per_second": 31.789,
+    "eval_steps_per_second": 3.98,
+    "total_flos": 2.910419581971751e+18,
+    "train_loss": 0.4888155373286145,
+    "train_runtime": 2894.9609,
+    "train_samples_per_second": 12.973,
+    "train_steps_per_second": 0.811
 }

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
-    "eval_accuracy": 0.634703196347032,
-    "eval_loss": 0.929534375667572,
-    "eval_runtime": 8.1809,
-    "eval_samples_per_second": 26.77,
-    "eval_steps_per_second": 3.423
 }

 {
     "epoch": 4.0,
+    "eval_accuracy": 0.6695059625212947,
+    "eval_loss": 0.8459659218788147,
+    "eval_runtime": 36.9315,
+    "eval_samples_per_second": 31.789,
+    "eval_steps_per_second": 3.98
 }

runs/Mar23_12-27-13_65433f580760/events.out.tfevents.1711199781.65433f580760.3628.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:302b161eab291b644cbc90f82eeed5595548682cc98dcc111575d766a1cc0332
+size 411

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "epoch": 4.0,
-    "total_flos": 6.10974224738132e+17,
-    "train_loss": 0.25425288126233125,
-    "train_runtime": 387.3536,
-    "train_samples_per_second": 20.353,
-    "train_steps_per_second": 1.28
 }

 {
     "epoch": 4.0,
+    "total_flos": 2.910419581971751e+18,
+    "train_loss": 0.4888155373286145,
+    "train_runtime": 2894.9609,
+    "train_samples_per_second": 12.973,
+    "train_steps_per_second": 0.811
 }

trainer_state.json CHANGED Viewed

@@ -1,408 +1,1874 @@
 {
-  "best_metric": 0.929534375667572,
-  "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-100",
   "epoch": 4.0,
   "eval_steps": 100,
-  "global_step": 496,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.08,
-      "grad_norm": 1.9071108102798462,
-      "learning_rate": 0.00019596774193548388,
-      "loss": 0.7806,
       "step": 10
     },
     {
-      "epoch": 0.16,
-      "grad_norm": 2.2380499839782715,
-      "learning_rate": 0.00019193548387096775,
-      "loss": 0.7214,
       "step": 20
     },
     {
-      "epoch": 0.24,
-      "grad_norm": 1.4890930652618408,
-      "learning_rate": 0.00018790322580645164,
-      "loss": 0.6215,
       "step": 30
     },
     {
-      "epoch": 0.32,
-      "grad_norm": 3.2323720455169678,
-      "learning_rate": 0.00018387096774193548,
-      "loss": 0.6378,
       "step": 40
     },
     {
-      "epoch": 0.4,
-      "grad_norm": 2.838930606842041,
-      "learning_rate": 0.00017983870967741935,
-      "loss": 0.7502,
       "step": 50
     },
     {
-      "epoch": 0.48,
-      "grad_norm": 3.2034356594085693,
-      "learning_rate": 0.00017580645161290325,
-      "loss": 0.5904,
       "step": 60
     },
     {
-      "epoch": 0.56,
-      "grad_norm": 3.1891825199127197,
-      "learning_rate": 0.00017177419354838711,
-      "loss": 0.5718,
       "step": 70
     },
     {
-      "epoch": 0.65,
-      "grad_norm": 2.0921356678009033,
-      "learning_rate": 0.00016774193548387098,
-      "loss": 0.3783,
       "step": 80
     },
     {
-      "epoch": 0.73,
-      "grad_norm": 2.864804983139038,
-      "learning_rate": 0.00016370967741935485,
-      "loss": 0.6002,
       "step": 90
     },
     {
-      "epoch": 0.81,
-      "grad_norm": 3.1752126216888428,
-      "learning_rate": 0.00015967741935483872,
-      "loss": 0.6438,
       "step": 100
     },
     {
-      "epoch": 0.81,
-      "eval_accuracy": 0.634703196347032,
-      "eval_loss": 0.929534375667572,
-      "eval_runtime": 7.2962,
-      "eval_samples_per_second": 30.016,
-      "eval_steps_per_second": 3.838,
       "step": 100
     },
     {
-      "epoch": 0.89,
-      "grad_norm": 2.728193521499634,
-      "learning_rate": 0.0001556451612903226,
-      "loss": 0.5441,
       "step": 110
     },
     {
-      "epoch": 0.97,
-      "grad_norm": 2.140393018722534,
-      "learning_rate": 0.00015161290322580646,
-      "loss": 0.4403,
       "step": 120
     },
     {
-      "epoch": 1.05,
-      "grad_norm": 0.6765386462211609,
-      "learning_rate": 0.00014758064516129032,
-      "loss": 0.3251,
       "step": 130
     },
     {
-      "epoch": 1.13,
-      "grad_norm": 0.9497590661048889,
-      "learning_rate": 0.00014354838709677422,
-      "loss": 0.2046,
       "step": 140
     },
     {
-      "epoch": 1.21,
-      "grad_norm": 4.010074615478516,
-      "learning_rate": 0.00013991935483870967,
-      "loss": 0.3276,
       "step": 150
     },
     {
-      "epoch": 1.29,
-      "grad_norm": 3.7631189823150635,
-      "learning_rate": 0.00013588709677419357,
-      "loss": 0.2937,
       "step": 160
     },
     {
-      "epoch": 1.37,
-      "grad_norm": 0.5803029537200928,
-      "learning_rate": 0.00013185483870967743,
-      "loss": 0.1906,
       "step": 170
     },
     {
-      "epoch": 1.45,
-      "grad_norm": 5.088043212890625,
-      "learning_rate": 0.0001278225806451613,
-      "loss": 0.2207,
       "step": 180
     },
     {
-      "epoch": 1.53,
-      "grad_norm": 2.3816022872924805,
-      "learning_rate": 0.00012379032258064514,
-      "loss": 0.1919,
       "step": 190
     },
     {
-      "epoch": 1.61,
-      "grad_norm": 5.558553218841553,
-      "learning_rate": 0.00011975806451612903,
-      "loss": 0.3105,
       "step": 200
     },
     {
-      "epoch": 1.61,
-      "eval_accuracy": 0.6575342465753424,
-      "eval_loss": 0.9350173473358154,
-      "eval_runtime": 7.7793,
-      "eval_samples_per_second": 28.152,
-      "eval_steps_per_second": 3.599,
       "step": 200
     },
     {
-      "epoch": 1.69,
-      "grad_norm": 3.439823865890503,
-      "learning_rate": 0.00011572580645161291,
-      "loss": 0.3714,
       "step": 210
     },
     {
-      "epoch": 1.77,
-      "grad_norm": 2.6023850440979004,
-      "learning_rate": 0.00011169354838709678,
-      "loss": 0.2869,
       "step": 220
     },
     {
-      "epoch": 1.85,
-      "grad_norm": 3.2238519191741943,
-      "learning_rate": 0.00010766129032258066,
-      "loss": 0.4462,
       "step": 230
     },
     {
-      "epoch": 1.94,
-      "grad_norm": 1.0531260967254639,
-      "learning_rate": 0.00010362903225806453,
-      "loss": 0.3634,
       "step": 240
     },
     {
-      "epoch": 2.02,
-      "grad_norm": 0.5729889869689941,
-      "learning_rate": 9.95967741935484e-05,
-      "loss": 0.2624,
       "step": 250
     },
     {
-      "epoch": 2.1,
-      "grad_norm": 0.1924820989370346,
-      "learning_rate": 9.556451612903226e-05,
-      "loss": 0.0999,
       "step": 260
     },
     {
-      "epoch": 2.18,
-      "grad_norm": 0.39775505661964417,
-      "learning_rate": 9.153225806451613e-05,
-      "loss": 0.0938,
       "step": 270
     },
     {
-      "epoch": 2.26,
-      "grad_norm": 0.22179947793483734,
-      "learning_rate": 8.75e-05,
-      "loss": 0.1017,
       "step": 280
     },
     {
-      "epoch": 2.34,
-      "grad_norm": 1.6249357461929321,
-      "learning_rate": 8.346774193548388e-05,
-      "loss": 0.1745,
       "step": 290
     },
     {
-      "epoch": 2.42,
-      "grad_norm": 0.34801536798477173,
-      "learning_rate": 7.943548387096774e-05,
-      "loss": 0.0634,
       "step": 300
     },
     {
-      "epoch": 2.42,
-      "eval_accuracy": 0.6894977168949772,
-      "eval_loss": 1.0781886577606201,
-      "eval_runtime": 7.6715,
-      "eval_samples_per_second": 28.547,
-      "eval_steps_per_second": 3.65,
       "step": 300
     },
     {
-      "epoch": 2.5,
-      "grad_norm": 2.6541597843170166,
-      "learning_rate": 7.540322580645162e-05,
-      "loss": 0.0772,
       "step": 310
     },
     {
-      "epoch": 2.58,
-      "grad_norm": 0.1635380983352661,
-      "learning_rate": 7.137096774193549e-05,
-      "loss": 0.1042,
       "step": 320
     },
     {
-      "epoch": 2.66,
-      "grad_norm": 1.126976490020752,
-      "learning_rate": 6.733870967741935e-05,
-      "loss": 0.1643,
       "step": 330
     },
     {
-      "epoch": 2.74,
-      "grad_norm": 0.2140628844499588,
-      "learning_rate": 6.330645161290322e-05,
-      "loss": 0.0479,
       "step": 340
     },
     {
-      "epoch": 2.82,
-      "grad_norm": 0.14856065809726715,
-      "learning_rate": 5.9274193548387104e-05,
-      "loss": 0.0606,
       "step": 350
     },
     {
-      "epoch": 2.9,
-      "grad_norm": 1.9021470546722412,
-      "learning_rate": 5.5241935483870966e-05,
-      "loss": 0.0576,
       "step": 360
     },
     {
-      "epoch": 2.98,
-      "grad_norm": 0.488421767950058,
-      "learning_rate": 5.120967741935484e-05,
-      "loss": 0.1573,
       "step": 370
     },
     {
-      "epoch": 3.06,
-      "grad_norm": 0.27475953102111816,
-      "learning_rate": 4.7177419354838716e-05,
-      "loss": 0.0264,
       "step": 380
     },
     {
-      "epoch": 3.15,
-      "grad_norm": 0.08814023435115814,
-      "learning_rate": 4.3145161290322584e-05,
-      "loss": 0.0197,
       "step": 390
     },
     {
-      "epoch": 3.23,
-      "grad_norm": 0.10707065463066101,
-      "learning_rate": 3.911290322580645e-05,
-      "loss": 0.0257,
       "step": 400
     },
     {
-      "epoch": 3.23,
-      "eval_accuracy": 0.6986301369863014,
-      "eval_loss": 1.06435227394104,
-      "eval_runtime": 7.0971,
-      "eval_samples_per_second": 30.858,
-      "eval_steps_per_second": 3.945,
       "step": 400
     },
     {
-      "epoch": 3.31,
-      "grad_norm": 0.06996390968561172,
-      "learning_rate": 3.508064516129033e-05,
-      "loss": 0.0192,
       "step": 410
     },
     {
-      "epoch": 3.39,
-      "grad_norm": 1.358115315437317,
-      "learning_rate": 3.1048387096774195e-05,
-      "loss": 0.0431,
       "step": 420
     },
     {
-      "epoch": 3.47,
-      "grad_norm": 0.4962191581726074,
-      "learning_rate": 2.7016129032258064e-05,
-      "loss": 0.0573,
       "step": 430
     },
     {
-      "epoch": 3.55,
-      "grad_norm": 0.08283121138811111,
-      "learning_rate": 2.2983870967741935e-05,
-      "loss": 0.0216,
       "step": 440
     },
     {
-      "epoch": 3.63,
-      "grad_norm": 0.06285007297992706,
-      "learning_rate": 1.8951612903225807e-05,
-      "loss": 0.0169,
       "step": 450
     },
     {
-      "epoch": 3.71,
-      "grad_norm": 0.10198648273944855,
-      "learning_rate": 1.4919354838709679e-05,
-      "loss": 0.0188,
       "step": 460
     },
     {
-      "epoch": 3.79,
-      "grad_norm": 1.5539321899414062,
-      "learning_rate": 1.0887096774193549e-05,
-      "loss": 0.0227,
       "step": 470
     },
     {
-      "epoch": 3.87,
-      "grad_norm": 0.06271003931760788,
-      "learning_rate": 6.854838709677419e-06,
-      "loss": 0.0212,
       "step": 480
     },
     {
-      "epoch": 3.95,
-      "grad_norm": 0.1244824230670929,
-      "learning_rate": 2.82258064516129e-06,
-      "loss": 0.0183,
       "step": 490
     },
     {
       "epoch": 4.0,
-      "step": 496,
-      "total_flos": 6.10974224738132e+17,
-      "train_loss": 0.25425288126233125,
-      "train_runtime": 387.3536,
-      "train_samples_per_second": 20.353,
-      "train_steps_per_second": 1.28
     }
   ],
   "logging_steps": 10,
-  "max_steps": 496,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 100,
-  "total_flos": 6.10974224738132e+17,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.8459659218788147,
+  "best_model_checkpoint": "./vit-base-beans-demo-v5/checkpoint-900",
   "epoch": 4.0,
   "eval_steps": 100,
+  "global_step": 2348,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.02,
+      "grad_norm": 1.6497292518615723,
+      "learning_rate": 0.00019914821124361162,
+      "loss": 1.6003,
       "step": 10
     },
     {
+      "epoch": 0.03,
+      "grad_norm": 1.307145357131958,
+      "learning_rate": 0.00019829642248722317,
+      "loss": 1.2767,
       "step": 20
     },
     {
+      "epoch": 0.05,
+      "grad_norm": 2.9354941844940186,
+      "learning_rate": 0.00019744463373083478,
+      "loss": 1.2612,
       "step": 30
     },
     {
+      "epoch": 0.07,
+      "grad_norm": 1.3261815309524536,
+      "learning_rate": 0.00019659284497444633,
+      "loss": 1.2354,
       "step": 40
     },
     {
+      "epoch": 0.09,
+      "grad_norm": 1.5586915016174316,
+      "learning_rate": 0.00019574105621805794,
+      "loss": 1.0959,
       "step": 50
     },
     {
+      "epoch": 0.1,
+      "grad_norm": 1.490173578262329,
+      "learning_rate": 0.00019488926746166952,
+      "loss": 1.0438,
       "step": 60
     },
     {
+      "epoch": 0.12,
+      "grad_norm": 2.0831446647644043,
+      "learning_rate": 0.0001940374787052811,
+      "loss": 1.0841,
       "step": 70
     },
     {
+      "epoch": 0.14,
+      "grad_norm": 2.6207799911499023,
+      "learning_rate": 0.00019318568994889268,
+      "loss": 1.0983,
       "step": 80
     },
     {
+      "epoch": 0.15,
+      "grad_norm": 1.7383110523223877,
+      "learning_rate": 0.00019233390119250426,
+      "loss": 1.1775,
       "step": 90
     },
     {
+      "epoch": 0.17,
+      "grad_norm": 2.1954941749572754,
+      "learning_rate": 0.00019148211243611585,
+      "loss": 1.0616,
       "step": 100
     },
     {
+      "epoch": 0.17,
+      "eval_accuracy": 0.5817717206132879,
+      "eval_loss": 1.0267014503479004,
+      "eval_runtime": 39.3874,
+      "eval_samples_per_second": 29.806,
+      "eval_steps_per_second": 3.732,
       "step": 100
     },
     {
+      "epoch": 0.19,
+      "grad_norm": 1.597124695777893,
+      "learning_rate": 0.00019063032367972745,
+      "loss": 1.007,
       "step": 110
     },
     {
+      "epoch": 0.2,
+      "grad_norm": 1.289490818977356,
+      "learning_rate": 0.000189778534923339,
+      "loss": 1.0065,
       "step": 120
     },
     {
+      "epoch": 0.22,
+      "grad_norm": 1.7088607549667358,
+      "learning_rate": 0.00018892674616695061,
+      "loss": 1.0204,
       "step": 130
     },
     {
+      "epoch": 0.24,
+      "grad_norm": 2.730241537094116,
+      "learning_rate": 0.00018807495741056217,
+      "loss": 0.8969,
       "step": 140
     },
     {
+      "epoch": 0.26,
+      "grad_norm": 2.9691402912139893,
+      "learning_rate": 0.00018722316865417378,
+      "loss": 0.953,
       "step": 150
     },
     {
+      "epoch": 0.27,
+      "grad_norm": 2.2519712448120117,
+      "learning_rate": 0.00018637137989778536,
+      "loss": 0.9269,
       "step": 160
     },
     {
+      "epoch": 0.29,
+      "grad_norm": 1.8000602722167969,
+      "learning_rate": 0.00018551959114139694,
+      "loss": 1.1314,
       "step": 170
     },
     {
+      "epoch": 0.31,
+      "grad_norm": 1.5348334312438965,
+      "learning_rate": 0.00018466780238500855,
+      "loss": 0.9615,
       "step": 180
     },
     {
+      "epoch": 0.32,
+      "grad_norm": 1.599938988685608,
+      "learning_rate": 0.0001838160136286201,
+      "loss": 0.8033,
       "step": 190
     },
     {
+      "epoch": 0.34,
+      "grad_norm": 1.50412917137146,
+      "learning_rate": 0.0001829642248722317,
+      "loss": 0.9594,
       "step": 200
     },
     {
+      "epoch": 0.34,
+      "eval_accuracy": 0.6073253833049403,
+      "eval_loss": 0.9467767477035522,
+      "eval_runtime": 38.8829,
+      "eval_samples_per_second": 30.193,
+      "eval_steps_per_second": 3.781,
       "step": 200
     },
     {
+      "epoch": 0.36,
+      "grad_norm": 2.1896722316741943,
+      "learning_rate": 0.0001821124361158433,
+      "loss": 0.9217,
       "step": 210
     },
     {
+      "epoch": 0.37,
+      "grad_norm": 1.9687891006469727,
+      "learning_rate": 0.00018126064735945487,
+      "loss": 1.0296,
       "step": 220
     },
     {
+      "epoch": 0.39,
+      "grad_norm": 1.9628914594650269,
+      "learning_rate": 0.00018040885860306645,
+      "loss": 0.8122,
       "step": 230
     },
     {
+      "epoch": 0.41,
+      "grad_norm": 2.598545789718628,
+      "learning_rate": 0.00017955706984667803,
+      "loss": 0.8393,
       "step": 240
     },
     {
+      "epoch": 0.43,
+      "grad_norm": 2.2483532428741455,
+      "learning_rate": 0.0001787052810902896,
+      "loss": 0.9047,
       "step": 250
     },
     {
+      "epoch": 0.44,
+      "grad_norm": 2.1274337768554688,
+      "learning_rate": 0.0001778534923339012,
+      "loss": 0.91,
       "step": 260
     },
     {
+      "epoch": 0.46,
+      "grad_norm": 2.436018466949463,
+      "learning_rate": 0.00017700170357751277,
+      "loss": 1.0615,
       "step": 270
     },
     {
+      "epoch": 0.48,
+      "grad_norm": 2.069586992263794,
+      "learning_rate": 0.00017614991482112438,
+      "loss": 1.0799,
       "step": 280
     },
     {
+      "epoch": 0.49,
+      "grad_norm": 1.7266385555267334,
+      "learning_rate": 0.00017529812606473594,
+      "loss": 0.9465,
       "step": 290
     },
     {
+      "epoch": 0.51,
+      "grad_norm": 2.0491390228271484,
+      "learning_rate": 0.00017444633730834754,
+      "loss": 1.1785,
       "step": 300
     },
     {
+      "epoch": 0.51,
+      "eval_accuracy": 0.5868824531516184,
+      "eval_loss": 0.997596025466919,
+      "eval_runtime": 39.3421,
+      "eval_samples_per_second": 29.841,
+      "eval_steps_per_second": 3.736,
       "step": 300
     },
     {
+      "epoch": 0.53,
+      "grad_norm": 1.4697805643081665,
+      "learning_rate": 0.00017359454855195912,
+      "loss": 1.094,
       "step": 310
     },
     {
+      "epoch": 0.55,
+      "grad_norm": 2.369339942932129,
+      "learning_rate": 0.0001727427597955707,
+      "loss": 0.9398,
       "step": 320
     },
     {
+      "epoch": 0.56,
+      "grad_norm": 2.325148344039917,
+      "learning_rate": 0.00017189097103918229,
+      "loss": 0.9718,
       "step": 330
     },
     {
+      "epoch": 0.58,
+      "grad_norm": 1.9404678344726562,
+      "learning_rate": 0.00017103918228279387,
+      "loss": 0.9091,
       "step": 340
     },
     {
+      "epoch": 0.6,
+      "grad_norm": 2.4493370056152344,
+      "learning_rate": 0.00017018739352640547,
+      "loss": 0.9295,
       "step": 350
     },
     {
+      "epoch": 0.61,
+      "grad_norm": 1.6286579370498657,
+      "learning_rate": 0.00016933560477001706,
+      "loss": 1.1049,
       "step": 360
     },
     {
+      "epoch": 0.63,
+      "grad_norm": 3.559056043624878,
+      "learning_rate": 0.00016848381601362864,
+      "loss": 0.9566,
       "step": 370
     },
     {
+      "epoch": 0.65,
+      "grad_norm": 1.4250924587249756,
+      "learning_rate": 0.00016763202725724022,
+      "loss": 0.7772,
       "step": 380
     },
     {
+      "epoch": 0.66,
+      "grad_norm": 1.5668089389801025,
+      "learning_rate": 0.0001667802385008518,
+      "loss": 0.8869,
       "step": 390
     },
     {
+      "epoch": 0.68,
+      "grad_norm": 2.725231885910034,
+      "learning_rate": 0.00016592844974446338,
+      "loss": 0.865,
       "step": 400
     },
     {
+      "epoch": 0.68,
+      "eval_accuracy": 0.6388415672913118,
+      "eval_loss": 0.9287859201431274,
+      "eval_runtime": 38.5489,
+      "eval_samples_per_second": 30.455,
+      "eval_steps_per_second": 3.813,
       "step": 400
     },
     {
+      "epoch": 0.7,
+      "grad_norm": 2.6907713413238525,
+      "learning_rate": 0.00016507666098807496,
+      "loss": 0.899,
       "step": 410
     },
     {
+      "epoch": 0.72,
+      "grad_norm": 2.402860164642334,
+      "learning_rate": 0.00016422487223168654,
+      "loss": 0.9506,
       "step": 420
     },
     {
+      "epoch": 0.73,
+      "grad_norm": 2.749433994293213,
+      "learning_rate": 0.00016337308347529815,
+      "loss": 0.8529,
       "step": 430
     },
     {
+      "epoch": 0.75,
+      "grad_norm": 1.92979097366333,
+      "learning_rate": 0.0001625212947189097,
+      "loss": 0.8695,
       "step": 440
     },
     {
+      "epoch": 0.77,
+      "grad_norm": 2.793747901916504,
+      "learning_rate": 0.0001616695059625213,
+      "loss": 0.8614,
       "step": 450
     },
     {
+      "epoch": 0.78,
+      "grad_norm": 2.483780860900879,
+      "learning_rate": 0.0001608177172061329,
+      "loss": 0.9176,
       "step": 460
     },
     {
+      "epoch": 0.8,
+      "grad_norm": 1.7278929948806763,
+      "learning_rate": 0.00015996592844974447,
+      "loss": 0.9656,
       "step": 470
     },
     {
+      "epoch": 0.82,
+      "grad_norm": 2.649017810821533,
+      "learning_rate": 0.00015911413969335605,
+      "loss": 0.8653,
       "step": 480
     },
     {
+      "epoch": 0.83,
+      "grad_norm": 1.8457053899765015,
+      "learning_rate": 0.00015826235093696763,
+      "loss": 0.7707,
       "step": 490
     },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.824699640274048,
+      "learning_rate": 0.00015741056218057921,
+      "loss": 0.8494,
+      "step": 500
+    },
+    {
+      "epoch": 0.85,
+      "eval_accuracy": 0.651618398637138,
+      "eval_loss": 0.8572959303855896,
+      "eval_runtime": 38.0883,
+      "eval_samples_per_second": 30.823,
+      "eval_steps_per_second": 3.859,
+      "step": 500
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.9104124307632446,
+      "learning_rate": 0.0001565587734241908,
+      "loss": 0.8113,
+      "step": 510
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 2.2717394828796387,
+      "learning_rate": 0.0001557069846678024,
+      "loss": 0.9194,
+      "step": 520
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.891735553741455,
+      "learning_rate": 0.00015485519591141398,
+      "loss": 0.9337,
+      "step": 530
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 4.8229146003723145,
+      "learning_rate": 0.00015400340715502557,
+      "loss": 0.9033,
+      "step": 540
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 2.656970977783203,
+      "learning_rate": 0.00015315161839863715,
+      "loss": 0.8985,
+      "step": 550
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 2.2908411026000977,
+      "learning_rate": 0.00015229982964224873,
+      "loss": 0.8708,
+      "step": 560
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 2.141950845718384,
+      "learning_rate": 0.0001514480408858603,
+      "loss": 0.9298,
+      "step": 570
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 2.5572831630706787,
+      "learning_rate": 0.00015059625212947192,
+      "loss": 0.9101,
+      "step": 580
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 2.2453222274780273,
+      "learning_rate": 0.00014982964224872234,
+      "loss": 0.8034,
+      "step": 590
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 2.2874865531921387,
+      "learning_rate": 0.00014897785349233392,
+      "loss": 0.8151,
+      "step": 600
+    },
+    {
+      "epoch": 1.02,
+      "eval_accuracy": 0.6396933560477002,
+      "eval_loss": 0.87294602394104,
+      "eval_runtime": 38.7251,
+      "eval_samples_per_second": 30.316,
+      "eval_steps_per_second": 3.796,
+      "step": 600
+    },
+    {
+      "epoch": 1.04,
+      "grad_norm": 2.950303554534912,
+      "learning_rate": 0.0001481260647359455,
+      "loss": 0.7484,
+      "step": 610
+    },
+    {
+      "epoch": 1.06,
+      "grad_norm": 1.9773017168045044,
+      "learning_rate": 0.00014727427597955708,
+      "loss": 0.6572,
+      "step": 620
+    },
+    {
+      "epoch": 1.07,
+      "grad_norm": 2.9777700901031494,
+      "learning_rate": 0.00014642248722316866,
+      "loss": 0.6927,
+      "step": 630
+    },
+    {
+      "epoch": 1.09,
+      "grad_norm": 3.323662519454956,
+      "learning_rate": 0.00014557069846678024,
+      "loss": 0.5812,
+      "step": 640
+    },
+    {
+      "epoch": 1.11,
+      "grad_norm": 1.9647018909454346,
+      "learning_rate": 0.00014471890971039185,
+      "loss": 0.6166,
+      "step": 650
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 3.215794563293457,
+      "learning_rate": 0.0001438671209540034,
+      "loss": 0.602,
+      "step": 660
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 2.8758130073547363,
+      "learning_rate": 0.000143015332197615,
+      "loss": 0.5224,
+      "step": 670
+    },
+    {
+      "epoch": 1.16,
+      "grad_norm": 2.142829179763794,
+      "learning_rate": 0.00014216354344122656,
+      "loss": 0.5663,
+      "step": 680
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 6.860159397125244,
+      "learning_rate": 0.00014131175468483817,
+      "loss": 0.6479,
+      "step": 690
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 3.3176701068878174,
+      "learning_rate": 0.00014045996592844975,
+      "loss": 0.5787,
+      "step": 700
+    },
+    {
+      "epoch": 1.19,
+      "eval_accuracy": 0.6448040885860307,
+      "eval_loss": 0.9067147970199585,
+      "eval_runtime": 38.2427,
+      "eval_samples_per_second": 30.699,
+      "eval_steps_per_second": 3.844,
+      "step": 700
+    },
+    {
+      "epoch": 1.21,
+      "grad_norm": 2.322371482849121,
+      "learning_rate": 0.00013960817717206133,
+      "loss": 0.6849,
+      "step": 710
+    },
+    {
+      "epoch": 1.23,
+      "grad_norm": 1.875775933265686,
+      "learning_rate": 0.00013875638841567291,
+      "loss": 0.6399,
+      "step": 720
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 2.0012145042419434,
+      "learning_rate": 0.0001379045996592845,
+      "loss": 0.725,
+      "step": 730
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 2.5320353507995605,
+      "learning_rate": 0.00013705281090289608,
+      "loss": 0.5306,
+      "step": 740
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.29856538772583,
+      "learning_rate": 0.00013620102214650768,
+      "loss": 0.5731,
+      "step": 750
+    },
+    {
+      "epoch": 1.29,
+      "grad_norm": 1.8604925870895386,
+      "learning_rate": 0.00013534923339011926,
+      "loss": 0.6806,
+      "step": 760
+    },
+    {
+      "epoch": 1.31,
+      "grad_norm": 2.6868739128112793,
+      "learning_rate": 0.00013449744463373084,
+      "loss": 0.5944,
+      "step": 770
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 3.3680803775787354,
+      "learning_rate": 0.00013364565587734243,
+      "loss": 0.6412,
+      "step": 780
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 2.798149824142456,
+      "learning_rate": 0.000132793867120954,
+      "loss": 0.5235,
+      "step": 790
+    },
+    {
+      "epoch": 1.36,
+      "grad_norm": 2.4862072467803955,
+      "learning_rate": 0.00013194207836456561,
+      "loss": 0.7768,
+      "step": 800
+    },
+    {
+      "epoch": 1.36,
+      "eval_accuracy": 0.6533219761499148,
+      "eval_loss": 0.8995758295059204,
+      "eval_runtime": 38.4107,
+      "eval_samples_per_second": 30.564,
+      "eval_steps_per_second": 3.827,
+      "step": 800
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 3.291276216506958,
+      "learning_rate": 0.00013109028960817717,
+      "loss": 0.669,
+      "step": 810
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 2.814397096633911,
+      "learning_rate": 0.00013023850085178878,
+      "loss": 0.5539,
+      "step": 820
+    },
+    {
+      "epoch": 1.41,
+      "grad_norm": 2.5982093811035156,
+      "learning_rate": 0.00012938671209540033,
+      "loss": 0.6565,
+      "step": 830
+    },
+    {
+      "epoch": 1.43,
+      "grad_norm": 3.1191565990448,
+      "learning_rate": 0.00012853492333901194,
+      "loss": 0.533,
+      "step": 840
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 5.229197025299072,
+      "learning_rate": 0.00012768313458262352,
+      "loss": 0.6123,
+      "step": 850
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 2.259110689163208,
+      "learning_rate": 0.0001268313458262351,
+      "loss": 0.5183,
+      "step": 860
+    },
+    {
+      "epoch": 1.48,
+      "grad_norm": 3.099496364593506,
+      "learning_rate": 0.00012597955706984668,
+      "loss": 0.6911,
+      "step": 870
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.9909987449645996,
+      "learning_rate": 0.00012512776831345826,
+      "loss": 0.6671,
+      "step": 880
+    },
+    {
+      "epoch": 1.52,
+      "grad_norm": 3.1856462955474854,
+      "learning_rate": 0.00012427597955706984,
+      "loss": 0.6652,
+      "step": 890
+    },
+    {
+      "epoch": 1.53,
+      "grad_norm": 3.9080755710601807,
+      "learning_rate": 0.00012342419080068145,
+      "loss": 0.6098,
+      "step": 900
+    },
+    {
+      "epoch": 1.53,
+      "eval_accuracy": 0.6695059625212947,
+      "eval_loss": 0.8459659218788147,
+      "eval_runtime": 37.8733,
+      "eval_samples_per_second": 30.998,
+      "eval_steps_per_second": 3.881,
+      "step": 900
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 1.7587580680847168,
+      "learning_rate": 0.000122572402044293,
+      "loss": 0.7362,
+      "step": 910
+    },
+    {
+      "epoch": 1.57,
+      "grad_norm": 2.7327494621276855,
+      "learning_rate": 0.00012172061328790461,
+      "loss": 0.5863,
+      "step": 920
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 4.113401889801025,
+      "learning_rate": 0.0001208688245315162,
+      "loss": 0.8205,
+      "step": 930
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 4.598094940185547,
+      "learning_rate": 0.00012001703577512777,
+      "loss": 0.7198,
+      "step": 940
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 2.8792037963867188,
+      "learning_rate": 0.00011916524701873937,
+      "loss": 0.6532,
+      "step": 950
+    },
+    {
+      "epoch": 1.64,
+      "grad_norm": 2.949414014816284,
+      "learning_rate": 0.00011831345826235094,
+      "loss": 0.6783,
+      "step": 960
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 2.300352096557617,
+      "learning_rate": 0.00011746166950596253,
+      "loss": 0.69,
+      "step": 970
+    },
+    {
+      "epoch": 1.67,
+      "grad_norm": 2.5100274085998535,
+      "learning_rate": 0.00011660988074957411,
+      "loss": 0.7028,
+      "step": 980
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 2.372359275817871,
+      "learning_rate": 0.0001157580919931857,
+      "loss": 0.5673,
+      "step": 990
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 4.268792152404785,
+      "learning_rate": 0.00011490630323679727,
+      "loss": 0.6251,
+      "step": 1000
+    },
+    {
+      "epoch": 1.7,
+      "eval_accuracy": 0.6703577512776832,
+      "eval_loss": 0.8609783053398132,
+      "eval_runtime": 37.811,
+      "eval_samples_per_second": 31.049,
+      "eval_steps_per_second": 3.888,
+      "step": 1000
+    },
+    {
+      "epoch": 1.72,
+      "grad_norm": 3.081153154373169,
+      "learning_rate": 0.00011405451448040887,
+      "loss": 0.7021,
+      "step": 1010
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 2.9631364345550537,
+      "learning_rate": 0.0001132879045996593,
+      "loss": 0.5469,
+      "step": 1020
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 3.2896649837493896,
+      "learning_rate": 0.00011243611584327087,
+      "loss": 0.5593,
+      "step": 1030
+    },
+    {
+      "epoch": 1.77,
+      "grad_norm": 3.8375134468078613,
+      "learning_rate": 0.00011158432708688246,
+      "loss": 0.5499,
+      "step": 1040
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 1.5597748756408691,
+      "learning_rate": 0.00011073253833049404,
+      "loss": 0.5529,
+      "step": 1050
+    },
+    {
+      "epoch": 1.81,
+      "grad_norm": 4.54299783706665,
+      "learning_rate": 0.00010988074957410564,
+      "loss": 0.6211,
+      "step": 1060
+    },
+    {
+      "epoch": 1.82,
+      "grad_norm": 3.2734501361846924,
+      "learning_rate": 0.0001090289608177172,
+      "loss": 0.7002,
+      "step": 1070
+    },
+    {
+      "epoch": 1.84,
+      "grad_norm": 3.7582859992980957,
+      "learning_rate": 0.0001081771720613288,
+      "loss": 0.7465,
+      "step": 1080
+    },
+    {
+      "epoch": 1.86,
+      "grad_norm": 2.190544605255127,
+      "learning_rate": 0.00010732538330494038,
+      "loss": 0.6662,
+      "step": 1090
+    },
+    {
+      "epoch": 1.87,
+      "grad_norm": 1.7477951049804688,
+      "learning_rate": 0.00010647359454855197,
+      "loss": 0.7863,
+      "step": 1100
+    },
+    {
+      "epoch": 1.87,
+      "eval_accuracy": 0.6431005110732538,
+      "eval_loss": 0.8668282628059387,
+      "eval_runtime": 37.5178,
+      "eval_samples_per_second": 31.292,
+      "eval_steps_per_second": 3.918,
+      "step": 1100
+    },
+    {
+      "epoch": 1.89,
+      "grad_norm": 1.9970145225524902,
+      "learning_rate": 0.00010562180579216354,
+      "loss": 0.5988,
+      "step": 1110
+    },
+    {
+      "epoch": 1.91,
+      "grad_norm": 3.718055248260498,
+      "learning_rate": 0.00010477001703577514,
+      "loss": 0.5973,
+      "step": 1120
+    },
+    {
+      "epoch": 1.93,
+      "grad_norm": 1.6347967386245728,
+      "learning_rate": 0.0001039182282793867,
+      "loss": 0.5818,
+      "step": 1130
+    },
+    {
+      "epoch": 1.94,
+      "grad_norm": 2.3118577003479004,
+      "learning_rate": 0.0001030664395229983,
+      "loss": 0.5136,
+      "step": 1140
+    },
+    {
+      "epoch": 1.96,
+      "grad_norm": 2.806833267211914,
+      "learning_rate": 0.00010221465076660988,
+      "loss": 0.5353,
+      "step": 1150
+    },
+    {
+      "epoch": 1.98,
+      "grad_norm": 2.699890375137329,
+      "learning_rate": 0.00010136286201022147,
+      "loss": 0.5498,
+      "step": 1160
+    },
+    {
+      "epoch": 1.99,
+      "grad_norm": 2.3461856842041016,
+      "learning_rate": 0.00010051107325383304,
+      "loss": 0.7181,
+      "step": 1170
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 2.483959436416626,
+      "learning_rate": 9.965928449744463e-05,
+      "loss": 0.3872,
+      "step": 1180
+    },
+    {
+      "epoch": 2.03,
+      "grad_norm": 2.1393377780914307,
+      "learning_rate": 9.880749574105622e-05,
+      "loss": 0.292,
+      "step": 1190
+    },
+    {
+      "epoch": 2.04,
+      "grad_norm": 1.6828927993774414,
+      "learning_rate": 9.795570698466781e-05,
+      "loss": 0.2595,
+      "step": 1200
+    },
+    {
+      "epoch": 2.04,
+      "eval_accuracy": 0.6839863713798978,
+      "eval_loss": 0.8725138902664185,
+      "eval_runtime": 37.2408,
+      "eval_samples_per_second": 31.525,
+      "eval_steps_per_second": 3.947,
+      "step": 1200
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.973240852355957,
+      "learning_rate": 9.710391822827939e-05,
+      "loss": 0.2644,
+      "step": 1210
+    },
+    {
+      "epoch": 2.08,
+      "grad_norm": 2.932751417160034,
+      "learning_rate": 9.625212947189097e-05,
+      "loss": 0.2925,
+      "step": 1220
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 3.356760025024414,
+      "learning_rate": 9.540034071550255e-05,
+      "loss": 0.312,
+      "step": 1230
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 1.7125446796417236,
+      "learning_rate": 9.454855195911415e-05,
+      "loss": 0.2277,
+      "step": 1240
+    },
+    {
+      "epoch": 2.13,
+      "grad_norm": 1.714805006980896,
+      "learning_rate": 9.369676320272573e-05,
+      "loss": 0.3301,
+      "step": 1250
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 2.301734685897827,
+      "learning_rate": 9.284497444633732e-05,
+      "loss": 0.2668,
+      "step": 1260
+    },
+    {
+      "epoch": 2.16,
+      "grad_norm": 2.4843878746032715,
+      "learning_rate": 9.19931856899489e-05,
+      "loss": 0.2333,
+      "step": 1270
+    },
+    {
+      "epoch": 2.18,
+      "grad_norm": 2.9054977893829346,
+      "learning_rate": 9.114139693356048e-05,
+      "loss": 0.3492,
+      "step": 1280
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 4.664933681488037,
+      "learning_rate": 9.028960817717206e-05,
+      "loss": 0.3754,
+      "step": 1290
+    },
+    {
+      "epoch": 2.21,
+      "grad_norm": 2.1164679527282715,
+      "learning_rate": 8.943781942078366e-05,
+      "loss": 0.2735,
+      "step": 1300
+    },
+    {
+      "epoch": 2.21,
+      "eval_accuracy": 0.6746166950596252,
+      "eval_loss": 0.9306557178497314,
+      "eval_runtime": 37.0944,
+      "eval_samples_per_second": 31.649,
+      "eval_steps_per_second": 3.963,
+      "step": 1300
+    },
+    {
+      "epoch": 2.23,
+      "grad_norm": 4.541740894317627,
+      "learning_rate": 8.858603066439524e-05,
+      "loss": 0.3835,
+      "step": 1310
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 3.0828359127044678,
+      "learning_rate": 8.773424190800682e-05,
+      "loss": 0.3189,
+      "step": 1320
+    },
+    {
+      "epoch": 2.27,
+      "grad_norm": 2.398512363433838,
+      "learning_rate": 8.68824531516184e-05,
+      "loss": 0.29,
+      "step": 1330
+    },
+    {
+      "epoch": 2.28,
+      "grad_norm": 3.069840908050537,
+      "learning_rate": 8.603066439522998e-05,
+      "loss": 0.288,
+      "step": 1340
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 5.078506946563721,
+      "learning_rate": 8.517887563884158e-05,
+      "loss": 0.2772,
+      "step": 1350
+    },
+    {
+      "epoch": 2.32,
+      "grad_norm": 2.812199354171753,
+      "learning_rate": 8.432708688245316e-05,
+      "loss": 0.2951,
+      "step": 1360
+    },
+    {
+      "epoch": 2.33,
+      "grad_norm": 4.542017936706543,
+      "learning_rate": 8.347529812606474e-05,
+      "loss": 0.2142,
+      "step": 1370
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 3.7486696243286133,
+      "learning_rate": 8.262350936967632e-05,
+      "loss": 0.257,
+      "step": 1380
+    },
+    {
+      "epoch": 2.37,
+      "grad_norm": 3.5566983222961426,
+      "learning_rate": 8.17717206132879e-05,
+      "loss": 0.2816,
+      "step": 1390
+    },
+    {
+      "epoch": 2.39,
+      "grad_norm": 1.3465384244918823,
+      "learning_rate": 8.09199318568995e-05,
+      "loss": 0.2429,
+      "step": 1400
+    },
+    {
+      "epoch": 2.39,
+      "eval_accuracy": 0.6354344122657581,
+      "eval_loss": 1.0957823991775513,
+      "eval_runtime": 37.2033,
+      "eval_samples_per_second": 31.556,
+      "eval_steps_per_second": 3.951,
+      "step": 1400
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 2.310131788253784,
+      "learning_rate": 8.006814310051108e-05,
+      "loss": 0.306,
+      "step": 1410
+    },
+    {
+      "epoch": 2.42,
+      "grad_norm": 3.1297261714935303,
+      "learning_rate": 7.921635434412266e-05,
+      "loss": 0.3257,
+      "step": 1420
+    },
+    {
+      "epoch": 2.44,
+      "grad_norm": 1.8082480430603027,
+      "learning_rate": 7.836456558773425e-05,
+      "loss": 0.2001,
+      "step": 1430
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 1.7700148820877075,
+      "learning_rate": 7.751277683134583e-05,
+      "loss": 0.3476,
+      "step": 1440
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 4.247625350952148,
+      "learning_rate": 7.666098807495741e-05,
+      "loss": 0.2323,
+      "step": 1450
+    },
+    {
+      "epoch": 2.49,
+      "grad_norm": 4.059571743011475,
+      "learning_rate": 7.5809199318569e-05,
+      "loss": 0.3089,
+      "step": 1460
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 3.2417612075805664,
+      "learning_rate": 7.495741056218059e-05,
+      "loss": 0.1964,
+      "step": 1470
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 5.7817463874816895,
+      "learning_rate": 7.410562180579217e-05,
+      "loss": 0.3549,
+      "step": 1480
+    },
+    {
+      "epoch": 2.54,
+      "grad_norm": 5.440825939178467,
+      "learning_rate": 7.325383304940375e-05,
+      "loss": 0.3085,
+      "step": 1490
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 4.482067108154297,
+      "learning_rate": 7.240204429301533e-05,
+      "loss": 0.3224,
+      "step": 1500
+    },
+    {
+      "epoch": 2.56,
+      "eval_accuracy": 0.6686541737649063,
+      "eval_loss": 1.0305246114730835,
+      "eval_runtime": 37.1181,
+      "eval_samples_per_second": 31.629,
+      "eval_steps_per_second": 3.96,
+      "step": 1500
+    },
+    {
+      "epoch": 2.57,
+      "grad_norm": 2.1568057537078857,
+      "learning_rate": 7.155025553662692e-05,
+      "loss": 0.1612,
+      "step": 1510
+    },
+    {
+      "epoch": 2.59,
+      "grad_norm": 1.293427586555481,
+      "learning_rate": 7.06984667802385e-05,
+      "loss": 0.3217,
+      "step": 1520
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 4.301244258880615,
+      "learning_rate": 6.984667802385009e-05,
+      "loss": 0.2378,
+      "step": 1530
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.6040468215942383,
+      "learning_rate": 6.899488926746167e-05,
+      "loss": 0.2801,
+      "step": 1540
+    },
+    {
+      "epoch": 2.64,
+      "grad_norm": 0.7993047833442688,
+      "learning_rate": 6.814310051107326e-05,
+      "loss": 0.2637,
+      "step": 1550
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 4.865533828735352,
+      "learning_rate": 6.729131175468484e-05,
+      "loss": 0.3441,
+      "step": 1560
+    },
+    {
+      "epoch": 2.67,
+      "grad_norm": 1.7501546144485474,
+      "learning_rate": 6.643952299829642e-05,
+      "loss": 0.2523,
+      "step": 1570
+    },
+    {
+      "epoch": 2.69,
+      "grad_norm": 1.331475019454956,
+      "learning_rate": 6.5587734241908e-05,
+      "loss": 0.2127,
+      "step": 1580
+    },
+    {
+      "epoch": 2.71,
+      "grad_norm": 3.352147102355957,
+      "learning_rate": 6.473594548551958e-05,
+      "loss": 0.3432,
+      "step": 1590
+    },
+    {
+      "epoch": 2.73,
+      "grad_norm": 0.3470512330532074,
+      "learning_rate": 6.388415672913118e-05,
+      "loss": 0.1602,
+      "step": 1600
+    },
+    {
+      "epoch": 2.73,
+      "eval_accuracy": 0.6746166950596252,
+      "eval_loss": 1.0072139501571655,
+      "eval_runtime": 37.0019,
+      "eval_samples_per_second": 31.728,
+      "eval_steps_per_second": 3.973,
+      "step": 1600
+    },
+    {
+      "epoch": 2.74,
+      "grad_norm": 3.1594250202178955,
+      "learning_rate": 6.303236797274277e-05,
+      "loss": 0.1929,
+      "step": 1610
+    },
+    {
+      "epoch": 2.76,
+      "grad_norm": 4.477923393249512,
+      "learning_rate": 6.218057921635435e-05,
+      "loss": 0.2696,
+      "step": 1620
+    },
+    {
+      "epoch": 2.78,
+      "grad_norm": 3.042938232421875,
+      "learning_rate": 6.132879045996594e-05,
+      "loss": 0.2527,
+      "step": 1630
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.8534514904022217,
+      "learning_rate": 6.0477001703577516e-05,
+      "loss": 0.1727,
+      "step": 1640
+    },
+    {
+      "epoch": 2.81,
+      "grad_norm": 2.2307116985321045,
+      "learning_rate": 5.9625212947189104e-05,
+      "loss": 0.3178,
+      "step": 1650
+    },
+    {
+      "epoch": 2.83,
+      "grad_norm": 3.302003860473633,
+      "learning_rate": 5.8773424190800684e-05,
+      "loss": 0.2973,
+      "step": 1660
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 5.320656776428223,
+      "learning_rate": 5.792163543441227e-05,
+      "loss": 0.2715,
+      "step": 1670
+    },
+    {
+      "epoch": 2.86,
+      "grad_norm": 3.923163414001465,
+      "learning_rate": 5.706984667802385e-05,
+      "loss": 0.1991,
+      "step": 1680
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 7.479254245758057,
+      "learning_rate": 5.6218057921635434e-05,
+      "loss": 0.321,
+      "step": 1690
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 2.2710225582122803,
+      "learning_rate": 5.536626916524702e-05,
+      "loss": 0.2042,
+      "step": 1700
+    },
+    {
+      "epoch": 2.9,
+      "eval_accuracy": 0.6788756388415673,
+      "eval_loss": 1.0971218347549438,
+      "eval_runtime": 36.9173,
+      "eval_samples_per_second": 31.801,
+      "eval_steps_per_second": 3.982,
+      "step": 1700
+    },
+    {
+      "epoch": 2.91,
+      "grad_norm": 2.7610058784484863,
+      "learning_rate": 5.45144804088586e-05,
+      "loss": 0.3396,
+      "step": 1710
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 2.2475104331970215,
+      "learning_rate": 5.366269165247019e-05,
+      "loss": 0.266,
+      "step": 1720
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 4.55673885345459,
+      "learning_rate": 5.281090289608177e-05,
+      "loss": 0.341,
+      "step": 1730
+    },
+    {
+      "epoch": 2.96,
+      "grad_norm": 4.0248260498046875,
+      "learning_rate": 5.195911413969335e-05,
+      "loss": 0.2005,
+      "step": 1740
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 4.798257827758789,
+      "learning_rate": 5.110732538330494e-05,
+      "loss": 0.2615,
+      "step": 1750
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 3.2967402935028076,
+      "learning_rate": 5.025553662691652e-05,
+      "loss": 0.1966,
+      "step": 1760
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 5.774517059326172,
+      "learning_rate": 4.940374787052811e-05,
+      "loss": 0.1141,
+      "step": 1770
+    },
+    {
+      "epoch": 3.03,
+      "grad_norm": 1.7739803791046143,
+      "learning_rate": 4.8551959114139695e-05,
+      "loss": 0.0671,
+      "step": 1780
+    },
+    {
+      "epoch": 3.05,
+      "grad_norm": 0.8837150931358337,
+      "learning_rate": 4.7700170357751276e-05,
+      "loss": 0.0835,
+      "step": 1790
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 1.7833037376403809,
+      "learning_rate": 4.6848381601362864e-05,
+      "loss": 0.0604,
+      "step": 1800
+    },
+    {
+      "epoch": 3.07,
+      "eval_accuracy": 0.6916524701873935,
+      "eval_loss": 1.0816737413406372,
+      "eval_runtime": 36.8222,
+      "eval_samples_per_second": 31.883,
+      "eval_steps_per_second": 3.992,
+      "step": 1800
+    },
+    {
+      "epoch": 3.08,
+      "grad_norm": 0.34585830569267273,
+      "learning_rate": 4.599659284497445e-05,
+      "loss": 0.092,
+      "step": 1810
+    },
+    {
+      "epoch": 3.1,
+      "grad_norm": 0.7962571382522583,
+      "learning_rate": 4.514480408858603e-05,
+      "loss": 0.0587,
+      "step": 1820
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.16402888298034668,
+      "learning_rate": 4.429301533219762e-05,
+      "loss": 0.0547,
+      "step": 1830
+    },
+    {
+      "epoch": 3.13,
+      "grad_norm": 0.624047040939331,
+      "learning_rate": 4.34412265758092e-05,
+      "loss": 0.0954,
+      "step": 1840
+    },
+    {
+      "epoch": 3.15,
+      "grad_norm": 0.4253842532634735,
+      "learning_rate": 4.258943781942079e-05,
+      "loss": 0.0567,
+      "step": 1850
+    },
+    {
+      "epoch": 3.17,
+      "grad_norm": 0.1523701399564743,
+      "learning_rate": 4.173764906303237e-05,
+      "loss": 0.0413,
+      "step": 1860
+    },
+    {
+      "epoch": 3.19,
+      "grad_norm": 4.592818260192871,
+      "learning_rate": 4.088586030664395e-05,
+      "loss": 0.0968,
+      "step": 1870
+    },
+    {
+      "epoch": 3.2,
+      "grad_norm": 1.4066344499588013,
+      "learning_rate": 4.003407155025554e-05,
+      "loss": 0.1454,
+      "step": 1880
+    },
+    {
+      "epoch": 3.22,
+      "grad_norm": 2.1996095180511475,
+      "learning_rate": 3.9182282793867125e-05,
+      "loss": 0.1128,
+      "step": 1890
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 0.102027028799057,
+      "learning_rate": 3.8330494037478706e-05,
+      "loss": 0.0716,
+      "step": 1900
+    },
+    {
+      "epoch": 3.24,
+      "eval_accuracy": 0.692504258943782,
+      "eval_loss": 1.1307132244110107,
+      "eval_runtime": 37.0403,
+      "eval_samples_per_second": 31.695,
+      "eval_steps_per_second": 3.969,
+      "step": 1900
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 1.6857343912124634,
+      "learning_rate": 3.7478705281090294e-05,
+      "loss": 0.04,
+      "step": 1910
+    },
+    {
+      "epoch": 3.27,
+      "grad_norm": 1.2973403930664062,
+      "learning_rate": 3.6626916524701875e-05,
+      "loss": 0.0403,
+      "step": 1920
+    },
+    {
+      "epoch": 3.29,
+      "grad_norm": 0.41860514879226685,
+      "learning_rate": 3.577512776831346e-05,
+      "loss": 0.0642,
+      "step": 1930
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.5436795353889465,
+      "learning_rate": 3.492333901192504e-05,
+      "loss": 0.0836,
+      "step": 1940
+    },
+    {
+      "epoch": 3.32,
+      "grad_norm": 0.21996204555034637,
+      "learning_rate": 3.407155025553663e-05,
+      "loss": 0.0406,
+      "step": 1950
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 0.14845231175422668,
+      "learning_rate": 3.321976149914821e-05,
+      "loss": 0.0385,
+      "step": 1960
+    },
+    {
+      "epoch": 3.36,
+      "grad_norm": 3.531405448913574,
+      "learning_rate": 3.236797274275979e-05,
+      "loss": 0.0824,
+      "step": 1970
+    },
+    {
+      "epoch": 3.37,
+      "grad_norm": 0.07682117819786072,
+      "learning_rate": 3.151618398637139e-05,
+      "loss": 0.0717,
+      "step": 1980
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.07611515372991562,
+      "learning_rate": 3.066439522998297e-05,
+      "loss": 0.0572,
+      "step": 1990
+    },
+    {
+      "epoch": 3.41,
+      "grad_norm": 0.6266534328460693,
+      "learning_rate": 2.9812606473594552e-05,
+      "loss": 0.0822,
+      "step": 2000
+    },
+    {
+      "epoch": 3.41,
+      "eval_accuracy": 0.692504258943782,
+      "eval_loss": 1.1826940774917603,
+      "eval_runtime": 37.1369,
+      "eval_samples_per_second": 31.613,
+      "eval_steps_per_second": 3.958,
+      "step": 2000
+    },
+    {
+      "epoch": 3.42,
+      "grad_norm": 0.1280030608177185,
+      "learning_rate": 2.8960817717206136e-05,
+      "loss": 0.0244,
+      "step": 2010
+    },
+    {
+      "epoch": 3.44,
+      "grad_norm": 0.07406999170780182,
+      "learning_rate": 2.8109028960817717e-05,
+      "loss": 0.0574,
+      "step": 2020
+    },
+    {
+      "epoch": 3.46,
+      "grad_norm": 5.587332248687744,
+      "learning_rate": 2.72572402044293e-05,
+      "loss": 0.0352,
+      "step": 2030
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 2.2010979652404785,
+      "learning_rate": 2.6405451448040885e-05,
+      "loss": 0.0789,
+      "step": 2040
+    },
+    {
+      "epoch": 3.49,
+      "grad_norm": 2.9271368980407715,
+      "learning_rate": 2.555366269165247e-05,
+      "loss": 0.082,
+      "step": 2050
+    },
+    {
+      "epoch": 3.51,
+      "grad_norm": 0.05890679359436035,
+      "learning_rate": 2.4701873935264054e-05,
+      "loss": 0.0769,
+      "step": 2060
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 0.7043523192405701,
+      "learning_rate": 2.3850085178875638e-05,
+      "loss": 0.0819,
+      "step": 2070
+    },
+    {
+      "epoch": 3.54,
+      "grad_norm": 0.12047506123781204,
+      "learning_rate": 2.2998296422487226e-05,
+      "loss": 0.0195,
+      "step": 2080
+    },
+    {
+      "epoch": 3.56,
+      "grad_norm": 0.1116802990436554,
+      "learning_rate": 2.214650766609881e-05,
+      "loss": 0.0159,
+      "step": 2090
+    },
+    {
+      "epoch": 3.58,
+      "grad_norm": 0.09187493473291397,
+      "learning_rate": 2.1294718909710394e-05,
+      "loss": 0.0889,
+      "step": 2100
+    },
+    {
+      "epoch": 3.58,
+      "eval_accuracy": 0.6933560477001703,
+      "eval_loss": 1.2423571348190308,
+      "eval_runtime": 37.3059,
+      "eval_samples_per_second": 31.47,
+      "eval_steps_per_second": 3.94,
+      "step": 2100
+    },
+    {
+      "epoch": 3.59,
+      "grad_norm": 4.332376956939697,
+      "learning_rate": 2.0442930153321975e-05,
+      "loss": 0.0939,
+      "step": 2110
+    },
+    {
+      "epoch": 3.61,
+      "grad_norm": 0.13916102051734924,
+      "learning_rate": 1.9591141396933563e-05,
+      "loss": 0.0933,
+      "step": 2120
+    },
+    {
+      "epoch": 3.63,
+      "grad_norm": 7.690703392028809,
+      "learning_rate": 1.8739352640545147e-05,
+      "loss": 0.0496,
+      "step": 2130
+    },
+    {
+      "epoch": 3.65,
+      "grad_norm": 2.5700595378875732,
+      "learning_rate": 1.788756388415673e-05,
+      "loss": 0.0782,
+      "step": 2140
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 0.20934216678142548,
+      "learning_rate": 1.7035775127768315e-05,
+      "loss": 0.0606,
+      "step": 2150
+    },
+    {
+      "epoch": 3.68,
+      "grad_norm": 1.2959486246109009,
+      "learning_rate": 1.6183986371379896e-05,
+      "loss": 0.0601,
+      "step": 2160
+    },
+    {
+      "epoch": 3.7,
+      "grad_norm": 0.2652721405029297,
+      "learning_rate": 1.5332197614991484e-05,
+      "loss": 0.062,
+      "step": 2170
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.48360127210617065,
+      "learning_rate": 1.4480408858603068e-05,
+      "loss": 0.054,
+      "step": 2180
+    },
+    {
+      "epoch": 3.73,
+      "grad_norm": 3.1118693351745605,
+      "learning_rate": 1.362862010221465e-05,
+      "loss": 0.0989,
+      "step": 2190
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.9077383279800415,
+      "learning_rate": 1.2776831345826235e-05,
+      "loss": 0.0855,
+      "step": 2200
+    },
+    {
+      "epoch": 3.75,
+      "eval_accuracy": 0.6899488926746167,
+      "eval_loss": 1.2667156457901,
+      "eval_runtime": 36.8511,
+      "eval_samples_per_second": 31.858,
+      "eval_steps_per_second": 3.989,
+      "step": 2200
+    },
+    {
+      "epoch": 3.76,
+      "grad_norm": 0.13304296135902405,
+      "learning_rate": 1.1925042589437819e-05,
+      "loss": 0.0675,
+      "step": 2210
+    },
+    {
+      "epoch": 3.78,
+      "grad_norm": 1.3241567611694336,
+      "learning_rate": 1.1073253833049405e-05,
+      "loss": 0.0753,
+      "step": 2220
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.2818525731563568,
+      "learning_rate": 1.0221465076660987e-05,
+      "loss": 0.0998,
+      "step": 2230
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 7.136697292327881,
+      "learning_rate": 9.369676320272573e-06,
+      "loss": 0.0314,
+      "step": 2240
+    },
+    {
+      "epoch": 3.83,
+      "grad_norm": 1.372044324874878,
+      "learning_rate": 8.517887563884158e-06,
+      "loss": 0.0768,
+      "step": 2250
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 6.264348983764648,
+      "learning_rate": 7.666098807495742e-06,
+      "loss": 0.1516,
+      "step": 2260
+    },
+    {
+      "epoch": 3.87,
+      "grad_norm": 0.1342085599899292,
+      "learning_rate": 6.814310051107325e-06,
+      "loss": 0.0812,
+      "step": 2270
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 0.7664629220962524,
+      "learning_rate": 5.9625212947189095e-06,
+      "loss": 0.0474,
+      "step": 2280
+    },
+    {
+      "epoch": 3.9,
+      "grad_norm": 4.264090538024902,
+      "learning_rate": 5.110732538330494e-06,
+      "loss": 0.0903,
+      "step": 2290
+    },
+    {
+      "epoch": 3.92,
+      "grad_norm": 0.07316776365041733,
+      "learning_rate": 4.258943781942079e-06,
+      "loss": 0.0682,
+      "step": 2300
+    },
+    {
+      "epoch": 3.92,
+      "eval_accuracy": 0.6950596252129472,
+      "eval_loss": 1.2470241785049438,
+      "eval_runtime": 37.0027,
+      "eval_samples_per_second": 31.727,
+      "eval_steps_per_second": 3.973,
+      "step": 2300
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 1.477973222732544,
+      "learning_rate": 3.4071550255536626e-06,
+      "loss": 0.0587,
+      "step": 2310
+    },
+    {
+      "epoch": 3.95,
+      "grad_norm": 1.249779224395752,
+      "learning_rate": 2.555366269165247e-06,
+      "loss": 0.0546,
+      "step": 2320
+    },
+    {
+      "epoch": 3.97,
+      "grad_norm": 1.9763495922088623,
+      "learning_rate": 1.7035775127768313e-06,
+      "loss": 0.0539,
+      "step": 2330
+    },
+    {
+      "epoch": 3.99,
+      "grad_norm": 0.11824575811624527,
+      "learning_rate": 8.517887563884157e-07,
+      "loss": 0.0322,
+      "step": 2340
+    },
     {
       "epoch": 4.0,
+      "step": 2348,
+      "total_flos": 2.910419581971751e+18,
+      "train_loss": 0.4888155373286145,
+      "train_runtime": 2894.9609,
+      "train_samples_per_second": 12.973,
+      "train_steps_per_second": 0.811
     }
   ],
   "logging_steps": 10,
+  "max_steps": 2348,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 4,
   "save_steps": 100,
+  "total_flos": 2.910419581971751e+18,
   "train_batch_size": 16,
   "trial_name": null,
   "trial_params": null