End of training

Browse files

Files changed (11) hide show

README.md +112 -0
all_results.json +13 -0
config.json +53 -0
eval_results.json +8 -0
model.safetensors +3 -0
preprocessor_config.json +22 -0
runs/Mar07_03-18-48_606d6922676f/events.out.tfevents.1709781530.606d6922676f.1016.0 +3 -0
runs/Mar07_03-18-48_606d6922676f/events.out.tfevents.1709785268.606d6922676f.1016.1 +3 -0
train_results.json +8 -0
trainer_state.json +3280 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+license: apache-2.0
+base_model: microsoft/resnet-50
+tags:
+- generated_from_trainer
+metrics:
+- accuracy
+model-index:
+- name: resnet-Alzheimer
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# resnet-Alzheimer
+This model is a fine-tuned version of [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0932
+- Accuracy: 0.9795
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.002
+- train_batch_size: 16
+- eval_batch_size: 16
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 64
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 50
+### Training results
+| Training Loss | Epoch | Step | Validation Loss | Accuracy |
+|:-------------:|:-----:|:----:|:---------------:|:--------:|
+| 1.0127        | 1.0   | 80   | 0.9888          | 0.5088   |
+| 0.9345        | 2.0   | 160  | 0.9422          | 0.5303   |
+| 0.8889        | 3.0   | 240  | 0.8724          | 0.5781   |
+| 0.8843        | 4.0   | 320  | 0.8536          | 0.5889   |
+| 0.8397        | 5.0   | 400  | 0.8354          | 0.6152   |
+| 0.8624        | 6.0   | 480  | 0.9221          | 0.5381   |
+| 0.7543        | 7.0   | 560  | 0.7568          | 0.6475   |
+| 0.6993        | 8.0   | 640  | 0.8830          | 0.6133   |
+| 0.7045        | 9.0   | 720  | 0.7373          | 0.6582   |
+| 0.6557        | 10.0  | 800  | 0.6076          | 0.7451   |
+| 0.5876        | 11.0  | 880  | 0.7281          | 0.6992   |
+| 0.5732        | 12.0  | 960  | 0.5769          | 0.7510   |
+| 0.4864        | 13.0  | 1040 | 0.4457          | 0.8311   |
+| 0.5175        | 14.0  | 1120 | 0.5278          | 0.7842   |
+| 0.4865        | 15.0  | 1200 | 0.4164          | 0.8379   |
+| 0.4049        | 16.0  | 1280 | 0.4204          | 0.8301   |
+| 0.4167        | 17.0  | 1360 | 0.4720          | 0.8281   |
+| 0.36          | 18.0  | 1440 | 0.4660          | 0.8164   |
+| 0.3195        | 19.0  | 1520 | 0.3064          | 0.8770   |
+| 0.3652        | 20.0  | 1600 | 0.2571          | 0.9121   |
+| 0.2794        | 21.0  | 1680 | 0.2450          | 0.9150   |
+| 0.2704        | 22.0  | 1760 | 0.2391          | 0.9033   |
+| 0.2612        | 23.0  | 1840 | 0.2352          | 0.9277   |
+| 0.2425        | 24.0  | 1920 | 0.4720          | 0.8281   |
+| 0.2567        | 25.0  | 2000 | 0.2296          | 0.9131   |
+| 0.2302        | 26.0  | 2080 | 0.3067          | 0.8945   |
+| 0.2358        | 27.0  | 2160 | 0.1776          | 0.9375   |
+| 0.2173        | 28.0  | 2240 | 0.1596          | 0.9492   |
+| 0.1798        | 29.0  | 2320 | 0.1548          | 0.9414   |
+| 0.197         | 30.0  | 2400 | 0.1740          | 0.9570   |
+| 0.1654        | 31.0  | 2480 | 0.1217          | 0.9668   |
+| 0.1896        | 32.0  | 2560 | 0.2552          | 0.9258   |
+| 0.1705        | 33.0  | 2640 | 0.1031          | 0.9727   |
+| 0.1689        | 34.0  | 2720 | 0.1011          | 0.9688   |
+| 0.1439        | 35.0  | 2800 | 0.1175          | 0.9648   |
+| 0.1606        | 36.0  | 2880 | 0.1805          | 0.9443   |
+| 0.1281        | 37.0  | 2960 | 0.1254          | 0.9678   |
+| 0.1518        | 38.0  | 3040 | 0.1184          | 0.9648   |
+| 0.1531        | 39.0  | 3120 | 0.0992          | 0.9736   |
+| 0.132         | 40.0  | 3200 | 0.0920          | 0.9775   |
+| 0.134         | 41.0  | 3280 | 0.1391          | 0.9639   |
+| 0.1413        | 42.0  | 3360 | 0.1122          | 0.9717   |
+| 0.1097        | 43.0  | 3440 | 0.1171          | 0.9678   |
+| 0.1167        | 44.0  | 3520 | 0.1054          | 0.9766   |
+| 0.1388        | 45.0  | 3600 | 0.0932          | 0.9795   |
+| 0.1221        | 46.0  | 3680 | 0.0946          | 0.9766   |
+| 0.1099        | 47.0  | 3760 | 0.1116          | 0.9756   |
+| 0.1041        | 48.0  | 3840 | 0.1126          | 0.9746   |
+| 0.1025        | 49.0  | 3920 | 0.1114          | 0.9756   |
+| 0.0887        | 50.0  | 4000 | 0.1056          | 0.9756   |
+### Framework versions
+- Transformers 4.38.2
+- Pytorch 2.1.0+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.2

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 50.0,
+    "eval_accuracy": 0.9794921875,
+    "eval_loss": 0.09323666244745255,
+    "eval_runtime": 4.9584,
+    "eval_samples_per_second": 206.517,
+    "eval_steps_per_second": 12.907,
+    "total_flos": 5.437210780237824e+18,
+    "train_loss": 0.3629864407479763,
+    "train_runtime": 3465.6999,
+    "train_samples_per_second": 73.867,
+    "train_steps_per_second": 1.154
+}

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "_name_or_path": "microsoft/resnet-50",
+  "architectures": [
+    "ResNetForImageClassification"
+  ],
+  "depths": [
+    3,
+    4,
+    6,
+    3
+  ],
+  "downsample_in_bottleneck": false,
+  "downsample_in_first_stage": false,
+  "embedding_size": 64,
+  "hidden_act": "relu",
+  "hidden_sizes": [
+    256,
+    512,
+    1024,
+    2048
+  ],
+  "id2label": {
+    "0": "Mild_Demented",
+    "1": "Moderate_Demented",
+    "2": "Non_Demented",
+    "3": "Very_Mild_Demented"
+  },
+  "label2id": {
+    "Mild_Demented": "0",
+    "Moderate_Demented": "1",
+    "Non_Demented": "2",
+    "Very_Mild_Demented": "3"
+  },
+  "layer_type": "bottleneck",
+  "model_type": "resnet",
+  "num_channels": 3,
+  "out_features": [
+    "stage4"
+  ],
+  "out_indices": [
+    4
+  ],
+  "problem_type": "single_label_classification",
+  "stage_names": [
+    "stem",
+    "stage1",
+    "stage2",
+    "stage3",
+    "stage4"
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2"
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 50.0,
+    "eval_accuracy": 0.9794921875,
+    "eval_loss": 0.09323666244745255,
+    "eval_runtime": 4.9584,
+    "eval_samples_per_second": 206.517,
+    "eval_steps_per_second": 12.907
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:029a8ae5f126fd073b1ab5d8557516e85c13d2f89151fe30dd1b908285b38fa0
+size 94319344

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "crop_pct": 0.875,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "ConvNextImageProcessor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

runs/Mar07_03-18-48_606d6922676f/events.out.tfevents.1709781530.606d6922676f.1016.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5790612438d06e2ed0cceaee3ef25c441d28e09f2b9c07c8d451e313069f40b7
+size 105676

runs/Mar07_03-18-48_606d6922676f/events.out.tfevents.1709785268.606d6922676f.1016.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2935e6bd8a6623dced1d3dbbb6b041793ca5563cab7ba253ded482f2c1c2cc
+size 411

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 50.0,
+    "total_flos": 5.437210780237824e+18,
+    "train_loss": 0.3629864407479763,
+    "train_runtime": 3465.6999,
+    "train_samples_per_second": 73.867,
+    "train_steps_per_second": 1.154
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3280 @@

+{
+  "best_metric": 0.9794921875,
+  "best_model_checkpoint": "resnet-Alzheimer/checkpoint-3600",
+  "epoch": 50.0,
+  "eval_steps": 500,
+  "global_step": 4000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.12,
+      "grad_norm": 2.1754038333892822,
+      "learning_rate": 5e-05,
+      "loss": 1.3812,
+      "step": 10
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 3.91097092628479,
+      "learning_rate": 0.0001,
+      "loss": 1.34,
+      "step": 20
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 4.134509563446045,
+      "learning_rate": 0.00015,
+      "loss": 1.2635,
+      "step": 30
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 6.377187252044678,
+      "learning_rate": 0.0002,
+      "loss": 1.1824,
+      "step": 40
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 7.713193416595459,
+      "learning_rate": 0.00025,
+      "loss": 1.1226,
+      "step": 50
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 3.569382429122925,
+      "learning_rate": 0.0003,
+      "loss": 1.0308,
+      "step": 60
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 5.650737285614014,
+      "learning_rate": 0.00035,
+      "loss": 1.0115,
+      "step": 70
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 5.970870494842529,
+      "learning_rate": 0.0004,
+      "loss": 1.0127,
+      "step": 80
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.5087890625,
+      "eval_loss": 0.9888483285903931,
+      "eval_runtime": 5.4153,
+      "eval_samples_per_second": 189.093,
+      "eval_steps_per_second": 11.818,
+      "step": 80
+    },
+    {
+      "epoch": 1.12,
+      "grad_norm": 2.92154860496521,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 1.0175,
+      "step": 90
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 4.131512641906738,
+      "learning_rate": 0.0005,
+      "loss": 0.918,
+      "step": 100
+    },
+    {
+      "epoch": 1.38,
+      "grad_norm": 2.9838943481445312,
+      "learning_rate": 0.00055,
+      "loss": 0.9613,
+      "step": 110
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.4230854511260986,
+      "learning_rate": 0.0006,
+      "loss": 0.9188,
+      "step": 120
+    },
+    {
+      "epoch": 1.62,
+      "grad_norm": 3.774385690689087,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 0.9245,
+      "step": 130
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 3.175440549850464,
+      "learning_rate": 0.0007,
+      "loss": 0.8713,
+      "step": 140
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 2.7985305786132812,
+      "learning_rate": 0.00075,
+      "loss": 0.9264,
+      "step": 150
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.5870747566223145,
+      "learning_rate": 0.0008,
+      "loss": 0.9345,
+      "step": 160
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.5302734375,
+      "eval_loss": 0.942151665687561,
+      "eval_runtime": 5.1954,
+      "eval_samples_per_second": 197.099,
+      "eval_steps_per_second": 12.319,
+      "step": 160
+    },
+    {
+      "epoch": 2.12,
+      "grad_norm": 1.3990691900253296,
+      "learning_rate": 0.00085,
+      "loss": 0.8851,
+      "step": 170
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 1.4066977500915527,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.9081,
+      "step": 180
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 1.5531185865402222,
+      "learning_rate": 0.00095,
+      "loss": 0.8888,
+      "step": 190
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 1.960204839706421,
+      "learning_rate": 0.001,
+      "loss": 0.9206,
+      "step": 200
+    },
+    {
+      "epoch": 2.62,
+      "grad_norm": 1.0218795537948608,
+      "learning_rate": 0.0010500000000000002,
+      "loss": 0.8735,
+      "step": 210
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 1.9216176271438599,
+      "learning_rate": 0.0011,
+      "loss": 0.924,
+      "step": 220
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 1.9017541408538818,
+      "learning_rate": 0.00115,
+      "loss": 0.9327,
+      "step": 230
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.556686282157898,
+      "learning_rate": 0.0012,
+      "loss": 0.8889,
+      "step": 240
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.578125,
+      "eval_loss": 0.8723889589309692,
+      "eval_runtime": 4.9225,
+      "eval_samples_per_second": 208.026,
+      "eval_steps_per_second": 13.002,
+      "step": 240
+    },
+    {
+      "epoch": 3.12,
+      "grad_norm": 0.9459726810455322,
+      "learning_rate": 0.00125,
+      "loss": 0.8643,
+      "step": 250
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.749912679195404,
+      "learning_rate": 0.0013000000000000002,
+      "loss": 0.8978,
+      "step": 260
+    },
+    {
+      "epoch": 3.38,
+      "grad_norm": 0.9228126406669617,
+      "learning_rate": 0.00135,
+      "loss": 0.8838,
+      "step": 270
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 1.0743939876556396,
+      "learning_rate": 0.0014,
+      "loss": 0.8868,
+      "step": 280
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 0.997053325176239,
+      "learning_rate": 0.00145,
+      "loss": 0.8632,
+      "step": 290
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.5891302227973938,
+      "learning_rate": 0.0015,
+      "loss": 0.8501,
+      "step": 300
+    },
+    {
+      "epoch": 3.88,
+      "grad_norm": 1.0819345712661743,
+      "learning_rate": 0.0015500000000000002,
+      "loss": 0.884,
+      "step": 310
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.5622245073318481,
+      "learning_rate": 0.0016,
+      "loss": 0.8843,
+      "step": 320
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.5888671875,
+      "eval_loss": 0.8535706996917725,
+      "eval_runtime": 6.5338,
+      "eval_samples_per_second": 156.725,
+      "eval_steps_per_second": 9.795,
+      "step": 320
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 0.7301604747772217,
+      "learning_rate": 0.00165,
+      "loss": 0.8241,
+      "step": 330
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 1.222732424736023,
+      "learning_rate": 0.0017,
+      "loss": 0.8929,
+      "step": 340
+    },
+    {
+      "epoch": 4.38,
+      "grad_norm": 0.8520879745483398,
+      "learning_rate": 0.00175,
+      "loss": 0.8726,
+      "step": 350
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.6151734590530396,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 0.8851,
+      "step": 360
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 0.6786526441574097,
+      "learning_rate": 0.00185,
+      "loss": 0.8544,
+      "step": 370
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 0.8025469779968262,
+      "learning_rate": 0.0019,
+      "loss": 0.8432,
+      "step": 380
+    },
+    {
+      "epoch": 4.88,
+      "grad_norm": 1.0158729553222656,
+      "learning_rate": 0.00195,
+      "loss": 0.8261,
+      "step": 390
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.7815405130386353,
+      "learning_rate": 0.002,
+      "loss": 0.8397,
+      "step": 400
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.615234375,
+      "eval_loss": 0.8353910446166992,
+      "eval_runtime": 5.3022,
+      "eval_samples_per_second": 193.129,
+      "eval_steps_per_second": 12.071,
+      "step": 400
+    },
+    {
+      "epoch": 5.12,
+      "grad_norm": 0.9535025954246521,
+      "learning_rate": 0.0019944444444444445,
+      "loss": 0.8636,
+      "step": 410
+    },
+    {
+      "epoch": 5.25,
+      "grad_norm": 0.43990448117256165,
+      "learning_rate": 0.001988888888888889,
+      "loss": 0.7931,
+      "step": 420
+    },
+    {
+      "epoch": 5.38,
+      "grad_norm": 0.6062633395195007,
+      "learning_rate": 0.0019833333333333335,
+      "loss": 0.8345,
+      "step": 430
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 0.6349042057991028,
+      "learning_rate": 0.001977777777777778,
+      "loss": 0.8593,
+      "step": 440
+    },
+    {
+      "epoch": 5.62,
+      "grad_norm": 0.6786915063858032,
+      "learning_rate": 0.0019722222222222224,
+      "loss": 0.8435,
+      "step": 450
+    },
+    {
+      "epoch": 5.75,
+      "grad_norm": 0.7090786695480347,
+      "learning_rate": 0.0019666666666666665,
+      "loss": 0.8008,
+      "step": 460
+    },
+    {
+      "epoch": 5.88,
+      "grad_norm": 0.6538481712341309,
+      "learning_rate": 0.001961111111111111,
+      "loss": 0.763,
+      "step": 470
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.4316461682319641,
+      "learning_rate": 0.0019555555555555554,
+      "loss": 0.8624,
+      "step": 480
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.5380859375,
+      "eval_loss": 0.9221189022064209,
+      "eval_runtime": 4.9173,
+      "eval_samples_per_second": 208.245,
+      "eval_steps_per_second": 13.015,
+      "step": 480
+    },
+    {
+      "epoch": 6.12,
+      "grad_norm": 0.5696819424629211,
+      "learning_rate": 0.00195,
+      "loss": 0.8065,
+      "step": 490
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.6260728240013123,
+      "learning_rate": 0.0019444444444444444,
+      "loss": 0.7873,
+      "step": 500
+    },
+    {
+      "epoch": 6.38,
+      "grad_norm": 0.6295855045318604,
+      "learning_rate": 0.0019388888888888889,
+      "loss": 0.7802,
+      "step": 510
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.6074417233467102,
+      "learning_rate": 0.0019333333333333333,
+      "loss": 0.7907,
+      "step": 520
+    },
+    {
+      "epoch": 6.62,
+      "grad_norm": 0.6099679470062256,
+      "learning_rate": 0.0019277777777777778,
+      "loss": 0.7391,
+      "step": 530
+    },
+    {
+      "epoch": 6.75,
+      "grad_norm": 0.9349565505981445,
+      "learning_rate": 0.0019222222222222223,
+      "loss": 0.7749,
+      "step": 540
+    },
+    {
+      "epoch": 6.88,
+      "grad_norm": 0.6923946142196655,
+      "learning_rate": 0.0019166666666666668,
+      "loss": 0.8232,
+      "step": 550
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.5967056751251221,
+      "learning_rate": 0.0019111111111111113,
+      "loss": 0.7543,
+      "step": 560
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.6474609375,
+      "eval_loss": 0.7568103671073914,
+      "eval_runtime": 5.3014,
+      "eval_samples_per_second": 193.156,
+      "eval_steps_per_second": 12.072,
+      "step": 560
+    },
+    {
+      "epoch": 7.12,
+      "grad_norm": 0.7925052642822266,
+      "learning_rate": 0.0019055555555555555,
+      "loss": 0.7086,
+      "step": 570
+    },
+    {
+      "epoch": 7.25,
+      "grad_norm": 0.715761125087738,
+      "learning_rate": 0.0019,
+      "loss": 0.7901,
+      "step": 580
+    },
+    {
+      "epoch": 7.38,
+      "grad_norm": 0.6602711081504822,
+      "learning_rate": 0.0018944444444444445,
+      "loss": 0.7375,
+      "step": 590
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.5104066729545593,
+      "learning_rate": 0.001888888888888889,
+      "loss": 0.7805,
+      "step": 600
+    },
+    {
+      "epoch": 7.62,
+      "grad_norm": 0.6333702802658081,
+      "learning_rate": 0.0018833333333333334,
+      "loss": 0.7017,
+      "step": 610
+    },
+    {
+      "epoch": 7.75,
+      "grad_norm": 0.5703239440917969,
+      "learning_rate": 0.001877777777777778,
+      "loss": 0.7086,
+      "step": 620
+    },
+    {
+      "epoch": 7.88,
+      "grad_norm": 0.8939486742019653,
+      "learning_rate": 0.0018722222222222222,
+      "loss": 0.7399,
+      "step": 630
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.6808524131774902,
+      "learning_rate": 0.0018666666666666666,
+      "loss": 0.6993,
+      "step": 640
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.61328125,
+      "eval_loss": 0.8830391764640808,
+      "eval_runtime": 4.9073,
+      "eval_samples_per_second": 208.667,
+      "eval_steps_per_second": 13.042,
+      "step": 640
+    },
+    {
+      "epoch": 8.12,
+      "grad_norm": 0.7670312523841858,
+      "learning_rate": 0.0018611111111111111,
+      "loss": 0.7304,
+      "step": 650
+    },
+    {
+      "epoch": 8.25,
+      "grad_norm": 0.518883466720581,
+      "learning_rate": 0.0018555555555555556,
+      "loss": 0.6759,
+      "step": 660
+    },
+    {
+      "epoch": 8.38,
+      "grad_norm": 0.6331384778022766,
+      "learning_rate": 0.00185,
+      "loss": 0.7323,
+      "step": 670
+    },
+    {
+      "epoch": 8.5,
+      "grad_norm": 0.5934571027755737,
+      "learning_rate": 0.0018444444444444446,
+      "loss": 0.7109,
+      "step": 680
+    },
+    {
+      "epoch": 8.62,
+      "grad_norm": 0.5555841326713562,
+      "learning_rate": 0.0018388888888888888,
+      "loss": 0.7361,
+      "step": 690
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.45028582215309143,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.7209,
+      "step": 700
+    },
+    {
+      "epoch": 8.88,
+      "grad_norm": 0.4313984811306,
+      "learning_rate": 0.0018277777777777778,
+      "loss": 0.692,
+      "step": 710
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.6221916675567627,
+      "learning_rate": 0.0018222222222222223,
+      "loss": 0.7045,
+      "step": 720
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.658203125,
+      "eval_loss": 0.7372878789901733,
+      "eval_runtime": 4.9218,
+      "eval_samples_per_second": 208.053,
+      "eval_steps_per_second": 13.003,
+      "step": 720
+    },
+    {
+      "epoch": 9.12,
+      "grad_norm": 0.9794626832008362,
+      "learning_rate": 0.0018166666666666667,
+      "loss": 0.641,
+      "step": 730
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 0.8530990481376648,
+      "learning_rate": 0.0018111111111111112,
+      "loss": 0.6616,
+      "step": 740
+    },
+    {
+      "epoch": 9.38,
+      "grad_norm": 0.5696712136268616,
+      "learning_rate": 0.0018055555555555557,
+      "loss": 0.6685,
+      "step": 750
+    },
+    {
+      "epoch": 9.5,
+      "grad_norm": 0.6695945858955383,
+      "learning_rate": 0.0018000000000000002,
+      "loss": 0.6079,
+      "step": 760
+    },
+    {
+      "epoch": 9.62,
+      "grad_norm": 0.9470874667167664,
+      "learning_rate": 0.0017944444444444446,
+      "loss": 0.6362,
+      "step": 770
+    },
+    {
+      "epoch": 9.75,
+      "grad_norm": 1.0435755252838135,
+      "learning_rate": 0.001788888888888889,
+      "loss": 0.7036,
+      "step": 780
+    },
+    {
+      "epoch": 9.88,
+      "grad_norm": 0.4934737980365753,
+      "learning_rate": 0.0017833333333333334,
+      "loss": 0.6955,
+      "step": 790
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.49625110626220703,
+      "learning_rate": 0.0017777777777777776,
+      "loss": 0.6557,
+      "step": 800
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.7451171875,
+      "eval_loss": 0.6075544357299805,
+      "eval_runtime": 5.2806,
+      "eval_samples_per_second": 193.918,
+      "eval_steps_per_second": 12.12,
+      "step": 800
+    },
+    {
+      "epoch": 10.12,
+      "grad_norm": 0.49739229679107666,
+      "learning_rate": 0.0017722222222222221,
+      "loss": 0.605,
+      "step": 810
+    },
+    {
+      "epoch": 10.25,
+      "grad_norm": 0.6317277550697327,
+      "learning_rate": 0.0017666666666666666,
+      "loss": 0.5332,
+      "step": 820
+    },
+    {
+      "epoch": 10.38,
+      "grad_norm": 0.756879985332489,
+      "learning_rate": 0.001761111111111111,
+      "loss": 0.5619,
+      "step": 830
+    },
+    {
+      "epoch": 10.5,
+      "grad_norm": 0.6143298149108887,
+      "learning_rate": 0.0017555555555555556,
+      "loss": 0.601,
+      "step": 840
+    },
+    {
+      "epoch": 10.62,
+      "grad_norm": 0.7249147891998291,
+      "learning_rate": 0.00175,
+      "loss": 0.5935,
+      "step": 850
+    },
+    {
+      "epoch": 10.75,
+      "grad_norm": 0.4532654285430908,
+      "learning_rate": 0.0017444444444444445,
+      "loss": 0.5988,
+      "step": 860
+    },
+    {
+      "epoch": 10.88,
+      "grad_norm": 0.5738415718078613,
+      "learning_rate": 0.001738888888888889,
+      "loss": 0.6634,
+      "step": 870
+    },
+    {
+      "epoch": 11.0,
+      "grad_norm": 0.5514868497848511,
+      "learning_rate": 0.0017333333333333335,
+      "loss": 0.5876,
+      "step": 880
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.69921875,
+      "eval_loss": 0.7281272411346436,
+      "eval_runtime": 4.8994,
+      "eval_samples_per_second": 209.004,
+      "eval_steps_per_second": 13.063,
+      "step": 880
+    },
+    {
+      "epoch": 11.12,
+      "grad_norm": 0.7158863544464111,
+      "learning_rate": 0.001727777777777778,
+      "loss": 0.606,
+      "step": 890
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.7355363368988037,
+      "learning_rate": 0.0017222222222222224,
+      "loss": 0.5923,
+      "step": 900
+    },
+    {
+      "epoch": 11.38,
+      "grad_norm": 0.7794367671012878,
+      "learning_rate": 0.0017166666666666667,
+      "loss": 0.5935,
+      "step": 910
+    },
+    {
+      "epoch": 11.5,
+      "grad_norm": 0.9755826592445374,
+      "learning_rate": 0.0017111111111111112,
+      "loss": 0.644,
+      "step": 920
+    },
+    {
+      "epoch": 11.62,
+      "grad_norm": 0.6257722973823547,
+      "learning_rate": 0.0017055555555555554,
+      "loss": 0.617,
+      "step": 930
+    },
+    {
+      "epoch": 11.75,
+      "grad_norm": 0.8550503253936768,
+      "learning_rate": 0.0017,
+      "loss": 0.5854,
+      "step": 940
+    },
+    {
+      "epoch": 11.88,
+      "grad_norm": 0.7347137928009033,
+      "learning_rate": 0.0016944444444444444,
+      "loss": 0.6358,
+      "step": 950
+    },
+    {
+      "epoch": 12.0,
+      "grad_norm": 0.7867416739463806,
+      "learning_rate": 0.0016888888888888889,
+      "loss": 0.5732,
+      "step": 960
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.7509765625,
+      "eval_loss": 0.5769097208976746,
+      "eval_runtime": 5.275,
+      "eval_samples_per_second": 194.122,
+      "eval_steps_per_second": 12.133,
+      "step": 960
+    },
+    {
+      "epoch": 12.12,
+      "grad_norm": 0.6022630333900452,
+      "learning_rate": 0.0016833333333333333,
+      "loss": 0.5643,
+      "step": 970
+    },
+    {
+      "epoch": 12.25,
+      "grad_norm": 0.599958062171936,
+      "learning_rate": 0.0016777777777777778,
+      "loss": 0.5438,
+      "step": 980
+    },
+    {
+      "epoch": 12.38,
+      "grad_norm": 0.6484814286231995,
+      "learning_rate": 0.0016722222222222223,
+      "loss": 0.5208,
+      "step": 990
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.8167735934257507,
+      "learning_rate": 0.0016666666666666668,
+      "loss": 0.5369,
+      "step": 1000
+    },
+    {
+      "epoch": 12.62,
+      "grad_norm": 0.49088793992996216,
+      "learning_rate": 0.0016611111111111113,
+      "loss": 0.4803,
+      "step": 1010
+    },
+    {
+      "epoch": 12.75,
+      "grad_norm": 0.6817615628242493,
+      "learning_rate": 0.0016555555555555555,
+      "loss": 0.5102,
+      "step": 1020
+    },
+    {
+      "epoch": 12.88,
+      "grad_norm": 0.8656439781188965,
+      "learning_rate": 0.00165,
+      "loss": 0.5287,
+      "step": 1030
+    },
+    {
+      "epoch": 13.0,
+      "grad_norm": 0.5195401310920715,
+      "learning_rate": 0.0016444444444444445,
+      "loss": 0.4864,
+      "step": 1040
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.8310546875,
+      "eval_loss": 0.445728600025177,
+      "eval_runtime": 4.9054,
+      "eval_samples_per_second": 208.751,
+      "eval_steps_per_second": 13.047,
+      "step": 1040
+    },
+    {
+      "epoch": 13.12,
+      "grad_norm": 0.7564366459846497,
+      "learning_rate": 0.001638888888888889,
+      "loss": 0.4715,
+      "step": 1050
+    },
+    {
+      "epoch": 13.25,
+      "grad_norm": 0.6976212859153748,
+      "learning_rate": 0.0016333333333333334,
+      "loss": 0.471,
+      "step": 1060
+    },
+    {
+      "epoch": 13.38,
+      "grad_norm": 0.7652568817138672,
+      "learning_rate": 0.001627777777777778,
+      "loss": 0.4821,
+      "step": 1070
+    },
+    {
+      "epoch": 13.5,
+      "grad_norm": 0.7834269404411316,
+      "learning_rate": 0.0016222222222222222,
+      "loss": 0.5091,
+      "step": 1080
+    },
+    {
+      "epoch": 13.62,
+      "grad_norm": 0.8186032176017761,
+      "learning_rate": 0.0016166666666666666,
+      "loss": 0.4611,
+      "step": 1090
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.7720199227333069,
+      "learning_rate": 0.0016111111111111111,
+      "loss": 0.5397,
+      "step": 1100
+    },
+    {
+      "epoch": 13.88,
+      "grad_norm": 0.6797453165054321,
+      "learning_rate": 0.0016055555555555556,
+      "loss": 0.5135,
+      "step": 1110
+    },
+    {
+      "epoch": 14.0,
+      "grad_norm": 0.726184606552124,
+      "learning_rate": 0.0016,
+      "loss": 0.5175,
+      "step": 1120
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.7841796875,
+      "eval_loss": 0.5278125405311584,
+      "eval_runtime": 4.8906,
+      "eval_samples_per_second": 209.383,
+      "eval_steps_per_second": 13.086,
+      "step": 1120
+    },
+    {
+      "epoch": 14.12,
+      "grad_norm": 0.6777172088623047,
+      "learning_rate": 0.0015944444444444446,
+      "loss": 0.4831,
+      "step": 1130
+    },
+    {
+      "epoch": 14.25,
+      "grad_norm": 0.6228752732276917,
+      "learning_rate": 0.0015888888888888888,
+      "loss": 0.4657,
+      "step": 1140
+    },
+    {
+      "epoch": 14.38,
+      "grad_norm": 0.7296370267868042,
+      "learning_rate": 0.0015833333333333333,
+      "loss": 0.5084,
+      "step": 1150
+    },
+    {
+      "epoch": 14.5,
+      "grad_norm": 0.7809439897537231,
+      "learning_rate": 0.0015777777777777778,
+      "loss": 0.4749,
+      "step": 1160
+    },
+    {
+      "epoch": 14.62,
+      "grad_norm": 0.4627506136894226,
+      "learning_rate": 0.0015722222222222223,
+      "loss": 0.4157,
+      "step": 1170
+    },
+    {
+      "epoch": 14.75,
+      "grad_norm": 0.465811163187027,
+      "learning_rate": 0.0015666666666666667,
+      "loss": 0.4192,
+      "step": 1180
+    },
+    {
+      "epoch": 14.88,
+      "grad_norm": 0.636384129524231,
+      "learning_rate": 0.0015611111111111112,
+      "loss": 0.4627,
+      "step": 1190
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.8339561223983765,
+      "learning_rate": 0.0015555555555555557,
+      "loss": 0.4865,
+      "step": 1200
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.837890625,
+      "eval_loss": 0.4163576364517212,
+      "eval_runtime": 5.3394,
+      "eval_samples_per_second": 191.782,
+      "eval_steps_per_second": 11.986,
+      "step": 1200
+    },
+    {
+      "epoch": 15.12,
+      "grad_norm": 0.5218497514724731,
+      "learning_rate": 0.0015500000000000002,
+      "loss": 0.4253,
+      "step": 1210
+    },
+    {
+      "epoch": 15.25,
+      "grad_norm": 0.6273193359375,
+      "learning_rate": 0.0015444444444444446,
+      "loss": 0.4474,
+      "step": 1220
+    },
+    {
+      "epoch": 15.38,
+      "grad_norm": 0.6019622087478638,
+      "learning_rate": 0.001538888888888889,
+      "loss": 0.4008,
+      "step": 1230
+    },
+    {
+      "epoch": 15.5,
+      "grad_norm": 0.7020573616027832,
+      "learning_rate": 0.0015333333333333334,
+      "loss": 0.3768,
+      "step": 1240
+    },
+    {
+      "epoch": 15.62,
+      "grad_norm": 0.577691376209259,
+      "learning_rate": 0.0015277777777777776,
+      "loss": 0.4108,
+      "step": 1250
+    },
+    {
+      "epoch": 15.75,
+      "grad_norm": 0.8489026427268982,
+      "learning_rate": 0.0015222222222222221,
+      "loss": 0.3994,
+      "step": 1260
+    },
+    {
+      "epoch": 15.88,
+      "grad_norm": 0.42233309149742126,
+      "learning_rate": 0.0015166666666666666,
+      "loss": 0.4292,
+      "step": 1270
+    },
+    {
+      "epoch": 16.0,
+      "grad_norm": 0.48867735266685486,
+      "learning_rate": 0.001511111111111111,
+      "loss": 0.4049,
+      "step": 1280
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.830078125,
+      "eval_loss": 0.4204105734825134,
+      "eval_runtime": 4.8855,
+      "eval_samples_per_second": 209.602,
+      "eval_steps_per_second": 13.1,
+      "step": 1280
+    },
+    {
+      "epoch": 16.12,
+      "grad_norm": 0.6492818593978882,
+      "learning_rate": 0.0015055555555555556,
+      "loss": 0.3885,
+      "step": 1290
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.4546281397342682,
+      "learning_rate": 0.0015,
+      "loss": 0.4096,
+      "step": 1300
+    },
+    {
+      "epoch": 16.38,
+      "grad_norm": 0.6827344298362732,
+      "learning_rate": 0.0014944444444444445,
+      "loss": 0.3618,
+      "step": 1310
+    },
+    {
+      "epoch": 16.5,
+      "grad_norm": 0.454326868057251,
+      "learning_rate": 0.001488888888888889,
+      "loss": 0.3863,
+      "step": 1320
+    },
+    {
+      "epoch": 16.62,
+      "grad_norm": 0.6911420226097107,
+      "learning_rate": 0.0014833333333333335,
+      "loss": 0.4264,
+      "step": 1330
+    },
+    {
+      "epoch": 16.75,
+      "grad_norm": 0.6122339367866516,
+      "learning_rate": 0.001477777777777778,
+      "loss": 0.4205,
+      "step": 1340
+    },
+    {
+      "epoch": 16.88,
+      "grad_norm": 0.5123728513717651,
+      "learning_rate": 0.0014722222222222224,
+      "loss": 0.4419,
+      "step": 1350
+    },
+    {
+      "epoch": 17.0,
+      "grad_norm": 1.0908498764038086,
+      "learning_rate": 0.0014666666666666667,
+      "loss": 0.4167,
+      "step": 1360
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.828125,
+      "eval_loss": 0.47203314304351807,
+      "eval_runtime": 5.3495,
+      "eval_samples_per_second": 191.421,
+      "eval_steps_per_second": 11.964,
+      "step": 1360
+    },
+    {
+      "epoch": 17.12,
+      "grad_norm": 0.42975571751594543,
+      "learning_rate": 0.0014611111111111112,
+      "loss": 0.4006,
+      "step": 1370
+    },
+    {
+      "epoch": 17.25,
+      "grad_norm": 0.6392154693603516,
+      "learning_rate": 0.0014555555555555554,
+      "loss": 0.3581,
+      "step": 1380
+    },
+    {
+      "epoch": 17.38,
+      "grad_norm": 0.6548070907592773,
+      "learning_rate": 0.00145,
+      "loss": 0.3672,
+      "step": 1390
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.6939528584480286,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 0.3514,
+      "step": 1400
+    },
+    {
+      "epoch": 17.62,
+      "grad_norm": 0.6098494529724121,
+      "learning_rate": 0.0014388888888888889,
+      "loss": 0.3835,
+      "step": 1410
+    },
+    {
+      "epoch": 17.75,
+      "grad_norm": 0.5356572866439819,
+      "learning_rate": 0.0014333333333333333,
+      "loss": 0.3326,
+      "step": 1420
+    },
+    {
+      "epoch": 17.88,
+      "grad_norm": 0.6472760438919067,
+      "learning_rate": 0.0014277777777777778,
+      "loss": 0.3829,
+      "step": 1430
+    },
+    {
+      "epoch": 18.0,
+      "grad_norm": 0.67198646068573,
+      "learning_rate": 0.0014222222222222223,
+      "loss": 0.36,
+      "step": 1440
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.81640625,
+      "eval_loss": 0.4660454988479614,
+      "eval_runtime": 4.9124,
+      "eval_samples_per_second": 208.451,
+      "eval_steps_per_second": 13.028,
+      "step": 1440
+    },
+    {
+      "epoch": 18.12,
+      "grad_norm": 0.4594449996948242,
+      "learning_rate": 0.0014166666666666668,
+      "loss": 0.3549,
+      "step": 1450
+    },
+    {
+      "epoch": 18.25,
+      "grad_norm": 0.4456086754798889,
+      "learning_rate": 0.0014111111111111112,
+      "loss": 0.2899,
+      "step": 1460
+    },
+    {
+      "epoch": 18.38,
+      "grad_norm": 0.724087119102478,
+      "learning_rate": 0.0014055555555555555,
+      "loss": 0.2976,
+      "step": 1470
+    },
+    {
+      "epoch": 18.5,
+      "grad_norm": 0.8099024891853333,
+      "learning_rate": 0.0014,
+      "loss": 0.3706,
+      "step": 1480
+    },
+    {
+      "epoch": 18.62,
+      "grad_norm": 0.6271733641624451,
+      "learning_rate": 0.0013944444444444445,
+      "loss": 0.3591,
+      "step": 1490
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.5864254236221313,
+      "learning_rate": 0.001388888888888889,
+      "loss": 0.3184,
+      "step": 1500
+    },
+    {
+      "epoch": 18.88,
+      "grad_norm": 0.4915286898612976,
+      "learning_rate": 0.0013833333333333334,
+      "loss": 0.301,
+      "step": 1510
+    },
+    {
+      "epoch": 19.0,
+      "grad_norm": 0.6932692527770996,
+      "learning_rate": 0.001377777777777778,
+      "loss": 0.3195,
+      "step": 1520
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.876953125,
+      "eval_loss": 0.306354820728302,
+      "eval_runtime": 5.2563,
+      "eval_samples_per_second": 194.815,
+      "eval_steps_per_second": 12.176,
+      "step": 1520
+    },
+    {
+      "epoch": 19.12,
+      "grad_norm": 0.5778792500495911,
+      "learning_rate": 0.0013722222222222222,
+      "loss": 0.3493,
+      "step": 1530
+    },
+    {
+      "epoch": 19.25,
+      "grad_norm": 0.951936662197113,
+      "learning_rate": 0.0013666666666666666,
+      "loss": 0.3305,
+      "step": 1540
+    },
+    {
+      "epoch": 19.38,
+      "grad_norm": 0.6778426170349121,
+      "learning_rate": 0.0013611111111111111,
+      "loss": 0.32,
+      "step": 1550
+    },
+    {
+      "epoch": 19.5,
+      "grad_norm": 0.6356533765792847,
+      "learning_rate": 0.0013555555555555556,
+      "loss": 0.2889,
+      "step": 1560
+    },
+    {
+      "epoch": 19.62,
+      "grad_norm": 0.6476128697395325,
+      "learning_rate": 0.00135,
+      "loss": 0.2907,
+      "step": 1570
+    },
+    {
+      "epoch": 19.75,
+      "grad_norm": 0.4664938151836395,
+      "learning_rate": 0.0013444444444444445,
+      "loss": 0.3261,
+      "step": 1580
+    },
+    {
+      "epoch": 19.88,
+      "grad_norm": 1.06290602684021,
+      "learning_rate": 0.0013388888888888888,
+      "loss": 0.3365,
+      "step": 1590
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.5365467667579651,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.3652,
+      "step": 1600
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.912109375,
+      "eval_loss": 0.25709766149520874,
+      "eval_runtime": 4.9952,
+      "eval_samples_per_second": 204.995,
+      "eval_steps_per_second": 12.812,
+      "step": 1600
+    },
+    {
+      "epoch": 20.12,
+      "grad_norm": 0.5051919221878052,
+      "learning_rate": 0.0013277777777777778,
+      "loss": 0.3147,
+      "step": 1610
+    },
+    {
+      "epoch": 20.25,
+      "grad_norm": 0.5098996162414551,
+      "learning_rate": 0.0013222222222222222,
+      "loss": 0.3085,
+      "step": 1620
+    },
+    {
+      "epoch": 20.38,
+      "grad_norm": 0.5585361123085022,
+      "learning_rate": 0.0013166666666666667,
+      "loss": 0.3679,
+      "step": 1630
+    },
+    {
+      "epoch": 20.5,
+      "grad_norm": 0.38560378551483154,
+      "learning_rate": 0.0013111111111111112,
+      "loss": 0.2987,
+      "step": 1640
+    },
+    {
+      "epoch": 20.62,
+      "grad_norm": 0.3209057152271271,
+      "learning_rate": 0.0013055555555555557,
+      "loss": 0.2792,
+      "step": 1650
+    },
+    {
+      "epoch": 20.75,
+      "grad_norm": 0.6471489667892456,
+      "learning_rate": 0.0013000000000000002,
+      "loss": 0.2755,
+      "step": 1660
+    },
+    {
+      "epoch": 20.88,
+      "grad_norm": 0.8814804553985596,
+      "learning_rate": 0.0012944444444444446,
+      "loss": 0.2993,
+      "step": 1670
+    },
+    {
+      "epoch": 21.0,
+      "grad_norm": 0.5392754673957825,
+      "learning_rate": 0.001288888888888889,
+      "loss": 0.2794,
+      "step": 1680
+    },
+    {
+      "epoch": 21.0,
+      "eval_accuracy": 0.9150390625,
+      "eval_loss": 0.24504294991493225,
+      "eval_runtime": 4.8909,
+      "eval_samples_per_second": 209.37,
+      "eval_steps_per_second": 13.086,
+      "step": 1680
+    },
+    {
+      "epoch": 21.12,
+      "grad_norm": 0.6234158873558044,
+      "learning_rate": 0.0012833333333333334,
+      "loss": 0.2926,
+      "step": 1690
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.4284802973270416,
+      "learning_rate": 0.0012777777777777776,
+      "loss": 0.2803,
+      "step": 1700
+    },
+    {
+      "epoch": 21.38,
+      "grad_norm": 0.688140869140625,
+      "learning_rate": 0.0012722222222222221,
+      "loss": 0.2799,
+      "step": 1710
+    },
+    {
+      "epoch": 21.5,
+      "grad_norm": 0.8576880097389221,
+      "learning_rate": 0.0012666666666666666,
+      "loss": 0.2868,
+      "step": 1720
+    },
+    {
+      "epoch": 21.62,
+      "grad_norm": 0.6299762725830078,
+      "learning_rate": 0.001261111111111111,
+      "loss": 0.2971,
+      "step": 1730
+    },
+    {
+      "epoch": 21.75,
+      "grad_norm": 0.7093678116798401,
+      "learning_rate": 0.0012555555555555555,
+      "loss": 0.2905,
+      "step": 1740
+    },
+    {
+      "epoch": 21.88,
+      "grad_norm": 0.4271737039089203,
+      "learning_rate": 0.00125,
+      "loss": 0.3336,
+      "step": 1750
+    },
+    {
+      "epoch": 22.0,
+      "grad_norm": 0.6771571040153503,
+      "learning_rate": 0.0012444444444444445,
+      "loss": 0.2704,
+      "step": 1760
+    },
+    {
+      "epoch": 22.0,
+      "eval_accuracy": 0.9033203125,
+      "eval_loss": 0.23907524347305298,
+      "eval_runtime": 5.3054,
+      "eval_samples_per_second": 193.012,
+      "eval_steps_per_second": 12.063,
+      "step": 1760
+    },
+    {
+      "epoch": 22.12,
+      "grad_norm": 0.44859397411346436,
+      "learning_rate": 0.001238888888888889,
+      "loss": 0.28,
+      "step": 1770
+    },
+    {
+      "epoch": 22.25,
+      "grad_norm": 0.5617765784263611,
+      "learning_rate": 0.0012333333333333335,
+      "loss": 0.3093,
+      "step": 1780
+    },
+    {
+      "epoch": 22.38,
+      "grad_norm": 0.6634913682937622,
+      "learning_rate": 0.001227777777777778,
+      "loss": 0.2417,
+      "step": 1790
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.670782744884491,
+      "learning_rate": 0.0012222222222222224,
+      "loss": 0.2932,
+      "step": 1800
+    },
+    {
+      "epoch": 22.62,
+      "grad_norm": 0.6564796566963196,
+      "learning_rate": 0.0012166666666666667,
+      "loss": 0.3042,
+      "step": 1810
+    },
+    {
+      "epoch": 22.75,
+      "grad_norm": 0.34089842438697815,
+      "learning_rate": 0.0012111111111111112,
+      "loss": 0.2925,
+      "step": 1820
+    },
+    {
+      "epoch": 22.88,
+      "grad_norm": 0.5612368583679199,
+      "learning_rate": 0.0012055555555555554,
+      "loss": 0.2559,
+      "step": 1830
+    },
+    {
+      "epoch": 23.0,
+      "grad_norm": 0.624458909034729,
+      "learning_rate": 0.0012,
+      "loss": 0.2612,
+      "step": 1840
+    },
+    {
+      "epoch": 23.0,
+      "eval_accuracy": 0.927734375,
+      "eval_loss": 0.23524078726768494,
+      "eval_runtime": 4.902,
+      "eval_samples_per_second": 208.893,
+      "eval_steps_per_second": 13.056,
+      "step": 1840
+    },
+    {
+      "epoch": 23.12,
+      "grad_norm": 0.6820557117462158,
+      "learning_rate": 0.0011944444444444444,
+      "loss": 0.2282,
+      "step": 1850
+    },
+    {
+      "epoch": 23.25,
+      "grad_norm": 0.5979276895523071,
+      "learning_rate": 0.0011888888888888889,
+      "loss": 0.2569,
+      "step": 1860
+    },
+    {
+      "epoch": 23.38,
+      "grad_norm": 0.5427021384239197,
+      "learning_rate": 0.0011833333333333333,
+      "loss": 0.2724,
+      "step": 1870
+    },
+    {
+      "epoch": 23.5,
+      "grad_norm": 0.4382477104663849,
+      "learning_rate": 0.0011777777777777778,
+      "loss": 0.2616,
+      "step": 1880
+    },
+    {
+      "epoch": 23.62,
+      "grad_norm": 0.6240445375442505,
+      "learning_rate": 0.0011722222222222223,
+      "loss": 0.2636,
+      "step": 1890
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.7440346479415894,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 0.2913,
+      "step": 1900
+    },
+    {
+      "epoch": 23.88,
+      "grad_norm": 0.4682701826095581,
+      "learning_rate": 0.0011611111111111112,
+      "loss": 0.2499,
+      "step": 1910
+    },
+    {
+      "epoch": 24.0,
+      "grad_norm": 0.5112751722335815,
+      "learning_rate": 0.0011555555555555555,
+      "loss": 0.2425,
+      "step": 1920
+    },
+    {
+      "epoch": 24.0,
+      "eval_accuracy": 0.828125,
+      "eval_loss": 0.4720377027988434,
+      "eval_runtime": 5.3156,
+      "eval_samples_per_second": 192.639,
+      "eval_steps_per_second": 12.04,
+      "step": 1920
+    },
+    {
+      "epoch": 24.12,
+      "grad_norm": 0.765444278717041,
+      "learning_rate": 0.00115,
+      "loss": 0.2736,
+      "step": 1930
+    },
+    {
+      "epoch": 24.25,
+      "grad_norm": 0.380066841840744,
+      "learning_rate": 0.0011444444444444445,
+      "loss": 0.2357,
+      "step": 1940
+    },
+    {
+      "epoch": 24.38,
+      "grad_norm": 0.43320003151893616,
+      "learning_rate": 0.001138888888888889,
+      "loss": 0.2518,
+      "step": 1950
+    },
+    {
+      "epoch": 24.5,
+      "grad_norm": 0.5003307461738586,
+      "learning_rate": 0.0011333333333333334,
+      "loss": 0.2898,
+      "step": 1960
+    },
+    {
+      "epoch": 24.62,
+      "grad_norm": 0.41153478622436523,
+      "learning_rate": 0.001127777777777778,
+      "loss": 0.2209,
+      "step": 1970
+    },
+    {
+      "epoch": 24.75,
+      "grad_norm": 0.41805940866470337,
+      "learning_rate": 0.0011222222222222222,
+      "loss": 0.235,
+      "step": 1980
+    },
+    {
+      "epoch": 24.88,
+      "grad_norm": 0.5226410627365112,
+      "learning_rate": 0.0011166666666666666,
+      "loss": 0.2349,
+      "step": 1990
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.3767559826374054,
+      "learning_rate": 0.0011111111111111111,
+      "loss": 0.2567,
+      "step": 2000
+    },
+    {
+      "epoch": 25.0,
+      "eval_accuracy": 0.9130859375,
+      "eval_loss": 0.22960150241851807,
+      "eval_runtime": 4.887,
+      "eval_samples_per_second": 209.535,
+      "eval_steps_per_second": 13.096,
+      "step": 2000
+    },
+    {
+      "epoch": 25.12,
+      "grad_norm": 0.6860052943229675,
+      "learning_rate": 0.0011055555555555556,
+      "loss": 0.2426,
+      "step": 2010
+    },
+    {
+      "epoch": 25.25,
+      "grad_norm": 0.3876688778400421,
+      "learning_rate": 0.0011,
+      "loss": 0.2243,
+      "step": 2020
+    },
+    {
+      "epoch": 25.38,
+      "grad_norm": 0.3251183032989502,
+      "learning_rate": 0.0010944444444444445,
+      "loss": 0.234,
+      "step": 2030
+    },
+    {
+      "epoch": 25.5,
+      "grad_norm": 0.5538493990898132,
+      "learning_rate": 0.0010888888888888888,
+      "loss": 0.2547,
+      "step": 2040
+    },
+    {
+      "epoch": 25.62,
+      "grad_norm": 0.6539644598960876,
+      "learning_rate": 0.0010833333333333333,
+      "loss": 0.2382,
+      "step": 2050
+    },
+    {
+      "epoch": 25.75,
+      "grad_norm": 0.6687932014465332,
+      "learning_rate": 0.0010777777777777778,
+      "loss": 0.2254,
+      "step": 2060
+    },
+    {
+      "epoch": 25.88,
+      "grad_norm": 0.6210919618606567,
+      "learning_rate": 0.0010722222222222222,
+      "loss": 0.2356,
+      "step": 2070
+    },
+    {
+      "epoch": 26.0,
+      "grad_norm": 0.5525135397911072,
+      "learning_rate": 0.0010666666666666667,
+      "loss": 0.2302,
+      "step": 2080
+    },
+    {
+      "epoch": 26.0,
+      "eval_accuracy": 0.89453125,
+      "eval_loss": 0.30673664808273315,
+      "eval_runtime": 4.9576,
+      "eval_samples_per_second": 206.552,
+      "eval_steps_per_second": 12.909,
+      "step": 2080
+    },
+    {
+      "epoch": 26.12,
+      "grad_norm": 0.5014208555221558,
+      "learning_rate": 0.0010611111111111112,
+      "loss": 0.2403,
+      "step": 2090
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.6093131303787231,
+      "learning_rate": 0.0010555555555555557,
+      "loss": 0.2356,
+      "step": 2100
+    },
+    {
+      "epoch": 26.38,
+      "grad_norm": 0.3627248704433441,
+      "learning_rate": 0.0010500000000000002,
+      "loss": 0.2509,
+      "step": 2110
+    },
+    {
+      "epoch": 26.5,
+      "grad_norm": 0.4119124114513397,
+      "learning_rate": 0.0010444444444444446,
+      "loss": 0.1915,
+      "step": 2120
+    },
+    {
+      "epoch": 26.62,
+      "grad_norm": 0.5565811395645142,
+      "learning_rate": 0.0010388888888888889,
+      "loss": 0.191,
+      "step": 2130
+    },
+    {
+      "epoch": 26.75,
+      "grad_norm": 0.44097578525543213,
+      "learning_rate": 0.0010333333333333334,
+      "loss": 0.2353,
+      "step": 2140
+    },
+    {
+      "epoch": 26.88,
+      "grad_norm": 0.4542636275291443,
+      "learning_rate": 0.0010277777777777776,
+      "loss": 0.2144,
+      "step": 2150
+    },
+    {
+      "epoch": 27.0,
+      "grad_norm": 0.4763772785663605,
+      "learning_rate": 0.0010222222222222221,
+      "loss": 0.2358,
+      "step": 2160
+    },
+    {
+      "epoch": 27.0,
+      "eval_accuracy": 0.9375,
+      "eval_loss": 0.17758239805698395,
+      "eval_runtime": 5.3185,
+      "eval_samples_per_second": 192.534,
+      "eval_steps_per_second": 12.033,
+      "step": 2160
+    },
+    {
+      "epoch": 27.12,
+      "grad_norm": 0.7219308614730835,
+      "learning_rate": 0.0010166666666666666,
+      "loss": 0.238,
+      "step": 2170
+    },
+    {
+      "epoch": 27.25,
+      "grad_norm": 0.7707520127296448,
+      "learning_rate": 0.001011111111111111,
+      "loss": 0.1863,
+      "step": 2180
+    },
+    {
+      "epoch": 27.38,
+      "grad_norm": 0.6878935098648071,
+      "learning_rate": 0.0010055555555555555,
+      "loss": 0.2493,
+      "step": 2190
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.5451861619949341,
+      "learning_rate": 0.001,
+      "loss": 0.2374,
+      "step": 2200
+    },
+    {
+      "epoch": 27.62,
+      "grad_norm": 0.39642319083213806,
+      "learning_rate": 0.0009944444444444445,
+      "loss": 0.2382,
+      "step": 2210
+    },
+    {
+      "epoch": 27.75,
+      "grad_norm": 0.4122956097126007,
+      "learning_rate": 0.000988888888888889,
+      "loss": 0.2176,
+      "step": 2220
+    },
+    {
+      "epoch": 27.88,
+      "grad_norm": 0.6155421733856201,
+      "learning_rate": 0.0009833333333333332,
+      "loss": 0.2128,
+      "step": 2230
+    },
+    {
+      "epoch": 28.0,
+      "grad_norm": 0.7283052206039429,
+      "learning_rate": 0.0009777777777777777,
+      "loss": 0.2173,
+      "step": 2240
+    },
+    {
+      "epoch": 28.0,
+      "eval_accuracy": 0.94921875,
+      "eval_loss": 0.15962785482406616,
+      "eval_runtime": 4.9115,
+      "eval_samples_per_second": 208.489,
+      "eval_steps_per_second": 13.031,
+      "step": 2240
+    },
+    {
+      "epoch": 28.12,
+      "grad_norm": 0.39027243852615356,
+      "learning_rate": 0.0009722222222222222,
+      "loss": 0.1979,
+      "step": 2250
+    },
+    {
+      "epoch": 28.25,
+      "grad_norm": 0.5258718729019165,
+      "learning_rate": 0.0009666666666666667,
+      "loss": 0.1447,
+      "step": 2260
+    },
+    {
+      "epoch": 28.38,
+      "grad_norm": 0.6615960001945496,
+      "learning_rate": 0.0009611111111111112,
+      "loss": 0.2403,
+      "step": 2270
+    },
+    {
+      "epoch": 28.5,
+      "grad_norm": 0.4044310748577118,
+      "learning_rate": 0.0009555555555555556,
+      "loss": 0.1981,
+      "step": 2280
+    },
+    {
+      "epoch": 28.62,
+      "grad_norm": 0.2666930556297302,
+      "learning_rate": 0.00095,
+      "loss": 0.2108,
+      "step": 2290
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.5612334609031677,
+      "learning_rate": 0.0009444444444444445,
+      "loss": 0.1783,
+      "step": 2300
+    },
+    {
+      "epoch": 28.88,
+      "grad_norm": 0.48420026898384094,
+      "learning_rate": 0.000938888888888889,
+      "loss": 0.1848,
+      "step": 2310
+    },
+    {
+      "epoch": 29.0,
+      "grad_norm": 0.5850337743759155,
+      "learning_rate": 0.0009333333333333333,
+      "loss": 0.1798,
+      "step": 2320
+    },
+    {
+      "epoch": 29.0,
+      "eval_accuracy": 0.94140625,
+      "eval_loss": 0.1548241674900055,
+      "eval_runtime": 5.32,
+      "eval_samples_per_second": 192.483,
+      "eval_steps_per_second": 12.03,
+      "step": 2320
+    },
+    {
+      "epoch": 29.12,
+      "grad_norm": 0.5059901475906372,
+      "learning_rate": 0.0009277777777777778,
+      "loss": 0.1954,
+      "step": 2330
+    },
+    {
+      "epoch": 29.25,
+      "grad_norm": 0.22623513638973236,
+      "learning_rate": 0.0009222222222222223,
+      "loss": 0.1604,
+      "step": 2340
+    },
+    {
+      "epoch": 29.38,
+      "grad_norm": 0.2330830693244934,
+      "learning_rate": 0.0009166666666666666,
+      "loss": 0.2125,
+      "step": 2350
+    },
+    {
+      "epoch": 29.5,
+      "grad_norm": 0.4784901440143585,
+      "learning_rate": 0.0009111111111111111,
+      "loss": 0.1823,
+      "step": 2360
+    },
+    {
+      "epoch": 29.62,
+      "grad_norm": 0.6156973242759705,
+      "learning_rate": 0.0009055555555555556,
+      "loss": 0.2289,
+      "step": 2370
+    },
+    {
+      "epoch": 29.75,
+      "grad_norm": 0.4373360872268677,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 0.2127,
+      "step": 2380
+    },
+    {
+      "epoch": 29.88,
+      "grad_norm": 0.501115083694458,
+      "learning_rate": 0.0008944444444444445,
+      "loss": 0.2359,
+      "step": 2390
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.411662757396698,
+      "learning_rate": 0.0008888888888888888,
+      "loss": 0.197,
+      "step": 2400
+    },
+    {
+      "epoch": 30.0,
+      "eval_accuracy": 0.95703125,
+      "eval_loss": 0.17402663826942444,
+      "eval_runtime": 4.919,
+      "eval_samples_per_second": 208.172,
+      "eval_steps_per_second": 13.011,
+      "step": 2400
+    },
+    {
+      "epoch": 30.12,
+      "grad_norm": 0.45976510643959045,
+      "learning_rate": 0.0008833333333333333,
+      "loss": 0.1608,
+      "step": 2410
+    },
+    {
+      "epoch": 30.25,
+      "grad_norm": 0.3243074417114258,
+      "learning_rate": 0.0008777777777777778,
+      "loss": 0.1742,
+      "step": 2420
+    },
+    {
+      "epoch": 30.38,
+      "grad_norm": 0.5205725431442261,
+      "learning_rate": 0.0008722222222222223,
+      "loss": 0.1718,
+      "step": 2430
+    },
+    {
+      "epoch": 30.5,
+      "grad_norm": 0.3976719081401825,
+      "learning_rate": 0.0008666666666666667,
+      "loss": 0.2247,
+      "step": 2440
+    },
+    {
+      "epoch": 30.62,
+      "grad_norm": 0.2859196662902832,
+      "learning_rate": 0.0008611111111111112,
+      "loss": 0.1884,
+      "step": 2450
+    },
+    {
+      "epoch": 30.75,
+      "grad_norm": 0.5310297012329102,
+      "learning_rate": 0.0008555555555555556,
+      "loss": 0.1672,
+      "step": 2460
+    },
+    {
+      "epoch": 30.88,
+      "grad_norm": 0.5172590613365173,
+      "learning_rate": 0.00085,
+      "loss": 0.1828,
+      "step": 2470
+    },
+    {
+      "epoch": 31.0,
+      "grad_norm": 0.6098745465278625,
+      "learning_rate": 0.0008444444444444444,
+      "loss": 0.1654,
+      "step": 2480
+    },
+    {
+      "epoch": 31.0,
+      "eval_accuracy": 0.966796875,
+      "eval_loss": 0.12167137861251831,
+      "eval_runtime": 4.9956,
+      "eval_samples_per_second": 204.98,
+      "eval_steps_per_second": 12.811,
+      "step": 2480
+    },
+    {
+      "epoch": 31.12,
+      "grad_norm": 0.3343498706817627,
+      "learning_rate": 0.0008388888888888889,
+      "loss": 0.1784,
+      "step": 2490
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.3938640058040619,
+      "learning_rate": 0.0008333333333333334,
+      "loss": 0.1697,
+      "step": 2500
+    },
+    {
+      "epoch": 31.38,
+      "grad_norm": 0.41868484020233154,
+      "learning_rate": 0.0008277777777777778,
+      "loss": 0.2263,
+      "step": 2510
+    },
+    {
+      "epoch": 31.5,
+      "grad_norm": 0.4363801181316376,
+      "learning_rate": 0.0008222222222222222,
+      "loss": 0.1762,
+      "step": 2520
+    },
+    {
+      "epoch": 31.62,
+      "grad_norm": 0.5088948607444763,
+      "learning_rate": 0.0008166666666666667,
+      "loss": 0.1711,
+      "step": 2530
+    },
+    {
+      "epoch": 31.75,
+      "grad_norm": 0.5423977375030518,
+      "learning_rate": 0.0008111111111111111,
+      "loss": 0.1675,
+      "step": 2540
+    },
+    {
+      "epoch": 31.88,
+      "grad_norm": 0.431382954120636,
+      "learning_rate": 0.0008055555555555556,
+      "loss": 0.2216,
+      "step": 2550
+    },
+    {
+      "epoch": 32.0,
+      "grad_norm": 0.4037337303161621,
+      "learning_rate": 0.0008,
+      "loss": 0.1896,
+      "step": 2560
+    },
+    {
+      "epoch": 32.0,
+      "eval_accuracy": 0.92578125,
+      "eval_loss": 0.2552070617675781,
+      "eval_runtime": 5.2019,
+      "eval_samples_per_second": 196.853,
+      "eval_steps_per_second": 12.303,
+      "step": 2560
+    },
+    {
+      "epoch": 32.12,
+      "grad_norm": 0.6025939583778381,
+      "learning_rate": 0.0007944444444444444,
+      "loss": 0.1926,
+      "step": 2570
+    },
+    {
+      "epoch": 32.25,
+      "grad_norm": 0.7205588221549988,
+      "learning_rate": 0.0007888888888888889,
+      "loss": 0.1755,
+      "step": 2580
+    },
+    {
+      "epoch": 32.38,
+      "grad_norm": 0.3841509222984314,
+      "learning_rate": 0.0007833333333333334,
+      "loss": 0.1696,
+      "step": 2590
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.5659075975418091,
+      "learning_rate": 0.0007777777777777778,
+      "loss": 0.133,
+      "step": 2600
+    },
+    {
+      "epoch": 32.62,
+      "grad_norm": 0.7011501789093018,
+      "learning_rate": 0.0007722222222222223,
+      "loss": 0.2069,
+      "step": 2610
+    },
+    {
+      "epoch": 32.75,
+      "grad_norm": 0.5933576822280884,
+      "learning_rate": 0.0007666666666666667,
+      "loss": 0.1799,
+      "step": 2620
+    },
+    {
+      "epoch": 32.88,
+      "grad_norm": 0.636463463306427,
+      "learning_rate": 0.0007611111111111111,
+      "loss": 0.1884,
+      "step": 2630
+    },
+    {
+      "epoch": 33.0,
+      "grad_norm": 0.36000609397888184,
+      "learning_rate": 0.0007555555555555555,
+      "loss": 0.1705,
+      "step": 2640
+    },
+    {
+      "epoch": 33.0,
+      "eval_accuracy": 0.97265625,
+      "eval_loss": 0.10305143892765045,
+      "eval_runtime": 4.8746,
+      "eval_samples_per_second": 210.07,
+      "eval_steps_per_second": 13.129,
+      "step": 2640
+    },
+    {
+      "epoch": 33.12,
+      "grad_norm": 0.25941601395606995,
+      "learning_rate": 0.00075,
+      "loss": 0.143,
+      "step": 2650
+    },
+    {
+      "epoch": 33.25,
+      "grad_norm": 0.6486319899559021,
+      "learning_rate": 0.0007444444444444445,
+      "loss": 0.1819,
+      "step": 2660
+    },
+    {
+      "epoch": 33.38,
+      "grad_norm": 0.34492290019989014,
+      "learning_rate": 0.000738888888888889,
+      "loss": 0.1877,
+      "step": 2670
+    },
+    {
+      "epoch": 33.5,
+      "grad_norm": 0.5475990176200867,
+      "learning_rate": 0.0007333333333333333,
+      "loss": 0.1586,
+      "step": 2680
+    },
+    {
+      "epoch": 33.62,
+      "grad_norm": 0.231631800532341,
+      "learning_rate": 0.0007277777777777777,
+      "loss": 0.145,
+      "step": 2690
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.6208530068397522,
+      "learning_rate": 0.0007222222222222222,
+      "loss": 0.2015,
+      "step": 2700
+    },
+    {
+      "epoch": 33.88,
+      "grad_norm": 0.7229673862457275,
+      "learning_rate": 0.0007166666666666667,
+      "loss": 0.1814,
+      "step": 2710
+    },
+    {
+      "epoch": 34.0,
+      "grad_norm": 0.38056522607803345,
+      "learning_rate": 0.0007111111111111111,
+      "loss": 0.1689,
+      "step": 2720
+    },
+    {
+      "epoch": 34.0,
+      "eval_accuracy": 0.96875,
+      "eval_loss": 0.10111749172210693,
+      "eval_runtime": 5.2922,
+      "eval_samples_per_second": 193.491,
+      "eval_steps_per_second": 12.093,
+      "step": 2720
+    },
+    {
+      "epoch": 34.12,
+      "grad_norm": 0.5405479669570923,
+      "learning_rate": 0.0007055555555555556,
+      "loss": 0.16,
+      "step": 2730
+    },
+    {
+      "epoch": 34.25,
+      "grad_norm": 0.5781314373016357,
+      "learning_rate": 0.0007,
+      "loss": 0.1598,
+      "step": 2740
+    },
+    {
+      "epoch": 34.38,
+      "grad_norm": 0.33385559916496277,
+      "learning_rate": 0.0006944444444444445,
+      "loss": 0.1747,
+      "step": 2750
+    },
+    {
+      "epoch": 34.5,
+      "grad_norm": 0.36587977409362793,
+      "learning_rate": 0.000688888888888889,
+      "loss": 0.1376,
+      "step": 2760
+    },
+    {
+      "epoch": 34.62,
+      "grad_norm": 0.3459375202655792,
+      "learning_rate": 0.0006833333333333333,
+      "loss": 0.1297,
+      "step": 2770
+    },
+    {
+      "epoch": 34.75,
+      "grad_norm": 0.5182803273200989,
+      "learning_rate": 0.0006777777777777778,
+      "loss": 0.1747,
+      "step": 2780
+    },
+    {
+      "epoch": 34.88,
+      "grad_norm": 0.39014366269111633,
+      "learning_rate": 0.0006722222222222223,
+      "loss": 0.169,
+      "step": 2790
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.4516375660896301,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.1439,
+      "step": 2800
+    },
+    {
+      "epoch": 35.0,
+      "eval_accuracy": 0.96484375,
+      "eval_loss": 0.11748197674751282,
+      "eval_runtime": 4.909,
+      "eval_samples_per_second": 208.595,
+      "eval_steps_per_second": 13.037,
+      "step": 2800
+    },
+    {
+      "epoch": 35.12,
+      "grad_norm": 0.47782474756240845,
+      "learning_rate": 0.0006611111111111111,
+      "loss": 0.1417,
+      "step": 2810
+    },
+    {
+      "epoch": 35.25,
+      "grad_norm": 0.11640643328428268,
+      "learning_rate": 0.0006555555555555556,
+      "loss": 0.1226,
+      "step": 2820
+    },
+    {
+      "epoch": 35.38,
+      "grad_norm": 0.4363173544406891,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 0.14,
+      "step": 2830
+    },
+    {
+      "epoch": 35.5,
+      "grad_norm": 0.6676026582717896,
+      "learning_rate": 0.0006444444444444444,
+      "loss": 0.1548,
+      "step": 2840
+    },
+    {
+      "epoch": 35.62,
+      "grad_norm": 0.4940982162952423,
+      "learning_rate": 0.0006388888888888888,
+      "loss": 0.1554,
+      "step": 2850
+    },
+    {
+      "epoch": 35.75,
+      "grad_norm": 0.6478282809257507,
+      "learning_rate": 0.0006333333333333333,
+      "loss": 0.1641,
+      "step": 2860
+    },
+    {
+      "epoch": 35.88,
+      "grad_norm": 0.6007707715034485,
+      "learning_rate": 0.0006277777777777778,
+      "loss": 0.1484,
+      "step": 2870
+    },
+    {
+      "epoch": 36.0,
+      "grad_norm": 0.4945576786994934,
+      "learning_rate": 0.0006222222222222223,
+      "loss": 0.1606,
+      "step": 2880
+    },
+    {
+      "epoch": 36.0,
+      "eval_accuracy": 0.9443359375,
+      "eval_loss": 0.18046385049819946,
+      "eval_runtime": 5.0626,
+      "eval_samples_per_second": 202.266,
+      "eval_steps_per_second": 12.642,
+      "step": 2880
+    },
+    {
+      "epoch": 36.12,
+      "grad_norm": 0.4033058285713196,
+      "learning_rate": 0.0006166666666666667,
+      "loss": 0.1372,
+      "step": 2890
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.30507412552833557,
+      "learning_rate": 0.0006111111111111112,
+      "loss": 0.1549,
+      "step": 2900
+    },
+    {
+      "epoch": 36.38,
+      "grad_norm": 0.3899296820163727,
+      "learning_rate": 0.0006055555555555556,
+      "loss": 0.1667,
+      "step": 2910
+    },
+    {
+      "epoch": 36.5,
+      "grad_norm": 0.44058963656425476,
+      "learning_rate": 0.0006,
+      "loss": 0.1712,
+      "step": 2920
+    },
+    {
+      "epoch": 36.62,
+      "grad_norm": 0.4805178642272949,
+      "learning_rate": 0.0005944444444444444,
+      "loss": 0.1805,
+      "step": 2930
+    },
+    {
+      "epoch": 36.75,
+      "grad_norm": 0.37880581617355347,
+      "learning_rate": 0.0005888888888888889,
+      "loss": 0.1411,
+      "step": 2940
+    },
+    {
+      "epoch": 36.88,
+      "grad_norm": 0.4263412654399872,
+      "learning_rate": 0.0005833333333333334,
+      "loss": 0.1714,
+      "step": 2950
+    },
+    {
+      "epoch": 37.0,
+      "grad_norm": 0.2723836898803711,
+      "learning_rate": 0.0005777777777777778,
+      "loss": 0.1281,
+      "step": 2960
+    },
+    {
+      "epoch": 37.0,
+      "eval_accuracy": 0.9677734375,
+      "eval_loss": 0.1253870278596878,
+      "eval_runtime": 5.1275,
+      "eval_samples_per_second": 199.709,
+      "eval_steps_per_second": 12.482,
+      "step": 2960
+    },
+    {
+      "epoch": 37.12,
+      "grad_norm": 0.4946765601634979,
+      "learning_rate": 0.0005722222222222222,
+      "loss": 0.1059,
+      "step": 2970
+    },
+    {
+      "epoch": 37.25,
+      "grad_norm": 0.4709372818470001,
+      "learning_rate": 0.0005666666666666667,
+      "loss": 0.1321,
+      "step": 2980
+    },
+    {
+      "epoch": 37.38,
+      "grad_norm": 0.36459285020828247,
+      "learning_rate": 0.0005611111111111111,
+      "loss": 0.1351,
+      "step": 2990
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.4145031273365021,
+      "learning_rate": 0.0005555555555555556,
+      "loss": 0.1545,
+      "step": 3000
+    },
+    {
+      "epoch": 37.62,
+      "grad_norm": 0.5457221865653992,
+      "learning_rate": 0.00055,
+      "loss": 0.1424,
+      "step": 3010
+    },
+    {
+      "epoch": 37.75,
+      "grad_norm": 0.5123695731163025,
+      "learning_rate": 0.0005444444444444444,
+      "loss": 0.1508,
+      "step": 3020
+    },
+    {
+      "epoch": 37.88,
+      "grad_norm": 0.29368171095848083,
+      "learning_rate": 0.0005388888888888889,
+      "loss": 0.1438,
+      "step": 3030
+    },
+    {
+      "epoch": 38.0,
+      "grad_norm": 0.6859858632087708,
+      "learning_rate": 0.0005333333333333334,
+      "loss": 0.1518,
+      "step": 3040
+    },
+    {
+      "epoch": 38.0,
+      "eval_accuracy": 0.96484375,
+      "eval_loss": 0.11837992072105408,
+      "eval_runtime": 4.9042,
+      "eval_samples_per_second": 208.8,
+      "eval_steps_per_second": 13.05,
+      "step": 3040
+    },
+    {
+      "epoch": 38.12,
+      "grad_norm": 0.3859548270702362,
+      "learning_rate": 0.0005277777777777778,
+      "loss": 0.1455,
+      "step": 3050
+    },
+    {
+      "epoch": 38.25,
+      "grad_norm": 0.21001270413398743,
+      "learning_rate": 0.0005222222222222223,
+      "loss": 0.13,
+      "step": 3060
+    },
+    {
+      "epoch": 38.38,
+      "grad_norm": 0.4814240038394928,
+      "learning_rate": 0.0005166666666666667,
+      "loss": 0.1291,
+      "step": 3070
+    },
+    {
+      "epoch": 38.5,
+      "grad_norm": 0.4478558301925659,
+      "learning_rate": 0.0005111111111111111,
+      "loss": 0.1293,
+      "step": 3080
+    },
+    {
+      "epoch": 38.62,
+      "grad_norm": 0.4811321496963501,
+      "learning_rate": 0.0005055555555555555,
+      "loss": 0.1231,
+      "step": 3090
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.2841961085796356,
+      "learning_rate": 0.0005,
+      "loss": 0.166,
+      "step": 3100
+    },
+    {
+      "epoch": 38.88,
+      "grad_norm": 0.5479158759117126,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.1044,
+      "step": 3110
+    },
+    {
+      "epoch": 39.0,
+      "grad_norm": 0.37449321150779724,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.1531,
+      "step": 3120
+    },
+    {
+      "epoch": 39.0,
+      "eval_accuracy": 0.9736328125,
+      "eval_loss": 0.09921471774578094,
+      "eval_runtime": 5.3451,
+      "eval_samples_per_second": 191.577,
+      "eval_steps_per_second": 11.974,
+      "step": 3120
+    },
+    {
+      "epoch": 39.12,
+      "grad_norm": 0.5961503386497498,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.1321,
+      "step": 3130
+    },
+    {
+      "epoch": 39.25,
+      "grad_norm": 0.3140615224838257,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.1192,
+      "step": 3140
+    },
+    {
+      "epoch": 39.38,
+      "grad_norm": 0.8949409127235413,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.122,
+      "step": 3150
+    },
+    {
+      "epoch": 39.5,
+      "grad_norm": 0.21187840402126312,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.1341,
+      "step": 3160
+    },
+    {
+      "epoch": 39.62,
+      "grad_norm": 0.6364386081695557,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.1327,
+      "step": 3170
+    },
+    {
+      "epoch": 39.75,
+      "grad_norm": 0.2257820963859558,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.1101,
+      "step": 3180
+    },
+    {
+      "epoch": 39.88,
+      "grad_norm": 0.373692125082016,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.1293,
+      "step": 3190
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.48990318179130554,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.132,
+      "step": 3200
+    },
+    {
+      "epoch": 40.0,
+      "eval_accuracy": 0.9775390625,
+      "eval_loss": 0.09202806651592255,
+      "eval_runtime": 4.9155,
+      "eval_samples_per_second": 208.319,
+      "eval_steps_per_second": 13.02,
+      "step": 3200
+    },
+    {
+      "epoch": 40.12,
+      "grad_norm": 0.571524441242218,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.1167,
+      "step": 3210
+    },
+    {
+      "epoch": 40.25,
+      "grad_norm": 0.5896998643875122,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.151,
+      "step": 3220
+    },
+    {
+      "epoch": 40.38,
+      "grad_norm": 0.44366732239723206,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.1422,
+      "step": 3230
+    },
+    {
+      "epoch": 40.5,
+      "grad_norm": 0.314609169960022,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.1253,
+      "step": 3240
+    },
+    {
+      "epoch": 40.62,
+      "grad_norm": 0.3513747453689575,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.119,
+      "step": 3250
+    },
+    {
+      "epoch": 40.75,
+      "grad_norm": 0.3717803359031677,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.1156,
+      "step": 3260
+    },
+    {
+      "epoch": 40.88,
+      "grad_norm": 0.22342754900455475,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.1372,
+      "step": 3270
+    },
+    {
+      "epoch": 41.0,
+      "grad_norm": 0.41738444566726685,
+      "learning_rate": 0.0004,
+      "loss": 0.134,
+      "step": 3280
+    },
+    {
+      "epoch": 41.0,
+      "eval_accuracy": 0.9638671875,
+      "eval_loss": 0.13908132910728455,
+      "eval_runtime": 5.2165,
+      "eval_samples_per_second": 196.299,
+      "eval_steps_per_second": 12.269,
+      "step": 3280
+    },
+    {
+      "epoch": 41.12,
+      "grad_norm": 0.41814348101615906,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.1279,
+      "step": 3290
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.9678131937980652,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.1398,
+      "step": 3300
+    },
+    {
+      "epoch": 41.38,
+      "grad_norm": 0.6725767850875854,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 0.1492,
+      "step": 3310
+    },
+    {
+      "epoch": 41.5,
+      "grad_norm": 0.31534790992736816,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.1119,
+      "step": 3320
+    },
+    {
+      "epoch": 41.62,
+      "grad_norm": 0.632583737373352,
+      "learning_rate": 0.00037222222222222225,
+      "loss": 0.1131,
+      "step": 3330
+    },
+    {
+      "epoch": 41.75,
+      "grad_norm": 0.6746741533279419,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.1351,
+      "step": 3340
+    },
+    {
+      "epoch": 41.88,
+      "grad_norm": 0.3400849997997284,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 0.0815,
+      "step": 3350
+    },
+    {
+      "epoch": 42.0,
+      "grad_norm": 0.5605281591415405,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.1413,
+      "step": 3360
+    },
+    {
+      "epoch": 42.0,
+      "eval_accuracy": 0.9716796875,
+      "eval_loss": 0.11220287531614304,
+      "eval_runtime": 5.0927,
+      "eval_samples_per_second": 201.072,
+      "eval_steps_per_second": 12.567,
+      "step": 3360
+    },
+    {
+      "epoch": 42.12,
+      "grad_norm": 0.5148097276687622,
+      "learning_rate": 0.00035,
+      "loss": 0.125,
+      "step": 3370
+    },
+    {
+      "epoch": 42.25,
+      "grad_norm": 0.38650012016296387,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.1209,
+      "step": 3380
+    },
+    {
+      "epoch": 42.38,
+      "grad_norm": 0.3292187750339508,
+      "learning_rate": 0.0003388888888888889,
+      "loss": 0.1236,
+      "step": 3390
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.20681746304035187,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0973,
+      "step": 3400
+    },
+    {
+      "epoch": 42.62,
+      "grad_norm": 0.33743348717689514,
+      "learning_rate": 0.0003277777777777778,
+      "loss": 0.1208,
+      "step": 3410
+    },
+    {
+      "epoch": 42.75,
+      "grad_norm": 0.34158453345298767,
+      "learning_rate": 0.0003222222222222222,
+      "loss": 0.11,
+      "step": 3420
+    },
+    {
+      "epoch": 42.88,
+      "grad_norm": 0.5730062127113342,
+      "learning_rate": 0.00031666666666666665,
+      "loss": 0.1292,
+      "step": 3430
+    },
+    {
+      "epoch": 43.0,
+      "grad_norm": 0.44954267144203186,
+      "learning_rate": 0.0003111111111111111,
+      "loss": 0.1097,
+      "step": 3440
+    },
+    {
+      "epoch": 43.0,
+      "eval_accuracy": 0.9677734375,
+      "eval_loss": 0.11706902086734772,
+      "eval_runtime": 4.9202,
+      "eval_samples_per_second": 208.12,
+      "eval_steps_per_second": 13.007,
+      "step": 3440
+    },
+    {
+      "epoch": 43.12,
+      "grad_norm": 0.25731635093688965,
+      "learning_rate": 0.0003055555555555556,
+      "loss": 0.1161,
+      "step": 3450
+    },
+    {
+      "epoch": 43.25,
+      "grad_norm": 0.5329569578170776,
+      "learning_rate": 0.0003,
+      "loss": 0.1507,
+      "step": 3460
+    },
+    {
+      "epoch": 43.38,
+      "grad_norm": 0.3034692704677582,
+      "learning_rate": 0.00029444444444444445,
+      "loss": 0.1447,
+      "step": 3470
+    },
+    {
+      "epoch": 43.5,
+      "grad_norm": 0.5483482480049133,
+      "learning_rate": 0.0002888888888888889,
+      "loss": 0.1323,
+      "step": 3480
+    },
+    {
+      "epoch": 43.62,
+      "grad_norm": 0.279697984457016,
+      "learning_rate": 0.00028333333333333335,
+      "loss": 0.1,
+      "step": 3490
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.5593113303184509,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 0.1169,
+      "step": 3500
+    },
+    {
+      "epoch": 43.88,
+      "grad_norm": 0.621919572353363,
+      "learning_rate": 0.0002722222222222222,
+      "loss": 0.1119,
+      "step": 3510
+    },
+    {
+      "epoch": 44.0,
+      "grad_norm": 0.37898024916648865,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 0.1167,
+      "step": 3520
+    },
+    {
+      "epoch": 44.0,
+      "eval_accuracy": 0.9765625,
+      "eval_loss": 0.10542036592960358,
+      "eval_runtime": 5.3473,
+      "eval_samples_per_second": 191.5,
+      "eval_steps_per_second": 11.969,
+      "step": 3520
+    },
+    {
+      "epoch": 44.12,
+      "grad_norm": 0.40025296807289124,
+      "learning_rate": 0.00026111111111111116,
+      "loss": 0.1107,
+      "step": 3530
+    },
+    {
+      "epoch": 44.25,
+      "grad_norm": 0.19010861217975616,
+      "learning_rate": 0.00025555555555555553,
+      "loss": 0.1008,
+      "step": 3540
+    },
+    {
+      "epoch": 44.38,
+      "grad_norm": 0.33224934339523315,
+      "learning_rate": 0.00025,
+      "loss": 0.1355,
+      "step": 3550
+    },
+    {
+      "epoch": 44.5,
+      "grad_norm": 0.4298325181007385,
+      "learning_rate": 0.00024444444444444443,
+      "loss": 0.106,
+      "step": 3560
+    },
+    {
+      "epoch": 44.62,
+      "grad_norm": 0.4320330023765564,
+      "learning_rate": 0.0002388888888888889,
+      "loss": 0.1053,
+      "step": 3570
+    },
+    {
+      "epoch": 44.75,
+      "grad_norm": 0.1121302917599678,
+      "learning_rate": 0.00023333333333333333,
+      "loss": 0.0845,
+      "step": 3580
+    },
+    {
+      "epoch": 44.88,
+      "grad_norm": 0.3021819293498993,
+      "learning_rate": 0.00022777777777777778,
+      "loss": 0.1222,
+      "step": 3590
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.7353653311729431,
+      "learning_rate": 0.0002222222222222222,
+      "loss": 0.1388,
+      "step": 3600
+    },
+    {
+      "epoch": 45.0,
+      "eval_accuracy": 0.9794921875,
+      "eval_loss": 0.09323666244745255,
+      "eval_runtime": 4.9368,
+      "eval_samples_per_second": 207.422,
+      "eval_steps_per_second": 12.964,
+      "step": 3600
+    },
+    {
+      "epoch": 45.12,
+      "grad_norm": 0.5964930057525635,
+      "learning_rate": 0.00021666666666666668,
+      "loss": 0.1201,
+      "step": 3610
+    },
+    {
+      "epoch": 45.25,
+      "grad_norm": 0.17329342663288116,
+      "learning_rate": 0.0002111111111111111,
+      "loss": 0.0905,
+      "step": 3620
+    },
+    {
+      "epoch": 45.38,
+      "grad_norm": 0.5378609299659729,
+      "learning_rate": 0.00020555555555555556,
+      "loss": 0.0981,
+      "step": 3630
+    },
+    {
+      "epoch": 45.5,
+      "grad_norm": 0.3457593619823456,
+      "learning_rate": 0.0002,
+      "loss": 0.1116,
+      "step": 3640
+    },
+    {
+      "epoch": 45.62,
+      "grad_norm": 0.5954685211181641,
+      "learning_rate": 0.00019444444444444446,
+      "loss": 0.1037,
+      "step": 3650
+    },
+    {
+      "epoch": 45.75,
+      "grad_norm": 0.1786712259054184,
+      "learning_rate": 0.00018888888888888888,
+      "loss": 0.0978,
+      "step": 3660
+    },
+    {
+      "epoch": 45.88,
+      "grad_norm": 0.25224894285202026,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.1089,
+      "step": 3670
+    },
+    {
+      "epoch": 46.0,
+      "grad_norm": 0.33607247471809387,
+      "learning_rate": 0.00017777777777777779,
+      "loss": 0.1221,
+      "step": 3680
+    },
+    {
+      "epoch": 46.0,
+      "eval_accuracy": 0.9765625,
+      "eval_loss": 0.09462323784828186,
+      "eval_runtime": 5.2287,
+      "eval_samples_per_second": 195.844,
+      "eval_steps_per_second": 12.24,
+      "step": 3680
+    },
+    {
+      "epoch": 46.12,
+      "grad_norm": 0.34634700417518616,
+      "learning_rate": 0.00017222222222222224,
+      "loss": 0.1243,
+      "step": 3690
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.5061681866645813,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.1115,
+      "step": 3700
+    },
+    {
+      "epoch": 46.38,
+      "grad_norm": 0.2837713658809662,
+      "learning_rate": 0.0001611111111111111,
+      "loss": 0.1008,
+      "step": 3710
+    },
+    {
+      "epoch": 46.5,
+      "grad_norm": 0.2688066363334656,
+      "learning_rate": 0.00015555555555555556,
+      "loss": 0.1058,
+      "step": 3720
+    },
+    {
+      "epoch": 46.62,
+      "grad_norm": 0.32675421237945557,
+      "learning_rate": 0.00015,
+      "loss": 0.0897,
+      "step": 3730
+    },
+    {
+      "epoch": 46.75,
+      "grad_norm": 0.6959260702133179,
+      "learning_rate": 0.00014444444444444444,
+      "loss": 0.1182,
+      "step": 3740
+    },
+    {
+      "epoch": 46.88,
+      "grad_norm": 0.3018099069595337,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 0.1013,
+      "step": 3750
+    },
+    {
+      "epoch": 47.0,
+      "grad_norm": 0.6018778085708618,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.1099,
+      "step": 3760
+    },
+    {
+      "epoch": 47.0,
+      "eval_accuracy": 0.9755859375,
+      "eval_loss": 0.1115545928478241,
+      "eval_runtime": 5.0799,
+      "eval_samples_per_second": 201.581,
+      "eval_steps_per_second": 12.599,
+      "step": 3760
+    },
+    {
+      "epoch": 47.12,
+      "grad_norm": 0.42199546098709106,
+      "learning_rate": 0.00012777777777777776,
+      "loss": 0.1073,
+      "step": 3770
+    },
+    {
+      "epoch": 47.25,
+      "grad_norm": 0.6451756358146667,
+      "learning_rate": 0.00012222222222222221,
+      "loss": 0.099,
+      "step": 3780
+    },
+    {
+      "epoch": 47.38,
+      "grad_norm": 0.4935210943222046,
+      "learning_rate": 0.00011666666666666667,
+      "loss": 0.1077,
+      "step": 3790
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.2563684582710266,
+      "learning_rate": 0.0001111111111111111,
+      "loss": 0.0907,
+      "step": 3800
+    },
+    {
+      "epoch": 47.62,
+      "grad_norm": 0.3351310193538666,
+      "learning_rate": 0.00010555555555555555,
+      "loss": 0.1059,
+      "step": 3810
+    },
+    {
+      "epoch": 47.75,
+      "grad_norm": 0.39526107907295227,
+      "learning_rate": 0.0001,
+      "loss": 0.0868,
+      "step": 3820
+    },
+    {
+      "epoch": 47.88,
+      "grad_norm": 0.4634101390838623,
+      "learning_rate": 9.444444444444444e-05,
+      "loss": 0.1098,
+      "step": 3830
+    },
+    {
+      "epoch": 48.0,
+      "grad_norm": 0.5983624458312988,
+      "learning_rate": 8.888888888888889e-05,
+      "loss": 0.1041,
+      "step": 3840
+    },
+    {
+      "epoch": 48.0,
+      "eval_accuracy": 0.974609375,
+      "eval_loss": 0.11264081299304962,
+      "eval_runtime": 4.9279,
+      "eval_samples_per_second": 207.795,
+      "eval_steps_per_second": 12.987,
+      "step": 3840
+    },
+    {
+      "epoch": 48.12,
+      "grad_norm": 0.4093017578125,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 0.1134,
+      "step": 3850
+    },
+    {
+      "epoch": 48.25,
+      "grad_norm": 0.6668171286582947,
+      "learning_rate": 7.777777777777778e-05,
+      "loss": 0.0948,
+      "step": 3860
+    },
+    {
+      "epoch": 48.38,
+      "grad_norm": 0.24066688120365143,
+      "learning_rate": 7.222222222222222e-05,
+      "loss": 0.0958,
+      "step": 3870
+    },
+    {
+      "epoch": 48.5,
+      "grad_norm": 0.2770562469959259,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.1021,
+      "step": 3880
+    },
+    {
+      "epoch": 48.62,
+      "grad_norm": 0.45978790521621704,
+      "learning_rate": 6.111111111111111e-05,
+      "loss": 0.1084,
+      "step": 3890
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.594672441482544,
+      "learning_rate": 5.555555555555555e-05,
+      "loss": 0.1373,
+      "step": 3900
+    },
+    {
+      "epoch": 48.88,
+      "grad_norm": 0.8167428374290466,
+      "learning_rate": 5e-05,
+      "loss": 0.1038,
+      "step": 3910
+    },
+    {
+      "epoch": 49.0,
+      "grad_norm": 0.2987329661846161,
+      "learning_rate": 4.4444444444444447e-05,
+      "loss": 0.1025,
+      "step": 3920
+    },
+    {
+      "epoch": 49.0,
+      "eval_accuracy": 0.9755859375,
+      "eval_loss": 0.11138872057199478,
+      "eval_runtime": 5.3184,
+      "eval_samples_per_second": 192.54,
+      "eval_steps_per_second": 12.034,
+      "step": 3920
+    },
+    {
+      "epoch": 49.12,
+      "grad_norm": 0.3884102404117584,
+      "learning_rate": 3.888888888888889e-05,
+      "loss": 0.1018,
+      "step": 3930
+    },
+    {
+      "epoch": 49.25,
+      "grad_norm": 0.2661769688129425,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 0.1011,
+      "step": 3940
+    },
+    {
+      "epoch": 49.38,
+      "grad_norm": 0.40820014476776123,
+      "learning_rate": 2.7777777777777776e-05,
+      "loss": 0.1488,
+      "step": 3950
+    },
+    {
+      "epoch": 49.5,
+      "grad_norm": 0.46163231134414673,
+      "learning_rate": 2.2222222222222223e-05,
+      "loss": 0.1258,
+      "step": 3960
+    },
+    {
+      "epoch": 49.62,
+      "grad_norm": 0.4315054416656494,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.1018,
+      "step": 3970
+    },
+    {
+      "epoch": 49.75,
+      "grad_norm": 0.2365369200706482,
+      "learning_rate": 1.1111111111111112e-05,
+      "loss": 0.0977,
+      "step": 3980
+    },
+    {
+      "epoch": 49.88,
+      "grad_norm": 0.4910149574279785,
+      "learning_rate": 5.555555555555556e-06,
+      "loss": 0.1122,
+      "step": 3990
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.2623092234134674,
+      "learning_rate": 0.0,
+      "loss": 0.0887,
+      "step": 4000
+    },
+    {
+      "epoch": 50.0,
+      "eval_accuracy": 0.9755859375,
+      "eval_loss": 0.10555899888277054,
+      "eval_runtime": 4.9229,
+      "eval_samples_per_second": 208.005,
+      "eval_steps_per_second": 13.0,
+      "step": 4000
+    },
+    {
+      "epoch": 50.0,
+      "step": 4000,
+      "total_flos": 5.437210780237824e+18,
+      "train_loss": 0.3629864407479763,
+      "train_runtime": 3465.6999,
+      "train_samples_per_second": 73.867,
+      "train_steps_per_second": 1.154
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 500,
+  "total_flos": 5.437210780237824e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ac0417085b4a2358ca8e0bc3a128e1354e847ed5e45152747b6de1925eaf689
+size 4920