Model save

Browse files

Files changed (6) hide show

README.md +69 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
runs/Sep29_15-26-06_5bc1778c8f32/events.out.tfevents.1727623620.5bc1778c8f32.108129.0 +2 -2
train_results.json +9 -0
trainer_state.json +1730 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+base_model: google/gemma-7b
+datasets:
+- generator
+library_name: peft
+license: gemma
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: gemma7b-gpt4o_100k_coding-lora
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gemma7b-gpt4o_100k_coding-lora
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.2800
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 16
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.5901        | 0.9996 | 1195 | 1.2800          |
+### Framework versions
+- PEFT 0.13.0
+- Transformers 4.45.1
+- Pytorch 2.4.1+cu121
+- Datasets 3.0.1
+- Tokenizers 0.20.0

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e793cf8aec16b755de8479104f9d17e6ec21e8652b9ed3797bda5af1db41bf3
 size 12859872

 version https://git-lfs.github.com/spec/v1
+oid sha256:39878badd29124b975bb8df5bfaf7a1627e690d82b3797f228aa8ea1241bf1f7
 size 12859872

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.999581764951903,
+    "total_flos": 9.109418934146171e+17,
+    "train_loss": 1.440422640086218,
+    "train_runtime": 6570.4384,
+    "train_samples": 116368,
+    "train_samples_per_second": 2.911,
+    "train_steps_per_second": 0.182
+}

runs/Sep29_15-26-06_5bc1778c8f32/events.out.tfevents.1727623620.5bc1778c8f32.108129.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27e9d298c48f11f7eb3d4b54a363544742c42d84951227f52cc9fa8b523a3c18
-size 53221

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd12868a08c79bd23240846e1ea4c9c082f8de4033a5a7cbd2eb6e428aa3f56e
+size 57855

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.999581764951903,
+    "total_flos": 9.109418934146171e+17,
+    "train_loss": 1.440422640086218,
+    "train_runtime": 6570.4384,
+    "train_samples": 116368,
+    "train_samples_per_second": 2.911,
+    "train_steps_per_second": 0.182
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1730 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.999581764951903,
+  "eval_steps": 500,
+  "global_step": 1195,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000836470096194061,
+      "grad_norm": 109.33856201171875,
+      "learning_rate": 2.4999999999999998e-06,
+      "loss": 24.0328,
+      "step": 1
+    },
+    {
+      "epoch": 0.004182350480970306,
+      "grad_norm": 97.91401672363281,
+      "learning_rate": 1.2499999999999999e-05,
+      "loss": 22.4374,
+      "step": 5
+    },
+    {
+      "epoch": 0.008364700961940611,
+      "grad_norm": 36.984405517578125,
+      "learning_rate": 2.4999999999999998e-05,
+      "loss": 20.9539,
+      "step": 10
+    },
+    {
+      "epoch": 0.012547051442910916,
+      "grad_norm": 20.562854766845703,
+      "learning_rate": 3.75e-05,
+      "loss": 18.0157,
+      "step": 15
+    },
+    {
+      "epoch": 0.016729401923881223,
+      "grad_norm": 7.916600227355957,
+      "learning_rate": 4.9999999999999996e-05,
+      "loss": 15.4241,
+      "step": 20
+    },
+    {
+      "epoch": 0.020911752404851526,
+      "grad_norm": 6.871030807495117,
+      "learning_rate": 6.25e-05,
+      "loss": 14.6159,
+      "step": 25
+    },
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 6.986784934997559,
+      "learning_rate": 7.5e-05,
+      "loss": 13.6058,
+      "step": 30
+    },
+    {
+      "epoch": 0.029276453366792136,
+      "grad_norm": 3.074172019958496,
+      "learning_rate": 8.75e-05,
+      "loss": 12.7264,
+      "step": 35
+    },
+    {
+      "epoch": 0.033458803847762446,
+      "grad_norm": 2.548049211502075,
+      "learning_rate": 9.999999999999999e-05,
+      "loss": 12.7985,
+      "step": 40
+    },
+    {
+      "epoch": 0.037641154328732745,
+      "grad_norm": 3.184255361557007,
+      "learning_rate": 0.0001125,
+      "loss": 12.2921,
+      "step": 45
+    },
+    {
+      "epoch": 0.04182350480970305,
+      "grad_norm": 4.834357738494873,
+      "learning_rate": 0.000125,
+      "loss": 11.7192,
+      "step": 50
+    },
+    {
+      "epoch": 0.04600585529067336,
+      "grad_norm": 8.06936264038086,
+      "learning_rate": 0.00013749999999999998,
+      "loss": 11.0815,
+      "step": 55
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 12.874434471130371,
+      "learning_rate": 0.00015,
+      "loss": 9.8728,
+      "step": 60
+    },
+    {
+      "epoch": 0.05437055625261397,
+      "grad_norm": 18.794525146484375,
+      "learning_rate": 0.00016249999999999997,
+      "loss": 7.7488,
+      "step": 65
+    },
+    {
+      "epoch": 0.05855290673358427,
+      "grad_norm": 21.42744255065918,
+      "learning_rate": 0.000175,
+      "loss": 4.8002,
+      "step": 70
+    },
+    {
+      "epoch": 0.06273525721455459,
+      "grad_norm": 7.021483898162842,
+      "learning_rate": 0.00018749999999999998,
+      "loss": 2.2667,
+      "step": 75
+    },
+    {
+      "epoch": 0.06691760769552489,
+      "grad_norm": 4.35729455947876,
+      "learning_rate": 0.00019999999999999998,
+      "loss": 1.7236,
+      "step": 80
+    },
+    {
+      "epoch": 0.07109995817649518,
+      "grad_norm": 2.531404972076416,
+      "learning_rate": 0.0002125,
+      "loss": 1.4388,
+      "step": 85
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 1.7126120328903198,
+      "learning_rate": 0.000225,
+      "loss": 1.2603,
+      "step": 90
+    },
+    {
+      "epoch": 0.0794646591384358,
+      "grad_norm": 1.713827133178711,
+      "learning_rate": 0.00023749999999999997,
+      "loss": 1.1149,
+      "step": 95
+    },
+    {
+      "epoch": 0.0836470096194061,
+      "grad_norm": 0.6418541073799133,
+      "learning_rate": 0.00025,
+      "loss": 1.0645,
+      "step": 100
+    },
+    {
+      "epoch": 0.08782936010037641,
+      "grad_norm": 0.9687772989273071,
+      "learning_rate": 0.0002625,
+      "loss": 1.0047,
+      "step": 105
+    },
+    {
+      "epoch": 0.09201171058134672,
+      "grad_norm": 0.8204954266548157,
+      "learning_rate": 0.00027499999999999996,
+      "loss": 0.986,
+      "step": 110
+    },
+    {
+      "epoch": 0.09619406106231702,
+      "grad_norm": 0.5046463012695312,
+      "learning_rate": 0.0002875,
+      "loss": 0.9229,
+      "step": 115
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 1.1709442138671875,
+      "learning_rate": 0.0003,
+      "loss": 0.9344,
+      "step": 120
+    },
+    {
+      "epoch": 0.10455876202425764,
+      "grad_norm": 0.9831328392028809,
+      "learning_rate": 0.0002999839868651235,
+      "loss": 0.8812,
+      "step": 125
+    },
+    {
+      "epoch": 0.10874111250522794,
+      "grad_norm": 1.4019944667816162,
+      "learning_rate": 0.0002999359508794339,
+      "loss": 0.8814,
+      "step": 130
+    },
+    {
+      "epoch": 0.11292346298619825,
+      "grad_norm": 0.78233802318573,
+      "learning_rate": 0.00029985590229902073,
+      "loss": 0.8701,
+      "step": 135
+    },
+    {
+      "epoch": 0.11710581346716854,
+      "grad_norm": 1.4517076015472412,
+      "learning_rate": 0.0002997438582149335,
+      "loss": 0.8753,
+      "step": 140
+    },
+    {
+      "epoch": 0.12128816394813885,
+      "grad_norm": 2.320331335067749,
+      "learning_rate": 0.0002995998425495327,
+      "loss": 0.8464,
+      "step": 145
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 1.0486135482788086,
+      "learning_rate": 0.000299423886051382,
+      "loss": 0.8498,
+      "step": 150
+    },
+    {
+      "epoch": 0.12965286491007946,
+      "grad_norm": 0.9131810069084167,
+      "learning_rate": 0.0002992160262886831,
+      "loss": 0.8468,
+      "step": 155
+    },
+    {
+      "epoch": 0.13383521539104978,
+      "grad_norm": 1.4284504652023315,
+      "learning_rate": 0.0002989763076412549,
+      "loss": 0.8088,
+      "step": 160
+    },
+    {
+      "epoch": 0.13801756587202008,
+      "grad_norm": 0.6106438040733337,
+      "learning_rate": 0.000298704781291058,
+      "loss": 0.8215,
+      "step": 165
+    },
+    {
+      "epoch": 0.14219991635299037,
+      "grad_norm": 0.5358127951622009,
+      "learning_rate": 0.0002984015052112665,
+      "loss": 0.8201,
+      "step": 170
+    },
+    {
+      "epoch": 0.1463822668339607,
+      "grad_norm": 1.5017443895339966,
+      "learning_rate": 0.0002980665441538907,
+      "loss": 0.7957,
+      "step": 175
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 1.1214942932128906,
+      "learning_rate": 0.00029769996963595184,
+      "loss": 0.8083,
+      "step": 180
+    },
+    {
+      "epoch": 0.1547469677959013,
+      "grad_norm": 2.1036202907562256,
+      "learning_rate": 0.0002973018599242125,
+      "loss": 0.7929,
+      "step": 185
+    },
+    {
+      "epoch": 0.1589293182768716,
+      "grad_norm": 1.0557819604873657,
+      "learning_rate": 0.0002968723000184662,
+      "loss": 0.7868,
+      "step": 190
+    },
+    {
+      "epoch": 0.16311166875784192,
+      "grad_norm": 0.9558168649673462,
+      "learning_rate": 0.00029641138163338907,
+      "loss": 0.7812,
+      "step": 195
+    },
+    {
+      "epoch": 0.1672940192388122,
+      "grad_norm": 0.771851122379303,
+      "learning_rate": 0.0002959192031789579,
+      "loss": 0.7846,
+      "step": 200
+    },
+    {
+      "epoch": 0.17147636971978253,
+      "grad_norm": 1.288865089416504,
+      "learning_rate": 0.0002953958697394391,
+      "loss": 0.777,
+      "step": 205
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 2.001302480697632,
+      "learning_rate": 0.000294841493050952,
+      "loss": 0.7797,
+      "step": 210
+    },
+    {
+      "epoch": 0.17984107068172314,
+      "grad_norm": 0.7828574776649475,
+      "learning_rate": 0.0002942561914776124,
+      "loss": 0.7815,
+      "step": 215
+    },
+    {
+      "epoch": 0.18402342116269343,
+      "grad_norm": 1.4854490756988525,
+      "learning_rate": 0.00029364008998626086,
+      "loss": 0.7608,
+      "step": 220
+    },
+    {
+      "epoch": 0.18820577164366373,
+      "grad_norm": 1.1406800746917725,
+      "learning_rate": 0.00029299332011978107,
+      "loss": 0.747,
+      "step": 225
+    },
+    {
+      "epoch": 0.19238812212463405,
+      "grad_norm": 1.7346460819244385,
+      "learning_rate": 0.00029231601996901433,
+      "loss": 0.7555,
+      "step": 230
+    },
+    {
+      "epoch": 0.19657047260560434,
+      "grad_norm": 1.7754813432693481,
+      "learning_rate": 0.0002916083341432763,
+      "loss": 0.7626,
+      "step": 235
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 1.2126661539077759,
+      "learning_rate": 0.00029087041373948135,
+      "loss": 0.7237,
+      "step": 240
+    },
+    {
+      "epoch": 0.20493517356754495,
+      "grad_norm": 1.930538535118103,
+      "learning_rate": 0.00029010241630988217,
+      "loss": 0.7672,
+      "step": 245
+    },
+    {
+      "epoch": 0.20911752404851527,
+      "grad_norm": 2.1792216300964355,
+      "learning_rate": 0.0002893045058284311,
+      "loss": 0.7416,
+      "step": 250
+    },
+    {
+      "epoch": 0.21329987452948557,
+      "grad_norm": 1.416754961013794,
+      "learning_rate": 0.0002884768526557703,
+      "loss": 0.7196,
+      "step": 255
+    },
+    {
+      "epoch": 0.2174822250104559,
+      "grad_norm": 1.6103583574295044,
+      "learning_rate": 0.0002876196335028581,
+      "loss": 0.7397,
+      "step": 260
+    },
+    {
+      "epoch": 0.22166457549142618,
+      "grad_norm": 1.0755459070205688,
+      "learning_rate": 0.0002867330313932402,
+      "loss": 0.7644,
+      "step": 265
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.8303298354148865,
+      "learning_rate": 0.000285817235623972,
+      "loss": 0.7393,
+      "step": 270
+    },
+    {
+      "epoch": 0.2300292764533668,
+      "grad_norm": 1.4747998714447021,
+      "learning_rate": 0.00028487244172520246,
+      "loss": 0.7121,
+      "step": 275
+    },
+    {
+      "epoch": 0.23421162693433709,
+      "grad_norm": 2.582953929901123,
+      "learning_rate": 0.0002838988514184267,
+      "loss": 0.7361,
+      "step": 280
+    },
+    {
+      "epoch": 0.2383939774153074,
+      "grad_norm": 2.413325309753418,
+      "learning_rate": 0.0002828966725734167,
+      "loss": 0.74,
+      "step": 285
+    },
+    {
+      "epoch": 0.2425763278962777,
+      "grad_norm": 0.7637856006622314,
+      "learning_rate": 0.0002818661191638393,
+      "loss": 0.7096,
+      "step": 290
+    },
+    {
+      "epoch": 0.24675867837724802,
+      "grad_norm": 1.757056713104248,
+      "learning_rate": 0.0002808074112215711,
+      "loss": 0.7205,
+      "step": 295
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.8766753077507019,
+      "learning_rate": 0.0002797207747897198,
+      "loss": 0.7098,
+      "step": 300
+    },
+    {
+      "epoch": 0.2551233793391886,
+      "grad_norm": 1.449209213256836,
+      "learning_rate": 0.00027860644187436195,
+      "loss": 0.725,
+      "step": 305
+    },
+    {
+      "epoch": 0.2593057298201589,
+      "grad_norm": 0.6825206875801086,
+      "learning_rate": 0.0002774646503950078,
+      "loss": 0.6938,
+      "step": 310
+    },
+    {
+      "epoch": 0.26348808030112925,
+      "grad_norm": 1.119585394859314,
+      "learning_rate": 0.0002762956441338036,
+      "loss": 0.698,
+      "step": 315
+    },
+    {
+      "epoch": 0.26767043078209957,
+      "grad_norm": 0.9425824880599976,
+      "learning_rate": 0.0002750996726834817,
+      "loss": 0.7189,
+      "step": 320
+    },
+    {
+      "epoch": 0.27185278126306983,
+      "grad_norm": 0.5979897975921631,
+      "learning_rate": 0.0002738769913940706,
+      "loss": 0.7039,
+      "step": 325
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 1.8769757747650146,
+      "learning_rate": 0.00027262786131837573,
+      "loss": 0.7035,
+      "step": 330
+    },
+    {
+      "epoch": 0.2802174822250105,
+      "grad_norm": 1.1395800113677979,
+      "learning_rate": 0.0002713525491562421,
+      "loss": 0.6898,
+      "step": 335
+    },
+    {
+      "epoch": 0.28439983270598074,
+      "grad_norm": 1.0573526620864868,
+      "learning_rate": 0.0002700513271976119,
+      "loss": 0.7042,
+      "step": 340
+    },
+    {
+      "epoch": 0.28858218318695106,
+      "grad_norm": 0.5185459852218628,
+      "learning_rate": 0.0002687244732643881,
+      "loss": 0.6914,
+      "step": 345
+    },
+    {
+      "epoch": 0.2927645336679214,
+      "grad_norm": 2.7914602756500244,
+      "learning_rate": 0.0002673722706511174,
+      "loss": 0.7049,
+      "step": 350
+    },
+    {
+      "epoch": 0.2969468841488917,
+      "grad_norm": 3.0459792613983154,
+      "learning_rate": 0.000265995008064504,
+      "loss": 0.7148,
+      "step": 355
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 2.1906723976135254,
+      "learning_rate": 0.00026459297956176885,
+      "loss": 0.7074,
+      "step": 360
+    },
+    {
+      "epoch": 0.3053115851108323,
+      "grad_norm": 1.6257227659225464,
+      "learning_rate": 0.00026316648448786536,
+      "loss": 0.6985,
+      "step": 365
+    },
+    {
+      "epoch": 0.3094939355918026,
+      "grad_norm": 0.7152910828590393,
+      "learning_rate": 0.00026171582741156725,
+      "loss": 0.6875,
+      "step": 370
+    },
+    {
+      "epoch": 0.3136762860727729,
+      "grad_norm": 2.4449851512908936,
+      "learning_rate": 0.0002602413180604401,
+      "loss": 0.6787,
+      "step": 375
+    },
+    {
+      "epoch": 0.3178586365537432,
+      "grad_norm": 0.5180588960647583,
+      "learning_rate": 0.000258743271254712,
+      "loss": 0.6724,
+      "step": 380
+    },
+    {
+      "epoch": 0.3220409870347135,
+      "grad_norm": 1.5739381313323975,
+      "learning_rate": 0.00025722200684005715,
+      "loss": 0.7076,
+      "step": 385
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.8701817989349365,
+      "learning_rate": 0.00025567784961930546,
+      "loss": 0.6841,
+      "step": 390
+    },
+    {
+      "epoch": 0.3304056879966541,
+      "grad_norm": 1.474747896194458,
+      "learning_rate": 0.0002541111292830951,
+      "loss": 0.713,
+      "step": 395
+    },
+    {
+      "epoch": 0.3345880384776244,
+      "grad_norm": 1.8884798288345337,
+      "learning_rate": 0.00025252218033947993,
+      "loss": 0.6893,
+      "step": 400
+    },
+    {
+      "epoch": 0.33877038895859474,
+      "grad_norm": 0.8834472894668579,
+      "learning_rate": 0.00025091134204250997,
+      "loss": 0.6966,
+      "step": 405
+    },
+    {
+      "epoch": 0.34295273943956506,
+      "grad_norm": 0.6324520707130432,
+      "learning_rate": 0.00024927895831979745,
+      "loss": 0.6882,
+      "step": 410
+    },
+    {
+      "epoch": 0.3471350899205353,
+      "grad_norm": 2.353163480758667,
+      "learning_rate": 0.00024762537769908535,
+      "loss": 0.6829,
+      "step": 415
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 1.3682096004486084,
+      "learning_rate": 0.00024595095323383365,
+      "loss": 0.6912,
+      "step": 420
+    },
+    {
+      "epoch": 0.35549979088247596,
+      "grad_norm": 0.9962055087089539,
+      "learning_rate": 0.0002442560424278399,
+      "loss": 0.6857,
+      "step": 425
+    },
+    {
+      "epoch": 0.3596821413634463,
+      "grad_norm": 1.1282930374145508,
+      "learning_rate": 0.00024254100715890846,
+      "loss": 0.6696,
+      "step": 430
+    },
+    {
+      "epoch": 0.36386449184441655,
+      "grad_norm": 0.934388279914856,
+      "learning_rate": 0.00024080621360158717,
+      "loss": 0.6841,
+      "step": 435
+    },
+    {
+      "epoch": 0.36804684232538687,
+      "grad_norm": 1.4339077472686768,
+      "learning_rate": 0.00023905203214898558,
+      "loss": 0.6705,
+      "step": 440
+    },
+    {
+      "epoch": 0.3722291928063572,
+      "grad_norm": 1.0309265851974487,
+      "learning_rate": 0.00023727883733369292,
+      "loss": 0.6706,
+      "step": 445
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 1.9208811521530151,
+      "learning_rate": 0.00023548700774781242,
+      "loss": 0.6637,
+      "step": 450
+    },
+    {
+      "epoch": 0.3805938937682978,
+      "grad_norm": 1.0379974842071533,
+      "learning_rate": 0.00023367692596212858,
+      "loss": 0.68,
+      "step": 455
+    },
+    {
+      "epoch": 0.3847762442492681,
+      "grad_norm": 1.852662444114685,
+      "learning_rate": 0.00023184897844442495,
+      "loss": 0.6589,
+      "step": 460
+    },
+    {
+      "epoch": 0.3889585947302384,
+      "grad_norm": 1.1750479936599731,
+      "learning_rate": 0.00023000355547697027,
+      "loss": 0.6675,
+      "step": 465
+    },
+    {
+      "epoch": 0.3931409452112087,
+      "grad_norm": 1.6473002433776855,
+      "learning_rate": 0.00022814105107318952,
+      "loss": 0.6709,
+      "step": 470
+    },
+    {
+      "epoch": 0.397323295692179,
+      "grad_norm": 1.2356650829315186,
+      "learning_rate": 0.00022626186289353913,
+      "loss": 0.6652,
+      "step": 475
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 1.1605840921401978,
+      "learning_rate": 0.00022436639216060275,
+      "loss": 0.6698,
+      "step": 480
+    },
+    {
+      "epoch": 0.40568799665411964,
+      "grad_norm": 1.5935866832733154,
+      "learning_rate": 0.00022245504357342716,
+      "loss": 0.6688,
+      "step": 485
+    },
+    {
+      "epoch": 0.4098703471350899,
+      "grad_norm": 0.810558557510376,
+      "learning_rate": 0.00022052822522111522,
+      "loss": 0.6524,
+      "step": 490
+    },
+    {
+      "epoch": 0.41405269761606023,
+      "grad_norm": 0.7008018493652344,
+      "learning_rate": 0.00021858634849569576,
+      "loss": 0.6924,
+      "step": 495
+    },
+    {
+      "epoch": 0.41823504809703055,
+      "grad_norm": 1.7558863162994385,
+      "learning_rate": 0.0002166298280042877,
+      "loss": 0.6711,
+      "step": 500
+    },
+    {
+      "epoch": 0.4224173985780008,
+      "grad_norm": 1.573688268661499,
+      "learning_rate": 0.00021465908148057787,
+      "loss": 0.6674,
+      "step": 505
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 1.4761265516281128,
+      "learning_rate": 0.00021267452969563153,
+      "loss": 0.6706,
+      "step": 510
+    },
+    {
+      "epoch": 0.43078209953994145,
+      "grad_norm": 1.7749208211898804,
+      "learning_rate": 0.00021067659636805403,
+      "loss": 0.6469,
+      "step": 515
+    },
+    {
+      "epoch": 0.4349644500209118,
+      "grad_norm": 1.0164939165115356,
+      "learning_rate": 0.00020866570807352337,
+      "loss": 0.6764,
+      "step": 520
+    },
+    {
+      "epoch": 0.43914680050188204,
+      "grad_norm": 1.6237319707870483,
+      "learning_rate": 0.00020664229415371266,
+      "loss": 0.6694,
+      "step": 525
+    },
+    {
+      "epoch": 0.44332915098285236,
+      "grad_norm": 1.5586035251617432,
+      "learning_rate": 0.00020460678662462194,
+      "loss": 0.6562,
+      "step": 530
+    },
+    {
+      "epoch": 0.4475115014638227,
+      "grad_norm": 1.771645188331604,
+      "learning_rate": 0.0002025596200843394,
+      "loss": 0.6622,
+      "step": 535
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5951160788536072,
+      "learning_rate": 0.0002005012316202506,
+      "loss": 0.651,
+      "step": 540
+    },
+    {
+      "epoch": 0.45587620242576327,
+      "grad_norm": 0.793093740940094,
+      "learning_rate": 0.00019843206071571692,
+      "loss": 0.6634,
+      "step": 545
+    },
+    {
+      "epoch": 0.4600585529067336,
+      "grad_norm": 1.0352814197540283,
+      "learning_rate": 0.0001963525491562421,
+      "loss": 0.6636,
+      "step": 550
+    },
+    {
+      "epoch": 0.4642409033877039,
+      "grad_norm": 0.843008816242218,
+      "learning_rate": 0.00019426314093514717,
+      "loss": 0.6407,
+      "step": 555
+    },
+    {
+      "epoch": 0.46842325386867417,
+      "grad_norm": 1.7540709972381592,
+      "learning_rate": 0.00019216428215877425,
+      "loss": 0.638,
+      "step": 560
+    },
+    {
+      "epoch": 0.4726056043496445,
+      "grad_norm": 0.5828922390937805,
+      "learning_rate": 0.00019005642095123895,
+      "loss": 0.6625,
+      "step": 565
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.7700462937355042,
+      "learning_rate": 0.00018794000735875208,
+      "loss": 0.6428,
+      "step": 570
+    },
+    {
+      "epoch": 0.48097030531158513,
+      "grad_norm": 0.8344655632972717,
+      "learning_rate": 0.00018581549325353126,
+      "loss": 0.6553,
+      "step": 575
+    },
+    {
+      "epoch": 0.4851526557925554,
+      "grad_norm": 1.2676873207092285,
+      "learning_rate": 0.000183683332237322,
+      "loss": 0.6645,
+      "step": 580
+    },
+    {
+      "epoch": 0.4893350062735257,
+      "grad_norm": 1.4888837337493896,
+      "learning_rate": 0.00018154397954454993,
+      "loss": 0.6859,
+      "step": 585
+    },
+    {
+      "epoch": 0.49351735675449604,
+      "grad_norm": 0.7020601034164429,
+      "learning_rate": 0.00017939789194512472,
+      "loss": 0.6456,
+      "step": 590
+    },
+    {
+      "epoch": 0.49769970723546636,
+      "grad_norm": 1.1964813470840454,
+      "learning_rate": 0.00017724552764691545,
+      "loss": 0.6594,
+      "step": 595
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 1.1332772970199585,
+      "learning_rate": 0.00017508734619791966,
+      "loss": 0.6606,
+      "step": 600
+    },
+    {
+      "epoch": 0.506064408197407,
+      "grad_norm": 1.6122368574142456,
+      "learning_rate": 0.00017292380838814577,
+      "loss": 0.6468,
+      "step": 605
+    },
+    {
+      "epoch": 0.5102467586783772,
+      "grad_norm": 0.8950415849685669,
+      "learning_rate": 0.00017075537615123042,
+      "loss": 0.6615,
+      "step": 610
+    },
+    {
+      "epoch": 0.5144291091593476,
+      "grad_norm": 1.9138753414154053,
+      "learning_rate": 0.00016858251246581216,
+      "loss": 0.6683,
+      "step": 615
+    },
+    {
+      "epoch": 0.5186114596403179,
+      "grad_norm": 0.9320158362388611,
+      "learning_rate": 0.00016640568125668117,
+      "loss": 0.6734,
+      "step": 620
+    },
+    {
+      "epoch": 0.5227938101212881,
+      "grad_norm": 1.2331713438034058,
+      "learning_rate": 0.00016422534729572738,
+      "loss": 0.6582,
+      "step": 625
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 1.1182340383529663,
+      "learning_rate": 0.00016204197610270816,
+      "loss": 0.6533,
+      "step": 630
+    },
+    {
+      "epoch": 0.5311585110832288,
+      "grad_norm": 0.6500148773193359,
+      "learning_rate": 0.00015985603384585542,
+      "loss": 0.6396,
+      "step": 635
+    },
+    {
+      "epoch": 0.5353408615641991,
+      "grad_norm": 0.9531376361846924,
+      "learning_rate": 0.00015766798724234506,
+      "loss": 0.6337,
+      "step": 640
+    },
+    {
+      "epoch": 0.5395232120451694,
+      "grad_norm": 0.7729771733283997,
+      "learning_rate": 0.00015547830345864885,
+      "loss": 0.6498,
+      "step": 645
+    },
+    {
+      "epoch": 0.5437055625261397,
+      "grad_norm": 0.6831007599830627,
+      "learning_rate": 0.0001532874500107902,
+      "loss": 0.6404,
+      "step": 650
+    },
+    {
+      "epoch": 0.54788791300711,
+      "grad_norm": 1.2160038948059082,
+      "learning_rate": 0.00015109589466452594,
+      "loss": 0.658,
+      "step": 655
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 1.015773057937622,
+      "learning_rate": 0.00014890410533547404,
+      "loss": 0.6507,
+      "step": 660
+    },
+    {
+      "epoch": 0.5562526139690506,
+      "grad_norm": 1.208256721496582,
+      "learning_rate": 0.00014671254998920976,
+      "loss": 0.6399,
+      "step": 665
+    },
+    {
+      "epoch": 0.560434964450021,
+      "grad_norm": 0.7431871294975281,
+      "learning_rate": 0.00014452169654135115,
+      "loss": 0.6534,
+      "step": 670
+    },
+    {
+      "epoch": 0.5646173149309912,
+      "grad_norm": 0.6661595702171326,
+      "learning_rate": 0.00014233201275765494,
+      "loss": 0.6343,
+      "step": 675
+    },
+    {
+      "epoch": 0.5687996654119615,
+      "grad_norm": 1.2753201723098755,
+      "learning_rate": 0.00014014396615414458,
+      "loss": 0.6296,
+      "step": 680
+    },
+    {
+      "epoch": 0.5729820158929319,
+      "grad_norm": 1.4110949039459229,
+      "learning_rate": 0.00013795802389729184,
+      "loss": 0.6452,
+      "step": 685
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 1.4824358224868774,
+      "learning_rate": 0.00013577465270427262,
+      "loss": 0.6348,
+      "step": 690
+    },
+    {
+      "epoch": 0.5813467168548725,
+      "grad_norm": 1.8900264501571655,
+      "learning_rate": 0.00013359431874331886,
+      "loss": 0.6509,
+      "step": 695
+    },
+    {
+      "epoch": 0.5855290673358428,
+      "grad_norm": 1.652632236480713,
+      "learning_rate": 0.0001314174875341878,
+      "loss": 0.6206,
+      "step": 700
+    },
+    {
+      "epoch": 0.589711417816813,
+      "grad_norm": 1.1248772144317627,
+      "learning_rate": 0.00012924462384876953,
+      "loss": 0.6299,
+      "step": 705
+    },
+    {
+      "epoch": 0.5938937682977834,
+      "grad_norm": 0.7448098659515381,
+      "learning_rate": 0.00012707619161185423,
+      "loss": 0.6483,
+      "step": 710
+    },
+    {
+      "epoch": 0.5980761187787537,
+      "grad_norm": 0.6708864569664001,
+      "learning_rate": 0.00012491265380208032,
+      "loss": 0.6473,
+      "step": 715
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.8381022810935974,
+      "learning_rate": 0.00012275447235308453,
+      "loss": 0.6356,
+      "step": 720
+    },
+    {
+      "epoch": 0.6064408197406943,
+      "grad_norm": 1.3462333679199219,
+      "learning_rate": 0.00012060210805487529,
+      "loss": 0.6388,
+      "step": 725
+    },
+    {
+      "epoch": 0.6106231702216646,
+      "grad_norm": 1.2015129327774048,
+      "learning_rate": 0.00011845602045545008,
+      "loss": 0.6258,
+      "step": 730
+    },
+    {
+      "epoch": 0.6148055207026348,
+      "grad_norm": 0.7825962901115417,
+      "learning_rate": 0.00011631666776267803,
+      "loss": 0.6401,
+      "step": 735
+    },
+    {
+      "epoch": 0.6189878711836052,
+      "grad_norm": 0.9470372200012207,
+      "learning_rate": 0.00011418450674646868,
+      "loss": 0.6501,
+      "step": 740
+    },
+    {
+      "epoch": 0.6231702216645755,
+      "grad_norm": 0.9243600368499756,
+      "learning_rate": 0.00011205999264124786,
+      "loss": 0.6195,
+      "step": 745
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 1.931402325630188,
+      "learning_rate": 0.00010994357904876106,
+      "loss": 0.6264,
+      "step": 750
+    },
+    {
+      "epoch": 0.6315349226265161,
+      "grad_norm": 1.7754958868026733,
+      "learning_rate": 0.00010783571784122577,
+      "loss": 0.6351,
+      "step": 755
+    },
+    {
+      "epoch": 0.6357172731074864,
+      "grad_norm": 0.627479076385498,
+      "learning_rate": 0.00010573685906485282,
+      "loss": 0.6395,
+      "step": 760
+    },
+    {
+      "epoch": 0.6398996235884568,
+      "grad_norm": 2.4568421840667725,
+      "learning_rate": 0.0001036474508437579,
+      "loss": 0.6257,
+      "step": 765
+    },
+    {
+      "epoch": 0.644081974069427,
+      "grad_norm": 1.9016671180725098,
+      "learning_rate": 0.0001015679392842831,
+      "loss": 0.6446,
+      "step": 770
+    },
+    {
+      "epoch": 0.6482643245503973,
+      "grad_norm": 0.7173120975494385,
+      "learning_rate": 9.949876837974944e-05,
+      "loss": 0.6312,
+      "step": 775
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 1.9048292636871338,
+      "learning_rate": 9.744037991566058e-05,
+      "loss": 0.622,
+      "step": 780
+    },
+    {
+      "epoch": 0.6566290255123379,
+      "grad_norm": 1.2454332113265991,
+      "learning_rate": 9.5393213375378e-05,
+      "loss": 0.6219,
+      "step": 785
+    },
+    {
+      "epoch": 0.6608113759933082,
+      "grad_norm": 1.0283215045928955,
+      "learning_rate": 9.33577058462873e-05,
+      "loss": 0.6236,
+      "step": 790
+    },
+    {
+      "epoch": 0.6649937264742786,
+      "grad_norm": 1.8116226196289062,
+      "learning_rate": 9.133429192647661e-05,
+      "loss": 0.6244,
+      "step": 795
+    },
+    {
+      "epoch": 0.6691760769552488,
+      "grad_norm": 0.8519335389137268,
+      "learning_rate": 8.932340363194595e-05,
+      "loss": 0.6253,
+      "step": 800
+    },
+    {
+      "epoch": 0.6733584274362192,
+      "grad_norm": 1.004647135734558,
+      "learning_rate": 8.73254703043685e-05,
+      "loss": 0.6278,
+      "step": 805
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.945331871509552,
+      "learning_rate": 8.534091851942214e-05,
+      "loss": 0.6251,
+      "step": 810
+    },
+    {
+      "epoch": 0.6817231283981597,
+      "grad_norm": 0.4643709659576416,
+      "learning_rate": 8.337017199571235e-05,
+      "loss": 0.6298,
+      "step": 815
+    },
+    {
+      "epoch": 0.6859054788791301,
+      "grad_norm": 0.8933060169219971,
+      "learning_rate": 8.141365150430421e-05,
+      "loss": 0.6419,
+      "step": 820
+    },
+    {
+      "epoch": 0.6900878293601004,
+      "grad_norm": 2.5518875122070312,
+      "learning_rate": 7.947177477888472e-05,
+      "loss": 0.6424,
+      "step": 825
+    },
+    {
+      "epoch": 0.6942701798410706,
+      "grad_norm": 0.7830976247787476,
+      "learning_rate": 7.754495642657282e-05,
+      "loss": 0.6292,
+      "step": 830
+    },
+    {
+      "epoch": 0.698452530322041,
+      "grad_norm": 1.2565546035766602,
+      "learning_rate": 7.563360783939722e-05,
+      "loss": 0.6308,
+      "step": 835
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 1.588156819343567,
+      "learning_rate": 7.373813710646083e-05,
+      "loss": 0.6249,
+      "step": 840
+    },
+    {
+      "epoch": 0.7068172312839816,
+      "grad_norm": 0.6486766934394836,
+      "learning_rate": 7.185894892681048e-05,
+      "loss": 0.6308,
+      "step": 845
+    },
+    {
+      "epoch": 0.7109995817649519,
+      "grad_norm": 0.7454473972320557,
+      "learning_rate": 6.999644452302975e-05,
+      "loss": 0.6267,
+      "step": 850
+    },
+    {
+      "epoch": 0.7151819322459222,
+      "grad_norm": 0.6658061146736145,
+      "learning_rate": 6.815102155557501e-05,
+      "loss": 0.6162,
+      "step": 855
+    },
+    {
+      "epoch": 0.7193642827268926,
+      "grad_norm": 1.0052908658981323,
+      "learning_rate": 6.632307403787138e-05,
+      "loss": 0.644,
+      "step": 860
+    },
+    {
+      "epoch": 0.7235466332078628,
+      "grad_norm": 0.7472626566886902,
+      "learning_rate": 6.451299225218754e-05,
+      "loss": 0.616,
+      "step": 865
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.587893009185791,
+      "learning_rate": 6.27211626663071e-05,
+      "loss": 0.6318,
+      "step": 870
+    },
+    {
+      "epoch": 0.7319113341698035,
+      "grad_norm": 0.898607611656189,
+      "learning_rate": 6.0947967851014405e-05,
+      "loss": 0.6409,
+      "step": 875
+    },
+    {
+      "epoch": 0.7360936846507737,
+      "grad_norm": 0.7444003224372864,
+      "learning_rate": 5.919378639841281e-05,
+      "loss": 0.6214,
+      "step": 880
+    },
+    {
+      "epoch": 0.740276035131744,
+      "grad_norm": 1.199029564857483,
+      "learning_rate": 5.745899284109154e-05,
+      "loss": 0.6184,
+      "step": 885
+    },
+    {
+      "epoch": 0.7444583856127144,
+      "grad_norm": 1.2874826192855835,
+      "learning_rate": 5.57439575721601e-05,
+      "loss": 0.6233,
+      "step": 890
+    },
+    {
+      "epoch": 0.7486407360936846,
+      "grad_norm": 1.3848364353179932,
+      "learning_rate": 5.4049046766166335e-05,
+      "loss": 0.6043,
+      "step": 895
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.6463631987571716,
+      "learning_rate": 5.237462230091467e-05,
+      "loss": 0.6361,
+      "step": 900
+    },
+    {
+      "epoch": 0.7570054370556253,
+      "grad_norm": 1.0429089069366455,
+      "learning_rate": 5.07210416802025e-05,
+      "loss": 0.6206,
+      "step": 905
+    },
+    {
+      "epoch": 0.7611877875365956,
+      "grad_norm": 0.7253705263137817,
+      "learning_rate": 4.908865795748999e-05,
+      "loss": 0.6312,
+      "step": 910
+    },
+    {
+      "epoch": 0.7653701380175659,
+      "grad_norm": 0.498542457818985,
+      "learning_rate": 4.74778196605201e-05,
+      "loss": 0.6421,
+      "step": 915
+    },
+    {
+      "epoch": 0.7695524884985362,
+      "grad_norm": 0.6464426517486572,
+      "learning_rate": 4.58888707169049e-05,
+      "loss": 0.6047,
+      "step": 920
+    },
+    {
+      "epoch": 0.7737348389795065,
+      "grad_norm": 0.6658442616462708,
+      "learning_rate": 4.432215038069449e-05,
+      "loss": 0.623,
+      "step": 925
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.6936389207839966,
+      "learning_rate": 4.277799315994286e-05,
+      "loss": 0.6226,
+      "step": 930
+    },
+    {
+      "epoch": 0.7820995399414471,
+      "grad_norm": 0.5900949835777283,
+      "learning_rate": 4.125672874528797e-05,
+      "loss": 0.6314,
+      "step": 935
+    },
+    {
+      "epoch": 0.7862818904224174,
+      "grad_norm": 0.9340611100196838,
+      "learning_rate": 3.97586819395599e-05,
+      "loss": 0.6252,
+      "step": 940
+    },
+    {
+      "epoch": 0.7904642409033877,
+      "grad_norm": 1.272733211517334,
+      "learning_rate": 3.8284172588432716e-05,
+      "loss": 0.6236,
+      "step": 945
+    },
+    {
+      "epoch": 0.794646591384358,
+      "grad_norm": 0.7782835364341736,
+      "learning_rate": 3.6833515512134606e-05,
+      "loss": 0.6096,
+      "step": 950
+    },
+    {
+      "epoch": 0.7988289418653283,
+      "grad_norm": 0.6020464301109314,
+      "learning_rate": 3.540702043823113e-05,
+      "loss": 0.6124,
+      "step": 955
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.675359845161438,
+      "learning_rate": 3.4004991935496004e-05,
+      "loss": 0.5955,
+      "step": 960
+    },
+    {
+      "epoch": 0.8071936428272689,
+      "grad_norm": 0.5639395117759705,
+      "learning_rate": 3.262772934888265e-05,
+      "loss": 0.6069,
+      "step": 965
+    },
+    {
+      "epoch": 0.8113759933082393,
+      "grad_norm": 0.4711320698261261,
+      "learning_rate": 3.1275526735611896e-05,
+      "loss": 0.6102,
+      "step": 970
+    },
+    {
+      "epoch": 0.8155583437892095,
+      "grad_norm": 1.0711911916732788,
+      "learning_rate": 2.9948672802388135e-05,
+      "loss": 0.6391,
+      "step": 975
+    },
+    {
+      "epoch": 0.8197406942701798,
+      "grad_norm": 0.4709874987602234,
+      "learning_rate": 2.8647450843757897e-05,
+      "loss": 0.6186,
+      "step": 980
+    },
+    {
+      "epoch": 0.8239230447511502,
+      "grad_norm": 0.5402533411979675,
+      "learning_rate": 2.7372138681624244e-05,
+      "loss": 0.613,
+      "step": 985
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.6864106059074402,
+      "learning_rate": 2.6123008605929375e-05,
+      "loss": 0.6215,
+      "step": 990
+    },
+    {
+      "epoch": 0.8322877457130907,
+      "grad_norm": 0.5939123630523682,
+      "learning_rate": 2.4900327316518326e-05,
+      "loss": 0.6168,
+      "step": 995
+    },
+    {
+      "epoch": 0.8364700961940611,
+      "grad_norm": 0.7122395038604736,
+      "learning_rate": 2.3704355866196373e-05,
+      "loss": 0.6053,
+      "step": 1000
+    },
+    {
+      "epoch": 0.8406524466750314,
+      "grad_norm": 0.6920621395111084,
+      "learning_rate": 2.2535349604992153e-05,
+      "loss": 0.6097,
+      "step": 1005
+    },
+    {
+      "epoch": 0.8448347971560016,
+      "grad_norm": 1.7864691019058228,
+      "learning_rate": 2.1393558125638066e-05,
+      "loss": 0.6382,
+      "step": 1010
+    },
+    {
+      "epoch": 0.849017147636972,
+      "grad_norm": 0.7472600936889648,
+      "learning_rate": 2.027922521028018e-05,
+      "loss": 0.6159,
+      "step": 1015
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 1.722579836845398,
+      "learning_rate": 1.9192588778428842e-05,
+      "loss": 0.6011,
+      "step": 1020
+    },
+    {
+      "epoch": 0.8573818485989126,
+      "grad_norm": 0.5121240019798279,
+      "learning_rate": 1.813388083616068e-05,
+      "loss": 0.6031,
+      "step": 1025
+    },
+    {
+      "epoch": 0.8615641990798829,
+      "grad_norm": 0.5083288550376892,
+      "learning_rate": 1.7103327426583265e-05,
+      "loss": 0.5845,
+      "step": 1030
+    },
+    {
+      "epoch": 0.8657465495608532,
+      "grad_norm": 0.46453267335891724,
+      "learning_rate": 1.6101148581573274e-05,
+      "loss": 0.6031,
+      "step": 1035
+    },
+    {
+      "epoch": 0.8699289000418235,
+      "grad_norm": 0.8352246880531311,
+      "learning_rate": 1.5127558274797535e-05,
+      "loss": 0.6024,
+      "step": 1040
+    },
+    {
+      "epoch": 0.8741112505227938,
+      "grad_norm": 0.8040021061897278,
+      "learning_rate": 1.4182764376028006e-05,
+      "loss": 0.635,
+      "step": 1045
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.7152721881866455,
+      "learning_rate": 1.326696860675981e-05,
+      "loss": 0.6162,
+      "step": 1050
+    },
+    {
+      "epoch": 0.8824759514847345,
+      "grad_norm": 0.6833348274230957,
+      "learning_rate": 1.2380366497141886e-05,
+      "loss": 0.6217,
+      "step": 1055
+    },
+    {
+      "epoch": 0.8866583019657047,
+      "grad_norm": 0.6803585886955261,
+      "learning_rate": 1.1523147344229716e-05,
+      "loss": 0.6218,
+      "step": 1060
+    },
+    {
+      "epoch": 0.890840652446675,
+      "grad_norm": 0.9396490454673767,
+      "learning_rate": 1.069549417156887e-05,
+      "loss": 0.6176,
+      "step": 1065
+    },
+    {
+      "epoch": 0.8950230029276454,
+      "grad_norm": 0.7006051540374756,
+      "learning_rate": 9.89758369011781e-06,
+      "loss": 0.6123,
+      "step": 1070
+    },
+    {
+      "epoch": 0.8992053534086156,
+      "grad_norm": 0.5886880159378052,
+      "learning_rate": 9.129586260518634e-06,
+      "loss": 0.5923,
+      "step": 1075
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 0.5032349228858948,
+      "learning_rate": 8.391665856723655e-06,
+      "loss": 0.619,
+      "step": 1080
+    },
+    {
+      "epoch": 0.9075700543705563,
+      "grad_norm": 0.51763916015625,
+      "learning_rate": 7.683980030985654e-06,
+      "loss": 0.6039,
+      "step": 1085
+    },
+    {
+      "epoch": 0.9117524048515265,
+      "grad_norm": 0.588211715221405,
+      "learning_rate": 7.006679880218974e-06,
+      "loss": 0.6057,
+      "step": 1090
+    },
+    {
+      "epoch": 0.9159347553324969,
+      "grad_norm": 0.6631506681442261,
+      "learning_rate": 6.359910013739122e-06,
+      "loss": 0.6106,
+      "step": 1095
+    },
+    {
+      "epoch": 0.9201171058134672,
+      "grad_norm": 0.5523665547370911,
+      "learning_rate": 5.743808522387544e-06,
+      "loss": 0.6058,
+      "step": 1100
+    },
+    {
+      "epoch": 0.9242994562944374,
+      "grad_norm": 0.5903011560440063,
+      "learning_rate": 5.158506949047975e-06,
+      "loss": 0.6321,
+      "step": 1105
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.6151677966117859,
+      "learning_rate": 4.604130260560873e-06,
+      "loss": 0.6171,
+      "step": 1110
+    },
+    {
+      "epoch": 0.9326641572563781,
+      "grad_norm": 0.5324861407279968,
+      "learning_rate": 4.080796821042082e-06,
+      "loss": 0.6184,
+      "step": 1115
+    },
+    {
+      "epoch": 0.9368465077373483,
+      "grad_norm": 0.5442612171173096,
+      "learning_rate": 3.5886183666109405e-06,
+      "loss": 0.6069,
+      "step": 1120
+    },
+    {
+      "epoch": 0.9410288582183187,
+      "grad_norm": 0.4981847405433655,
+      "learning_rate": 3.1276999815337544e-06,
+      "loss": 0.62,
+      "step": 1125
+    },
+    {
+      "epoch": 0.945211208699289,
+      "grad_norm": 0.45795848965644836,
+      "learning_rate": 2.6981400757874584e-06,
+      "loss": 0.6027,
+      "step": 1130
+    },
+    {
+      "epoch": 0.9493935591802594,
+      "grad_norm": 0.78538978099823,
+      "learning_rate": 2.3000303640481386e-06,
+      "loss": 0.6084,
+      "step": 1135
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.5453292727470398,
+      "learning_rate": 1.9334558461092663e-06,
+      "loss": 0.6043,
+      "step": 1140
+    },
+    {
+      "epoch": 0.9577582601421999,
+      "grad_norm": 0.5924518704414368,
+      "learning_rate": 1.598494788733462e-06,
+      "loss": 0.6033,
+      "step": 1145
+    },
+    {
+      "epoch": 0.9619406106231703,
+      "grad_norm": 0.5484139323234558,
+      "learning_rate": 1.2952187089419642e-06,
+      "loss": 0.616,
+      "step": 1150
+    },
+    {
+      "epoch": 0.9661229611041405,
+      "grad_norm": 0.5363529324531555,
+      "learning_rate": 1.0236923587450263e-06,
+      "loss": 0.6196,
+      "step": 1155
+    },
+    {
+      "epoch": 0.9703053115851108,
+      "grad_norm": 0.6777392625808716,
+      "learning_rate": 7.839737113168931e-07,
+      "loss": 0.6102,
+      "step": 1160
+    },
+    {
+      "epoch": 0.9744876620660812,
+      "grad_norm": 0.5920884609222412,
+      "learning_rate": 5.761139486180178e-07,
+      "loss": 0.6075,
+      "step": 1165
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.4196658730506897,
+      "learning_rate": 4.0015745046725336e-07,
+      "loss": 0.6017,
+      "step": 1170
+    },
+    {
+      "epoch": 0.9828523630280217,
+      "grad_norm": 0.6661626100540161,
+      "learning_rate": 2.5614178506644934e-07,
+      "loss": 0.5787,
+      "step": 1175
+    },
+    {
+      "epoch": 0.9870347135089921,
+      "grad_norm": 0.6183582544326782,
+      "learning_rate": 1.4409770097926765e-07,
+      "loss": 0.6188,
+      "step": 1180
+    },
+    {
+      "epoch": 0.9912170639899623,
+      "grad_norm": 0.5524072051048279,
+      "learning_rate": 6.40491205661009e-08,
+      "loss": 0.6089,
+      "step": 1185
+    },
+    {
+      "epoch": 0.9953994144709327,
+      "grad_norm": 0.5690078735351562,
+      "learning_rate": 1.6013134876491362e-08,
+      "loss": 0.6079,
+      "step": 1190
+    },
+    {
+      "epoch": 0.999581764951903,
+      "grad_norm": 0.4828049838542938,
+      "learning_rate": 0.0,
+      "loss": 0.5901,
+      "step": 1195
+    },
+    {
+      "epoch": 0.999581764951903,
+      "eval_loss": 1.2799842357635498,
+      "eval_runtime": 0.8401,
+      "eval_samples_per_second": 5.951,
+      "eval_steps_per_second": 1.19,
+      "step": 1195
+    },
+    {
+      "epoch": 0.999581764951903,
+      "step": 1195,
+      "total_flos": 9.109418934146171e+17,
+      "train_loss": 1.440422640086218,
+      "train_runtime": 6570.4384,
+      "train_samples_per_second": 2.911,
+      "train_steps_per_second": 0.182
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1195,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.109418934146171e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}