Model save

Browse files

Files changed (5) hide show

README.md +5 -9
all_results.json +7 -12
runs/Nov15_21-21-31_main-lora-mistral-alpaca-0-0/events.out.tfevents.1731724272.main-lora-mistral-alpaca-0-0.456.0 +2 -2
train_results.json +7 -7
trainer_state.json +134 -425

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
-- Loss: 1.6643
 ## Model description
@@ -51,17 +51,13 @@ The following hyperparameters were used during training:
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
-- num_epochs: 5
 ### Training results
-| Training Loss | Epoch  | Step | Validation Loss |
-|:-------------:|:------:|:----:|:---------------:|
-| 2.9056        | 0.9924 | 65   | 2.6113          |
-| 1.8271        | 2.0    | 131  | 1.8230          |
-| 1.7019        | 2.9924 | 196  | 1.7041          |
-| 1.7024        | 4.0    | 262  | 1.6962          |
-| 1.6463        | 4.9618 | 325  | 1.6643          |
 ### Framework versions

 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
 It achieves the following results on the evaluation set:
+- Loss: 1.6616
 ## Model description
 - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.6542        | 1.0   | 140  | 1.6616          |
 ### Framework versions

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 4.961832061068702,
-    "eval_loss": 1.664337158203125,
-    "eval_runtime": 19.0212,
-    "eval_samples": 5201,
-    "eval_samples_per_second": 48.42,
-    "eval_steps_per_second": 0.789,
-    "total_flos": 9.909828121379471e+17,
-    "train_loss": 5.476599056537335,
-    "train_runtime": 4095.1846,
-    "train_samples": 46801,
-    "train_samples_per_second": 10.222,
-    "train_steps_per_second": 0.079
 }

 {
+    "epoch": 1.0,
+    "total_flos": 4.268849030789857e+17,
+    "train_loss": 5.964417205538068,
+    "train_runtime": 1743.5737,
+    "train_samples": 51241,
+    "train_samples_per_second": 10.269,
+    "train_steps_per_second": 0.08
 }

runs/Nov15_21-21-31_main-lora-mistral-alpaca-0-0/events.out.tfevents.1731724272.main-lora-mistral-alpaca-0-0.456.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:509cbd7f8d7f8e46314c0198f31a922c767fdf00a6a24d9aebc80e23c745dcf3
-size 12677

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7d66dcd64dfa383a706ccb760e61ce343eda49a6adcba7ff73cabb9e80d7c6f
+size 13302

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 4.961832061068702,
-    "total_flos": 9.909828121379471e+17,
-    "train_loss": 5.476599056537335,
-    "train_runtime": 4095.1846,
-    "train_samples": 46801,
-    "train_samples_per_second": 10.222,
-    "train_steps_per_second": 0.079
 }

 {
+    "epoch": 1.0,
+    "total_flos": 4.268849030789857e+17,
+    "train_loss": 5.964417205538068,
+    "train_runtime": 1743.5737,
+    "train_samples": 51241,
+    "train_samples_per_second": 10.269,
+    "train_steps_per_second": 0.08
 }

trainer_state.json CHANGED Viewed

@@ -1,529 +1,238 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 4.961832061068702,
   "eval_steps": 500,
-  "global_step": 325,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.015267175572519083,
-      "grad_norm": 183.11753845214844,
-      "learning_rate": 6.060606060606061e-06,
-      "loss": 46.1063,
       "step": 1
     },
     {
-      "epoch": 0.07633587786259542,
-      "grad_norm": 136.03738403320312,
-      "learning_rate": 3.0303030303030306e-05,
-      "loss": 44.0302,
       "step": 5
     },
     {
-      "epoch": 0.15267175572519084,
-      "grad_norm": 69.2432632446289,
-      "learning_rate": 6.060606060606061e-05,
-      "loss": 38.4659,
       "step": 10
     },
     {
-      "epoch": 0.22900763358778625,
-      "grad_norm": 17.486797332763672,
-      "learning_rate": 9.090909090909092e-05,
-      "loss": 30.3029,
       "step": 15
     },
     {
-      "epoch": 0.3053435114503817,
-      "grad_norm": 13.530756950378418,
-      "learning_rate": 0.00012121212121212122,
-      "loss": 26.6709,
       "step": 20
     },
     {
-      "epoch": 0.3816793893129771,
-      "grad_norm": 7.521498680114746,
-      "learning_rate": 0.00015151515151515152,
-      "loss": 24.4319,
       "step": 25
     },
     {
-      "epoch": 0.4580152671755725,
-      "grad_norm": 5.912084102630615,
-      "learning_rate": 0.00018181818181818183,
-      "loss": 22.862,
       "step": 30
     },
     {
-      "epoch": 0.5343511450381679,
-      "grad_norm": 10.610209465026855,
-      "learning_rate": 0.00019997685019798912,
-      "loss": 21.5999,
       "step": 35
     },
     {
-      "epoch": 0.6106870229007634,
-      "grad_norm": 20.944725036621094,
-      "learning_rate": 0.0001997165380022878,
-      "loss": 19.4719,
       "step": 40
     },
     {
-      "epoch": 0.6870229007633588,
-      "grad_norm": 34.12383270263672,
-      "learning_rate": 0.000199167731989929,
-      "loss": 14.6832,
       "step": 45
     },
     {
-      "epoch": 0.7633587786259542,
-      "grad_norm": 42.86738204956055,
-      "learning_rate": 0.0001983320199330545,
-      "loss": 8.7569,
       "step": 50
     },
     {
-      "epoch": 0.8396946564885496,
-      "grad_norm": 12.474686622619629,
-      "learning_rate": 0.00019721181966290613,
-      "loss": 4.3457,
       "step": 55
     },
     {
-      "epoch": 0.916030534351145,
-      "grad_norm": 9.623456954956055,
-      "learning_rate": 0.00019581037207470382,
-      "loss": 3.4309,
       "step": 60
     },
     {
-      "epoch": 0.9923664122137404,
-      "grad_norm": 3.5216312408447266,
-      "learning_rate": 0.00019413173175128473,
-      "loss": 2.9056,
       "step": 65
     },
     {
-      "epoch": 0.9923664122137404,
-      "eval_loss": 2.611328125,
-      "eval_runtime": 19.2134,
-      "eval_samples_per_second": 47.935,
-      "eval_steps_per_second": 0.781,
-      "step": 65
-    },
-    {
-      "epoch": 1.0687022900763359,
-      "grad_norm": 2.9582359790802,
-      "learning_rate": 0.00019218075523263104,
-      "loss": 2.7809,
       "step": 70
     },
     {
-      "epoch": 1.1450381679389312,
-      "grad_norm": 2.319239616394043,
-      "learning_rate": 0.00018996308696522433,
-      "loss": 2.3224,
       "step": 75
     },
     {
-      "epoch": 1.2213740458015268,
-      "grad_norm": 1.3839267492294312,
-      "learning_rate": 0.00018748514297187648,
-      "loss": 2.2039,
       "step": 80
     },
     {
-      "epoch": 1.297709923664122,
-      "grad_norm": 0.5840837955474854,
-      "learning_rate": 0.00018475409228928312,
-      "loss": 2.1174,
       "step": 85
     },
     {
-      "epoch": 1.3740458015267176,
-      "grad_norm": 1.5493711233139038,
-      "learning_rate": 0.00018177783622700327,
-      "loss": 2.0565,
       "step": 90
     },
     {
-      "epoch": 1.450381679389313,
-      "grad_norm": 0.7415986657142639,
-      "learning_rate": 0.00017856498550787144,
-      "loss": 2.003,
       "step": 95
     },
     {
-      "epoch": 1.5267175572519083,
-      "grad_norm": 0.6342356204986572,
-      "learning_rate": 0.00017512483535597867,
-      "loss": 1.9686,
       "step": 100
     },
     {
-      "epoch": 1.6030534351145038,
-      "grad_norm": 1.0893248319625854,
-      "learning_rate": 0.00017146733860429612,
-      "loss": 1.9499,
       "step": 105
     },
     {
-      "epoch": 1.6793893129770994,
-      "grad_norm": 1.233128547668457,
-      "learning_rate": 0.0001676030768997445,
-      "loss": 1.9192,
       "step": 110
     },
     {
-      "epoch": 1.7557251908396947,
-      "grad_norm": 0.7829602360725403,
-      "learning_rate": 0.00016354323008901776,
-      "loss": 1.8934,
       "step": 115
     },
     {
-      "epoch": 1.83206106870229,
-      "grad_norm": 1.0393383502960205,
-      "learning_rate": 0.00015929954387373103,
-      "loss": 1.8579,
       "step": 120
     },
     {
-      "epoch": 1.9083969465648853,
-      "grad_norm": 2.433302879333496,
-      "learning_rate": 0.00015488429582847192,
-      "loss": 1.8576,
       "step": 125
     },
     {
-      "epoch": 1.984732824427481,
-      "grad_norm": 1.2537367343902588,
-      "learning_rate": 0.00015031025988006936,
-      "loss": 1.8271,
       "step": 130
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 1.8229883909225464,
-      "eval_runtime": 19.0953,
-      "eval_samples_per_second": 48.232,
-      "eval_steps_per_second": 0.786,
-      "step": 131
-    },
-    {
-      "epoch": 2.0610687022900764,
-      "grad_norm": 1.04417085647583,
-      "learning_rate": 0.00014559066935084588,
-      "loss": 1.975,
       "step": 135
     },
     {
-      "epoch": 2.1374045801526718,
-      "grad_norm": 0.9754623174667358,
-      "learning_rate": 0.00014073917867277557,
-      "loss": 1.7901,
-      "step": 140
-    },
-    {
-      "epoch": 2.213740458015267,
-      "grad_norm": 0.6031882762908936,
-      "learning_rate": 0.0001357698238833126,
-      "loss": 1.7584,
-      "step": 145
-    },
-    {
-      "epoch": 2.2900763358778624,
-      "grad_norm": 1.7654844522476196,
-      "learning_rate": 0.000130696982017182,
-      "loss": 1.7665,
-      "step": 150
-    },
-    {
-      "epoch": 2.366412213740458,
-      "grad_norm": 1.8184305429458618,
-      "learning_rate": 0.0001255353295116187,
-      "loss": 1.7496,
-      "step": 155
-    },
-    {
-      "epoch": 2.4427480916030535,
-      "grad_norm": 2.4291305541992188,
-      "learning_rate": 0.00012029979974539234,
-      "loss": 1.7389,
-      "step": 160
-    },
-    {
-      "epoch": 2.519083969465649,
-      "grad_norm": 0.7844381928443909,
-      "learning_rate": 0.00011500553983446527,
-      "loss": 1.7327,
-      "step": 165
-    },
-    {
-      "epoch": 2.595419847328244,
-      "grad_norm": 1.0221455097198486,
-      "learning_rate": 0.00010966786680927874,
-      "loss": 1.7365,
-      "step": 170
-    },
-    {
-      "epoch": 2.67175572519084,
-      "grad_norm": 1.1956524848937988,
-      "learning_rate": 0.00010430222330045304,
-      "loss": 1.7204,
-      "step": 175
-    },
-    {
-      "epoch": 2.7480916030534353,
-      "grad_norm": 0.7325518131256104,
-      "learning_rate": 9.892413286110886e-05,
-      "loss": 1.7177,
-      "step": 180
-    },
-    {
-      "epoch": 2.8244274809160306,
-      "grad_norm": 0.8538561463356018,
-      "learning_rate": 9.354915505506839e-05,
-      "loss": 1.7193,
-      "step": 185
-    },
-    {
-      "epoch": 2.900763358778626,
-      "grad_norm": 1.252325415611267,
-      "learning_rate": 8.81928404408726e-05,
-      "loss": 1.7058,
-      "step": 190
-    },
-    {
-      "epoch": 2.9770992366412212,
-      "grad_norm": 0.7734937071800232,
-      "learning_rate": 8.287068558185225e-05,
-      "loss": 1.7019,
-      "step": 195
-    },
-    {
-      "epoch": 2.9923664122137406,
-      "eval_loss": 1.7041354179382324,
-      "eval_runtime": 19.3108,
-      "eval_samples_per_second": 47.694,
-      "eval_steps_per_second": 0.777,
-      "step": 196
-    },
-    {
-      "epoch": 3.053435114503817,
-      "grad_norm": 0.6631619334220886,
-      "learning_rate": 7.759808821241406e-05,
-      "loss": 1.8697,
-      "step": 200
-    },
-    {
-      "epoch": 3.1297709923664123,
-      "grad_norm": 0.7187236547470093,
-      "learning_rate": 7.239030269025311e-05,
-      "loss": 1.7181,
-      "step": 205
-    },
-    {
-      "epoch": 3.2061068702290076,
-      "grad_norm": 0.5320985913276672,
-      "learning_rate": 6.726239586337408e-05,
-      "loss": 1.7351,
-      "step": 210
-    },
-    {
-      "epoch": 3.282442748091603,
-      "grad_norm": 0.43638336658477783,
-      "learning_rate": 6.22292034796035e-05,
-      "loss": 1.7156,
-      "step": 215
-    },
-    {
-      "epoch": 3.3587786259541983,
-      "grad_norm": 0.3966742753982544,
-      "learning_rate": 5.730528726470792e-05,
-      "loss": 1.7158,
-      "step": 220
-    },
-    {
-      "epoch": 3.435114503816794,
-      "grad_norm": 0.326159805059433,
-      "learning_rate": 5.2504892793295e-05,
-      "loss": 1.7055,
-      "step": 225
-    },
-    {
-      "epoch": 3.5114503816793894,
-      "grad_norm": 0.4766685664653778,
-      "learning_rate": 4.7841908274384616e-05,
-      "loss": 1.7006,
-      "step": 230
-    },
-    {
-      "epoch": 3.5877862595419847,
-      "grad_norm": 0.41363418102264404,
-      "learning_rate": 4.332982437088825e-05,
-      "loss": 1.7106,
-      "step": 235
-    },
-    {
-      "epoch": 3.66412213740458,
-      "grad_norm": 0.5006980299949646,
-      "learning_rate": 3.898169516924398e-05,
-      "loss": 1.6938,
-      "step": 240
-    },
-    {
-      "epoch": 3.7404580152671754,
-      "grad_norm": 0.4720315933227539,
-      "learning_rate": 3.4810100412128747e-05,
-      "loss": 1.6886,
-      "step": 245
-    },
-    {
-      "epoch": 3.816793893129771,
-      "grad_norm": 0.5057269334793091,
-      "learning_rate": 3.0827109103512643e-05,
-      "loss": 1.6912,
-      "step": 250
-    },
-    {
-      "epoch": 3.8931297709923665,
-      "grad_norm": 0.38378995656967163,
-      "learning_rate": 2.7044244591351232e-05,
-      "loss": 1.7001,
-      "step": 255
-    },
-    {
-      "epoch": 3.969465648854962,
-      "grad_norm": 0.3008043169975281,
-      "learning_rate": 2.3472451228937253e-05,
-      "loss": 1.7024,
-      "step": 260
-    },
-    {
-      "epoch": 4.0,
-      "eval_loss": 1.6962379217147827,
-      "eval_runtime": 18.9852,
-      "eval_samples_per_second": 48.512,
-      "eval_steps_per_second": 0.79,
-      "step": 262
-    },
-    {
-      "epoch": 4.0458015267175576,
-      "grad_norm": 0.9348434805870056,
-      "learning_rate": 2.0122062711363532e-05,
-      "loss": 1.8574,
-      "step": 265
-    },
-    {
-      "epoch": 4.122137404580153,
-      "grad_norm": 0.7455368638038635,
-      "learning_rate": 1.7002772178705716e-05,
-      "loss": 1.6594,
-      "step": 270
-    },
-    {
-      "epoch": 4.198473282442748,
-      "grad_norm": 0.5774383544921875,
-      "learning_rate": 1.4123604172419713e-05,
-      "loss": 1.6527,
-      "step": 275
-    },
-    {
-      "epoch": 4.2748091603053435,
-      "grad_norm": 0.5370898842811584,
-      "learning_rate": 1.149288852608743e-05,
-      "loss": 1.6587,
-      "step": 280
-    },
-    {
-      "epoch": 4.351145038167939,
-      "grad_norm": 0.7321135997772217,
-      "learning_rate": 9.118236266049707e-06,
-      "loss": 1.6676,
-      "step": 285
-    },
-    {
-      "epoch": 4.427480916030534,
-      "grad_norm": 0.5155964493751526,
-      "learning_rate": 7.0065175916482095e-06,
-      "loss": 1.6579,
-      "step": 290
-    },
-    {
-      "epoch": 4.5038167938931295,
-      "grad_norm": 0.6737932562828064,
-      "learning_rate": 5.163841998782837e-06,
-      "loss": 1.6508,
-      "step": 295
-    },
-    {
-      "epoch": 4.580152671755725,
-      "grad_norm": 0.9017395377159119,
-      "learning_rate": 3.595540604290437e-06,
-      "loss": 1.6375,
-      "step": 300
-    },
-    {
-      "epoch": 4.65648854961832,
-      "grad_norm": 0.5460083484649658,
-      "learning_rate": 2.30615072228183e-06,
-      "loss": 1.6522,
-      "step": 305
-    },
-    {
-      "epoch": 4.732824427480916,
-      "grad_norm": 0.5443113446235657,
-      "learning_rate": 1.2994027370611173e-06,
-      "loss": 1.648,
-      "step": 310
-    },
-    {
-      "epoch": 4.809160305343512,
-      "grad_norm": 0.6177972555160522,
-      "learning_rate": 5.782093106048159e-07,
-      "loss": 1.6559,
-      "step": 315
-    },
-    {
-      "epoch": 4.885496183206107,
-      "grad_norm": 0.4734289050102234,
-      "learning_rate": 1.446569558255395e-07,
-      "loss": 1.6443,
-      "step": 320
-    },
-    {
-      "epoch": 4.961832061068702,
-      "grad_norm": 0.6619871854782104,
       "learning_rate": 0.0,
-      "loss": 1.6463,
-      "step": 325
     },
     {
-      "epoch": 4.961832061068702,
-      "eval_loss": 1.664337158203125,
-      "eval_runtime": 18.9808,
-      "eval_samples_per_second": 48.523,
-      "eval_steps_per_second": 0.79,
-      "step": 325
     },
     {
-      "epoch": 4.961832061068702,
-      "step": 325,
-      "total_flos": 9.909828121379471e+17,
-      "train_loss": 5.476599056537335,
-      "train_runtime": 4095.1846,
-      "train_samples_per_second": 10.222,
-      "train_steps_per_second": 0.079
     }
   ],
   "logging_steps": 5,
-  "max_steps": 325,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -537,7 +246,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 9.909828121379471e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 140,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.007142857142857143,
+      "grad_norm": 106.10796356201172,
+      "learning_rate": 1.4285714285714285e-05,
+      "loss": 27.4831,
       "step": 1
     },
     {
+      "epoch": 0.03571428571428571,
+      "grad_norm": 46.88632583618164,
+      "learning_rate": 7.142857142857143e-05,
+      "loss": 26.3105,
       "step": 5
     },
     {
+      "epoch": 0.07142857142857142,
+      "grad_norm": 16.438461303710938,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 20.4923,
       "step": 10
     },
     {
+      "epoch": 0.10714285714285714,
+      "grad_norm": 9.703349113464355,
+      "learning_rate": 0.00019996891820008164,
+      "loss": 16.0475,
       "step": 15
     },
     {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 3.3252718448638916,
+      "learning_rate": 0.00019888308262251285,
+      "loss": 13.4483,
       "step": 20
     },
     {
+      "epoch": 0.17857142857142858,
+      "grad_norm": 3.251009941101074,
+      "learning_rate": 0.0001962624246950012,
+      "loss": 12.6172,
       "step": 25
     },
     {
+      "epoch": 0.21428571428571427,
+      "grad_norm": 5.38721227645874,
+      "learning_rate": 0.00019214762118704076,
+      "loss": 11.9827,
       "step": 30
     },
     {
+      "epoch": 0.25,
+      "grad_norm": 10.532784461975098,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 11.0145,
       "step": 35
     },
     {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 14.15513801574707,
+      "learning_rate": 0.00017971325072229226,
+      "loss": 9.3353,
       "step": 40
     },
     {
+      "epoch": 0.32142857142857145,
+      "grad_norm": 21.529788970947266,
+      "learning_rate": 0.00017158668492597186,
+      "loss": 7.229,
       "step": 45
     },
     {
+      "epoch": 0.35714285714285715,
+      "grad_norm": 15.934834480285645,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 4.7381,
       "step": 50
     },
     {
+      "epoch": 0.39285714285714285,
+      "grad_norm": 5.8390045166015625,
+      "learning_rate": 0.0001521435203379498,
+      "loss": 2.8963,
       "step": 55
     },
     {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 4.718578815460205,
+      "learning_rate": 0.00014112871031306119,
+      "loss": 2.4009,
       "step": 60
     },
     {
+      "epoch": 0.4642857142857143,
+      "grad_norm": 3.6623828411102295,
+      "learning_rate": 0.00012947551744109043,
+      "loss": 2.1721,
       "step": 65
     },
     {
+      "epoch": 0.5,
+      "grad_norm": 2.086202383041382,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 2.0155,
       "step": 70
     },
     {
+      "epoch": 0.5357142857142857,
+      "grad_norm": 1.1254757642745972,
+      "learning_rate": 0.00010498458856606972,
+      "loss": 1.9116,
       "step": 75
     },
     {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 1.9386184215545654,
+      "learning_rate": 9.252699064135758e-05,
+      "loss": 1.8129,
       "step": 80
     },
     {
+      "epoch": 0.6071428571428571,
+      "grad_norm": 1.5302088260650635,
+      "learning_rate": 8.018538568006027e-05,
+      "loss": 1.7831,
       "step": 85
     },
     {
+      "epoch": 0.6428571428571429,
+      "grad_norm": 1.059885859489441,
+      "learning_rate": 6.815133497483157e-05,
+      "loss": 1.7466,
       "step": 90
     },
     {
+      "epoch": 0.6785714285714286,
+      "grad_norm": 1.2457741498947144,
+      "learning_rate": 5.6611626088244194e-05,
+      "loss": 1.7392,
       "step": 95
     },
     {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 1.7660045623779297,
+      "learning_rate": 4.574537361342407e-05,
+      "loss": 1.6987,
       "step": 100
     },
     {
+      "epoch": 0.75,
+      "grad_norm": 0.7355481386184692,
+      "learning_rate": 3.5721239031346066e-05,
+      "loss": 1.6811,
       "step": 105
     },
     {
+      "epoch": 0.7857142857142857,
+      "grad_norm": 0.7224046587944031,
+      "learning_rate": 2.669481281701739e-05,
+      "loss": 1.7008,
       "step": 110
     },
     {
+      "epoch": 0.8214285714285714,
+      "grad_norm": 0.7918136715888977,
+      "learning_rate": 1.880619942841435e-05,
+      "loss": 1.6759,
       "step": 115
     },
     {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 1.5361932516098022,
+      "learning_rate": 1.2177842662977135e-05,
+      "loss": 1.6748,
       "step": 120
     },
     {
+      "epoch": 0.8928571428571429,
+      "grad_norm": 1.0850142240524292,
+      "learning_rate": 6.9126251355795864e-06,
+      "loss": 1.6777,
       "step": 125
     },
     {
+      "epoch": 0.9285714285714286,
+      "grad_norm": 0.7962830662727356,
+      "learning_rate": 3.092271377092215e-06,
+      "loss": 1.6705,
       "step": 130
     },
     {
+      "epoch": 0.9642857142857143,
+      "grad_norm": 0.7234132885932922,
+      "learning_rate": 7.760793399827937e-07,
+      "loss": 1.6417,
       "step": 135
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 0.7538830637931824,
       "learning_rate": 0.0,
+      "loss": 1.6542,
+      "step": 140
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 1.6615785360336304,
+      "eval_runtime": 3.8187,
+      "eval_samples_per_second": 46.613,
+      "eval_steps_per_second": 0.786,
+      "step": 140
     },
     {
+      "epoch": 1.0,
+      "step": 140,
+      "total_flos": 4.268849030789857e+17,
+      "train_loss": 5.964417205538068,
+      "train_runtime": 1743.5737,
+      "train_samples_per_second": 10.269,
+      "train_steps_per_second": 0.08
     }
   ],
   "logging_steps": 5,
+  "max_steps": 140,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 4.268849030789857e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null