Model save

Browse files

Files changed (5) hide show

README.md +52 -36
all_results.json +6 -11
runs/Nov15_11-44-05_main-lora-gemma7b-alpaca-0-0/events.out.tfevents.1731689602.main-lora-gemma7b-alpaca-0-0.456.0 +2 -2
train_results.json +6 -6
trainer_state.json +431 -126

README.md CHANGED Viewed

@@ -1,57 +1,73 @@
 ---
 base_model: google/gemma-7b
-library_name: transformers
-model_name: gemma7b-lora-alpaca-11-v1
 tags:
-- generated_from_trainer
 - trl
 - sft
-licence: license
 ---
-# Model Card for gemma7b-lora-alpaca-11-v1
-This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b).
-It has been trained using [TRL](https://github.com/huggingface/trl).
-## Quick start
-```python
-from transformers import pipeline
-question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
-generator = pipeline("text-generation", model="klcsp/gemma7b-lora-alpaca-11-v1", device="cuda")
-output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
-print(output["generated_text"])
-```
-## Training procedure
-This model was trained with SFT.
-### Framework versions
-- TRL: 0.12.1
-- Transformers: 4.46.2
-- Pytorch: 2.3.1+cu121
-- Datasets: 3.1.0
-- Tokenizers: 0.20.3
-## Citations
-Cite TRL as:
-```bibtex
-@misc{vonwerra2022trl,
-	title        = {{TRL: Transformer Reinforcement Learning}},
-	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
-	year         = 2020,
-	journal      = {GitHub repository},
-	publisher    = {GitHub},
-	howpublished = {\url{https://github.com/huggingface/trl}}
-}
-```

 ---
+library_name: peft
+license: gemma
 base_model: google/gemma-7b
 tags:
 - trl
 - sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: gemma7b-lora-alpaca-11-v1
+  results: []
 ---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gemma7b-lora-alpaca-11-v1
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.6643
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 64
+- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 5
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.9056        | 0.9924 | 65   | 2.6113          |
+| 1.8271        | 2.0    | 131  | 1.8230          |
+| 1.7019        | 2.9924 | 196  | 1.7041          |
+| 1.7024        | 4.0    | 262  | 1.6962          |
+| 1.6463        | 4.9618 | 325  | 1.6643          |
+### Framework versions
+- PEFT 0.13.2
+- Transformers 4.46.2
+- Pytorch 2.3.1+cu121
+- Datasets 3.1.0
+- Tokenizers 0.20.3

all_results.json CHANGED Viewed

@@ -1,14 +1,9 @@
 {
-    "epoch": 1.0,
-    "eval_loss": 2.035097599029541,
-    "eval_runtime": 20.9523,
-    "eval_samples": 5201,
-    "eval_samples_per_second": 43.957,
-    "eval_steps_per_second": 0.955,
-    "total_flos": 1.997211509414953e+17,
-    "train_loss": 9.505996913400315,
-    "train_runtime": 975.3365,
     "train_samples": 46801,
-    "train_samples_per_second": 8.584,
-    "train_steps_per_second": 0.134
 }

 {
+    "epoch": 4.961832061068702,
+    "total_flos": 9.909828121379471e+17,
+    "train_loss": 5.476599056537335,
+    "train_runtime": 4095.1846,
     "train_samples": 46801,
+    "train_samples_per_second": 10.222,
+    "train_steps_per_second": 0.079
 }

runs/Nov15_11-44-05_main-lora-gemma7b-alpaca-0-0/events.out.tfevents.1731689602.main-lora-gemma7b-alpaca-0-0.456.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8844cbc996531590a77871d77967251e6123852a5430c5f0dd09d09de470f966
-size 21563

 version https://git-lfs.github.com/spec/v1
+oid sha256:260fbb5a0d7c1f1237912d519bec44b4033367356963a4f7b20a8fa648872c33
+size 22188

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 1.0,
-    "total_flos": 1.997211509414953e+17,
-    "train_loss": 9.505996913400315,
-    "train_runtime": 975.3365,
     "train_samples": 46801,
-    "train_samples_per_second": 8.584,
-    "train_steps_per_second": 0.134
 }

 {
+    "epoch": 4.961832061068702,
+    "total_flos": 9.909828121379471e+17,
+    "train_loss": 5.476599056537335,
+    "train_runtime": 4095.1846,
     "train_samples": 46801,
+    "train_samples_per_second": 10.222,
+    "train_steps_per_second": 0.079
 }

trainer_state.json CHANGED Viewed

@@ -1,224 +1,529 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 131,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.007633587786259542,
-      "grad_norm": 177.04722595214844,
-      "learning_rate": 1.4285714285714285e-05,
-      "loss": 47.6977,
       "step": 1
     },
     {
-      "epoch": 0.03816793893129771,
-      "grad_norm": 85.62853240966797,
-      "learning_rate": 7.142857142857143e-05,
-      "loss": 43.3882,
       "step": 5
     },
     {
-      "epoch": 0.07633587786259542,
-      "grad_norm": 22.000961303710938,
-      "learning_rate": 0.00014285714285714287,
-      "loss": 32.4726,
       "step": 10
     },
     {
-      "epoch": 0.11450381679389313,
-      "grad_norm": 12.63040828704834,
-      "learning_rate": 0.00019996395276708856,
-      "loss": 26.3526,
       "step": 15
     },
     {
-      "epoch": 0.15267175572519084,
-      "grad_norm": 6.1236572265625,
-      "learning_rate": 0.00019870502626379127,
-      "loss": 23.8025,
       "step": 20
     },
     {
-      "epoch": 0.19083969465648856,
-      "grad_norm": 9.36489200592041,
-      "learning_rate": 0.00019566964208274254,
-      "loss": 22.3186,
       "step": 25
     },
     {
-      "epoch": 0.22900763358778625,
-      "grad_norm": 17.107019424438477,
-      "learning_rate": 0.0001909124299802724,
-      "loss": 20.77,
       "step": 30
     },
     {
-      "epoch": 0.26717557251908397,
-      "grad_norm": 27.212358474731445,
-      "learning_rate": 0.0001845190085543795,
-      "loss": 17.8624,
       "step": 35
     },
     {
-      "epoch": 0.3053435114503817,
-      "grad_norm": 36.82498550415039,
-      "learning_rate": 0.0001766044443118978,
-      "loss": 12.7998,
       "step": 40
     },
     {
-      "epoch": 0.3435114503816794,
-      "grad_norm": 30.142446517944336,
-      "learning_rate": 0.00016731118074275704,
-      "loss": 7.393,
       "step": 45
     },
     {
-      "epoch": 0.3816793893129771,
-      "grad_norm": 15.062440872192383,
-      "learning_rate": 0.00015680647467311557,
-      "loss": 4.1547,
       "step": 50
     },
     {
-      "epoch": 0.4198473282442748,
-      "grad_norm": 9.832117080688477,
-      "learning_rate": 0.00014527938603696376,
-      "loss": 3.463,
       "step": 55
     },
     {
-      "epoch": 0.4580152671755725,
-      "grad_norm": 4.392879009246826,
-      "learning_rate": 0.00013293737524320797,
-      "loss": 2.8845,
       "step": 60
     },
     {
-      "epoch": 0.4961832061068702,
-      "grad_norm": 2.260551929473877,
-      "learning_rate": 0.00012000256937760445,
-      "loss": 2.5922,
       "step": 65
     },
     {
-      "epoch": 0.5343511450381679,
-      "grad_norm": 3.587684154510498,
-      "learning_rate": 0.00010670776443910024,
-      "loss": 2.3901,
       "step": 70
     },
     {
-      "epoch": 0.5725190839694656,
-      "grad_norm": 2.6524131298065186,
-      "learning_rate": 9.329223556089975e-05,
-      "loss": 2.3052,
       "step": 75
     },
     {
-      "epoch": 0.6106870229007634,
-      "grad_norm": 0.9529216885566711,
-      "learning_rate": 7.999743062239557e-05,
-      "loss": 2.2007,
       "step": 80
     },
     {
-      "epoch": 0.648854961832061,
-      "grad_norm": 1.3604825735092163,
-      "learning_rate": 6.706262475679205e-05,
-      "loss": 2.1535,
       "step": 85
     },
     {
-      "epoch": 0.6870229007633588,
-      "grad_norm": 1.0362510681152344,
-      "learning_rate": 5.472061396303629e-05,
-      "loss": 2.1222,
       "step": 90
     },
     {
-      "epoch": 0.7251908396946565,
-      "grad_norm": 1.1193585395812988,
-      "learning_rate": 4.3193525326884435e-05,
-      "loss": 2.0834,
       "step": 95
     },
     {
-      "epoch": 0.7633587786259542,
-      "grad_norm": 1.9800268411636353,
-      "learning_rate": 3.268881925724297e-05,
-      "loss": 2.0757,
       "step": 100
     },
     {
-      "epoch": 0.8015267175572519,
-      "grad_norm": 1.1855801343917847,
-      "learning_rate": 2.339555568810221e-05,
-      "loss": 2.0367,
       "step": 105
     },
     {
-      "epoch": 0.8396946564885496,
-      "grad_norm": 1.8571408987045288,
-      "learning_rate": 1.5480991445620542e-05,
-      "loss": 2.0068,
       "step": 110
     },
     {
-      "epoch": 0.8778625954198473,
-      "grad_norm": 0.9461548924446106,
-      "learning_rate": 9.08757001972762e-06,
-      "loss": 2.0445,
       "step": 115
     },
     {
-      "epoch": 0.916030534351145,
-      "grad_norm": 2.1497249603271484,
-      "learning_rate": 4.3303579172574885e-06,
-      "loss": 2.0147,
       "step": 120
     },
     {
-      "epoch": 0.9541984732824428,
-      "grad_norm": 1.3707141876220703,
-      "learning_rate": 1.2949737362087156e-06,
-      "loss": 2.0338,
       "step": 125
     },
     {
-      "epoch": 0.9923664122137404,
-      "grad_norm": 1.4225437641143799,
-      "learning_rate": 3.60472329114625e-08,
-      "loss": 2.0402,
       "step": 130
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 2.035097599029541,
-      "eval_runtime": 20.9668,
-      "eval_samples_per_second": 43.927,
-      "eval_steps_per_second": 0.954,
       "step": 131
     },
     {
-      "epoch": 1.0,
-      "step": 131,
-      "total_flos": 1.997211509414953e+17,
-      "train_loss": 9.505996913400315,
-      "train_runtime": 975.3365,
-      "train_samples_per_second": 8.584,
-      "train_steps_per_second": 0.134
     }
   ],
   "logging_steps": 5,
-  "max_steps": 131,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -232,8 +537,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.997211509414953e+17,
-  "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null
 }

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 4.961832061068702,
   "eval_steps": 500,
+  "global_step": 325,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.015267175572519083,
+      "grad_norm": 183.11753845214844,
+      "learning_rate": 6.060606060606061e-06,
+      "loss": 46.1063,
       "step": 1
     },
     {
+      "epoch": 0.07633587786259542,
+      "grad_norm": 136.03738403320312,
+      "learning_rate": 3.0303030303030306e-05,
+      "loss": 44.0302,
       "step": 5
     },
     {
+      "epoch": 0.15267175572519084,
+      "grad_norm": 69.2432632446289,
+      "learning_rate": 6.060606060606061e-05,
+      "loss": 38.4659,
       "step": 10
     },
     {
+      "epoch": 0.22900763358778625,
+      "grad_norm": 17.486797332763672,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 30.3029,
       "step": 15
     },
     {
+      "epoch": 0.3053435114503817,
+      "grad_norm": 13.530756950378418,
+      "learning_rate": 0.00012121212121212122,
+      "loss": 26.6709,
       "step": 20
     },
     {
+      "epoch": 0.3816793893129771,
+      "grad_norm": 7.521498680114746,
+      "learning_rate": 0.00015151515151515152,
+      "loss": 24.4319,
       "step": 25
     },
     {
+      "epoch": 0.4580152671755725,
+      "grad_norm": 5.912084102630615,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 22.862,
       "step": 30
     },
     {
+      "epoch": 0.5343511450381679,
+      "grad_norm": 10.610209465026855,
+      "learning_rate": 0.00019997685019798912,
+      "loss": 21.5999,
       "step": 35
     },
     {
+      "epoch": 0.6106870229007634,
+      "grad_norm": 20.944725036621094,
+      "learning_rate": 0.0001997165380022878,
+      "loss": 19.4719,
       "step": 40
     },
     {
+      "epoch": 0.6870229007633588,
+      "grad_norm": 34.12383270263672,
+      "learning_rate": 0.000199167731989929,
+      "loss": 14.6832,
       "step": 45
     },
     {
+      "epoch": 0.7633587786259542,
+      "grad_norm": 42.86738204956055,
+      "learning_rate": 0.0001983320199330545,
+      "loss": 8.7569,
       "step": 50
     },
     {
+      "epoch": 0.8396946564885496,
+      "grad_norm": 12.474686622619629,
+      "learning_rate": 0.00019721181966290613,
+      "loss": 4.3457,
       "step": 55
     },
     {
+      "epoch": 0.916030534351145,
+      "grad_norm": 9.623456954956055,
+      "learning_rate": 0.00019581037207470382,
+      "loss": 3.4309,
       "step": 60
     },
     {
+      "epoch": 0.9923664122137404,
+      "grad_norm": 3.5216312408447266,
+      "learning_rate": 0.00019413173175128473,
+      "loss": 2.9056,
       "step": 65
     },
     {
+      "epoch": 0.9923664122137404,
+      "eval_loss": 2.611328125,
+      "eval_runtime": 19.2134,
+      "eval_samples_per_second": 47.935,
+      "eval_steps_per_second": 0.781,
+      "step": 65
+    },
+    {
+      "epoch": 1.0687022900763359,
+      "grad_norm": 2.9582359790802,
+      "learning_rate": 0.00019218075523263104,
+      "loss": 2.7809,
       "step": 70
     },
     {
+      "epoch": 1.1450381679389312,
+      "grad_norm": 2.319239616394043,
+      "learning_rate": 0.00018996308696522433,
+      "loss": 2.3224,
       "step": 75
     },
     {
+      "epoch": 1.2213740458015268,
+      "grad_norm": 1.3839267492294312,
+      "learning_rate": 0.00018748514297187648,
+      "loss": 2.2039,
       "step": 80
     },
     {
+      "epoch": 1.297709923664122,
+      "grad_norm": 0.5840837955474854,
+      "learning_rate": 0.00018475409228928312,
+      "loss": 2.1174,
       "step": 85
     },
     {
+      "epoch": 1.3740458015267176,
+      "grad_norm": 1.5493711233139038,
+      "learning_rate": 0.00018177783622700327,
+      "loss": 2.0565,
       "step": 90
     },
     {
+      "epoch": 1.450381679389313,
+      "grad_norm": 0.7415986657142639,
+      "learning_rate": 0.00017856498550787144,
+      "loss": 2.003,
       "step": 95
     },
     {
+      "epoch": 1.5267175572519083,
+      "grad_norm": 0.6342356204986572,
+      "learning_rate": 0.00017512483535597867,
+      "loss": 1.9686,
       "step": 100
     },
     {
+      "epoch": 1.6030534351145038,
+      "grad_norm": 1.0893248319625854,
+      "learning_rate": 0.00017146733860429612,
+      "loss": 1.9499,
       "step": 105
     },
     {
+      "epoch": 1.6793893129770994,
+      "grad_norm": 1.233128547668457,
+      "learning_rate": 0.0001676030768997445,
+      "loss": 1.9192,
       "step": 110
     },
     {
+      "epoch": 1.7557251908396947,
+      "grad_norm": 0.7829602360725403,
+      "learning_rate": 0.00016354323008901776,
+      "loss": 1.8934,
       "step": 115
     },
     {
+      "epoch": 1.83206106870229,
+      "grad_norm": 1.0393383502960205,
+      "learning_rate": 0.00015929954387373103,
+      "loss": 1.8579,
       "step": 120
     },
     {
+      "epoch": 1.9083969465648853,
+      "grad_norm": 2.433302879333496,
+      "learning_rate": 0.00015488429582847192,
+      "loss": 1.8576,
       "step": 125
     },
     {
+      "epoch": 1.984732824427481,
+      "grad_norm": 1.2537367343902588,
+      "learning_rate": 0.00015031025988006936,
+      "loss": 1.8271,
       "step": 130
     },
     {
+      "epoch": 2.0,
+      "eval_loss": 1.8229883909225464,
+      "eval_runtime": 19.0953,
+      "eval_samples_per_second": 48.232,
+      "eval_steps_per_second": 0.786,
       "step": 131
     },
     {
+      "epoch": 2.0610687022900764,
+      "grad_norm": 1.04417085647583,
+      "learning_rate": 0.00014559066935084588,
+      "loss": 1.975,
+      "step": 135
+    },
+    {
+      "epoch": 2.1374045801526718,
+      "grad_norm": 0.9754623174667358,
+      "learning_rate": 0.00014073917867277557,
+      "loss": 1.7901,
+      "step": 140
+    },
+    {
+      "epoch": 2.213740458015267,
+      "grad_norm": 0.6031882762908936,
+      "learning_rate": 0.0001357698238833126,
+      "loss": 1.7584,
+      "step": 145
+    },
+    {
+      "epoch": 2.2900763358778624,
+      "grad_norm": 1.7654844522476196,
+      "learning_rate": 0.000130696982017182,
+      "loss": 1.7665,
+      "step": 150
+    },
+    {
+      "epoch": 2.366412213740458,
+      "grad_norm": 1.8184305429458618,
+      "learning_rate": 0.0001255353295116187,
+      "loss": 1.7496,
+      "step": 155
+    },
+    {
+      "epoch": 2.4427480916030535,
+      "grad_norm": 2.4291305541992188,
+      "learning_rate": 0.00012029979974539234,
+      "loss": 1.7389,
+      "step": 160
+    },
+    {
+      "epoch": 2.519083969465649,
+      "grad_norm": 0.7844381928443909,
+      "learning_rate": 0.00011500553983446527,
+      "loss": 1.7327,
+      "step": 165
+    },
+    {
+      "epoch": 2.595419847328244,
+      "grad_norm": 1.0221455097198486,
+      "learning_rate": 0.00010966786680927874,
+      "loss": 1.7365,
+      "step": 170
+    },
+    {
+      "epoch": 2.67175572519084,
+      "grad_norm": 1.1956524848937988,
+      "learning_rate": 0.00010430222330045304,
+      "loss": 1.7204,
+      "step": 175
+    },
+    {
+      "epoch": 2.7480916030534353,
+      "grad_norm": 0.7325518131256104,
+      "learning_rate": 9.892413286110886e-05,
+      "loss": 1.7177,
+      "step": 180
+    },
+    {
+      "epoch": 2.8244274809160306,
+      "grad_norm": 0.8538561463356018,
+      "learning_rate": 9.354915505506839e-05,
+      "loss": 1.7193,
+      "step": 185
+    },
+    {
+      "epoch": 2.900763358778626,
+      "grad_norm": 1.252325415611267,
+      "learning_rate": 8.81928404408726e-05,
+      "loss": 1.7058,
+      "step": 190
+    },
+    {
+      "epoch": 2.9770992366412212,
+      "grad_norm": 0.7734937071800232,
+      "learning_rate": 8.287068558185225e-05,
+      "loss": 1.7019,
+      "step": 195
+    },
+    {
+      "epoch": 2.9923664122137406,
+      "eval_loss": 1.7041354179382324,
+      "eval_runtime": 19.3108,
+      "eval_samples_per_second": 47.694,
+      "eval_steps_per_second": 0.777,
+      "step": 196
+    },
+    {
+      "epoch": 3.053435114503817,
+      "grad_norm": 0.6631619334220886,
+      "learning_rate": 7.759808821241406e-05,
+      "loss": 1.8697,
+      "step": 200
+    },
+    {
+      "epoch": 3.1297709923664123,
+      "grad_norm": 0.7187236547470093,
+      "learning_rate": 7.239030269025311e-05,
+      "loss": 1.7181,
+      "step": 205
+    },
+    {
+      "epoch": 3.2061068702290076,
+      "grad_norm": 0.5320985913276672,
+      "learning_rate": 6.726239586337408e-05,
+      "loss": 1.7351,
+      "step": 210
+    },
+    {
+      "epoch": 3.282442748091603,
+      "grad_norm": 0.43638336658477783,
+      "learning_rate": 6.22292034796035e-05,
+      "loss": 1.7156,
+      "step": 215
+    },
+    {
+      "epoch": 3.3587786259541983,
+      "grad_norm": 0.3966742753982544,
+      "learning_rate": 5.730528726470792e-05,
+      "loss": 1.7158,
+      "step": 220
+    },
+    {
+      "epoch": 3.435114503816794,
+      "grad_norm": 0.326159805059433,
+      "learning_rate": 5.2504892793295e-05,
+      "loss": 1.7055,
+      "step": 225
+    },
+    {
+      "epoch": 3.5114503816793894,
+      "grad_norm": 0.4766685664653778,
+      "learning_rate": 4.7841908274384616e-05,
+      "loss": 1.7006,
+      "step": 230
+    },
+    {
+      "epoch": 3.5877862595419847,
+      "grad_norm": 0.41363418102264404,
+      "learning_rate": 4.332982437088825e-05,
+      "loss": 1.7106,
+      "step": 235
+    },
+    {
+      "epoch": 3.66412213740458,
+      "grad_norm": 0.5006980299949646,
+      "learning_rate": 3.898169516924398e-05,
+      "loss": 1.6938,
+      "step": 240
+    },
+    {
+      "epoch": 3.7404580152671754,
+      "grad_norm": 0.4720315933227539,
+      "learning_rate": 3.4810100412128747e-05,
+      "loss": 1.6886,
+      "step": 245
+    },
+    {
+      "epoch": 3.816793893129771,
+      "grad_norm": 0.5057269334793091,
+      "learning_rate": 3.0827109103512643e-05,
+      "loss": 1.6912,
+      "step": 250
+    },
+    {
+      "epoch": 3.8931297709923665,
+      "grad_norm": 0.38378995656967163,
+      "learning_rate": 2.7044244591351232e-05,
+      "loss": 1.7001,
+      "step": 255
+    },
+    {
+      "epoch": 3.969465648854962,
+      "grad_norm": 0.3008043169975281,
+      "learning_rate": 2.3472451228937253e-05,
+      "loss": 1.7024,
+      "step": 260
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.6962379217147827,
+      "eval_runtime": 18.9852,
+      "eval_samples_per_second": 48.512,
+      "eval_steps_per_second": 0.79,
+      "step": 262
+    },
+    {
+      "epoch": 4.0458015267175576,
+      "grad_norm": 0.9348434805870056,
+      "learning_rate": 2.0122062711363532e-05,
+      "loss": 1.8574,
+      "step": 265
+    },
+    {
+      "epoch": 4.122137404580153,
+      "grad_norm": 0.7455368638038635,
+      "learning_rate": 1.7002772178705716e-05,
+      "loss": 1.6594,
+      "step": 270
+    },
+    {
+      "epoch": 4.198473282442748,
+      "grad_norm": 0.5774383544921875,
+      "learning_rate": 1.4123604172419713e-05,
+      "loss": 1.6527,
+      "step": 275
+    },
+    {
+      "epoch": 4.2748091603053435,
+      "grad_norm": 0.5370898842811584,
+      "learning_rate": 1.149288852608743e-05,
+      "loss": 1.6587,
+      "step": 280
+    },
+    {
+      "epoch": 4.351145038167939,
+      "grad_norm": 0.7321135997772217,
+      "learning_rate": 9.118236266049707e-06,
+      "loss": 1.6676,
+      "step": 285
+    },
+    {
+      "epoch": 4.427480916030534,
+      "grad_norm": 0.5155964493751526,
+      "learning_rate": 7.0065175916482095e-06,
+      "loss": 1.6579,
+      "step": 290
+    },
+    {
+      "epoch": 4.5038167938931295,
+      "grad_norm": 0.6737932562828064,
+      "learning_rate": 5.163841998782837e-06,
+      "loss": 1.6508,
+      "step": 295
+    },
+    {
+      "epoch": 4.580152671755725,
+      "grad_norm": 0.9017395377159119,
+      "learning_rate": 3.595540604290437e-06,
+      "loss": 1.6375,
+      "step": 300
+    },
+    {
+      "epoch": 4.65648854961832,
+      "grad_norm": 0.5460083484649658,
+      "learning_rate": 2.30615072228183e-06,
+      "loss": 1.6522,
+      "step": 305
+    },
+    {
+      "epoch": 4.732824427480916,
+      "grad_norm": 0.5443113446235657,
+      "learning_rate": 1.2994027370611173e-06,
+      "loss": 1.648,
+      "step": 310
+    },
+    {
+      "epoch": 4.809160305343512,
+      "grad_norm": 0.6177972555160522,
+      "learning_rate": 5.782093106048159e-07,
+      "loss": 1.6559,
+      "step": 315
+    },
+    {
+      "epoch": 4.885496183206107,
+      "grad_norm": 0.4734289050102234,
+      "learning_rate": 1.446569558255395e-07,
+      "loss": 1.6443,
+      "step": 320
+    },
+    {
+      "epoch": 4.961832061068702,
+      "grad_norm": 0.6619871854782104,
+      "learning_rate": 0.0,
+      "loss": 1.6463,
+      "step": 325
+    },
+    {
+      "epoch": 4.961832061068702,
+      "eval_loss": 1.664337158203125,
+      "eval_runtime": 18.9808,
+      "eval_samples_per_second": 48.523,
+      "eval_steps_per_second": 0.79,
+      "step": 325
+    },
+    {
+      "epoch": 4.961832061068702,
+      "step": 325,
+      "total_flos": 9.909828121379471e+17,
+      "train_loss": 5.476599056537335,
+      "train_runtime": 4095.1846,
+      "train_samples_per_second": 10.222,
+      "train_steps_per_second": 0.079
     }
   ],
   "logging_steps": 5,
+  "max_steps": 325,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
   "save_steps": 100,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 9.909828121379471e+17,
+  "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null
 }