Model save

Browse files

Files changed (13) hide show

.gitattributes +1 -0
README.md +67 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
all_results.json +8 -0
runs/Apr25_16-17-37_COE-CS-sv003/events.out.tfevents.1714061941.COE-CS-sv003.466421.0 +3 -0
runs/Apr25_16-26-49_COE-CS-sv003/events.out.tfevents.1714062426.COE-CS-sv003.466694.0 +3 -0
special_tokens_map.json +28 -0
tokenizer.json +3 -0
tokenizer_config.json +70 -0
train_results.json +8 -0
trainer_state.json +990 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+---
+license: gemma
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: google/gemma-7b
+model-index:
+- name: zephyr-7b-gemma-sft-20p-2048
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# zephyr-7b-gemma-sft-20p-2048
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.2425
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 4
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 0.9395        | 1.0   | 675  | 1.2425          |
+### Framework versions
+- PEFT 0.7.1
+- Transformers 4.39.0.dev0
+- Pytorch 2.1.2
+- Datasets 2.14.6
+- Tokenizers 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-7b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 8,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 6,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d0f5a2c53b7ac5e956039e1cf5eeb1a747b2280358fffb9af9f1002df3bf56e
+size 37555048

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "train_loss": 1.3901304527565286,
+    "train_runtime": 7332.1213,
+    "train_samples": 21594,
+    "train_samples_per_second": 2.945,
+    "train_steps_per_second": 0.092
+}

runs/Apr25_16-17-37_COE-CS-sv003/events.out.tfevents.1714061941.COE-CS-sv003.466421.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edef0e3fc5e5a93597eb1e32cfe3a12c0b2a5c6331d8d21911a76330ee0ad1e2
+size 6466

runs/Apr25_16-26-49_COE-CS-sv003/events.out.tfevents.1714062426.COE-CS-sv003.466694.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48b041718b1dd279c1d1a4cdea90ef263664722bbf9e37dcb174fd98e4535cf2
+size 34027

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:299f8e59a5c4a3b3941dbe1159a7079d69c7a8d5ca34322ace1be9140ae76cc0
+size 17477572

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "legacy": null,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 1.0,
+    "train_loss": 1.3901304527565286,
+    "train_runtime": 7332.1213,
+    "train_samples": 21594,
+    "train_samples_per_second": 2.945,
+    "train_steps_per_second": 0.092
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,990 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 675,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 73.53218812891343,
+      "learning_rate": 2.9411764705882355e-06,
+      "loss": 17.1887,
+      "step": 1
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 72.20908013900717,
+      "learning_rate": 1.4705882352941177e-05,
+      "loss": 16.7463,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 62.778264757967754,
+      "learning_rate": 2.9411764705882354e-05,
+      "loss": 15.7811,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 49.113837778155805,
+      "learning_rate": 4.411764705882353e-05,
+      "loss": 12.4438,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 32.240597800281805,
+      "learning_rate": 5.882352941176471e-05,
+      "loss": 7.8515,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 15.86664975490431,
+      "learning_rate": 7.352941176470589e-05,
+      "loss": 5.0038,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 10.302430667844247,
+      "learning_rate": 8.823529411764706e-05,
+      "loss": 3.5358,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 7.201438595032495,
+      "learning_rate": 0.00010294117647058823,
+      "loss": 2.3707,
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 4.623788563021874,
+      "learning_rate": 0.00011764705882352942,
+      "loss": 1.7962,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.827534440198122,
+      "learning_rate": 0.0001323529411764706,
+      "loss": 1.5507,
+      "step": 45
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 2.4371715524744273,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 1.4152,
+      "step": 50
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 4.6308985646582865,
+      "learning_rate": 0.00016176470588235295,
+      "loss": 1.3047,
+      "step": 55
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 2.6277992850244916,
+      "learning_rate": 0.00017647058823529413,
+      "loss": 1.1884,
+      "step": 60
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.810606688485043,
+      "learning_rate": 0.0001911764705882353,
+      "loss": 1.1846,
+      "step": 65
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.6607629181186598,
+      "learning_rate": 0.00019999464266898484,
+      "loss": 1.0985,
+      "step": 70
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.4899860561533123,
+      "learning_rate": 0.00019993437928712978,
+      "loss": 1.0778,
+      "step": 75
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.5873695514869177,
+      "learning_rate": 0.0001998071963486563,
+      "loss": 1.1472,
+      "step": 80
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.5725818281652244,
+      "learning_rate": 0.00019961317901970953,
+      "loss": 1.0508,
+      "step": 85
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.7033192337312772,
+      "learning_rate": 0.0001993524572210807,
+      "loss": 1.1184,
+      "step": 90
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.864811477430765,
+      "learning_rate": 0.00019902520554120772,
+      "loss": 1.0191,
+      "step": 95
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.9842754161579026,
+      "learning_rate": 0.00019863164311926433,
+      "loss": 1.0969,
+      "step": 100
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.8657659018311334,
+      "learning_rate": 0.00019817203349841738,
+      "loss": 1.0578,
+      "step": 105
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.4402254237747114,
+      "learning_rate": 0.00019764668444934854,
+      "loss": 1.0136,
+      "step": 110
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.985678949607785,
+      "learning_rate": 0.0001970559477641606,
+      "loss": 1.0014,
+      "step": 115
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.4715740807243745,
+      "learning_rate": 0.0001964002190208052,
+      "loss": 1.0444,
+      "step": 120
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 2.351331444939703,
+      "learning_rate": 0.00019567993731818984,
+      "loss": 1.0044,
+      "step": 125
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.7332542810251639,
+      "learning_rate": 0.00019489558498214196,
+      "loss": 0.9762,
+      "step": 130
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.3569747435542954,
+      "learning_rate": 0.00019404768724242666,
+      "loss": 1.0202,
+      "step": 135
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.6992316193002084,
+      "learning_rate": 0.00019313681188103457,
+      "loss": 1.0261,
+      "step": 140
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 2.269643225680648,
+      "learning_rate": 0.000192163568851975,
+      "loss": 1.0437,
+      "step": 145
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.7489937385172625,
+      "learning_rate": 0.00019112860987282958,
+      "loss": 0.9816,
+      "step": 150
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.4197466438048754,
+      "learning_rate": 0.0001900326279883392,
+      "loss": 0.9973,
+      "step": 155
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.5625301281965893,
+      "learning_rate": 0.00018887635710631716,
+      "loss": 1.0078,
+      "step": 160
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.341641051289465,
+      "learning_rate": 0.00018766057150619865,
+      "loss": 0.9759,
+      "step": 165
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.9218279347652023,
+      "learning_rate": 0.00018638608532055634,
+      "loss": 0.9497,
+      "step": 170
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.5814298979333805,
+      "learning_rate": 0.00018505375198992857,
+      "loss": 0.9593,
+      "step": 175
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.6133434657400032,
+      "learning_rate": 0.00018366446369132578,
+      "loss": 0.9657,
+      "step": 180
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.8656214764246037,
+      "learning_rate": 0.00018221915074079762,
+      "loss": 0.931,
+      "step": 185
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.475284079334899,
+      "learning_rate": 0.00018071878097046065,
+      "loss": 1.0032,
+      "step": 190
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.3417878211289882,
+      "learning_rate": 0.00017916435908040413,
+      "loss": 0.9502,
+      "step": 195
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.5671700546108844,
+      "learning_rate": 0.00017755692596590778,
+      "loss": 0.9655,
+      "step": 200
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.5637519254714574,
+      "learning_rate": 0.00017589755802042186,
+      "loss": 1.0083,
+      "step": 205
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.3335973670360752,
+      "learning_rate": 0.00017418736641477636,
+      "loss": 0.9257,
+      "step": 210
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 1.851933280343888,
+      "learning_rate": 0.0001724274963531022,
+      "loss": 0.9555,
+      "step": 215
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.597267903033207,
+      "learning_rate": 0.00017061912630596252,
+      "loss": 0.961,
+      "step": 220
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.2586072933244326,
+      "learning_rate": 0.00016876346722120747,
+      "loss": 0.9545,
+      "step": 225
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.91126837231012,
+      "learning_rate": 0.00016686176171308126,
+      "loss": 1.0021,
+      "step": 230
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.221506251787188,
+      "learning_rate": 0.0001649152832301241,
+      "loss": 0.9536,
+      "step": 235
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.6421046726120057,
+      "learning_rate": 0.00016292533520242662,
+      "loss": 0.989,
+      "step": 240
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.65537527957207,
+      "learning_rate": 0.00016089325016880736,
+      "loss": 0.9306,
+      "step": 245
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 1.551249778846203,
+      "learning_rate": 0.0001588203888844982,
+      "loss": 0.933,
+      "step": 250
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.4718905144882328,
+      "learning_rate": 0.00015670813940993502,
+      "loss": 0.9966,
+      "step": 255
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.6457075355627533,
+      "learning_rate": 0.00015455791618126404,
+      "loss": 0.9326,
+      "step": 260
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.4689616265296752,
+      "learning_rate": 0.00015237115906318563,
+      "loss": 0.9327,
+      "step": 265
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.5768108727267562,
+      "learning_rate": 0.0001501493323847707,
+      "loss": 0.8785,
+      "step": 270
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.5154414103112674,
+      "learning_rate": 0.00014789392395889468,
+      "loss": 0.9677,
+      "step": 275
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.7991573095908828,
+      "learning_rate": 0.00014560644408594602,
+      "loss": 0.9732,
+      "step": 280
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.331331813541862,
+      "learning_rate": 0.0001432884245424761,
+      "loss": 0.895,
+      "step": 285
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 1.3265879307262,
+      "learning_rate": 0.00014094141755546815,
+      "loss": 0.9495,
+      "step": 290
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.373983753071535,
+      "learning_rate": 0.00013856699476291176,
+      "loss": 0.9167,
+      "step": 295
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.3848598834980737,
+      "learning_rate": 0.000136166746161379,
+      "loss": 0.9327,
+      "step": 300
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 1.349182069364577,
+      "learning_rate": 0.00013374227904130724,
+      "loss": 0.9156,
+      "step": 305
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.231890373583718,
+      "learning_rate": 0.00013129521691070107,
+      "loss": 0.9024,
+      "step": 310
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.3184286986849199,
+      "learning_rate": 0.00012882719840797473,
+      "loss": 0.946,
+      "step": 315
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 1.2850837837862192,
+      "learning_rate": 0.0001263398762046623,
+      "loss": 0.9647,
+      "step": 320
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.0818663632355159,
+      "learning_rate": 0.00012383491589873123,
+      "loss": 0.8986,
+      "step": 325
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.31127052136108,
+      "learning_rate": 0.0001213139948992394,
+      "loss": 0.9347,
+      "step": 330
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.3354281558595502,
+      "learning_rate": 0.0001187788013030837,
+      "loss": 0.912,
+      "step": 335
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.3230181317972467,
+      "learning_rate": 0.00011623103276459086,
+      "loss": 0.9542,
+      "step": 340
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.3558452166577937,
+      "learning_rate": 0.00011367239535870913,
+      "loss": 0.9307,
+      "step": 345
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.2776477730191607,
+      "learning_rate": 0.00011110460243856052,
+      "loss": 0.842,
+      "step": 350
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.1931456337138724,
+      "learning_rate": 0.0001085293734881197,
+      "loss": 1.023,
+      "step": 355
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.4101263363441188,
+      "learning_rate": 0.00010594843297078737,
+      "loss": 0.8469,
+      "step": 360
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.4759810995782983,
+      "learning_rate": 0.00010336350917462925,
+      "loss": 0.9429,
+      "step": 365
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.3195800017049626,
+      "learning_rate": 0.00010077633305505403,
+      "loss": 0.9467,
+      "step": 370
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.4340773951366725,
+      "learning_rate": 9.818863707570475e-05,
+      "loss": 0.9234,
+      "step": 375
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.3906713677707887,
+      "learning_rate": 9.560215404834095e-05,
+      "loss": 0.8627,
+      "step": 380
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.2312829341126241,
+      "learning_rate": 9.30186159724869e-05,
+      "loss": 0.9707,
+      "step": 385
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 1.2296503762823234,
+      "learning_rate": 9.043975287562441e-05,
+      "loss": 0.8975,
+      "step": 390
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.428266488089383,
+      "learning_rate": 8.786729165470584e-05,
+      "loss": 0.9242,
+      "step": 395
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.4351708822396818,
+      "learning_rate": 8.530295491976337e-05,
+      "loss": 0.9613,
+      "step": 400
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.4023979907327273,
+      "learning_rate": 8.274845984038916e-05,
+      "loss": 0.9386,
+      "step": 405
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.5289715737644887,
+      "learning_rate": 8.020551699585842e-05,
+      "loss": 0.8882,
+      "step": 410
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 1.3427576459400747,
+      "learning_rate": 7.76758292296659e-05,
+      "loss": 0.9386,
+      "step": 415
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 1.6234983552618958,
+      "learning_rate": 7.516109050924201e-05,
+      "loss": 0.9497,
+      "step": 420
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.3757787363056968,
+      "learning_rate": 7.266298479161318e-05,
+      "loss": 0.9353,
+      "step": 425
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0207515861639591,
+      "learning_rate": 7.01831848957653e-05,
+      "loss": 0.8773,
+      "step": 430
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.066825211996411,
+      "learning_rate": 6.772335138246548e-05,
+      "loss": 0.8815,
+      "step": 435
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.585875694236883,
+      "learning_rate": 6.528513144229255e-05,
+      "loss": 0.8624,
+      "step": 440
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 3.1374098679372833,
+      "learning_rate": 6.287015779262064e-05,
+      "loss": 0.8769,
+      "step": 445
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.4323569006609465,
+      "learning_rate": 6.048004758429451e-05,
+      "loss": 0.9578,
+      "step": 450
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.1745446431201967,
+      "learning_rate": 5.8116401318728667e-05,
+      "loss": 0.9778,
+      "step": 455
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 1.3489994553201705,
+      "learning_rate": 5.578080177615575e-05,
+      "loss": 0.9453,
+      "step": 460
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.4726017758093604,
+      "learning_rate": 5.3474812955741404e-05,
+      "loss": 0.9388,
+      "step": 465
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.4222376466782578,
+      "learning_rate": 5.119997902827584e-05,
+      "loss": 0.9389,
+      "step": 470
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 1.312463261970253,
+      "learning_rate": 4.895782330214291e-05,
+      "loss": 0.9978,
+      "step": 475
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 1.4811777334942215,
+      "learning_rate": 4.674984720325961e-05,
+      "loss": 0.9229,
+      "step": 480
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.187387116264144,
+      "learning_rate": 4.4577529269668874e-05,
+      "loss": 0.9319,
+      "step": 485
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.573886175317443,
+      "learning_rate": 4.244232416145839e-05,
+      "loss": 0.8582,
+      "step": 490
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 1.8423570100062419,
+      "learning_rate": 4.0345661686669745e-05,
+      "loss": 0.9875,
+      "step": 495
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.3959739056560523,
+      "learning_rate": 3.828894584384867e-05,
+      "loss": 0.9499,
+      "step": 500
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.2522644394201856,
+      "learning_rate": 3.62735538818787e-05,
+      "loss": 0.9465,
+      "step": 505
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.1092455291419923,
+      "learning_rate": 3.43008353777269e-05,
+      "loss": 0.8705,
+      "step": 510
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.3494618399232021,
+      "learning_rate": 3.237211133272004e-05,
+      "loss": 0.8925,
+      "step": 515
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.327141307012543,
+      "learning_rate": 3.0488673287955882e-05,
+      "loss": 0.8864,
+      "step": 520
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 1.4712560536545136,
+      "learning_rate": 2.8651782459442176e-05,
+      "loss": 0.9095,
+      "step": 525
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.0753618251829293,
+      "learning_rate": 2.686266889354211e-05,
+      "loss": 0.9094,
+      "step": 530
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.1938128239663364,
+      "learning_rate": 2.5122530643292275e-05,
+      "loss": 0.895,
+      "step": 535
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.2604734873430494,
+      "learning_rate": 2.3432532966144527e-05,
+      "loss": 0.9122,
+      "step": 540
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.1630833266614449,
+      "learning_rate": 2.1793807543668853e-05,
+      "loss": 0.8707,
+      "step": 545
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.4007959537230523,
+      "learning_rate": 2.0207451723739633e-05,
+      "loss": 0.9303,
+      "step": 550
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.1568429331812096,
+      "learning_rate": 1.8674527785713247e-05,
+      "loss": 0.918,
+      "step": 555
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 1.7514698026073916,
+      "learning_rate": 1.7196062229088604e-05,
+      "loss": 0.9194,
+      "step": 560
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 3.820001651436939,
+      "learning_rate": 1.577304508612717e-05,
+      "loss": 0.9079,
+      "step": 565
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 1.189776848938503,
+      "learning_rate": 1.4406429258892762e-05,
+      "loss": 0.8622,
+      "step": 570
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.2555397613395767,
+      "learning_rate": 1.3097129881154934e-05,
+      "loss": 0.963,
+      "step": 575
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 1.434313272835203,
+      "learning_rate": 1.1846023705583442e-05,
+      "loss": 0.903,
+      "step": 580
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.1102917022071244,
+      "learning_rate": 1.065394851664394e-05,
+      "loss": 0.8873,
+      "step": 585
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.346033677968864,
+      "learning_rate": 9.521702569588198e-06,
+      "loss": 0.9241,
+      "step": 590
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.352314708606863,
+      "learning_rate": 8.450044055914497e-06,
+      "loss": 0.844,
+      "step": 595
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 1.4077107378660827,
+      "learning_rate": 7.439690595656013e-06,
+      "loss": 0.9185,
+      "step": 600
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.2576697014152542,
+      "learning_rate": 6.4913187568374164e-06,
+      "loss": 0.9464,
+      "step": 605
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.185889063940016,
+      "learning_rate": 5.605563602421149e-06,
+      "loss": 0.9113,
+      "step": 610
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.289407769067274,
+      "learning_rate": 4.783018265047179e-06,
+      "loss": 0.9136,
+      "step": 615
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.1909907385752865,
+      "learning_rate": 4.024233549850509e-06,
+      "loss": 0.9004,
+      "step": 620
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.3163426883308564,
+      "learning_rate": 3.329717565622825e-06,
+      "loss": 0.9107,
+      "step": 625
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.0164303782764494,
+      "learning_rate": 2.699935384565111e-06,
+      "loss": 0.863,
+      "step": 630
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.2480074548201365,
+      "learning_rate": 2.1353087308590314e-06,
+      "loss": 0.9415,
+      "step": 635
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 1.2887306090020527,
+      "learning_rate": 1.6362156982656084e-06,
+      "loss": 0.9614,
+      "step": 640
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.4090113723084001,
+      "learning_rate": 1.2029904969404482e-06,
+      "loss": 0.9415,
+      "step": 645
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.2326893640043235,
+      "learning_rate": 8.359232296349162e-07,
+      "loss": 0.9132,
+      "step": 650
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 1.2349784200342029,
+      "learning_rate": 5.352596974332436e-07,
+      "loss": 0.9187,
+      "step": 655
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.3101498596239862,
+      "learning_rate": 3.0120123515540164e-07,
+      "loss": 0.9452,
+      "step": 660
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.281188961147188,
+      "learning_rate": 1.3390457653639222e-07,
+      "loss": 0.9203,
+      "step": 665
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.4762914419165216,
+      "learning_rate": 3.3481749271768726e-08,
+      "loss": 0.8818,
+      "step": 670
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.479271940054715,
+      "learning_rate": 0.0,
+      "loss": 0.9395,
+      "step": 675
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.2425206899642944,
+      "eval_runtime": 248.4678,
+      "eval_samples_per_second": 9.297,
+      "eval_steps_per_second": 0.584,
+      "step": 675
+    },
+    {
+      "epoch": 1.0,
+      "step": 675,
+      "total_flos": 1369955766894592.0,
+      "train_loss": 1.3901304527565286,
+      "train_runtime": 7332.1213,
+      "train_samples_per_second": 2.945,
+      "train_steps_per_second": 0.092
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 675,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 1369955766894592.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3b1d16005a217e0e428296578f2f031f3c28f06360053969da3ee953a75e35d
+size 6200