totally-not-an-llm commited on
Commit
ebe1b41
1 Parent(s): 544ce4b

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +150 -0
  2. adapter_config.json +34 -0
  3. adapter_model.bin +3 -0
  4. checkpoint-128/README.md +202 -0
  5. checkpoint-128/adapter_config.json +34 -0
  6. checkpoint-128/adapter_model.safetensors +3 -0
  7. checkpoint-128/optimizer.pt +3 -0
  8. checkpoint-128/rng_state.pth +3 -0
  9. checkpoint-128/scheduler.pt +3 -0
  10. checkpoint-128/special_tokens_map.json +24 -0
  11. checkpoint-128/tokenizer.model +3 -0
  12. checkpoint-128/tokenizer_config.json +44 -0
  13. checkpoint-128/trainer_state.json +1053 -0
  14. checkpoint-128/training_args.bin +3 -0
  15. checkpoint-32/README.md +202 -0
  16. checkpoint-32/adapter_config.json +34 -0
  17. checkpoint-32/adapter_model.safetensors +3 -0
  18. checkpoint-32/optimizer.pt +3 -0
  19. checkpoint-32/rng_state.pth +3 -0
  20. checkpoint-32/scheduler.pt +3 -0
  21. checkpoint-32/special_tokens_map.json +24 -0
  22. checkpoint-32/tokenizer.model +3 -0
  23. checkpoint-32/tokenizer_config.json +44 -0
  24. checkpoint-32/trainer_state.json +285 -0
  25. checkpoint-32/training_args.bin +3 -0
  26. checkpoint-64/README.md +202 -0
  27. checkpoint-64/adapter_config.json +34 -0
  28. checkpoint-64/adapter_model.safetensors +3 -0
  29. checkpoint-64/optimizer.pt +3 -0
  30. checkpoint-64/rng_state.pth +3 -0
  31. checkpoint-64/scheduler.pt +3 -0
  32. checkpoint-64/special_tokens_map.json +24 -0
  33. checkpoint-64/tokenizer.model +3 -0
  34. checkpoint-64/tokenizer_config.json +44 -0
  35. checkpoint-64/trainer_state.json +541 -0
  36. checkpoint-64/training_args.bin +3 -0
  37. checkpoint-96/README.md +202 -0
  38. checkpoint-96/adapter_config.json +34 -0
  39. checkpoint-96/adapter_model.safetensors +3 -0
  40. checkpoint-96/optimizer.pt +3 -0
  41. checkpoint-96/rng_state.pth +3 -0
  42. checkpoint-96/scheduler.pt +3 -0
  43. checkpoint-96/special_tokens_map.json +24 -0
  44. checkpoint-96/tokenizer.model +3 -0
  45. checkpoint-96/tokenizer_config.json +44 -0
  46. checkpoint-96/trainer_state.json +797 -0
  47. checkpoint-96/training_args.bin +3 -0
  48. config.json +43 -0
  49. runs/Apr30_02-51-48_663ec5cd7167/events.out.tfevents.1714445508.663ec5cd7167.5280.0 +3 -0
  50. special_tokens_map.json +24 -0
README.md CHANGED
@@ -1,3 +1,153 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
7
+ model-index:
8
+ - name: qlora-out
9
+ results: []
10
  ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.0`
19
+ ```yaml
20
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
21
+ model_type: LlamaForCausalLM
22
+ tokenizer_type: LlamaTokenizer
23
+
24
+ load_in_8bit: false
25
+ load_in_4bit: true
26
+ strict: false
27
+
28
+ datasets:
29
+ - path: totally-not-an-llm/ZorgonChat
30
+ type: alpaca
31
+ dataset_prepared_path:
32
+ val_set_size: 0.05
33
+ output_dir: ./qlora-out
34
+
35
+ adapter: qlora
36
+ lora_model_dir:
37
+
38
+ sequence_len: 4096
39
+ sample_packing: false
40
+ pad_to_sequence_len: true
41
+
42
+ lora_r: 32
43
+ lora_alpha: 16
44
+ lora_dropout: 0.05
45
+ lora_target_modules:
46
+ lora_target_linear: true
47
+ lora_fan_in_fan_out:
48
+
49
+ wandb_project:
50
+ wandb_entity:
51
+ wandb_watch:
52
+ wandb_name:
53
+ wandb_log_model:
54
+
55
+ gradient_accumulation_steps: 4
56
+ micro_batch_size: 2
57
+ num_epochs: 4
58
+ optimizer: paged_adamw_32bit
59
+ lr_scheduler: cosine
60
+ learning_rate: 0.0002
61
+
62
+ train_on_inputs: false
63
+ group_by_length: false
64
+ bf16: auto
65
+ fp16:
66
+ tf32: false
67
+
68
+ gradient_checkpointing: true
69
+ early_stopping_patience:
70
+ resume_from_checkpoint:
71
+ local_rank:
72
+ logging_steps: 1
73
+ xformers_attention:
74
+ flash_attention: true
75
+
76
+ warmup_steps: 10
77
+ evals_per_epoch: 4
78
+ saves_per_epoch: 1
79
+ debug:
80
+ deepspeed:
81
+ weight_decay: 0.0
82
+ fsdp:
83
+ fsdp_config:
84
+ special_tokens:
85
+
86
+ ```
87
+
88
+ </details><br>
89
+
90
+ # qlora-out
91
+
92
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
93
+ It achieves the following results on the evaluation set:
94
+ - Loss: 2.3466
95
+
96
+ ## Model description
97
+
98
+ More information needed
99
+
100
+ ## Intended uses & limitations
101
+
102
+ More information needed
103
+
104
+ ## Training and evaluation data
105
+
106
+ More information needed
107
+
108
+ ## Training procedure
109
+
110
+ ### Training hyperparameters
111
+
112
+ The following hyperparameters were used during training:
113
+ - learning_rate: 0.0002
114
+ - train_batch_size: 2
115
+ - eval_batch_size: 2
116
+ - seed: 42
117
+ - gradient_accumulation_steps: 4
118
+ - total_train_batch_size: 8
119
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
120
+ - lr_scheduler_type: cosine
121
+ - lr_scheduler_warmup_steps: 10
122
+ - num_epochs: 4
123
+
124
+ ### Training results
125
+
126
+ | Training Loss | Epoch | Step | Validation Loss |
127
+ |:-------------:|:-----:|:----:|:---------------:|
128
+ | 3.9295 | 0.03 | 1 | 3.9073 |
129
+ | 3.5364 | 0.25 | 8 | 3.6199 |
130
+ | 3.263 | 0.5 | 16 | 3.1821 |
131
+ | 2.798 | 0.75 | 24 | 2.8962 |
132
+ | 2.7787 | 1.0 | 32 | 2.6773 |
133
+ | 2.5959 | 1.25 | 40 | 2.5506 |
134
+ | 2.4793 | 1.5 | 48 | 2.4955 |
135
+ | 2.5221 | 1.75 | 56 | 2.4613 |
136
+ | 2.4384 | 2.0 | 64 | 2.4055 |
137
+ | 2.295 | 2.25 | 72 | 2.3923 |
138
+ | 2.3943 | 2.5 | 80 | 2.3862 |
139
+ | 2.2398 | 2.75 | 88 | 2.3605 |
140
+ | 2.2693 | 3.0 | 96 | 2.3526 |
141
+ | 2.425 | 3.25 | 104 | 2.3471 |
142
+ | 2.2857 | 3.5 | 112 | 2.3468 |
143
+ | 2.2448 | 3.75 | 120 | 2.3451 |
144
+ | 2.1836 | 4.0 | 128 | 2.3466 |
145
+
146
+
147
+ ### Framework versions
148
+
149
+ - PEFT 0.10.0
150
+ - Transformers 4.40.0.dev0
151
+ - Pytorch 2.1.2+cu118
152
+ - Datasets 2.15.0
153
+ - Tokenizers 0.15.0
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "v_proj",
25
+ "o_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3863c09db8a8ee37f6c92e820fd3918dba26368631f34bba226156413c6f68bd
3
+ size 50573978
checkpoint-128/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-128/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "v_proj",
25
+ "o_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-128/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:528e075df49b7bbd123228a3b95349c09b3e4aff529a6761814d25530514b3ec
3
+ size 50503848
checkpoint-128/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2662bde842785223ccfc20324eb816ad2974916f34f883ea7c17e0bfd3e93bd
3
+ size 202035450
checkpoint-128/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e46048a3692b100b154cf7c38df5e2ce5c28c9c56bbbf5e17fe2196b8374656f
3
+ size 14244
checkpoint-128/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:968a6dcbda34982ad43f2af2e04e5edf94e043c521201e71b1583695497d18e0
3
+ size 1064
checkpoint-128/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-128/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-128/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }
checkpoint-128/trainer_state.json ADDED
@@ -0,0 +1,1053 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 8,
6
+ "global_step": 128,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 2e-05,
15
+ "loss": 3.9295,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.03,
20
+ "eval_loss": 3.907318115234375,
21
+ "eval_runtime": 1.2713,
22
+ "eval_samples_per_second": 11.012,
23
+ "eval_steps_per_second": 5.506,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.5078125,
29
+ "learning_rate": 4e-05,
30
+ "loss": 3.805,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.09,
35
+ "grad_norm": 0.63671875,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.8521,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "grad_norm": 0.609375,
43
+ "learning_rate": 8e-05,
44
+ "loss": 3.8947,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.16,
49
+ "grad_norm": 0.546875,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.6494,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.19,
56
+ "grad_norm": 0.50390625,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.6457,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.22,
63
+ "grad_norm": 0.640625,
64
+ "learning_rate": 0.00014,
65
+ "loss": 3.967,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.25,
70
+ "grad_norm": 0.494140625,
71
+ "learning_rate": 0.00016,
72
+ "loss": 3.5364,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "eval_loss": 3.6198840141296387,
78
+ "eval_runtime": 1.2681,
79
+ "eval_samples_per_second": 11.04,
80
+ "eval_steps_per_second": 5.52,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 0.28,
85
+ "grad_norm": 0.51171875,
86
+ "learning_rate": 0.00018,
87
+ "loss": 3.5216,
88
+ "step": 9
89
+ },
90
+ {
91
+ "epoch": 0.31,
92
+ "grad_norm": 0.6171875,
93
+ "learning_rate": 0.0002,
94
+ "loss": 3.677,
95
+ "step": 10
96
+ },
97
+ {
98
+ "epoch": 0.34,
99
+ "grad_norm": 0.734375,
100
+ "learning_rate": 0.00019996456111234527,
101
+ "loss": 3.058,
102
+ "step": 11
103
+ },
104
+ {
105
+ "epoch": 0.38,
106
+ "grad_norm": 0.66796875,
107
+ "learning_rate": 0.0001998582695676762,
108
+ "loss": 3.1333,
109
+ "step": 12
110
+ },
111
+ {
112
+ "epoch": 0.41,
113
+ "grad_norm": 0.8359375,
114
+ "learning_rate": 0.000199681200703075,
115
+ "loss": 3.38,
116
+ "step": 13
117
+ },
118
+ {
119
+ "epoch": 0.44,
120
+ "grad_norm": 0.62109375,
121
+ "learning_rate": 0.00019943348002101371,
122
+ "loss": 3.1371,
123
+ "step": 14
124
+ },
125
+ {
126
+ "epoch": 0.47,
127
+ "grad_norm": 0.65625,
128
+ "learning_rate": 0.00019911528310040074,
129
+ "loss": 3.1479,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 0.5,
134
+ "grad_norm": 0.6484375,
135
+ "learning_rate": 0.00019872683547213446,
136
+ "loss": 3.263,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.5,
141
+ "eval_loss": 3.1820719242095947,
142
+ "eval_runtime": 1.2692,
143
+ "eval_samples_per_second": 11.031,
144
+ "eval_steps_per_second": 5.515,
145
+ "step": 16
146
+ },
147
+ {
148
+ "epoch": 0.53,
149
+ "grad_norm": 0.53125,
150
+ "learning_rate": 0.00019826841245925212,
151
+ "loss": 2.9833,
152
+ "step": 17
153
+ },
154
+ {
155
+ "epoch": 0.56,
156
+ "grad_norm": 0.5703125,
157
+ "learning_rate": 0.00019774033898178667,
158
+ "loss": 3.0787,
159
+ "step": 18
160
+ },
161
+ {
162
+ "epoch": 0.59,
163
+ "grad_norm": 0.71484375,
164
+ "learning_rate": 0.00019714298932647098,
165
+ "loss": 3.4132,
166
+ "step": 19
167
+ },
168
+ {
169
+ "epoch": 0.62,
170
+ "grad_norm": 0.73046875,
171
+ "learning_rate": 0.0001964767868814516,
172
+ "loss": 2.7304,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.66,
177
+ "grad_norm": 0.55078125,
178
+ "learning_rate": 0.00019574220383620055,
179
+ "loss": 3.0116,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.69,
184
+ "grad_norm": 0.6171875,
185
+ "learning_rate": 0.00019493976084683813,
186
+ "loss": 2.9474,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.72,
191
+ "grad_norm": 0.61328125,
192
+ "learning_rate": 0.00019407002666710336,
193
+ "loss": 2.9415,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.75,
198
+ "grad_norm": 0.6328125,
199
+ "learning_rate": 0.00019313361774523385,
200
+ "loss": 2.798,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.75,
205
+ "eval_loss": 2.896176815032959,
206
+ "eval_runtime": 1.2765,
207
+ "eval_samples_per_second": 10.967,
208
+ "eval_steps_per_second": 5.484,
209
+ "step": 24
210
+ },
211
+ {
212
+ "epoch": 0.78,
213
+ "grad_norm": 0.8046875,
214
+ "learning_rate": 0.00019213119778704128,
215
+ "loss": 3.2157,
216
+ "step": 25
217
+ },
218
+ {
219
+ "epoch": 0.81,
220
+ "grad_norm": 0.7109375,
221
+ "learning_rate": 0.00019106347728549135,
222
+ "loss": 3.0666,
223
+ "step": 26
224
+ },
225
+ {
226
+ "epoch": 0.84,
227
+ "grad_norm": 0.59765625,
228
+ "learning_rate": 0.00018993121301712193,
229
+ "loss": 2.8219,
230
+ "step": 27
231
+ },
232
+ {
233
+ "epoch": 0.88,
234
+ "grad_norm": 0.75390625,
235
+ "learning_rate": 0.00018873520750565718,
236
+ "loss": 3.1164,
237
+ "step": 28
238
+ },
239
+ {
240
+ "epoch": 0.91,
241
+ "grad_norm": 0.67578125,
242
+ "learning_rate": 0.00018747630845319612,
243
+ "loss": 2.7154,
244
+ "step": 29
245
+ },
246
+ {
247
+ "epoch": 0.94,
248
+ "grad_norm": 0.625,
249
+ "learning_rate": 0.0001861554081393806,
250
+ "loss": 2.7395,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.97,
255
+ "grad_norm": 0.8125,
256
+ "learning_rate": 0.0001847734427889671,
257
+ "loss": 2.8282,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 1.0,
262
+ "grad_norm": 0.86328125,
263
+ "learning_rate": 0.0001833313919082515,
264
+ "loss": 2.7787,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 1.0,
269
+ "eval_loss": 2.67726731300354,
270
+ "eval_runtime": 1.2769,
271
+ "eval_samples_per_second": 10.964,
272
+ "eval_steps_per_second": 5.482,
273
+ "step": 32
274
+ },
275
+ {
276
+ "epoch": 1.03,
277
+ "grad_norm": 0.703125,
278
+ "learning_rate": 0.0001818302775908169,
279
+ "loss": 2.5957,
280
+ "step": 33
281
+ },
282
+ {
283
+ "epoch": 1.06,
284
+ "grad_norm": 0.76953125,
285
+ "learning_rate": 0.00018027116379309638,
286
+ "loss": 2.7011,
287
+ "step": 34
288
+ },
289
+ {
290
+ "epoch": 1.09,
291
+ "grad_norm": 0.84765625,
292
+ "learning_rate": 0.00017865515558026428,
293
+ "loss": 2.6043,
294
+ "step": 35
295
+ },
296
+ {
297
+ "epoch": 1.12,
298
+ "grad_norm": 0.984375,
299
+ "learning_rate": 0.00017698339834299061,
300
+ "loss": 2.8607,
301
+ "step": 36
302
+ },
303
+ {
304
+ "epoch": 1.16,
305
+ "grad_norm": 0.7578125,
306
+ "learning_rate": 0.00017525707698561385,
307
+ "loss": 2.5949,
308
+ "step": 37
309
+ },
310
+ {
311
+ "epoch": 1.19,
312
+ "grad_norm": 0.94921875,
313
+ "learning_rate": 0.00017347741508630672,
314
+ "loss": 2.7476,
315
+ "step": 38
316
+ },
317
+ {
318
+ "epoch": 1.22,
319
+ "grad_norm": 0.88671875,
320
+ "learning_rate": 0.00017164567402983152,
321
+ "loss": 2.7991,
322
+ "step": 39
323
+ },
324
+ {
325
+ "epoch": 1.25,
326
+ "grad_norm": 0.8671875,
327
+ "learning_rate": 0.0001697631521134985,
328
+ "loss": 2.5959,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 1.25,
333
+ "eval_loss": 2.5505764484405518,
334
+ "eval_runtime": 1.2761,
335
+ "eval_samples_per_second": 10.971,
336
+ "eval_steps_per_second": 5.486,
337
+ "step": 40
338
+ },
339
+ {
340
+ "epoch": 1.28,
341
+ "grad_norm": 0.984375,
342
+ "learning_rate": 0.00016783118362696163,
343
+ "loss": 2.5342,
344
+ "step": 41
345
+ },
346
+ {
347
+ "epoch": 1.31,
348
+ "grad_norm": 0.921875,
349
+ "learning_rate": 0.00016585113790650388,
350
+ "loss": 2.4969,
351
+ "step": 42
352
+ },
353
+ {
354
+ "epoch": 1.34,
355
+ "grad_norm": 1.125,
356
+ "learning_rate": 0.00016382441836448202,
357
+ "loss": 2.5723,
358
+ "step": 43
359
+ },
360
+ {
361
+ "epoch": 1.38,
362
+ "grad_norm": 1.015625,
363
+ "learning_rate": 0.0001617524614946192,
364
+ "loss": 2.6166,
365
+ "step": 44
366
+ },
367
+ {
368
+ "epoch": 1.41,
369
+ "grad_norm": 0.98828125,
370
+ "learning_rate": 0.00015963673585385016,
371
+ "loss": 2.4574,
372
+ "step": 45
373
+ },
374
+ {
375
+ "epoch": 1.44,
376
+ "grad_norm": 0.94921875,
377
+ "learning_rate": 0.0001574787410214407,
378
+ "loss": 2.5074,
379
+ "step": 46
380
+ },
381
+ {
382
+ "epoch": 1.47,
383
+ "grad_norm": 0.953125,
384
+ "learning_rate": 0.00015528000653611935,
385
+ "loss": 2.5394,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 1.5,
390
+ "grad_norm": 0.828125,
391
+ "learning_rate": 0.00015304209081197425,
392
+ "loss": 2.4793,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 1.5,
397
+ "eval_loss": 2.495466470718384,
398
+ "eval_runtime": 1.2814,
399
+ "eval_samples_per_second": 10.926,
400
+ "eval_steps_per_second": 5.463,
401
+ "step": 48
402
+ },
403
+ {
404
+ "epoch": 1.53,
405
+ "grad_norm": 0.921875,
406
+ "learning_rate": 0.000150766580033884,
407
+ "loss": 2.2909,
408
+ "step": 49
409
+ },
410
+ {
411
+ "epoch": 1.56,
412
+ "grad_norm": 0.74609375,
413
+ "learning_rate": 0.00014845508703326504,
414
+ "loss": 2.3424,
415
+ "step": 50
416
+ },
417
+ {
418
+ "epoch": 1.59,
419
+ "grad_norm": 0.87890625,
420
+ "learning_rate": 0.0001461092501449326,
421
+ "loss": 2.506,
422
+ "step": 51
423
+ },
424
+ {
425
+ "epoch": 1.62,
426
+ "grad_norm": 0.7890625,
427
+ "learning_rate": 0.00014373073204588556,
428
+ "loss": 2.4829,
429
+ "step": 52
430
+ },
431
+ {
432
+ "epoch": 1.66,
433
+ "grad_norm": 0.984375,
434
+ "learning_rate": 0.00014132121857683783,
435
+ "loss": 2.481,
436
+ "step": 53
437
+ },
438
+ {
439
+ "epoch": 1.69,
440
+ "grad_norm": 0.9296875,
441
+ "learning_rate": 0.00013888241754733208,
442
+ "loss": 2.5512,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 1.72,
447
+ "grad_norm": 0.7890625,
448
+ "learning_rate": 0.00013641605752528224,
449
+ "loss": 2.5405,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 1.75,
454
+ "grad_norm": 0.95703125,
455
+ "learning_rate": 0.00013392388661180303,
456
+ "loss": 2.5221,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 1.75,
461
+ "eval_loss": 2.461298704147339,
462
+ "eval_runtime": 1.28,
463
+ "eval_samples_per_second": 10.937,
464
+ "eval_steps_per_second": 5.469,
465
+ "step": 56
466
+ },
467
+ {
468
+ "epoch": 1.78,
469
+ "grad_norm": 0.98046875,
470
+ "learning_rate": 0.0001314076712021949,
471
+ "loss": 2.5646,
472
+ "step": 57
473
+ },
474
+ {
475
+ "epoch": 1.81,
476
+ "grad_norm": 0.8125,
477
+ "learning_rate": 0.0001288691947339621,
478
+ "loss": 2.5079,
479
+ "step": 58
480
+ },
481
+ {
482
+ "epoch": 1.84,
483
+ "grad_norm": 0.8125,
484
+ "learning_rate": 0.00012631025642275212,
485
+ "loss": 2.4743,
486
+ "step": 59
487
+ },
488
+ {
489
+ "epoch": 1.88,
490
+ "grad_norm": 0.60546875,
491
+ "learning_rate": 0.0001237326699871115,
492
+ "loss": 2.3103,
493
+ "step": 60
494
+ },
495
+ {
496
+ "epoch": 1.91,
497
+ "grad_norm": 0.80859375,
498
+ "learning_rate": 0.00012113826236296244,
499
+ "loss": 2.4229,
500
+ "step": 61
501
+ },
502
+ {
503
+ "epoch": 1.94,
504
+ "grad_norm": 0.671875,
505
+ "learning_rate": 0.00011852887240871145,
506
+ "loss": 2.2709,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 1.97,
511
+ "grad_norm": 0.8203125,
512
+ "learning_rate": 0.00011590634960190721,
513
+ "loss": 2.4868,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 2.0,
518
+ "grad_norm": 0.62109375,
519
+ "learning_rate": 0.00011327255272837221,
520
+ "loss": 2.4384,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 2.0,
525
+ "eval_loss": 2.4055111408233643,
526
+ "eval_runtime": 1.2798,
527
+ "eval_samples_per_second": 10.94,
528
+ "eval_steps_per_second": 5.47,
529
+ "step": 64
530
+ },
531
+ {
532
+ "epoch": 2.03,
533
+ "grad_norm": 0.6328125,
534
+ "learning_rate": 0.00011062934856473655,
535
+ "loss": 2.3671,
536
+ "step": 65
537
+ },
538
+ {
539
+ "epoch": 2.06,
540
+ "grad_norm": 0.69140625,
541
+ "learning_rate": 0.00010797861055530831,
542
+ "loss": 2.3381,
543
+ "step": 66
544
+ },
545
+ {
546
+ "epoch": 2.09,
547
+ "grad_norm": 0.671875,
548
+ "learning_rate": 0.00010532221748421787,
549
+ "loss": 2.2134,
550
+ "step": 67
551
+ },
552
+ {
553
+ "epoch": 2.12,
554
+ "grad_norm": 0.7890625,
555
+ "learning_rate": 0.00010266205214377748,
556
+ "loss": 2.2687,
557
+ "step": 68
558
+ },
559
+ {
560
+ "epoch": 2.16,
561
+ "grad_norm": 0.75,
562
+ "learning_rate": 0.0001,
563
+ "loss": 2.4273,
564
+ "step": 69
565
+ },
566
+ {
567
+ "epoch": 2.19,
568
+ "grad_norm": 0.8671875,
569
+ "learning_rate": 9.733794785622253e-05,
570
+ "loss": 2.4439,
571
+ "step": 70
572
+ },
573
+ {
574
+ "epoch": 2.22,
575
+ "grad_norm": 0.953125,
576
+ "learning_rate": 9.467778251578217e-05,
577
+ "loss": 2.6631,
578
+ "step": 71
579
+ },
580
+ {
581
+ "epoch": 2.25,
582
+ "grad_norm": 0.78125,
583
+ "learning_rate": 9.202138944469168e-05,
584
+ "loss": 2.295,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 2.25,
589
+ "eval_loss": 2.3922784328460693,
590
+ "eval_runtime": 1.282,
591
+ "eval_samples_per_second": 10.921,
592
+ "eval_steps_per_second": 5.46,
593
+ "step": 72
594
+ },
595
+ {
596
+ "epoch": 2.28,
597
+ "grad_norm": 0.734375,
598
+ "learning_rate": 8.937065143526347e-05,
599
+ "loss": 2.4963,
600
+ "step": 73
601
+ },
602
+ {
603
+ "epoch": 2.31,
604
+ "grad_norm": 0.68359375,
605
+ "learning_rate": 8.672744727162781e-05,
606
+ "loss": 2.4274,
607
+ "step": 74
608
+ },
609
+ {
610
+ "epoch": 2.34,
611
+ "grad_norm": 0.9765625,
612
+ "learning_rate": 8.409365039809281e-05,
613
+ "loss": 2.4988,
614
+ "step": 75
615
+ },
616
+ {
617
+ "epoch": 2.38,
618
+ "grad_norm": 0.75390625,
619
+ "learning_rate": 8.147112759128859e-05,
620
+ "loss": 2.2886,
621
+ "step": 76
622
+ },
623
+ {
624
+ "epoch": 2.41,
625
+ "grad_norm": 0.80078125,
626
+ "learning_rate": 7.886173763703757e-05,
627
+ "loss": 2.1944,
628
+ "step": 77
629
+ },
630
+ {
631
+ "epoch": 2.44,
632
+ "grad_norm": 0.8828125,
633
+ "learning_rate": 7.626733001288851e-05,
634
+ "loss": 2.3283,
635
+ "step": 78
636
+ },
637
+ {
638
+ "epoch": 2.47,
639
+ "grad_norm": 0.82421875,
640
+ "learning_rate": 7.368974357724789e-05,
641
+ "loss": 2.3855,
642
+ "step": 79
643
+ },
644
+ {
645
+ "epoch": 2.5,
646
+ "grad_norm": 0.7890625,
647
+ "learning_rate": 7.113080526603792e-05,
648
+ "loss": 2.3943,
649
+ "step": 80
650
+ },
651
+ {
652
+ "epoch": 2.5,
653
+ "eval_loss": 2.386228084564209,
654
+ "eval_runtime": 1.2835,
655
+ "eval_samples_per_second": 10.908,
656
+ "eval_steps_per_second": 5.454,
657
+ "step": 80
658
+ },
659
+ {
660
+ "epoch": 2.53,
661
+ "grad_norm": 0.94921875,
662
+ "learning_rate": 6.859232879780515e-05,
663
+ "loss": 2.2725,
664
+ "step": 81
665
+ },
666
+ {
667
+ "epoch": 2.56,
668
+ "grad_norm": 0.75,
669
+ "learning_rate": 6.607611338819697e-05,
670
+ "loss": 2.1989,
671
+ "step": 82
672
+ },
673
+ {
674
+ "epoch": 2.59,
675
+ "grad_norm": 0.703125,
676
+ "learning_rate": 6.358394247471778e-05,
677
+ "loss": 2.1219,
678
+ "step": 83
679
+ },
680
+ {
681
+ "epoch": 2.62,
682
+ "grad_norm": 1.0390625,
683
+ "learning_rate": 6.111758245266794e-05,
684
+ "loss": 2.3478,
685
+ "step": 84
686
+ },
687
+ {
688
+ "epoch": 2.66,
689
+ "grad_norm": 0.78125,
690
+ "learning_rate": 5.867878142316221e-05,
691
+ "loss": 2.4196,
692
+ "step": 85
693
+ },
694
+ {
695
+ "epoch": 2.69,
696
+ "grad_norm": 0.6796875,
697
+ "learning_rate": 5.626926795411447e-05,
698
+ "loss": 2.1882,
699
+ "step": 86
700
+ },
701
+ {
702
+ "epoch": 2.72,
703
+ "grad_norm": 0.8203125,
704
+ "learning_rate": 5.38907498550674e-05,
705
+ "loss": 2.4703,
706
+ "step": 87
707
+ },
708
+ {
709
+ "epoch": 2.75,
710
+ "grad_norm": 0.8671875,
711
+ "learning_rate": 5.1544912966734994e-05,
712
+ "loss": 2.2398,
713
+ "step": 88
714
+ },
715
+ {
716
+ "epoch": 2.75,
717
+ "eval_loss": 2.3605105876922607,
718
+ "eval_runtime": 1.2887,
719
+ "eval_samples_per_second": 10.864,
720
+ "eval_steps_per_second": 5.432,
721
+ "step": 88
722
+ },
723
+ {
724
+ "epoch": 2.78,
725
+ "grad_norm": 0.81640625,
726
+ "learning_rate": 4.9233419966116036e-05,
727
+ "loss": 2.2939,
728
+ "step": 89
729
+ },
730
+ {
731
+ "epoch": 2.81,
732
+ "grad_norm": 1.0234375,
733
+ "learning_rate": 4.695790918802576e-05,
734
+ "loss": 2.2835,
735
+ "step": 90
736
+ },
737
+ {
738
+ "epoch": 2.84,
739
+ "grad_norm": 0.85546875,
740
+ "learning_rate": 4.47199934638807e-05,
741
+ "loss": 2.3145,
742
+ "step": 91
743
+ },
744
+ {
745
+ "epoch": 2.88,
746
+ "grad_norm": 0.5546875,
747
+ "learning_rate": 4.252125897855932e-05,
748
+ "loss": 2.2521,
749
+ "step": 92
750
+ },
751
+ {
752
+ "epoch": 2.91,
753
+ "grad_norm": 0.76953125,
754
+ "learning_rate": 4.036326414614985e-05,
755
+ "loss": 2.377,
756
+ "step": 93
757
+ },
758
+ {
759
+ "epoch": 2.94,
760
+ "grad_norm": 0.78125,
761
+ "learning_rate": 3.824753850538082e-05,
762
+ "loss": 2.4343,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 2.97,
767
+ "grad_norm": 0.8046875,
768
+ "learning_rate": 3.617558163551802e-05,
769
+ "loss": 2.2051,
770
+ "step": 95
771
+ },
772
+ {
773
+ "epoch": 3.0,
774
+ "grad_norm": 0.88671875,
775
+ "learning_rate": 3.414886209349615e-05,
776
+ "loss": 2.2693,
777
+ "step": 96
778
+ },
779
+ {
780
+ "epoch": 3.0,
781
+ "eval_loss": 2.3525888919830322,
782
+ "eval_runtime": 1.2821,
783
+ "eval_samples_per_second": 10.919,
784
+ "eval_steps_per_second": 5.46,
785
+ "step": 96
786
+ },
787
+ {
788
+ "epoch": 3.03,
789
+ "grad_norm": 0.74609375,
790
+ "learning_rate": 3.216881637303839e-05,
791
+ "loss": 2.5048,
792
+ "step": 97
793
+ },
794
+ {
795
+ "epoch": 3.06,
796
+ "grad_norm": 0.95703125,
797
+ "learning_rate": 3.0236847886501542e-05,
798
+ "loss": 2.3404,
799
+ "step": 98
800
+ },
801
+ {
802
+ "epoch": 3.09,
803
+ "grad_norm": 0.69140625,
804
+ "learning_rate": 2.8354325970168484e-05,
805
+ "loss": 2.3147,
806
+ "step": 99
807
+ },
808
+ {
809
+ "epoch": 3.12,
810
+ "grad_norm": 0.765625,
811
+ "learning_rate": 2.6522584913693294e-05,
812
+ "loss": 2.448,
813
+ "step": 100
814
+ },
815
+ {
816
+ "epoch": 3.16,
817
+ "grad_norm": 0.671875,
818
+ "learning_rate": 2.4742923014386156e-05,
819
+ "loss": 2.2212,
820
+ "step": 101
821
+ },
822
+ {
823
+ "epoch": 3.19,
824
+ "grad_norm": 0.625,
825
+ "learning_rate": 2.301660165700936e-05,
826
+ "loss": 2.3427,
827
+ "step": 102
828
+ },
829
+ {
830
+ "epoch": 3.22,
831
+ "grad_norm": 0.6484375,
832
+ "learning_rate": 2.1344844419735755e-05,
833
+ "loss": 2.1,
834
+ "step": 103
835
+ },
836
+ {
837
+ "epoch": 3.25,
838
+ "grad_norm": 0.73046875,
839
+ "learning_rate": 1.9728836206903656e-05,
840
+ "loss": 2.425,
841
+ "step": 104
842
+ },
843
+ {
844
+ "epoch": 3.25,
845
+ "eval_loss": 2.3470816612243652,
846
+ "eval_runtime": 1.2874,
847
+ "eval_samples_per_second": 10.875,
848
+ "eval_steps_per_second": 5.437,
849
+ "step": 104
850
+ },
851
+ {
852
+ "epoch": 3.28,
853
+ "grad_norm": 0.734375,
854
+ "learning_rate": 1.8169722409183097e-05,
855
+ "loss": 2.2268,
856
+ "step": 105
857
+ },
858
+ {
859
+ "epoch": 3.31,
860
+ "grad_norm": 0.8359375,
861
+ "learning_rate": 1.6668608091748495e-05,
862
+ "loss": 2.3695,
863
+ "step": 106
864
+ },
865
+ {
866
+ "epoch": 3.34,
867
+ "grad_norm": 0.58203125,
868
+ "learning_rate": 1.522655721103291e-05,
869
+ "loss": 2.1864,
870
+ "step": 107
871
+ },
872
+ {
873
+ "epoch": 3.38,
874
+ "grad_norm": 0.5703125,
875
+ "learning_rate": 1.3844591860619383e-05,
876
+ "loss": 2.2613,
877
+ "step": 108
878
+ },
879
+ {
880
+ "epoch": 3.41,
881
+ "grad_norm": 0.7265625,
882
+ "learning_rate": 1.2523691546803873e-05,
883
+ "loss": 2.2771,
884
+ "step": 109
885
+ },
886
+ {
887
+ "epoch": 3.44,
888
+ "grad_norm": 0.59765625,
889
+ "learning_rate": 1.1264792494342857e-05,
890
+ "loss": 2.2056,
891
+ "step": 110
892
+ },
893
+ {
894
+ "epoch": 3.47,
895
+ "grad_norm": 0.6875,
896
+ "learning_rate": 1.0068786982878087e-05,
897
+ "loss": 2.4039,
898
+ "step": 111
899
+ },
900
+ {
901
+ "epoch": 3.5,
902
+ "grad_norm": 0.77734375,
903
+ "learning_rate": 8.936522714508678e-06,
904
+ "loss": 2.2857,
905
+ "step": 112
906
+ },
907
+ {
908
+ "epoch": 3.5,
909
+ "eval_loss": 2.3467748165130615,
910
+ "eval_runtime": 1.2808,
911
+ "eval_samples_per_second": 10.931,
912
+ "eval_steps_per_second": 5.466,
913
+ "step": 112
914
+ },
915
+ {
916
+ "epoch": 3.53,
917
+ "grad_norm": 0.671875,
918
+ "learning_rate": 7.868802212958703e-06,
919
+ "loss": 2.257,
920
+ "step": 113
921
+ },
922
+ {
923
+ "epoch": 3.56,
924
+ "grad_norm": 1.21875,
925
+ "learning_rate": 6.866382254766157e-06,
926
+ "loss": 2.4278,
927
+ "step": 114
928
+ },
929
+ {
930
+ "epoch": 3.59,
931
+ "grad_norm": 0.859375,
932
+ "learning_rate": 5.929973332896677e-06,
933
+ "loss": 2.2232,
934
+ "step": 115
935
+ },
936
+ {
937
+ "epoch": 3.62,
938
+ "grad_norm": 0.66015625,
939
+ "learning_rate": 5.060239153161872e-06,
940
+ "loss": 2.225,
941
+ "step": 116
942
+ },
943
+ {
944
+ "epoch": 3.66,
945
+ "grad_norm": 0.6875,
946
+ "learning_rate": 4.257796163799455e-06,
947
+ "loss": 2.2133,
948
+ "step": 117
949
+ },
950
+ {
951
+ "epoch": 3.69,
952
+ "grad_norm": 0.8671875,
953
+ "learning_rate": 3.5232131185484076e-06,
954
+ "loss": 2.2191,
955
+ "step": 118
956
+ },
957
+ {
958
+ "epoch": 3.72,
959
+ "grad_norm": 0.77734375,
960
+ "learning_rate": 2.857010673529015e-06,
961
+ "loss": 2.0715,
962
+ "step": 119
963
+ },
964
+ {
965
+ "epoch": 3.75,
966
+ "grad_norm": 0.72265625,
967
+ "learning_rate": 2.259661018213333e-06,
968
+ "loss": 2.2448,
969
+ "step": 120
970
+ },
971
+ {
972
+ "epoch": 3.75,
973
+ "eval_loss": 2.3450927734375,
974
+ "eval_runtime": 1.2817,
975
+ "eval_samples_per_second": 10.923,
976
+ "eval_steps_per_second": 5.461,
977
+ "step": 120
978
+ },
979
+ {
980
+ "epoch": 3.78,
981
+ "grad_norm": 0.62109375,
982
+ "learning_rate": 1.7315875407479032e-06,
983
+ "loss": 2.1337,
984
+ "step": 121
985
+ },
986
+ {
987
+ "epoch": 3.81,
988
+ "grad_norm": 0.765625,
989
+ "learning_rate": 1.2731645278655445e-06,
990
+ "loss": 2.4334,
991
+ "step": 122
992
+ },
993
+ {
994
+ "epoch": 3.84,
995
+ "grad_norm": 0.73828125,
996
+ "learning_rate": 8.847168995992916e-07,
997
+ "loss": 2.3989,
998
+ "step": 123
999
+ },
1000
+ {
1001
+ "epoch": 3.88,
1002
+ "grad_norm": 0.71484375,
1003
+ "learning_rate": 5.665199789862907e-07,
1004
+ "loss": 2.3509,
1005
+ "step": 124
1006
+ },
1007
+ {
1008
+ "epoch": 3.91,
1009
+ "grad_norm": 0.78125,
1010
+ "learning_rate": 3.1879929692498757e-07,
1011
+ "loss": 2.3366,
1012
+ "step": 125
1013
+ },
1014
+ {
1015
+ "epoch": 3.94,
1016
+ "grad_norm": 0.625,
1017
+ "learning_rate": 1.4173043232380557e-07,
1018
+ "loss": 2.071,
1019
+ "step": 126
1020
+ },
1021
+ {
1022
+ "epoch": 3.97,
1023
+ "grad_norm": 0.6015625,
1024
+ "learning_rate": 3.5438887654737355e-08,
1025
+ "loss": 2.2108,
1026
+ "step": 127
1027
+ },
1028
+ {
1029
+ "epoch": 4.0,
1030
+ "grad_norm": 0.7578125,
1031
+ "learning_rate": 0.0,
1032
+ "loss": 2.1836,
1033
+ "step": 128
1034
+ },
1035
+ {
1036
+ "epoch": 4.0,
1037
+ "eval_loss": 2.3465805053710938,
1038
+ "eval_runtime": 1.2864,
1039
+ "eval_samples_per_second": 10.883,
1040
+ "eval_steps_per_second": 5.441,
1041
+ "step": 128
1042
+ }
1043
+ ],
1044
+ "logging_steps": 1,
1045
+ "max_steps": 128,
1046
+ "num_input_tokens_seen": 0,
1047
+ "num_train_epochs": 4,
1048
+ "save_steps": 32,
1049
+ "total_flos": 2.6669324546605056e+16,
1050
+ "train_batch_size": 2,
1051
+ "trial_name": null,
1052
+ "trial_params": null
1053
+ }
checkpoint-128/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fca5508434f64a69e54fc8c46f68907814d91f48e751b7a0eeb4050e5ae3225
3
+ size 5816
checkpoint-32/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-32/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "v_proj",
25
+ "o_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-32/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec430d847c06078c87671f113c67363bf58b51f662fe0c2c9aac1aea9197c05e
3
+ size 50503848
checkpoint-32/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2dcfc1311f5586ce9bc6ab8405e3706e4dcb6ef0050771e2c9bbe97b1fc7874
3
+ size 202035450
checkpoint-32/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fba88830bbf3f3a231c5f7c643ca357bcc703c4a14675504974ab2e003369a61
3
+ size 14244
checkpoint-32/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73fa7fed0c0930926a3524b49ae450b78da12282a81cf57d30f2d1a4044247d8
3
+ size 1064
checkpoint-32/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-32/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-32/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }
checkpoint-32/trainer_state.json ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 8,
6
+ "global_step": 32,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 2e-05,
15
+ "loss": 3.9295,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.03,
20
+ "eval_loss": 3.907318115234375,
21
+ "eval_runtime": 1.2713,
22
+ "eval_samples_per_second": 11.012,
23
+ "eval_steps_per_second": 5.506,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.5078125,
29
+ "learning_rate": 4e-05,
30
+ "loss": 3.805,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.09,
35
+ "grad_norm": 0.63671875,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.8521,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "grad_norm": 0.609375,
43
+ "learning_rate": 8e-05,
44
+ "loss": 3.8947,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.16,
49
+ "grad_norm": 0.546875,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.6494,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.19,
56
+ "grad_norm": 0.50390625,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.6457,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.22,
63
+ "grad_norm": 0.640625,
64
+ "learning_rate": 0.00014,
65
+ "loss": 3.967,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.25,
70
+ "grad_norm": 0.494140625,
71
+ "learning_rate": 0.00016,
72
+ "loss": 3.5364,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "eval_loss": 3.6198840141296387,
78
+ "eval_runtime": 1.2681,
79
+ "eval_samples_per_second": 11.04,
80
+ "eval_steps_per_second": 5.52,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 0.28,
85
+ "grad_norm": 0.51171875,
86
+ "learning_rate": 0.00018,
87
+ "loss": 3.5216,
88
+ "step": 9
89
+ },
90
+ {
91
+ "epoch": 0.31,
92
+ "grad_norm": 0.6171875,
93
+ "learning_rate": 0.0002,
94
+ "loss": 3.677,
95
+ "step": 10
96
+ },
97
+ {
98
+ "epoch": 0.34,
99
+ "grad_norm": 0.734375,
100
+ "learning_rate": 0.00019996456111234527,
101
+ "loss": 3.058,
102
+ "step": 11
103
+ },
104
+ {
105
+ "epoch": 0.38,
106
+ "grad_norm": 0.66796875,
107
+ "learning_rate": 0.0001998582695676762,
108
+ "loss": 3.1333,
109
+ "step": 12
110
+ },
111
+ {
112
+ "epoch": 0.41,
113
+ "grad_norm": 0.8359375,
114
+ "learning_rate": 0.000199681200703075,
115
+ "loss": 3.38,
116
+ "step": 13
117
+ },
118
+ {
119
+ "epoch": 0.44,
120
+ "grad_norm": 0.62109375,
121
+ "learning_rate": 0.00019943348002101371,
122
+ "loss": 3.1371,
123
+ "step": 14
124
+ },
125
+ {
126
+ "epoch": 0.47,
127
+ "grad_norm": 0.65625,
128
+ "learning_rate": 0.00019911528310040074,
129
+ "loss": 3.1479,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 0.5,
134
+ "grad_norm": 0.6484375,
135
+ "learning_rate": 0.00019872683547213446,
136
+ "loss": 3.263,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.5,
141
+ "eval_loss": 3.1820719242095947,
142
+ "eval_runtime": 1.2692,
143
+ "eval_samples_per_second": 11.031,
144
+ "eval_steps_per_second": 5.515,
145
+ "step": 16
146
+ },
147
+ {
148
+ "epoch": 0.53,
149
+ "grad_norm": 0.53125,
150
+ "learning_rate": 0.00019826841245925212,
151
+ "loss": 2.9833,
152
+ "step": 17
153
+ },
154
+ {
155
+ "epoch": 0.56,
156
+ "grad_norm": 0.5703125,
157
+ "learning_rate": 0.00019774033898178667,
158
+ "loss": 3.0787,
159
+ "step": 18
160
+ },
161
+ {
162
+ "epoch": 0.59,
163
+ "grad_norm": 0.71484375,
164
+ "learning_rate": 0.00019714298932647098,
165
+ "loss": 3.4132,
166
+ "step": 19
167
+ },
168
+ {
169
+ "epoch": 0.62,
170
+ "grad_norm": 0.73046875,
171
+ "learning_rate": 0.0001964767868814516,
172
+ "loss": 2.7304,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.66,
177
+ "grad_norm": 0.55078125,
178
+ "learning_rate": 0.00019574220383620055,
179
+ "loss": 3.0116,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.69,
184
+ "grad_norm": 0.6171875,
185
+ "learning_rate": 0.00019493976084683813,
186
+ "loss": 2.9474,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.72,
191
+ "grad_norm": 0.61328125,
192
+ "learning_rate": 0.00019407002666710336,
193
+ "loss": 2.9415,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.75,
198
+ "grad_norm": 0.6328125,
199
+ "learning_rate": 0.00019313361774523385,
200
+ "loss": 2.798,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.75,
205
+ "eval_loss": 2.896176815032959,
206
+ "eval_runtime": 1.2765,
207
+ "eval_samples_per_second": 10.967,
208
+ "eval_steps_per_second": 5.484,
209
+ "step": 24
210
+ },
211
+ {
212
+ "epoch": 0.78,
213
+ "grad_norm": 0.8046875,
214
+ "learning_rate": 0.00019213119778704128,
215
+ "loss": 3.2157,
216
+ "step": 25
217
+ },
218
+ {
219
+ "epoch": 0.81,
220
+ "grad_norm": 0.7109375,
221
+ "learning_rate": 0.00019106347728549135,
222
+ "loss": 3.0666,
223
+ "step": 26
224
+ },
225
+ {
226
+ "epoch": 0.84,
227
+ "grad_norm": 0.59765625,
228
+ "learning_rate": 0.00018993121301712193,
229
+ "loss": 2.8219,
230
+ "step": 27
231
+ },
232
+ {
233
+ "epoch": 0.88,
234
+ "grad_norm": 0.75390625,
235
+ "learning_rate": 0.00018873520750565718,
236
+ "loss": 3.1164,
237
+ "step": 28
238
+ },
239
+ {
240
+ "epoch": 0.91,
241
+ "grad_norm": 0.67578125,
242
+ "learning_rate": 0.00018747630845319612,
243
+ "loss": 2.7154,
244
+ "step": 29
245
+ },
246
+ {
247
+ "epoch": 0.94,
248
+ "grad_norm": 0.625,
249
+ "learning_rate": 0.0001861554081393806,
250
+ "loss": 2.7395,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.97,
255
+ "grad_norm": 0.8125,
256
+ "learning_rate": 0.0001847734427889671,
257
+ "loss": 2.8282,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 1.0,
262
+ "grad_norm": 0.86328125,
263
+ "learning_rate": 0.0001833313919082515,
264
+ "loss": 2.7787,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 1.0,
269
+ "eval_loss": 2.67726731300354,
270
+ "eval_runtime": 1.2769,
271
+ "eval_samples_per_second": 10.964,
272
+ "eval_steps_per_second": 5.482,
273
+ "step": 32
274
+ }
275
+ ],
276
+ "logging_steps": 1,
277
+ "max_steps": 128,
278
+ "num_input_tokens_seen": 0,
279
+ "num_train_epochs": 4,
280
+ "save_steps": 32,
281
+ "total_flos": 6667331136651264.0,
282
+ "train_batch_size": 2,
283
+ "trial_name": null,
284
+ "trial_params": null
285
+ }
checkpoint-32/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fca5508434f64a69e54fc8c46f68907814d91f48e751b7a0eeb4050e5ae3225
3
+ size 5816
checkpoint-64/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-64/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "v_proj",
25
+ "o_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-64/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478e962adcf5cf2208e94455da60f82ca1058e1b9684714f9c16065ef2896b84
3
+ size 50503848
checkpoint-64/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f7fae12b68510716cf6c13d76435aadee62c6caf362f7880a7604a4378d4478
3
+ size 202035450
checkpoint-64/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926ec53b6c5e5969c7a9dedbcd5e4c4ff81ecd6859afd62f93fd8684f6ac758d
3
+ size 14244
checkpoint-64/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc2f121208237af7a1af0cb966f4edc3bb705d4fe4813c302cf1425b7f9228d4
3
+ size 1064
checkpoint-64/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-64/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-64/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }
checkpoint-64/trainer_state.json ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 8,
6
+ "global_step": 64,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 2e-05,
15
+ "loss": 3.9295,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.03,
20
+ "eval_loss": 3.907318115234375,
21
+ "eval_runtime": 1.2713,
22
+ "eval_samples_per_second": 11.012,
23
+ "eval_steps_per_second": 5.506,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.5078125,
29
+ "learning_rate": 4e-05,
30
+ "loss": 3.805,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.09,
35
+ "grad_norm": 0.63671875,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.8521,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "grad_norm": 0.609375,
43
+ "learning_rate": 8e-05,
44
+ "loss": 3.8947,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.16,
49
+ "grad_norm": 0.546875,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.6494,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.19,
56
+ "grad_norm": 0.50390625,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.6457,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.22,
63
+ "grad_norm": 0.640625,
64
+ "learning_rate": 0.00014,
65
+ "loss": 3.967,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.25,
70
+ "grad_norm": 0.494140625,
71
+ "learning_rate": 0.00016,
72
+ "loss": 3.5364,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "eval_loss": 3.6198840141296387,
78
+ "eval_runtime": 1.2681,
79
+ "eval_samples_per_second": 11.04,
80
+ "eval_steps_per_second": 5.52,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 0.28,
85
+ "grad_norm": 0.51171875,
86
+ "learning_rate": 0.00018,
87
+ "loss": 3.5216,
88
+ "step": 9
89
+ },
90
+ {
91
+ "epoch": 0.31,
92
+ "grad_norm": 0.6171875,
93
+ "learning_rate": 0.0002,
94
+ "loss": 3.677,
95
+ "step": 10
96
+ },
97
+ {
98
+ "epoch": 0.34,
99
+ "grad_norm": 0.734375,
100
+ "learning_rate": 0.00019996456111234527,
101
+ "loss": 3.058,
102
+ "step": 11
103
+ },
104
+ {
105
+ "epoch": 0.38,
106
+ "grad_norm": 0.66796875,
107
+ "learning_rate": 0.0001998582695676762,
108
+ "loss": 3.1333,
109
+ "step": 12
110
+ },
111
+ {
112
+ "epoch": 0.41,
113
+ "grad_norm": 0.8359375,
114
+ "learning_rate": 0.000199681200703075,
115
+ "loss": 3.38,
116
+ "step": 13
117
+ },
118
+ {
119
+ "epoch": 0.44,
120
+ "grad_norm": 0.62109375,
121
+ "learning_rate": 0.00019943348002101371,
122
+ "loss": 3.1371,
123
+ "step": 14
124
+ },
125
+ {
126
+ "epoch": 0.47,
127
+ "grad_norm": 0.65625,
128
+ "learning_rate": 0.00019911528310040074,
129
+ "loss": 3.1479,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 0.5,
134
+ "grad_norm": 0.6484375,
135
+ "learning_rate": 0.00019872683547213446,
136
+ "loss": 3.263,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.5,
141
+ "eval_loss": 3.1820719242095947,
142
+ "eval_runtime": 1.2692,
143
+ "eval_samples_per_second": 11.031,
144
+ "eval_steps_per_second": 5.515,
145
+ "step": 16
146
+ },
147
+ {
148
+ "epoch": 0.53,
149
+ "grad_norm": 0.53125,
150
+ "learning_rate": 0.00019826841245925212,
151
+ "loss": 2.9833,
152
+ "step": 17
153
+ },
154
+ {
155
+ "epoch": 0.56,
156
+ "grad_norm": 0.5703125,
157
+ "learning_rate": 0.00019774033898178667,
158
+ "loss": 3.0787,
159
+ "step": 18
160
+ },
161
+ {
162
+ "epoch": 0.59,
163
+ "grad_norm": 0.71484375,
164
+ "learning_rate": 0.00019714298932647098,
165
+ "loss": 3.4132,
166
+ "step": 19
167
+ },
168
+ {
169
+ "epoch": 0.62,
170
+ "grad_norm": 0.73046875,
171
+ "learning_rate": 0.0001964767868814516,
172
+ "loss": 2.7304,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.66,
177
+ "grad_norm": 0.55078125,
178
+ "learning_rate": 0.00019574220383620055,
179
+ "loss": 3.0116,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.69,
184
+ "grad_norm": 0.6171875,
185
+ "learning_rate": 0.00019493976084683813,
186
+ "loss": 2.9474,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.72,
191
+ "grad_norm": 0.61328125,
192
+ "learning_rate": 0.00019407002666710336,
193
+ "loss": 2.9415,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.75,
198
+ "grad_norm": 0.6328125,
199
+ "learning_rate": 0.00019313361774523385,
200
+ "loss": 2.798,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.75,
205
+ "eval_loss": 2.896176815032959,
206
+ "eval_runtime": 1.2765,
207
+ "eval_samples_per_second": 10.967,
208
+ "eval_steps_per_second": 5.484,
209
+ "step": 24
210
+ },
211
+ {
212
+ "epoch": 0.78,
213
+ "grad_norm": 0.8046875,
214
+ "learning_rate": 0.00019213119778704128,
215
+ "loss": 3.2157,
216
+ "step": 25
217
+ },
218
+ {
219
+ "epoch": 0.81,
220
+ "grad_norm": 0.7109375,
221
+ "learning_rate": 0.00019106347728549135,
222
+ "loss": 3.0666,
223
+ "step": 26
224
+ },
225
+ {
226
+ "epoch": 0.84,
227
+ "grad_norm": 0.59765625,
228
+ "learning_rate": 0.00018993121301712193,
229
+ "loss": 2.8219,
230
+ "step": 27
231
+ },
232
+ {
233
+ "epoch": 0.88,
234
+ "grad_norm": 0.75390625,
235
+ "learning_rate": 0.00018873520750565718,
236
+ "loss": 3.1164,
237
+ "step": 28
238
+ },
239
+ {
240
+ "epoch": 0.91,
241
+ "grad_norm": 0.67578125,
242
+ "learning_rate": 0.00018747630845319612,
243
+ "loss": 2.7154,
244
+ "step": 29
245
+ },
246
+ {
247
+ "epoch": 0.94,
248
+ "grad_norm": 0.625,
249
+ "learning_rate": 0.0001861554081393806,
250
+ "loss": 2.7395,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.97,
255
+ "grad_norm": 0.8125,
256
+ "learning_rate": 0.0001847734427889671,
257
+ "loss": 2.8282,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 1.0,
262
+ "grad_norm": 0.86328125,
263
+ "learning_rate": 0.0001833313919082515,
264
+ "loss": 2.7787,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 1.0,
269
+ "eval_loss": 2.67726731300354,
270
+ "eval_runtime": 1.2769,
271
+ "eval_samples_per_second": 10.964,
272
+ "eval_steps_per_second": 5.482,
273
+ "step": 32
274
+ },
275
+ {
276
+ "epoch": 1.03,
277
+ "grad_norm": 0.703125,
278
+ "learning_rate": 0.0001818302775908169,
279
+ "loss": 2.5957,
280
+ "step": 33
281
+ },
282
+ {
283
+ "epoch": 1.06,
284
+ "grad_norm": 0.76953125,
285
+ "learning_rate": 0.00018027116379309638,
286
+ "loss": 2.7011,
287
+ "step": 34
288
+ },
289
+ {
290
+ "epoch": 1.09,
291
+ "grad_norm": 0.84765625,
292
+ "learning_rate": 0.00017865515558026428,
293
+ "loss": 2.6043,
294
+ "step": 35
295
+ },
296
+ {
297
+ "epoch": 1.12,
298
+ "grad_norm": 0.984375,
299
+ "learning_rate": 0.00017698339834299061,
300
+ "loss": 2.8607,
301
+ "step": 36
302
+ },
303
+ {
304
+ "epoch": 1.16,
305
+ "grad_norm": 0.7578125,
306
+ "learning_rate": 0.00017525707698561385,
307
+ "loss": 2.5949,
308
+ "step": 37
309
+ },
310
+ {
311
+ "epoch": 1.19,
312
+ "grad_norm": 0.94921875,
313
+ "learning_rate": 0.00017347741508630672,
314
+ "loss": 2.7476,
315
+ "step": 38
316
+ },
317
+ {
318
+ "epoch": 1.22,
319
+ "grad_norm": 0.88671875,
320
+ "learning_rate": 0.00017164567402983152,
321
+ "loss": 2.7991,
322
+ "step": 39
323
+ },
324
+ {
325
+ "epoch": 1.25,
326
+ "grad_norm": 0.8671875,
327
+ "learning_rate": 0.0001697631521134985,
328
+ "loss": 2.5959,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 1.25,
333
+ "eval_loss": 2.5505764484405518,
334
+ "eval_runtime": 1.2761,
335
+ "eval_samples_per_second": 10.971,
336
+ "eval_steps_per_second": 5.486,
337
+ "step": 40
338
+ },
339
+ {
340
+ "epoch": 1.28,
341
+ "grad_norm": 0.984375,
342
+ "learning_rate": 0.00016783118362696163,
343
+ "loss": 2.5342,
344
+ "step": 41
345
+ },
346
+ {
347
+ "epoch": 1.31,
348
+ "grad_norm": 0.921875,
349
+ "learning_rate": 0.00016585113790650388,
350
+ "loss": 2.4969,
351
+ "step": 42
352
+ },
353
+ {
354
+ "epoch": 1.34,
355
+ "grad_norm": 1.125,
356
+ "learning_rate": 0.00016382441836448202,
357
+ "loss": 2.5723,
358
+ "step": 43
359
+ },
360
+ {
361
+ "epoch": 1.38,
362
+ "grad_norm": 1.015625,
363
+ "learning_rate": 0.0001617524614946192,
364
+ "loss": 2.6166,
365
+ "step": 44
366
+ },
367
+ {
368
+ "epoch": 1.41,
369
+ "grad_norm": 0.98828125,
370
+ "learning_rate": 0.00015963673585385016,
371
+ "loss": 2.4574,
372
+ "step": 45
373
+ },
374
+ {
375
+ "epoch": 1.44,
376
+ "grad_norm": 0.94921875,
377
+ "learning_rate": 0.0001574787410214407,
378
+ "loss": 2.5074,
379
+ "step": 46
380
+ },
381
+ {
382
+ "epoch": 1.47,
383
+ "grad_norm": 0.953125,
384
+ "learning_rate": 0.00015528000653611935,
385
+ "loss": 2.5394,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 1.5,
390
+ "grad_norm": 0.828125,
391
+ "learning_rate": 0.00015304209081197425,
392
+ "loss": 2.4793,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 1.5,
397
+ "eval_loss": 2.495466470718384,
398
+ "eval_runtime": 1.2814,
399
+ "eval_samples_per_second": 10.926,
400
+ "eval_steps_per_second": 5.463,
401
+ "step": 48
402
+ },
403
+ {
404
+ "epoch": 1.53,
405
+ "grad_norm": 0.921875,
406
+ "learning_rate": 0.000150766580033884,
407
+ "loss": 2.2909,
408
+ "step": 49
409
+ },
410
+ {
411
+ "epoch": 1.56,
412
+ "grad_norm": 0.74609375,
413
+ "learning_rate": 0.00014845508703326504,
414
+ "loss": 2.3424,
415
+ "step": 50
416
+ },
417
+ {
418
+ "epoch": 1.59,
419
+ "grad_norm": 0.87890625,
420
+ "learning_rate": 0.0001461092501449326,
421
+ "loss": 2.506,
422
+ "step": 51
423
+ },
424
+ {
425
+ "epoch": 1.62,
426
+ "grad_norm": 0.7890625,
427
+ "learning_rate": 0.00014373073204588556,
428
+ "loss": 2.4829,
429
+ "step": 52
430
+ },
431
+ {
432
+ "epoch": 1.66,
433
+ "grad_norm": 0.984375,
434
+ "learning_rate": 0.00014132121857683783,
435
+ "loss": 2.481,
436
+ "step": 53
437
+ },
438
+ {
439
+ "epoch": 1.69,
440
+ "grad_norm": 0.9296875,
441
+ "learning_rate": 0.00013888241754733208,
442
+ "loss": 2.5512,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 1.72,
447
+ "grad_norm": 0.7890625,
448
+ "learning_rate": 0.00013641605752528224,
449
+ "loss": 2.5405,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 1.75,
454
+ "grad_norm": 0.95703125,
455
+ "learning_rate": 0.00013392388661180303,
456
+ "loss": 2.5221,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 1.75,
461
+ "eval_loss": 2.461298704147339,
462
+ "eval_runtime": 1.28,
463
+ "eval_samples_per_second": 10.937,
464
+ "eval_steps_per_second": 5.469,
465
+ "step": 56
466
+ },
467
+ {
468
+ "epoch": 1.78,
469
+ "grad_norm": 0.98046875,
470
+ "learning_rate": 0.0001314076712021949,
471
+ "loss": 2.5646,
472
+ "step": 57
473
+ },
474
+ {
475
+ "epoch": 1.81,
476
+ "grad_norm": 0.8125,
477
+ "learning_rate": 0.0001288691947339621,
478
+ "loss": 2.5079,
479
+ "step": 58
480
+ },
481
+ {
482
+ "epoch": 1.84,
483
+ "grad_norm": 0.8125,
484
+ "learning_rate": 0.00012631025642275212,
485
+ "loss": 2.4743,
486
+ "step": 59
487
+ },
488
+ {
489
+ "epoch": 1.88,
490
+ "grad_norm": 0.60546875,
491
+ "learning_rate": 0.0001237326699871115,
492
+ "loss": 2.3103,
493
+ "step": 60
494
+ },
495
+ {
496
+ "epoch": 1.91,
497
+ "grad_norm": 0.80859375,
498
+ "learning_rate": 0.00012113826236296244,
499
+ "loss": 2.4229,
500
+ "step": 61
501
+ },
502
+ {
503
+ "epoch": 1.94,
504
+ "grad_norm": 0.671875,
505
+ "learning_rate": 0.00011852887240871145,
506
+ "loss": 2.2709,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 1.97,
511
+ "grad_norm": 0.8203125,
512
+ "learning_rate": 0.00011590634960190721,
513
+ "loss": 2.4868,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 2.0,
518
+ "grad_norm": 0.62109375,
519
+ "learning_rate": 0.00011327255272837221,
520
+ "loss": 2.4384,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 2.0,
525
+ "eval_loss": 2.4055111408233643,
526
+ "eval_runtime": 1.2798,
527
+ "eval_samples_per_second": 10.94,
528
+ "eval_steps_per_second": 5.47,
529
+ "step": 64
530
+ }
531
+ ],
532
+ "logging_steps": 1,
533
+ "max_steps": 128,
534
+ "num_input_tokens_seen": 0,
535
+ "num_train_epochs": 4,
536
+ "save_steps": 32,
537
+ "total_flos": 1.3334662273302528e+16,
538
+ "train_batch_size": 2,
539
+ "trial_name": null,
540
+ "trial_params": null
541
+ }
checkpoint-64/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fca5508434f64a69e54fc8c46f68907814d91f48e751b7a0eeb4050e5ae3225
3
+ size 5816
checkpoint-96/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.10.0
checkpoint-96/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "v_proj",
25
+ "o_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "k_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
checkpoint-96/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f91dafd56822f172630cecc2b2e1daa597fdbbad4c82963c27fc7d9324b236
3
+ size 50503848
checkpoint-96/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201f8255e7b843a6dce3b0a0f20c92b3051e4c17064d38df3bcda3aea4710751
3
+ size 202035450
checkpoint-96/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cdd17d0d9f68ae884a6c41c42875839a612f5c867257621410c8670377c4664
3
+ size 14244
checkpoint-96/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b76c8662d73a5a13047466dda569fd16697d4a2a715b2ad8f3567f25cfa42c3
3
+ size 1064
checkpoint-96/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
checkpoint-96/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
checkpoint-96/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false,
43
+ "use_fast": true
44
+ }
checkpoint-96/trainer_state.json ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 8,
6
+ "global_step": 96,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 0.5859375,
14
+ "learning_rate": 2e-05,
15
+ "loss": 3.9295,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.03,
20
+ "eval_loss": 3.907318115234375,
21
+ "eval_runtime": 1.2713,
22
+ "eval_samples_per_second": 11.012,
23
+ "eval_steps_per_second": 5.506,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.5078125,
29
+ "learning_rate": 4e-05,
30
+ "loss": 3.805,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.09,
35
+ "grad_norm": 0.63671875,
36
+ "learning_rate": 6e-05,
37
+ "loss": 3.8521,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.12,
42
+ "grad_norm": 0.609375,
43
+ "learning_rate": 8e-05,
44
+ "loss": 3.8947,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.16,
49
+ "grad_norm": 0.546875,
50
+ "learning_rate": 0.0001,
51
+ "loss": 3.6494,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.19,
56
+ "grad_norm": 0.50390625,
57
+ "learning_rate": 0.00012,
58
+ "loss": 3.6457,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.22,
63
+ "grad_norm": 0.640625,
64
+ "learning_rate": 0.00014,
65
+ "loss": 3.967,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.25,
70
+ "grad_norm": 0.494140625,
71
+ "learning_rate": 0.00016,
72
+ "loss": 3.5364,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.25,
77
+ "eval_loss": 3.6198840141296387,
78
+ "eval_runtime": 1.2681,
79
+ "eval_samples_per_second": 11.04,
80
+ "eval_steps_per_second": 5.52,
81
+ "step": 8
82
+ },
83
+ {
84
+ "epoch": 0.28,
85
+ "grad_norm": 0.51171875,
86
+ "learning_rate": 0.00018,
87
+ "loss": 3.5216,
88
+ "step": 9
89
+ },
90
+ {
91
+ "epoch": 0.31,
92
+ "grad_norm": 0.6171875,
93
+ "learning_rate": 0.0002,
94
+ "loss": 3.677,
95
+ "step": 10
96
+ },
97
+ {
98
+ "epoch": 0.34,
99
+ "grad_norm": 0.734375,
100
+ "learning_rate": 0.00019996456111234527,
101
+ "loss": 3.058,
102
+ "step": 11
103
+ },
104
+ {
105
+ "epoch": 0.38,
106
+ "grad_norm": 0.66796875,
107
+ "learning_rate": 0.0001998582695676762,
108
+ "loss": 3.1333,
109
+ "step": 12
110
+ },
111
+ {
112
+ "epoch": 0.41,
113
+ "grad_norm": 0.8359375,
114
+ "learning_rate": 0.000199681200703075,
115
+ "loss": 3.38,
116
+ "step": 13
117
+ },
118
+ {
119
+ "epoch": 0.44,
120
+ "grad_norm": 0.62109375,
121
+ "learning_rate": 0.00019943348002101371,
122
+ "loss": 3.1371,
123
+ "step": 14
124
+ },
125
+ {
126
+ "epoch": 0.47,
127
+ "grad_norm": 0.65625,
128
+ "learning_rate": 0.00019911528310040074,
129
+ "loss": 3.1479,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 0.5,
134
+ "grad_norm": 0.6484375,
135
+ "learning_rate": 0.00019872683547213446,
136
+ "loss": 3.263,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.5,
141
+ "eval_loss": 3.1820719242095947,
142
+ "eval_runtime": 1.2692,
143
+ "eval_samples_per_second": 11.031,
144
+ "eval_steps_per_second": 5.515,
145
+ "step": 16
146
+ },
147
+ {
148
+ "epoch": 0.53,
149
+ "grad_norm": 0.53125,
150
+ "learning_rate": 0.00019826841245925212,
151
+ "loss": 2.9833,
152
+ "step": 17
153
+ },
154
+ {
155
+ "epoch": 0.56,
156
+ "grad_norm": 0.5703125,
157
+ "learning_rate": 0.00019774033898178667,
158
+ "loss": 3.0787,
159
+ "step": 18
160
+ },
161
+ {
162
+ "epoch": 0.59,
163
+ "grad_norm": 0.71484375,
164
+ "learning_rate": 0.00019714298932647098,
165
+ "loss": 3.4132,
166
+ "step": 19
167
+ },
168
+ {
169
+ "epoch": 0.62,
170
+ "grad_norm": 0.73046875,
171
+ "learning_rate": 0.0001964767868814516,
172
+ "loss": 2.7304,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.66,
177
+ "grad_norm": 0.55078125,
178
+ "learning_rate": 0.00019574220383620055,
179
+ "loss": 3.0116,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.69,
184
+ "grad_norm": 0.6171875,
185
+ "learning_rate": 0.00019493976084683813,
186
+ "loss": 2.9474,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.72,
191
+ "grad_norm": 0.61328125,
192
+ "learning_rate": 0.00019407002666710336,
193
+ "loss": 2.9415,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.75,
198
+ "grad_norm": 0.6328125,
199
+ "learning_rate": 0.00019313361774523385,
200
+ "loss": 2.798,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.75,
205
+ "eval_loss": 2.896176815032959,
206
+ "eval_runtime": 1.2765,
207
+ "eval_samples_per_second": 10.967,
208
+ "eval_steps_per_second": 5.484,
209
+ "step": 24
210
+ },
211
+ {
212
+ "epoch": 0.78,
213
+ "grad_norm": 0.8046875,
214
+ "learning_rate": 0.00019213119778704128,
215
+ "loss": 3.2157,
216
+ "step": 25
217
+ },
218
+ {
219
+ "epoch": 0.81,
220
+ "grad_norm": 0.7109375,
221
+ "learning_rate": 0.00019106347728549135,
222
+ "loss": 3.0666,
223
+ "step": 26
224
+ },
225
+ {
226
+ "epoch": 0.84,
227
+ "grad_norm": 0.59765625,
228
+ "learning_rate": 0.00018993121301712193,
229
+ "loss": 2.8219,
230
+ "step": 27
231
+ },
232
+ {
233
+ "epoch": 0.88,
234
+ "grad_norm": 0.75390625,
235
+ "learning_rate": 0.00018873520750565718,
236
+ "loss": 3.1164,
237
+ "step": 28
238
+ },
239
+ {
240
+ "epoch": 0.91,
241
+ "grad_norm": 0.67578125,
242
+ "learning_rate": 0.00018747630845319612,
243
+ "loss": 2.7154,
244
+ "step": 29
245
+ },
246
+ {
247
+ "epoch": 0.94,
248
+ "grad_norm": 0.625,
249
+ "learning_rate": 0.0001861554081393806,
250
+ "loss": 2.7395,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.97,
255
+ "grad_norm": 0.8125,
256
+ "learning_rate": 0.0001847734427889671,
257
+ "loss": 2.8282,
258
+ "step": 31
259
+ },
260
+ {
261
+ "epoch": 1.0,
262
+ "grad_norm": 0.86328125,
263
+ "learning_rate": 0.0001833313919082515,
264
+ "loss": 2.7787,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 1.0,
269
+ "eval_loss": 2.67726731300354,
270
+ "eval_runtime": 1.2769,
271
+ "eval_samples_per_second": 10.964,
272
+ "eval_steps_per_second": 5.482,
273
+ "step": 32
274
+ },
275
+ {
276
+ "epoch": 1.03,
277
+ "grad_norm": 0.703125,
278
+ "learning_rate": 0.0001818302775908169,
279
+ "loss": 2.5957,
280
+ "step": 33
281
+ },
282
+ {
283
+ "epoch": 1.06,
284
+ "grad_norm": 0.76953125,
285
+ "learning_rate": 0.00018027116379309638,
286
+ "loss": 2.7011,
287
+ "step": 34
288
+ },
289
+ {
290
+ "epoch": 1.09,
291
+ "grad_norm": 0.84765625,
292
+ "learning_rate": 0.00017865515558026428,
293
+ "loss": 2.6043,
294
+ "step": 35
295
+ },
296
+ {
297
+ "epoch": 1.12,
298
+ "grad_norm": 0.984375,
299
+ "learning_rate": 0.00017698339834299061,
300
+ "loss": 2.8607,
301
+ "step": 36
302
+ },
303
+ {
304
+ "epoch": 1.16,
305
+ "grad_norm": 0.7578125,
306
+ "learning_rate": 0.00017525707698561385,
307
+ "loss": 2.5949,
308
+ "step": 37
309
+ },
310
+ {
311
+ "epoch": 1.19,
312
+ "grad_norm": 0.94921875,
313
+ "learning_rate": 0.00017347741508630672,
314
+ "loss": 2.7476,
315
+ "step": 38
316
+ },
317
+ {
318
+ "epoch": 1.22,
319
+ "grad_norm": 0.88671875,
320
+ "learning_rate": 0.00017164567402983152,
321
+ "loss": 2.7991,
322
+ "step": 39
323
+ },
324
+ {
325
+ "epoch": 1.25,
326
+ "grad_norm": 0.8671875,
327
+ "learning_rate": 0.0001697631521134985,
328
+ "loss": 2.5959,
329
+ "step": 40
330
+ },
331
+ {
332
+ "epoch": 1.25,
333
+ "eval_loss": 2.5505764484405518,
334
+ "eval_runtime": 1.2761,
335
+ "eval_samples_per_second": 10.971,
336
+ "eval_steps_per_second": 5.486,
337
+ "step": 40
338
+ },
339
+ {
340
+ "epoch": 1.28,
341
+ "grad_norm": 0.984375,
342
+ "learning_rate": 0.00016783118362696163,
343
+ "loss": 2.5342,
344
+ "step": 41
345
+ },
346
+ {
347
+ "epoch": 1.31,
348
+ "grad_norm": 0.921875,
349
+ "learning_rate": 0.00016585113790650388,
350
+ "loss": 2.4969,
351
+ "step": 42
352
+ },
353
+ {
354
+ "epoch": 1.34,
355
+ "grad_norm": 1.125,
356
+ "learning_rate": 0.00016382441836448202,
357
+ "loss": 2.5723,
358
+ "step": 43
359
+ },
360
+ {
361
+ "epoch": 1.38,
362
+ "grad_norm": 1.015625,
363
+ "learning_rate": 0.0001617524614946192,
364
+ "loss": 2.6166,
365
+ "step": 44
366
+ },
367
+ {
368
+ "epoch": 1.41,
369
+ "grad_norm": 0.98828125,
370
+ "learning_rate": 0.00015963673585385016,
371
+ "loss": 2.4574,
372
+ "step": 45
373
+ },
374
+ {
375
+ "epoch": 1.44,
376
+ "grad_norm": 0.94921875,
377
+ "learning_rate": 0.0001574787410214407,
378
+ "loss": 2.5074,
379
+ "step": 46
380
+ },
381
+ {
382
+ "epoch": 1.47,
383
+ "grad_norm": 0.953125,
384
+ "learning_rate": 0.00015528000653611935,
385
+ "loss": 2.5394,
386
+ "step": 47
387
+ },
388
+ {
389
+ "epoch": 1.5,
390
+ "grad_norm": 0.828125,
391
+ "learning_rate": 0.00015304209081197425,
392
+ "loss": 2.4793,
393
+ "step": 48
394
+ },
395
+ {
396
+ "epoch": 1.5,
397
+ "eval_loss": 2.495466470718384,
398
+ "eval_runtime": 1.2814,
399
+ "eval_samples_per_second": 10.926,
400
+ "eval_steps_per_second": 5.463,
401
+ "step": 48
402
+ },
403
+ {
404
+ "epoch": 1.53,
405
+ "grad_norm": 0.921875,
406
+ "learning_rate": 0.000150766580033884,
407
+ "loss": 2.2909,
408
+ "step": 49
409
+ },
410
+ {
411
+ "epoch": 1.56,
412
+ "grad_norm": 0.74609375,
413
+ "learning_rate": 0.00014845508703326504,
414
+ "loss": 2.3424,
415
+ "step": 50
416
+ },
417
+ {
418
+ "epoch": 1.59,
419
+ "grad_norm": 0.87890625,
420
+ "learning_rate": 0.0001461092501449326,
421
+ "loss": 2.506,
422
+ "step": 51
423
+ },
424
+ {
425
+ "epoch": 1.62,
426
+ "grad_norm": 0.7890625,
427
+ "learning_rate": 0.00014373073204588556,
428
+ "loss": 2.4829,
429
+ "step": 52
430
+ },
431
+ {
432
+ "epoch": 1.66,
433
+ "grad_norm": 0.984375,
434
+ "learning_rate": 0.00014132121857683783,
435
+ "loss": 2.481,
436
+ "step": 53
437
+ },
438
+ {
439
+ "epoch": 1.69,
440
+ "grad_norm": 0.9296875,
441
+ "learning_rate": 0.00013888241754733208,
442
+ "loss": 2.5512,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 1.72,
447
+ "grad_norm": 0.7890625,
448
+ "learning_rate": 0.00013641605752528224,
449
+ "loss": 2.5405,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 1.75,
454
+ "grad_norm": 0.95703125,
455
+ "learning_rate": 0.00013392388661180303,
456
+ "loss": 2.5221,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 1.75,
461
+ "eval_loss": 2.461298704147339,
462
+ "eval_runtime": 1.28,
463
+ "eval_samples_per_second": 10.937,
464
+ "eval_steps_per_second": 5.469,
465
+ "step": 56
466
+ },
467
+ {
468
+ "epoch": 1.78,
469
+ "grad_norm": 0.98046875,
470
+ "learning_rate": 0.0001314076712021949,
471
+ "loss": 2.5646,
472
+ "step": 57
473
+ },
474
+ {
475
+ "epoch": 1.81,
476
+ "grad_norm": 0.8125,
477
+ "learning_rate": 0.0001288691947339621,
478
+ "loss": 2.5079,
479
+ "step": 58
480
+ },
481
+ {
482
+ "epoch": 1.84,
483
+ "grad_norm": 0.8125,
484
+ "learning_rate": 0.00012631025642275212,
485
+ "loss": 2.4743,
486
+ "step": 59
487
+ },
488
+ {
489
+ "epoch": 1.88,
490
+ "grad_norm": 0.60546875,
491
+ "learning_rate": 0.0001237326699871115,
492
+ "loss": 2.3103,
493
+ "step": 60
494
+ },
495
+ {
496
+ "epoch": 1.91,
497
+ "grad_norm": 0.80859375,
498
+ "learning_rate": 0.00012113826236296244,
499
+ "loss": 2.4229,
500
+ "step": 61
501
+ },
502
+ {
503
+ "epoch": 1.94,
504
+ "grad_norm": 0.671875,
505
+ "learning_rate": 0.00011852887240871145,
506
+ "loss": 2.2709,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 1.97,
511
+ "grad_norm": 0.8203125,
512
+ "learning_rate": 0.00011590634960190721,
513
+ "loss": 2.4868,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 2.0,
518
+ "grad_norm": 0.62109375,
519
+ "learning_rate": 0.00011327255272837221,
520
+ "loss": 2.4384,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 2.0,
525
+ "eval_loss": 2.4055111408233643,
526
+ "eval_runtime": 1.2798,
527
+ "eval_samples_per_second": 10.94,
528
+ "eval_steps_per_second": 5.47,
529
+ "step": 64
530
+ },
531
+ {
532
+ "epoch": 2.03,
533
+ "grad_norm": 0.6328125,
534
+ "learning_rate": 0.00011062934856473655,
535
+ "loss": 2.3671,
536
+ "step": 65
537
+ },
538
+ {
539
+ "epoch": 2.06,
540
+ "grad_norm": 0.69140625,
541
+ "learning_rate": 0.00010797861055530831,
542
+ "loss": 2.3381,
543
+ "step": 66
544
+ },
545
+ {
546
+ "epoch": 2.09,
547
+ "grad_norm": 0.671875,
548
+ "learning_rate": 0.00010532221748421787,
549
+ "loss": 2.2134,
550
+ "step": 67
551
+ },
552
+ {
553
+ "epoch": 2.12,
554
+ "grad_norm": 0.7890625,
555
+ "learning_rate": 0.00010266205214377748,
556
+ "loss": 2.2687,
557
+ "step": 68
558
+ },
559
+ {
560
+ "epoch": 2.16,
561
+ "grad_norm": 0.75,
562
+ "learning_rate": 0.0001,
563
+ "loss": 2.4273,
564
+ "step": 69
565
+ },
566
+ {
567
+ "epoch": 2.19,
568
+ "grad_norm": 0.8671875,
569
+ "learning_rate": 9.733794785622253e-05,
570
+ "loss": 2.4439,
571
+ "step": 70
572
+ },
573
+ {
574
+ "epoch": 2.22,
575
+ "grad_norm": 0.953125,
576
+ "learning_rate": 9.467778251578217e-05,
577
+ "loss": 2.6631,
578
+ "step": 71
579
+ },
580
+ {
581
+ "epoch": 2.25,
582
+ "grad_norm": 0.78125,
583
+ "learning_rate": 9.202138944469168e-05,
584
+ "loss": 2.295,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 2.25,
589
+ "eval_loss": 2.3922784328460693,
590
+ "eval_runtime": 1.282,
591
+ "eval_samples_per_second": 10.921,
592
+ "eval_steps_per_second": 5.46,
593
+ "step": 72
594
+ },
595
+ {
596
+ "epoch": 2.28,
597
+ "grad_norm": 0.734375,
598
+ "learning_rate": 8.937065143526347e-05,
599
+ "loss": 2.4963,
600
+ "step": 73
601
+ },
602
+ {
603
+ "epoch": 2.31,
604
+ "grad_norm": 0.68359375,
605
+ "learning_rate": 8.672744727162781e-05,
606
+ "loss": 2.4274,
607
+ "step": 74
608
+ },
609
+ {
610
+ "epoch": 2.34,
611
+ "grad_norm": 0.9765625,
612
+ "learning_rate": 8.409365039809281e-05,
613
+ "loss": 2.4988,
614
+ "step": 75
615
+ },
616
+ {
617
+ "epoch": 2.38,
618
+ "grad_norm": 0.75390625,
619
+ "learning_rate": 8.147112759128859e-05,
620
+ "loss": 2.2886,
621
+ "step": 76
622
+ },
623
+ {
624
+ "epoch": 2.41,
625
+ "grad_norm": 0.80078125,
626
+ "learning_rate": 7.886173763703757e-05,
627
+ "loss": 2.1944,
628
+ "step": 77
629
+ },
630
+ {
631
+ "epoch": 2.44,
632
+ "grad_norm": 0.8828125,
633
+ "learning_rate": 7.626733001288851e-05,
634
+ "loss": 2.3283,
635
+ "step": 78
636
+ },
637
+ {
638
+ "epoch": 2.47,
639
+ "grad_norm": 0.82421875,
640
+ "learning_rate": 7.368974357724789e-05,
641
+ "loss": 2.3855,
642
+ "step": 79
643
+ },
644
+ {
645
+ "epoch": 2.5,
646
+ "grad_norm": 0.7890625,
647
+ "learning_rate": 7.113080526603792e-05,
648
+ "loss": 2.3943,
649
+ "step": 80
650
+ },
651
+ {
652
+ "epoch": 2.5,
653
+ "eval_loss": 2.386228084564209,
654
+ "eval_runtime": 1.2835,
655
+ "eval_samples_per_second": 10.908,
656
+ "eval_steps_per_second": 5.454,
657
+ "step": 80
658
+ },
659
+ {
660
+ "epoch": 2.53,
661
+ "grad_norm": 0.94921875,
662
+ "learning_rate": 6.859232879780515e-05,
663
+ "loss": 2.2725,
664
+ "step": 81
665
+ },
666
+ {
667
+ "epoch": 2.56,
668
+ "grad_norm": 0.75,
669
+ "learning_rate": 6.607611338819697e-05,
670
+ "loss": 2.1989,
671
+ "step": 82
672
+ },
673
+ {
674
+ "epoch": 2.59,
675
+ "grad_norm": 0.703125,
676
+ "learning_rate": 6.358394247471778e-05,
677
+ "loss": 2.1219,
678
+ "step": 83
679
+ },
680
+ {
681
+ "epoch": 2.62,
682
+ "grad_norm": 1.0390625,
683
+ "learning_rate": 6.111758245266794e-05,
684
+ "loss": 2.3478,
685
+ "step": 84
686
+ },
687
+ {
688
+ "epoch": 2.66,
689
+ "grad_norm": 0.78125,
690
+ "learning_rate": 5.867878142316221e-05,
691
+ "loss": 2.4196,
692
+ "step": 85
693
+ },
694
+ {
695
+ "epoch": 2.69,
696
+ "grad_norm": 0.6796875,
697
+ "learning_rate": 5.626926795411447e-05,
698
+ "loss": 2.1882,
699
+ "step": 86
700
+ },
701
+ {
702
+ "epoch": 2.72,
703
+ "grad_norm": 0.8203125,
704
+ "learning_rate": 5.38907498550674e-05,
705
+ "loss": 2.4703,
706
+ "step": 87
707
+ },
708
+ {
709
+ "epoch": 2.75,
710
+ "grad_norm": 0.8671875,
711
+ "learning_rate": 5.1544912966734994e-05,
712
+ "loss": 2.2398,
713
+ "step": 88
714
+ },
715
+ {
716
+ "epoch": 2.75,
717
+ "eval_loss": 2.3605105876922607,
718
+ "eval_runtime": 1.2887,
719
+ "eval_samples_per_second": 10.864,
720
+ "eval_steps_per_second": 5.432,
721
+ "step": 88
722
+ },
723
+ {
724
+ "epoch": 2.78,
725
+ "grad_norm": 0.81640625,
726
+ "learning_rate": 4.9233419966116036e-05,
727
+ "loss": 2.2939,
728
+ "step": 89
729
+ },
730
+ {
731
+ "epoch": 2.81,
732
+ "grad_norm": 1.0234375,
733
+ "learning_rate": 4.695790918802576e-05,
734
+ "loss": 2.2835,
735
+ "step": 90
736
+ },
737
+ {
738
+ "epoch": 2.84,
739
+ "grad_norm": 0.85546875,
740
+ "learning_rate": 4.47199934638807e-05,
741
+ "loss": 2.3145,
742
+ "step": 91
743
+ },
744
+ {
745
+ "epoch": 2.88,
746
+ "grad_norm": 0.5546875,
747
+ "learning_rate": 4.252125897855932e-05,
748
+ "loss": 2.2521,
749
+ "step": 92
750
+ },
751
+ {
752
+ "epoch": 2.91,
753
+ "grad_norm": 0.76953125,
754
+ "learning_rate": 4.036326414614985e-05,
755
+ "loss": 2.377,
756
+ "step": 93
757
+ },
758
+ {
759
+ "epoch": 2.94,
760
+ "grad_norm": 0.78125,
761
+ "learning_rate": 3.824753850538082e-05,
762
+ "loss": 2.4343,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 2.97,
767
+ "grad_norm": 0.8046875,
768
+ "learning_rate": 3.617558163551802e-05,
769
+ "loss": 2.2051,
770
+ "step": 95
771
+ },
772
+ {
773
+ "epoch": 3.0,
774
+ "grad_norm": 0.88671875,
775
+ "learning_rate": 3.414886209349615e-05,
776
+ "loss": 2.2693,
777
+ "step": 96
778
+ },
779
+ {
780
+ "epoch": 3.0,
781
+ "eval_loss": 2.3525888919830322,
782
+ "eval_runtime": 1.2821,
783
+ "eval_samples_per_second": 10.919,
784
+ "eval_steps_per_second": 5.46,
785
+ "step": 96
786
+ }
787
+ ],
788
+ "logging_steps": 1,
789
+ "max_steps": 128,
790
+ "num_input_tokens_seen": 0,
791
+ "num_train_epochs": 4,
792
+ "save_steps": 32,
793
+ "total_flos": 2.000199340995379e+16,
794
+ "train_batch_size": 2,
795
+ "trial_name": null,
796
+ "trial_params": null
797
+ }
checkpoint-96/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fca5508434f64a69e54fc8c46f68907814d91f48e751b7a0eeb4050e5ae3225
3
+ size 5816
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5632,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "llama",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 22,
18
+ "num_key_value_heads": 4,
19
+ "pretraining_tp": 1,
20
+ "quantization_config": {
21
+ "_load_in_4bit": true,
22
+ "_load_in_8bit": false,
23
+ "bnb_4bit_compute_dtype": "bfloat16",
24
+ "bnb_4bit_quant_storage": "bfloat16",
25
+ "bnb_4bit_quant_type": "nf4",
26
+ "bnb_4bit_use_double_quant": true,
27
+ "llm_int8_enable_fp32_cpu_offload": false,
28
+ "llm_int8_has_fp16_weight": false,
29
+ "llm_int8_skip_modules": null,
30
+ "llm_int8_threshold": 6.0,
31
+ "load_in_4bit": true,
32
+ "load_in_8bit": false,
33
+ "quant_method": "bitsandbytes"
34
+ },
35
+ "rms_norm_eps": 1e-05,
36
+ "rope_scaling": null,
37
+ "rope_theta": 10000.0,
38
+ "tie_word_embeddings": false,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.40.0.dev0",
41
+ "use_cache": false,
42
+ "vocab_size": 32000
43
+ }
runs/Apr30_02-51-48_663ec5cd7167/events.out.tfevents.1714445508.663ec5cd7167.5280.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3bac99fef0086cdfda4f010adba206b97074810136dff597980b02f3f4736af
3
+ size 37468
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }