TracyTank commited on
Commit
bdd5caf
·
verified ·
1 Parent(s): 8be6b57

End of training

Browse files
README.md ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: apache-2.0
4
+ base_model: JackFram/llama-160m
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: f9eb2fff-bae0-49b0-89c5-624fc71c75d0
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.4.1`
20
+ ```yaml
21
+ adapter: lora
22
+ base_model: JackFram/llama-160m
23
+ bf16: auto
24
+ chat_template: llama3
25
+ cosine_min_lr_ratio: 0.1
26
+ data_processes: 16
27
+ dataset_prepared_path: null
28
+ datasets:
29
+ - data_files:
30
+ - a176eebac3e98bce_train_data.json
31
+ ds_type: json
32
+ format: custom
33
+ path: /workspace/input_data/a176eebac3e98bce_train_data.json
34
+ type:
35
+ field_input: "\uD310\uACB0\uC694\uC9C0"
36
+ field_instruction: "\uBC95\uC6D0\uBA85"
37
+ field_output: "\uD310\uACB0\uC720\uD615"
38
+ format: '{instruction} {input}'
39
+ no_input_format: '{instruction}'
40
+ system_format: '{system}'
41
+ system_prompt: ''
42
+ debug: null
43
+ deepspeed: null
44
+ device_map: '{'''':torch.cuda.current_device()}'
45
+ do_eval: true
46
+ early_stopping_patience: 1
47
+ eval_batch_size: 1
48
+ eval_sample_packing: false
49
+ eval_steps: 25
50
+ evaluation_strategy: steps
51
+ flash_attention: true
52
+ fp16: null
53
+ fsdp: null
54
+ fsdp_config: null
55
+ gradient_accumulation_steps: 64
56
+ gradient_checkpointing: true
57
+ group_by_length: true
58
+ hub_model_id: sn56b1/f9eb2fff-bae0-49b0-89c5-624fc71c75d0
59
+ hub_repo: stevemonite
60
+ hub_strategy: checkpoint
61
+ hub_token: null
62
+ learning_rate: 0.0003
63
+ load_in_4bit: false
64
+ load_in_8bit: false
65
+ local_rank: null
66
+ logging_steps: 1
67
+ lora_alpha: 32
68
+ lora_dropout: 0.05
69
+ lora_fan_in_fan_out: null
70
+ lora_model_dir: null
71
+ lora_r: 16
72
+ lora_target_linear: true
73
+ lora_target_modules:
74
+ - q_proj
75
+ - v_proj
76
+ lr_scheduler: cosine
77
+ max_grad_norm: 1.0
78
+ max_memory:
79
+ 0: 70GiB
80
+ max_steps: 1200
81
+ micro_batch_size: 1
82
+ mlflow_experiment_name: /tmp/a176eebac3e98bce_train_data.json
83
+ model_type: AutoModelForCausalLM
84
+ num_epochs: 2
85
+ optim_args:
86
+ adam_beta1: 0.9
87
+ adam_beta2: 0.95
88
+ adam_epsilon: 1e-5
89
+ optimizer: adamw_torch
90
+ output_dir: miner_id_24
91
+ pad_to_sequence_len: true
92
+ resume_from_checkpoint: null
93
+ s2_attention: null
94
+ sample_packing: false
95
+ save_steps: 50
96
+ save_strategy: steps
97
+ sequence_len: 2048
98
+ special_tokens:
99
+ pad_token: </s>
100
+ strict: false
101
+ tf32: false
102
+ tokenizer_type: AutoTokenizer
103
+ torch_compile: false
104
+ train_on_inputs: false
105
+ trust_remote_code: true
106
+ val_set_size: 50
107
+ wandb_entity: sn56-miner
108
+ wandb_mode: disabled
109
+ wandb_name: f9eb2fff-bae0-49b0-89c5-624fc71c75d0
110
+ wandb_project: god
111
+ wandb_run: v0xl
112
+ wandb_runid: f9eb2fff-bae0-49b0-89c5-624fc71c75d0
113
+ warmup_raio: 0.03
114
+ warmup_ratio: 0.03
115
+ weight_decay: 0.01
116
+ xformers_attention: null
117
+
118
+ ```
119
+
120
+ </details><br>
121
+
122
+ # f9eb2fff-bae0-49b0-89c5-624fc71c75d0
123
+
124
+ This model is a fine-tuned version of [JackFram/llama-160m](https://huggingface.co/JackFram/llama-160m) on the None dataset.
125
+ It achieves the following results on the evaluation set:
126
+ - Loss: 0.2604
127
+
128
+ ## Model description
129
+
130
+ More information needed
131
+
132
+ ## Intended uses & limitations
133
+
134
+ More information needed
135
+
136
+ ## Training and evaluation data
137
+
138
+ More information needed
139
+
140
+ ## Training procedure
141
+
142
+ ### Training hyperparameters
143
+
144
+ The following hyperparameters were used during training:
145
+ - learning_rate: 0.0003
146
+ - train_batch_size: 1
147
+ - eval_batch_size: 1
148
+ - seed: 42
149
+ - distributed_type: multi-GPU
150
+ - num_devices: 4
151
+ - gradient_accumulation_steps: 64
152
+ - total_train_batch_size: 256
153
+ - total_eval_batch_size: 4
154
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=adam_beta1=0.9,adam_beta2=0.95,adam_epsilon=1e-5
155
+ - lr_scheduler_type: cosine
156
+ - lr_scheduler_warmup_steps: 20
157
+ - training_steps: 668
158
+
159
+ ### Training results
160
+
161
+ | Training Loss | Epoch | Step | Validation Loss |
162
+ |:-------------:|:------:|:----:|:---------------:|
163
+ | 7.3791 | 0.0030 | 1 | 5.1000 |
164
+ | 0.8964 | 0.0749 | 25 | 1.0535 |
165
+ | 0.1023 | 0.1499 | 50 | 0.3284 |
166
+ | 0.0862 | 0.2248 | 75 | 0.1645 |
167
+ | 0.0421 | 0.2997 | 100 | 0.1421 |
168
+ | 0.0401 | 0.3747 | 125 | 0.1297 |
169
+ | 0.0459 | 0.4496 | 150 | 0.2604 |
170
+
171
+
172
+ ### Framework versions
173
+
174
+ - PEFT 0.13.2
175
+ - Transformers 4.46.0
176
+ - Pytorch 2.5.0+cu124
177
+ - Datasets 3.0.1
178
+ - Tokenizers 0.20.1
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "JackFram/llama-160m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "o_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:902e1bd91f6ca7c84eb4cc5f09bc41460c3b294412c37de08109f7a6d0074b1c
3
+ size 13626282
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18fc12de5654e0323293b5dd1ac28814fb994a9658fb252a535bc41258c43b46
3
+ size 13587864
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "JackFram/llama-160m",
4
+ "architectures": [
5
+ "LlamaForCausalLM"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 1,
10
+ "eos_token_id": 2,
11
+ "head_dim": 64,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "max_position_embeddings": 2048,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "num_key_value_heads": 12,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.46.0",
29
+ "use_cache": false,
30
+ "vocab_size": 32000
31
+ }
last-checkpoint/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: JackFram/llama-160m
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
last-checkpoint/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "JackFram/llama-160m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": null,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 32,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 16,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "up_proj",
24
+ "o_proj",
25
+ "v_proj",
26
+ "k_proj",
27
+ "q_proj",
28
+ "down_proj",
29
+ "gate_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
last-checkpoint/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba13959f6cd207bdf12c64122f952291693563848f8b8c1cc4a1ca3762addfe3
3
+ size 13587864
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:150eeb9db3c24cdacef19c4cb9526dee2bf01a997cfd73cd097231633997a73b
3
+ size 27273018
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da2fcad6cd81bb97d0061e263e1a1cdbb08b9ec3a021caeb1339fbf78256db76
3
+ size 15024
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f087b0a80be905d15bf4096528ebfb18cb89ccbf6b9eb78dac1d9a3823c111c8
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def1ba2fc2803d327de014a2798b289b94491b993e12c279b943920162e3e860
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f110bfb6c78dd11c3ef89591341a1beec9719bd9c484f380587fc8d405ee41
3
+ size 15024
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0ce7eb31f33da60fb4a65f0698e2d0c23b798eb451451afdf0be495be03e58f
3
+ size 1064
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
last-checkpoint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
last-checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.14205443859100342,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.44958553833185033,
5
+ "eval_steps": 25,
6
+ "global_step": 150,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0029972369222123355,
13
+ "grad_norm": 7.602510929107666,
14
+ "learning_rate": 1.4999999999999999e-05,
15
+ "loss": 7.3791,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.0029972369222123355,
20
+ "eval_loss": 5.100011348724365,
21
+ "eval_runtime": 0.2905,
22
+ "eval_samples_per_second": 172.095,
23
+ "eval_steps_per_second": 44.745,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.005994473844424671,
28
+ "grad_norm": 5.811465263366699,
29
+ "learning_rate": 2.9999999999999997e-05,
30
+ "loss": 5.2052,
31
+ "step": 2
32
+ },
33
+ {
34
+ "epoch": 0.008991710766637007,
35
+ "grad_norm": 6.1456217765808105,
36
+ "learning_rate": 4.4999999999999996e-05,
37
+ "loss": 5.0327,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.011988947688849342,
42
+ "grad_norm": 5.857211112976074,
43
+ "learning_rate": 5.9999999999999995e-05,
44
+ "loss": 4.9457,
45
+ "step": 4
46
+ },
47
+ {
48
+ "epoch": 0.014986184611061678,
49
+ "grad_norm": 5.512299537658691,
50
+ "learning_rate": 7.5e-05,
51
+ "loss": 4.8063,
52
+ "step": 5
53
+ },
54
+ {
55
+ "epoch": 0.017983421533274015,
56
+ "grad_norm": 5.282471179962158,
57
+ "learning_rate": 8.999999999999999e-05,
58
+ "loss": 4.7355,
59
+ "step": 6
60
+ },
61
+ {
62
+ "epoch": 0.02098065845548635,
63
+ "grad_norm": 4.267989158630371,
64
+ "learning_rate": 0.00010499999999999999,
65
+ "loss": 4.4458,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 0.023977895377698684,
70
+ "grad_norm": 3.799853801727295,
71
+ "learning_rate": 0.00011999999999999999,
72
+ "loss": 4.2302,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.02697513229991102,
77
+ "grad_norm": 3.506157875061035,
78
+ "learning_rate": 0.000135,
79
+ "loss": 3.857,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 0.029972369222123357,
84
+ "grad_norm": 3.8407812118530273,
85
+ "learning_rate": 0.00015,
86
+ "loss": 3.749,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 0.03296960614433569,
91
+ "grad_norm": 4.470568656921387,
92
+ "learning_rate": 0.000165,
93
+ "loss": 3.4078,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 0.03596684306654803,
98
+ "grad_norm": 4.822042465209961,
99
+ "learning_rate": 0.00017999999999999998,
100
+ "loss": 3.0358,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 0.038964079988760364,
105
+ "grad_norm": 5.060558795928955,
106
+ "learning_rate": 0.000195,
107
+ "loss": 4.5688,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 0.0419613169109727,
112
+ "grad_norm": 4.888822078704834,
113
+ "learning_rate": 0.00020999999999999998,
114
+ "loss": 3.6099,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.04495855383318503,
119
+ "grad_norm": 3.4810433387756348,
120
+ "learning_rate": 0.000225,
121
+ "loss": 2.4291,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 0.04795579075539737,
126
+ "grad_norm": 3.577241897583008,
127
+ "learning_rate": 0.00023999999999999998,
128
+ "loss": 2.1686,
129
+ "step": 16
130
+ },
131
+ {
132
+ "epoch": 0.0509530276776097,
133
+ "grad_norm": 3.595766544342041,
134
+ "learning_rate": 0.00025499999999999996,
135
+ "loss": 1.9973,
136
+ "step": 17
137
+ },
138
+ {
139
+ "epoch": 0.05395026459982204,
140
+ "grad_norm": 4.025913715362549,
141
+ "learning_rate": 0.00027,
142
+ "loss": 1.9723,
143
+ "step": 18
144
+ },
145
+ {
146
+ "epoch": 0.05694750152203437,
147
+ "grad_norm": 2.864335536956787,
148
+ "learning_rate": 0.000285,
149
+ "loss": 1.9312,
150
+ "step": 19
151
+ },
152
+ {
153
+ "epoch": 0.05994473844424671,
154
+ "grad_norm": 3.9801218509674072,
155
+ "learning_rate": 0.0003,
156
+ "loss": 1.8875,
157
+ "step": 20
158
+ },
159
+ {
160
+ "epoch": 0.06294197536645904,
161
+ "grad_norm": 3.2217376232147217,
162
+ "learning_rate": 0.00029999841345404617,
163
+ "loss": 1.5763,
164
+ "step": 21
165
+ },
166
+ {
167
+ "epoch": 0.06593921228867138,
168
+ "grad_norm": 19.193851470947266,
169
+ "learning_rate": 0.0002999936538534755,
170
+ "loss": 1.8067,
171
+ "step": 22
172
+ },
173
+ {
174
+ "epoch": 0.06893644921088371,
175
+ "grad_norm": 25.025583267211914,
176
+ "learning_rate": 0.0002999857213101595,
177
+ "loss": 1.4576,
178
+ "step": 23
179
+ },
180
+ {
181
+ "epoch": 0.07193368613309606,
182
+ "grad_norm": 4.855524063110352,
183
+ "learning_rate": 0.00029997461601054764,
184
+ "loss": 1.0614,
185
+ "step": 24
186
+ },
187
+ {
188
+ "epoch": 0.0749309230553084,
189
+ "grad_norm": 3.3954389095306396,
190
+ "learning_rate": 0.00029996033821566326,
191
+ "loss": 0.8964,
192
+ "step": 25
193
+ },
194
+ {
195
+ "epoch": 0.0749309230553084,
196
+ "eval_loss": 1.0534682273864746,
197
+ "eval_runtime": 0.2881,
198
+ "eval_samples_per_second": 173.557,
199
+ "eval_steps_per_second": 45.125,
200
+ "step": 25
201
+ },
202
+ {
203
+ "epoch": 0.07792815997752073,
204
+ "grad_norm": 12.105545997619629,
205
+ "learning_rate": 0.0002999428882610971,
206
+ "loss": 3.0311,
207
+ "step": 26
208
+ },
209
+ {
210
+ "epoch": 0.08092539689973306,
211
+ "grad_norm": 3.334498882293701,
212
+ "learning_rate": 0.00029992226655699945,
213
+ "loss": 1.2946,
214
+ "step": 27
215
+ },
216
+ {
217
+ "epoch": 0.0839226338219454,
218
+ "grad_norm": 2.8498733043670654,
219
+ "learning_rate": 0.00029989847358807104,
220
+ "loss": 1.1872,
221
+ "step": 28
222
+ },
223
+ {
224
+ "epoch": 0.08691987074415773,
225
+ "grad_norm": 2.367673873901367,
226
+ "learning_rate": 0.0002998715099135508,
227
+ "loss": 1.0292,
228
+ "step": 29
229
+ },
230
+ {
231
+ "epoch": 0.08991710766637007,
232
+ "grad_norm": 8.79304313659668,
233
+ "learning_rate": 0.00029984137616720325,
234
+ "loss": 1.2032,
235
+ "step": 30
236
+ },
237
+ {
238
+ "epoch": 0.0929143445885824,
239
+ "grad_norm": 8.142653465270996,
240
+ "learning_rate": 0.00029980807305730374,
241
+ "loss": 1.2389,
242
+ "step": 31
243
+ },
244
+ {
245
+ "epoch": 0.09591158151079474,
246
+ "grad_norm": 4.5038628578186035,
247
+ "learning_rate": 0.0002997716013666212,
248
+ "loss": 1.0933,
249
+ "step": 32
250
+ },
251
+ {
252
+ "epoch": 0.09890881843300707,
253
+ "grad_norm": 2.439879894256592,
254
+ "learning_rate": 0.0002997319619524003,
255
+ "loss": 0.922,
256
+ "step": 33
257
+ },
258
+ {
259
+ "epoch": 0.1019060553552194,
260
+ "grad_norm": 2.002732992172241,
261
+ "learning_rate": 0.0002996891557463412,
262
+ "loss": 0.8141,
263
+ "step": 34
264
+ },
265
+ {
266
+ "epoch": 0.10490329227743174,
267
+ "grad_norm": 1.9198921918869019,
268
+ "learning_rate": 0.00029964318375457725,
269
+ "loss": 0.5868,
270
+ "step": 35
271
+ },
272
+ {
273
+ "epoch": 0.10790052919964407,
274
+ "grad_norm": 1.1390972137451172,
275
+ "learning_rate": 0.00029959404705765186,
276
+ "loss": 0.4158,
277
+ "step": 36
278
+ },
279
+ {
280
+ "epoch": 0.11089776612185641,
281
+ "grad_norm": 0.9061455130577087,
282
+ "learning_rate": 0.00029954174681049296,
283
+ "loss": 0.3104,
284
+ "step": 37
285
+ },
286
+ {
287
+ "epoch": 0.11389500304406874,
288
+ "grad_norm": 3.6855151653289795,
289
+ "learning_rate": 0.0002994862842423856,
290
+ "loss": 1.3318,
291
+ "step": 38
292
+ },
293
+ {
294
+ "epoch": 0.11689223996628108,
295
+ "grad_norm": 2.992558717727661,
296
+ "learning_rate": 0.00029942766065694333,
297
+ "loss": 1.0331,
298
+ "step": 39
299
+ },
300
+ {
301
+ "epoch": 0.11988947688849343,
302
+ "grad_norm": 2.762493133544922,
303
+ "learning_rate": 0.00029936587743207736,
304
+ "loss": 0.7132,
305
+ "step": 40
306
+ },
307
+ {
308
+ "epoch": 0.12288671381070576,
309
+ "grad_norm": 2.2291383743286133,
310
+ "learning_rate": 0.00029930093601996446,
311
+ "loss": 0.5774,
312
+ "step": 41
313
+ },
314
+ {
315
+ "epoch": 0.12588395073291808,
316
+ "grad_norm": 1.1489959955215454,
317
+ "learning_rate": 0.0002992328379470125,
318
+ "loss": 0.5567,
319
+ "step": 42
320
+ },
321
+ {
322
+ "epoch": 0.12888118765513043,
323
+ "grad_norm": 6.168476104736328,
324
+ "learning_rate": 0.00029916158481382474,
325
+ "loss": 0.6131,
326
+ "step": 43
327
+ },
328
+ {
329
+ "epoch": 0.13187842457734275,
330
+ "grad_norm": 5.408211708068848,
331
+ "learning_rate": 0.0002990871782951623,
332
+ "loss": 0.6638,
333
+ "step": 44
334
+ },
335
+ {
336
+ "epoch": 0.1348756614995551,
337
+ "grad_norm": 1.5306873321533203,
338
+ "learning_rate": 0.0002990096201399045,
339
+ "loss": 0.5488,
340
+ "step": 45
341
+ },
342
+ {
343
+ "epoch": 0.13787289842176742,
344
+ "grad_norm": 1.8623676300048828,
345
+ "learning_rate": 0.00029892891217100817,
346
+ "loss": 0.4659,
347
+ "step": 46
348
+ },
349
+ {
350
+ "epoch": 0.14087013534397977,
351
+ "grad_norm": 1.6640721559524536,
352
+ "learning_rate": 0.0002988450562854644,
353
+ "loss": 0.4368,
354
+ "step": 47
355
+ },
356
+ {
357
+ "epoch": 0.14386737226619212,
358
+ "grad_norm": 0.3559802174568176,
359
+ "learning_rate": 0.0002987580544542541,
360
+ "loss": 0.0507,
361
+ "step": 48
362
+ },
363
+ {
364
+ "epoch": 0.14686460918840444,
365
+ "grad_norm": 0.4540441334247589,
366
+ "learning_rate": 0.0002986679087223018,
367
+ "loss": 0.087,
368
+ "step": 49
369
+ },
370
+ {
371
+ "epoch": 0.1498618461106168,
372
+ "grad_norm": 0.8076233267784119,
373
+ "learning_rate": 0.00029857462120842744,
374
+ "loss": 0.1023,
375
+ "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.1498618461106168,
379
+ "eval_loss": 0.3283706307411194,
380
+ "eval_runtime": 0.2882,
381
+ "eval_samples_per_second": 173.489,
382
+ "eval_steps_per_second": 45.107,
383
+ "step": 50
384
+ },
385
+ {
386
+ "epoch": 0.1528590830328291,
387
+ "grad_norm": 4.228235721588135,
388
+ "learning_rate": 0.0002984781941052967,
389
+ "loss": 1.0819,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 0.15585631995504146,
394
+ "grad_norm": 3.424924612045288,
395
+ "learning_rate": 0.0002983786296793692,
396
+ "loss": 0.5749,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 0.15885355687725378,
401
+ "grad_norm": 3.9904696941375732,
402
+ "learning_rate": 0.00029827593027084546,
403
+ "loss": 0.5831,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 0.16185079379946612,
408
+ "grad_norm": 2.694119930267334,
409
+ "learning_rate": 0.00029817009829361196,
410
+ "loss": 0.4457,
411
+ "step": 54
412
+ },
413
+ {
414
+ "epoch": 0.16484803072167845,
415
+ "grad_norm": 1.2387803792953491,
416
+ "learning_rate": 0.00029806113623518407,
417
+ "loss": 0.3431,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.1678452676438908,
422
+ "grad_norm": 1.6821039915084839,
423
+ "learning_rate": 0.0002979490466566481,
424
+ "loss": 0.3905,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.17084250456610311,
429
+ "grad_norm": 0.6684949994087219,
430
+ "learning_rate": 0.00029783383219260037,
431
+ "loss": 0.3056,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.17383974148831546,
436
+ "grad_norm": 0.7967026233673096,
437
+ "learning_rate": 0.0002977154955510861,
438
+ "loss": 0.3333,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.17683697841052778,
443
+ "grad_norm": 0.5187807083129883,
444
+ "learning_rate": 0.0002975940395135351,
445
+ "loss": 0.2867,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.17983421533274013,
450
+ "grad_norm": 7.2272138595581055,
451
+ "learning_rate": 0.00029746946693469693,
452
+ "loss": 1.0056,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.18283145225495245,
457
+ "grad_norm": 8.9348726272583,
458
+ "learning_rate": 0.00029734178074257325,
459
+ "loss": 0.7786,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.1858286891771648,
464
+ "grad_norm": 8.19704532623291,
465
+ "learning_rate": 0.0002972109839383494,
466
+ "loss": 0.3928,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.18882592609937715,
471
+ "grad_norm": 4.3927483558654785,
472
+ "learning_rate": 0.00029707707959632386,
473
+ "loss": 0.7528,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.19182316302158947,
478
+ "grad_norm": 2.271043539047241,
479
+ "learning_rate": 0.0002969400708638358,
480
+ "loss": 0.4877,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.19482039994380182,
485
+ "grad_norm": 1.3864595890045166,
486
+ "learning_rate": 0.000296799960961191,
487
+ "loss": 0.2625,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.19781763686601414,
492
+ "grad_norm": 2.2087323665618896,
493
+ "learning_rate": 0.00029665675318158656,
494
+ "loss": 0.2945,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.2008148737882265,
499
+ "grad_norm": 3.766403913497925,
500
+ "learning_rate": 0.00029651045089103316,
501
+ "loss": 0.3807,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.2038121107104388,
506
+ "grad_norm": 2.598832368850708,
507
+ "learning_rate": 0.0002963610575282762,
508
+ "loss": 0.3149,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.20680934763265116,
513
+ "grad_norm": 0.677237331867218,
514
+ "learning_rate": 0.0002962085766047146,
515
+ "loss": 0.346,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.20980658455486348,
520
+ "grad_norm": 0.5142577886581421,
521
+ "learning_rate": 0.00029605301170431867,
522
+ "loss": 0.2855,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.21280382147707583,
527
+ "grad_norm": 0.5518949031829834,
528
+ "learning_rate": 0.00029589436648354566,
529
+ "loss": 0.3163,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.21580105839928815,
534
+ "grad_norm": 0.336823046207428,
535
+ "learning_rate": 0.00029573264467125377,
536
+ "loss": 0.16,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.2187982953215005,
541
+ "grad_norm": 0.2474360167980194,
542
+ "learning_rate": 0.0002955678500686147,
543
+ "loss": 0.0297,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.22179553224371282,
548
+ "grad_norm": 0.18458165228366852,
549
+ "learning_rate": 0.0002953999865490242,
550
+ "loss": 0.0609,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.22479276916592517,
555
+ "grad_norm": 0.36120983958244324,
556
+ "learning_rate": 0.0002952290580580109,
557
+ "loss": 0.0862,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.22479276916592517,
562
+ "eval_loss": 0.16452732682228088,
563
+ "eval_runtime": 0.2888,
564
+ "eval_samples_per_second": 173.11,
565
+ "eval_steps_per_second": 45.009,
566
+ "step": 75
567
+ },
568
+ {
569
+ "epoch": 0.2277900060881375,
570
+ "grad_norm": 1.3333512544631958,
571
+ "learning_rate": 0.0002950550686131438,
572
+ "loss": 0.6146,
573
+ "step": 76
574
+ },
575
+ {
576
+ "epoch": 0.23078724301034984,
577
+ "grad_norm": 1.2993559837341309,
578
+ "learning_rate": 0.00029487802230393777,
579
+ "loss": 0.2574,
580
+ "step": 77
581
+ },
582
+ {
583
+ "epoch": 0.23378447993256216,
584
+ "grad_norm": 1.2781016826629639,
585
+ "learning_rate": 0.00029469792329175725,
586
+ "loss": 0.2978,
587
+ "step": 78
588
+ },
589
+ {
590
+ "epoch": 0.2367817168547745,
591
+ "grad_norm": 5.145886421203613,
592
+ "learning_rate": 0.0002945147758097187,
593
+ "loss": 0.3251,
594
+ "step": 79
595
+ },
596
+ {
597
+ "epoch": 0.23977895377698685,
598
+ "grad_norm": 5.573575019836426,
599
+ "learning_rate": 0.00029432858416259097,
600
+ "loss": 0.3483,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 0.24277619069919917,
605
+ "grad_norm": 2.6032469272613525,
606
+ "learning_rate": 0.0002941393527266941,
607
+ "loss": 0.306,
608
+ "step": 81
609
+ },
610
+ {
611
+ "epoch": 0.24577342762141152,
612
+ "grad_norm": 0.6271111965179443,
613
+ "learning_rate": 0.00029394708594979657,
614
+ "loss": 0.318,
615
+ "step": 82
616
+ },
617
+ {
618
+ "epoch": 0.24877066454362384,
619
+ "grad_norm": 0.5439050793647766,
620
+ "learning_rate": 0.0002937517883510106,
621
+ "loss": 0.2547,
622
+ "step": 83
623
+ },
624
+ {
625
+ "epoch": 0.25176790146583616,
626
+ "grad_norm": 0.5188155770301819,
627
+ "learning_rate": 0.0002935534645206861,
628
+ "loss": 0.2402,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.25476513838804854,
633
+ "grad_norm": 1.3832889795303345,
634
+ "learning_rate": 0.00029335211912030247,
635
+ "loss": 0.147,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.25776237531026086,
640
+ "grad_norm": 0.20522421598434448,
641
+ "learning_rate": 0.0002931477568823596,
642
+ "loss": 0.0365,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.2607596122324732,
647
+ "grad_norm": 0.07317493855953217,
648
+ "learning_rate": 0.00029294038261026595,
649
+ "loss": 0.0178,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.2637568491546855,
654
+ "grad_norm": 5.754029273986816,
655
+ "learning_rate": 0.0002927300011782263,
656
+ "loss": 0.5049,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.2667540860768979,
661
+ "grad_norm": 1.9069617986679077,
662
+ "learning_rate": 0.0002925166175311266,
663
+ "loss": 0.297,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.2697513229991102,
668
+ "grad_norm": 1.4803589582443237,
669
+ "learning_rate": 0.0002923002366844182,
670
+ "loss": 0.2419,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.2727485599213225,
675
+ "grad_norm": 1.1726033687591553,
676
+ "learning_rate": 0.0002920808637239998,
677
+ "loss": 0.2449,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.27574579684353484,
682
+ "grad_norm": 1.1483900547027588,
683
+ "learning_rate": 0.00029185850380609757,
684
+ "loss": 0.2845,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.2787430337657472,
689
+ "grad_norm": 0.6498438715934753,
690
+ "learning_rate": 0.00029163316215714477,
691
+ "loss": 0.3168,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.28174027068795954,
696
+ "grad_norm": 3.8744821548461914,
697
+ "learning_rate": 0.00029140484407365807,
698
+ "loss": 0.3098,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.28473750761017186,
703
+ "grad_norm": 5.238924026489258,
704
+ "learning_rate": 0.00029117355492211345,
705
+ "loss": 0.3747,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.28773474453238423,
710
+ "grad_norm": 4.835148334503174,
711
+ "learning_rate": 0.0002909393001388201,
712
+ "loss": 0.311,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.29073198145459656,
717
+ "grad_norm": 3.1126749515533447,
718
+ "learning_rate": 0.00029070208522979246,
719
+ "loss": 0.1933,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.2937292183768089,
724
+ "grad_norm": 0.31741341948509216,
725
+ "learning_rate": 0.000290461915770621,
726
+ "loss": 0.0311,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.2967264552990212,
731
+ "grad_norm": 0.13816803693771362,
732
+ "learning_rate": 0.00029021879740634106,
733
+ "loss": 0.0489,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.2997236922212336,
738
+ "grad_norm": 0.16050726175308228,
739
+ "learning_rate": 0.0002899727358513002,
740
+ "loss": 0.0421,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.2997236922212336,
745
+ "eval_loss": 0.14205443859100342,
746
+ "eval_runtime": 0.2881,
747
+ "eval_samples_per_second": 173.58,
748
+ "eval_steps_per_second": 45.131,
749
+ "step": 100
750
+ },
751
+ {
752
+ "epoch": 0.3027209291434459,
753
+ "grad_norm": 4.475709438323975,
754
+ "learning_rate": 0.0002897237368890237,
755
+ "loss": 0.7248,
756
+ "step": 101
757
+ },
758
+ {
759
+ "epoch": 0.3057181660656582,
760
+ "grad_norm": 1.3565483093261719,
761
+ "learning_rate": 0.00028947180637207894,
762
+ "loss": 0.2832,
763
+ "step": 102
764
+ },
765
+ {
766
+ "epoch": 0.30871540298787054,
767
+ "grad_norm": 1.5489473342895508,
768
+ "learning_rate": 0.0002892169502219377,
769
+ "loss": 0.3576,
770
+ "step": 103
771
+ },
772
+ {
773
+ "epoch": 0.3117126399100829,
774
+ "grad_norm": 1.2731544971466064,
775
+ "learning_rate": 0.00028895917442883697,
776
+ "loss": 0.2847,
777
+ "step": 104
778
+ },
779
+ {
780
+ "epoch": 0.31470987683229523,
781
+ "grad_norm": 0.5488539934158325,
782
+ "learning_rate": 0.000288698485051638,
783
+ "loss": 0.2249,
784
+ "step": 105
785
+ },
786
+ {
787
+ "epoch": 0.31770711375450755,
788
+ "grad_norm": 0.6603264808654785,
789
+ "learning_rate": 0.0002884348882176842,
790
+ "loss": 0.2748,
791
+ "step": 106
792
+ },
793
+ {
794
+ "epoch": 0.3207043506767199,
795
+ "grad_norm": 1.190644383430481,
796
+ "learning_rate": 0.0002881683901226569,
797
+ "loss": 0.264,
798
+ "step": 107
799
+ },
800
+ {
801
+ "epoch": 0.32370158759893225,
802
+ "grad_norm": 3.849586248397827,
803
+ "learning_rate": 0.00028789899703042976,
804
+ "loss": 0.2899,
805
+ "step": 108
806
+ },
807
+ {
808
+ "epoch": 0.32669882452114457,
809
+ "grad_norm": 2.8036534786224365,
810
+ "learning_rate": 0.00028762671527292165,
811
+ "loss": 0.2266,
812
+ "step": 109
813
+ },
814
+ {
815
+ "epoch": 0.3296960614433569,
816
+ "grad_norm": 1.1855659484863281,
817
+ "learning_rate": 0.00028735155124994774,
818
+ "loss": 0.1432,
819
+ "step": 110
820
+ },
821
+ {
822
+ "epoch": 0.33269329836556927,
823
+ "grad_norm": 0.14694784581661224,
824
+ "learning_rate": 0.0002870735114290689,
825
+ "loss": 0.0225,
826
+ "step": 111
827
+ },
828
+ {
829
+ "epoch": 0.3356905352877816,
830
+ "grad_norm": 0.08722022920846939,
831
+ "learning_rate": 0.0002867926023454401,
832
+ "loss": 0.0243,
833
+ "step": 112
834
+ },
835
+ {
836
+ "epoch": 0.3386877722099939,
837
+ "grad_norm": 1.4064624309539795,
838
+ "learning_rate": 0.00028650883060165634,
839
+ "loss": 0.4202,
840
+ "step": 113
841
+ },
842
+ {
843
+ "epoch": 0.34168500913220623,
844
+ "grad_norm": 1.0942851305007935,
845
+ "learning_rate": 0.00028622220286759787,
846
+ "loss": 0.2193,
847
+ "step": 114
848
+ },
849
+ {
850
+ "epoch": 0.3446822460544186,
851
+ "grad_norm": 1.4071762561798096,
852
+ "learning_rate": 0.0002859327258802732,
853
+ "loss": 0.2772,
854
+ "step": 115
855
+ },
856
+ {
857
+ "epoch": 0.3476794829766309,
858
+ "grad_norm": 1.451495885848999,
859
+ "learning_rate": 0.0002856404064436606,
860
+ "loss": 0.3247,
861
+ "step": 116
862
+ },
863
+ {
864
+ "epoch": 0.35067671989884325,
865
+ "grad_norm": 0.8838515281677246,
866
+ "learning_rate": 0.0002853452514285487,
867
+ "loss": 0.2307,
868
+ "step": 117
869
+ },
870
+ {
871
+ "epoch": 0.35367395682105557,
872
+ "grad_norm": 0.43812039494514465,
873
+ "learning_rate": 0.0002850472677723743,
874
+ "loss": 0.2225,
875
+ "step": 118
876
+ },
877
+ {
878
+ "epoch": 0.35667119374326794,
879
+ "grad_norm": 1.919777274131775,
880
+ "learning_rate": 0.0002847464624790599,
881
+ "loss": 0.2529,
882
+ "step": 119
883
+ },
884
+ {
885
+ "epoch": 0.35966843066548027,
886
+ "grad_norm": 3.364579916000366,
887
+ "learning_rate": 0.00028444284261884876,
888
+ "loss": 0.2957,
889
+ "step": 120
890
+ },
891
+ {
892
+ "epoch": 0.3626656675876926,
893
+ "grad_norm": 3.1780946254730225,
894
+ "learning_rate": 0.0002841364153281389,
895
+ "loss": 0.2983,
896
+ "step": 121
897
+ },
898
+ {
899
+ "epoch": 0.3656629045099049,
900
+ "grad_norm": 1.9313762187957764,
901
+ "learning_rate": 0.000283827187809315,
902
+ "loss": 0.1741,
903
+ "step": 122
904
+ },
905
+ {
906
+ "epoch": 0.3686601414321173,
907
+ "grad_norm": 0.2901758849620819,
908
+ "learning_rate": 0.0002835151673305797,
909
+ "loss": 0.0191,
910
+ "step": 123
911
+ },
912
+ {
913
+ "epoch": 0.3716573783543296,
914
+ "grad_norm": 0.18410703539848328,
915
+ "learning_rate": 0.00028320036122578225,
916
+ "loss": 0.0579,
917
+ "step": 124
918
+ },
919
+ {
920
+ "epoch": 0.3746546152765419,
921
+ "grad_norm": 0.06442587822675705,
922
+ "learning_rate": 0.0002828827768942464,
923
+ "loss": 0.0401,
924
+ "step": 125
925
+ },
926
+ {
927
+ "epoch": 0.3746546152765419,
928
+ "eval_loss": 0.12974713742733002,
929
+ "eval_runtime": 0.2912,
930
+ "eval_samples_per_second": 171.727,
931
+ "eval_steps_per_second": 44.649,
932
+ "step": 125
933
+ },
934
+ {
935
+ "epoch": 0.3776518521987543,
936
+ "grad_norm": 5.953274726867676,
937
+ "learning_rate": 0.00028256242180059644,
938
+ "loss": 0.5857,
939
+ "step": 126
940
+ },
941
+ {
942
+ "epoch": 0.3806490891209666,
943
+ "grad_norm": 1.0578787326812744,
944
+ "learning_rate": 0.0002822393034745815,
945
+ "loss": 0.1907,
946
+ "step": 127
947
+ },
948
+ {
949
+ "epoch": 0.38364632604317894,
950
+ "grad_norm": 1.2669620513916016,
951
+ "learning_rate": 0.0002819134295108992,
952
+ "loss": 0.2632,
953
+ "step": 128
954
+ },
955
+ {
956
+ "epoch": 0.38664356296539126,
957
+ "grad_norm": 1.2083393335342407,
958
+ "learning_rate": 0.0002815848075690163,
959
+ "loss": 0.2581,
960
+ "step": 129
961
+ },
962
+ {
963
+ "epoch": 0.38964079988760364,
964
+ "grad_norm": 0.6875675916671753,
965
+ "learning_rate": 0.00028125344537298933,
966
+ "loss": 0.2141,
967
+ "step": 130
968
+ },
969
+ {
970
+ "epoch": 0.39263803680981596,
971
+ "grad_norm": 0.6001489162445068,
972
+ "learning_rate": 0.00028091935071128274,
973
+ "loss": 0.2624,
974
+ "step": 131
975
+ },
976
+ {
977
+ "epoch": 0.3956352737320283,
978
+ "grad_norm": 0.28877830505371094,
979
+ "learning_rate": 0.00028058253143658596,
980
+ "loss": 0.2115,
981
+ "step": 132
982
+ },
983
+ {
984
+ "epoch": 0.3986325106542406,
985
+ "grad_norm": 0.8344032168388367,
986
+ "learning_rate": 0.0002802429954656287,
987
+ "loss": 0.234,
988
+ "step": 133
989
+ },
990
+ {
991
+ "epoch": 0.401629747576453,
992
+ "grad_norm": 2.311697244644165,
993
+ "learning_rate": 0.00027990075077899494,
994
+ "loss": 0.173,
995
+ "step": 134
996
+ },
997
+ {
998
+ "epoch": 0.4046269844986653,
999
+ "grad_norm": 0.9839431643486023,
1000
+ "learning_rate": 0.0002795558054209354,
1001
+ "loss": 0.0744,
1002
+ "step": 135
1003
+ },
1004
+ {
1005
+ "epoch": 0.4076242214208776,
1006
+ "grad_norm": 0.23247428238391876,
1007
+ "learning_rate": 0.0002792081674991785,
1008
+ "loss": 0.0324,
1009
+ "step": 136
1010
+ },
1011
+ {
1012
+ "epoch": 0.41062145834308994,
1013
+ "grad_norm": 0.11418312788009644,
1014
+ "learning_rate": 0.00027885784518473955,
1015
+ "loss": 0.0338,
1016
+ "step": 137
1017
+ },
1018
+ {
1019
+ "epoch": 0.4136186952653023,
1020
+ "grad_norm": 2.2888922691345215,
1021
+ "learning_rate": 0.0002785048467117289,
1022
+ "loss": 0.3951,
1023
+ "step": 138
1024
+ },
1025
+ {
1026
+ "epoch": 0.41661593218751464,
1027
+ "grad_norm": 1.3474727869033813,
1028
+ "learning_rate": 0.00027814918037715846,
1029
+ "loss": 0.2827,
1030
+ "step": 139
1031
+ },
1032
+ {
1033
+ "epoch": 0.41961316910972696,
1034
+ "grad_norm": 0.958093523979187,
1035
+ "learning_rate": 0.0002777908545407464,
1036
+ "loss": 0.2105,
1037
+ "step": 140
1038
+ },
1039
+ {
1040
+ "epoch": 0.4226104060319393,
1041
+ "grad_norm": 1.1106938123703003,
1042
+ "learning_rate": 0.00027742987762472104,
1043
+ "loss": 0.2451,
1044
+ "step": 141
1045
+ },
1046
+ {
1047
+ "epoch": 0.42560764295415165,
1048
+ "grad_norm": 0.991568386554718,
1049
+ "learning_rate": 0.0002770662581136226,
1050
+ "loss": 0.2107,
1051
+ "step": 142
1052
+ },
1053
+ {
1054
+ "epoch": 0.428604879876364,
1055
+ "grad_norm": 0.7932851314544678,
1056
+ "learning_rate": 0.0002767000045541039,
1057
+ "loss": 0.2097,
1058
+ "step": 143
1059
+ },
1060
+ {
1061
+ "epoch": 0.4316021167985763,
1062
+ "grad_norm": 0.4382745623588562,
1063
+ "learning_rate": 0.0002763311255547294,
1064
+ "loss": 0.2098,
1065
+ "step": 144
1066
+ },
1067
+ {
1068
+ "epoch": 0.4345993537207887,
1069
+ "grad_norm": 0.4467020034790039,
1070
+ "learning_rate": 0.0002759596297857729,
1071
+ "loss": 0.2249,
1072
+ "step": 145
1073
+ },
1074
+ {
1075
+ "epoch": 0.437596590643001,
1076
+ "grad_norm": 0.7452898025512695,
1077
+ "learning_rate": 0.0002755855259790139,
1078
+ "loss": 0.2022,
1079
+ "step": 146
1080
+ },
1081
+ {
1082
+ "epoch": 0.4405938275652133,
1083
+ "grad_norm": 0.8051543831825256,
1084
+ "learning_rate": 0.000275208822927532,
1085
+ "loss": 0.2018,
1086
+ "step": 147
1087
+ },
1088
+ {
1089
+ "epoch": 0.44359106448742563,
1090
+ "grad_norm": 0.4602643847465515,
1091
+ "learning_rate": 0.00027482952948550056,
1092
+ "loss": 0.0265,
1093
+ "step": 148
1094
+ },
1095
+ {
1096
+ "epoch": 0.446588301409638,
1097
+ "grad_norm": 0.16789375245571136,
1098
+ "learning_rate": 0.00027444765456797863,
1099
+ "loss": 0.0358,
1100
+ "step": 149
1101
+ },
1102
+ {
1103
+ "epoch": 0.44958553833185033,
1104
+ "grad_norm": 0.2375824749469757,
1105
+ "learning_rate": 0.000274063207150701,
1106
+ "loss": 0.0459,
1107
+ "step": 150
1108
+ },
1109
+ {
1110
+ "epoch": 0.44958553833185033,
1111
+ "eval_loss": 0.26043474674224854,
1112
+ "eval_runtime": 0.2878,
1113
+ "eval_samples_per_second": 173.721,
1114
+ "eval_steps_per_second": 45.167,
1115
+ "step": 150
1116
+ }
1117
+ ],
1118
+ "logging_steps": 1,
1119
+ "max_steps": 668,
1120
+ "num_input_tokens_seen": 0,
1121
+ "num_train_epochs": 3,
1122
+ "save_steps": 50,
1123
+ "stateful_callbacks": {
1124
+ "EarlyStoppingCallback": {
1125
+ "args": {
1126
+ "early_stopping_patience": 1,
1127
+ "early_stopping_threshold": 0.0
1128
+ },
1129
+ "attributes": {
1130
+ "early_stopping_patience_counter": 1
1131
+ }
1132
+ },
1133
+ "TrainerControl": {
1134
+ "args": {
1135
+ "should_epoch_stop": false,
1136
+ "should_evaluate": false,
1137
+ "should_log": false,
1138
+ "should_save": true,
1139
+ "should_training_stop": true
1140
+ },
1141
+ "attributes": {}
1142
+ }
1143
+ },
1144
+ "total_flos": 6.90057513051095e+16,
1145
+ "train_batch_size": 1,
1146
+ "trial_name": null,
1147
+ "trial_params": null
1148
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf86ea560d2053a5b479f98b88d008830c6e22c0c722396b6e3ab6eff3733725
3
+ size 6840
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
33
+ "clean_up_tokenization_spaces": false,
34
+ "eos_token": "</s>",
35
+ "legacy": false,
36
+ "model_max_length": 1000000000000000019884624838656,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf86ea560d2053a5b479f98b88d008830c6e22c0c722396b6e3ab6eff3733725
3
+ size 6840