llm-wizard commited on
Commit
8d46188
1 Parent(s): 5fccbd9

llm-wizard/llama381binstruct_summarize_short_merged

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [NousResearch/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/NousResearch/Meta-Llama-3.1-8B-Instruct) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 2.4530
24
 
25
  ## Model description
26
 
@@ -43,41 +43,41 @@ The following hyperparameters were used during training:
43
  - train_batch_size: 1
44
  - eval_batch_size: 8
45
  - seed: 42
46
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: linear
48
  - lr_scheduler_warmup_steps: 30
49
  - training_steps: 500
50
 
51
  ### Training results
52
 
53
- | Training Loss | Epoch | Step | Validation Loss |
54
- |:-------------:|:-----:|:----:|:---------------:|
55
- | 1.6176 | 1.25 | 25 | 1.4646 |
56
- | 0.7045 | 2.5 | 50 | 1.5828 |
57
- | 0.3188 | 3.75 | 75 | 1.7073 |
58
- | 0.1796 | 5.0 | 100 | 1.8367 |
59
- | 0.0882 | 6.25 | 125 | 2.0182 |
60
- | 0.038 | 7.5 | 150 | 2.3088 |
61
- | 0.0196 | 8.75 | 175 | 2.2476 |
62
- | 0.0256 | 10.0 | 200 | 2.1310 |
63
- | 0.0117 | 11.25 | 225 | 2.1974 |
64
- | 0.0066 | 12.5 | 250 | 2.2862 |
65
- | 0.0058 | 13.75 | 275 | 2.3694 |
66
- | 0.0071 | 15.0 | 300 | 2.2468 |
67
- | 0.0063 | 16.25 | 325 | 2.2986 |
68
- | 0.0031 | 17.5 | 350 | 2.3789 |
69
- | 0.0024 | 18.75 | 375 | 2.4145 |
70
- | 0.003 | 20.0 | 400 | 2.4298 |
71
- | 0.0025 | 21.25 | 425 | 2.4402 |
72
- | 0.0021 | 22.5 | 450 | 2.4474 |
73
- | 0.0022 | 23.75 | 475 | 2.4513 |
74
- | 0.0019 | 25.0 | 500 | 2.4530 |
75
 
76
 
77
  ### Framework versions
78
 
79
- - PEFT 0.12.0
80
- - Transformers 4.44.2
81
- - Pytorch 2.4.0+cu121
82
- - Datasets 3.0.0
83
- - Tokenizers 0.19.1
 
20
 
21
  This model is a fine-tuned version of [NousResearch/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/NousResearch/Meta-Llama-3.1-8B-Instruct) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.4158
24
 
25
  ## Model description
26
 
 
43
  - train_batch_size: 1
44
  - eval_batch_size: 8
45
  - seed: 42
46
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
  - lr_scheduler_warmup_steps: 30
49
  - training_steps: 500
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Validation Loss |
54
+ |:-------------:|:-------:|:----:|:---------------:|
55
+ | 1.6861 | 1.1905 | 25 | 0.9223 |
56
+ | 0.7859 | 2.3810 | 50 | 0.8779 |
57
+ | 0.3887 | 3.5714 | 75 | 0.9867 |
58
+ | 0.1412 | 4.7619 | 100 | 1.0822 |
59
+ | 0.0911 | 5.9524 | 125 | 1.2118 |
60
+ | 0.0391 | 7.1429 | 150 | 1.3553 |
61
+ | 0.0309 | 8.3333 | 175 | 1.2825 |
62
+ | 0.0188 | 9.5238 | 200 | 1.2512 |
63
+ | 0.0145 | 10.7143 | 225 | 1.2936 |
64
+ | 0.0091 | 11.9048 | 250 | 1.3109 |
65
+ | 0.0058 | 13.0952 | 275 | 1.2768 |
66
+ | 0.0042 | 14.2857 | 300 | 1.2963 |
67
+ | 0.0032 | 15.4762 | 325 | 1.3539 |
68
+ | 0.0021 | 16.6667 | 350 | 1.3810 |
69
+ | 0.0024 | 17.8571 | 375 | 1.3974 |
70
+ | 0.0021 | 19.0476 | 400 | 1.4047 |
71
+ | 0.002 | 20.2381 | 425 | 1.4103 |
72
+ | 0.0018 | 21.4286 | 450 | 1.4133 |
73
+ | 0.0017 | 22.6190 | 475 | 1.4152 |
74
+ | 0.0015 | 23.8095 | 500 | 1.4158 |
75
 
76
 
77
  ### Framework versions
78
 
79
+ - PEFT 0.13.2
80
+ - Transformers 4.46.1
81
+ - Pytorch 2.5.0+cu121
82
+ - Datasets 3.0.2
83
+ - Tokenizers 0.20.1
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "o_proj",
24
  "q_proj",
 
 
25
  "v_proj",
26
  "up_proj",
27
- "k_proj",
28
- "down_proj",
29
- "gate_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "q_proj",
24
+ "o_proj",
25
+ "k_proj",
26
  "v_proj",
27
  "up_proj",
28
+ "gate_proj",
29
+ "down_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3ef1a808206a62c89f13526d8271b95ba4bf9a6dd4577fd2280dc7b4572e2bf
3
  size 167832240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56440b90fcc80bb3c42505579c82b09bbb9dbe7463cd7ceab80119cbc07f45dd
3
  size 167832240
runs/Oct29_22-47-53_271f9d0a4177/events.out.tfevents.1730242079.271f9d0a4177.3118.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a6b6535b997cd13a3ae437b4ca989810f0c3c02d6223f0b25a45db3c9e60cf
3
+ size 22459
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecea9784aaf6b2bf5e16248cd6ff2f1781625e879e42c147341a9829970f6e50
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76220d30a88df6674f89af91dc9ce7dd13e737f81e8b5ae2ff6cb45cdffe9c1
3
+ size 5560