Minbyul commited on
Commit
38a911d
·
verified ·
1 Parent(s): 0887a15

Upload 15 files

Browse files
README.md CHANGED
@@ -2,15 +2,11 @@
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-v0.1
4
  tags:
5
- - alignment-handbook
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
12
  datasets:
13
- - HuggingFaceH4/deita-10k-v0-sft
14
  model-index:
15
  - name: mistral-7b-wo-live_qa-iter-sft-step1
16
  results: []
@@ -21,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
21
 
22
  # mistral-7b-wo-live_qa-iter-sft-step1
23
 
24
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the HuggingFaceH4/deita-10k-v0-sft dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 1.8004
27
 
28
  ## Model description
29
 
@@ -60,14 +56,14 @@ The following hyperparameters were used during training:
60
 
61
  | Training Loss | Epoch | Step | Validation Loss |
62
  |:-------------:|:-----:|:----:|:---------------:|
63
- | 2.2107 | 0.96 | 16 | 1.5384 |
64
- | 1.4572 | 1.97 | 33 | 1.6461 |
65
- | 0.9565 | 2.87 | 48 | 1.8004 |
66
 
67
 
68
  ### Framework versions
69
 
70
- - Transformers 4.39.0.dev0
71
- - Pytorch 2.1.2
72
  - Datasets 2.14.6
73
  - Tokenizers 0.15.2
 
2
  license: apache-2.0
3
  base_model: mistralai/Mistral-7B-v0.1
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  model-index:
11
  - name: mistral-7b-wo-live_qa-iter-sft-step1
12
  results: []
 
17
 
18
  # mistral-7b-wo-live_qa-iter-sft-step1
19
 
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 1.3446
23
 
24
  ## Model description
25
 
 
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.0022 | 0.97 | 17 | 1.3495 |
60
+ | 0.7519 | 2.0 | 35 | 1.3250 |
61
+ | 0.5685 | 2.91 | 51 | 1.3446 |
62
 
63
 
64
  ### Framework versions
65
 
66
+ - Transformers 4.38.2
67
+ - Pytorch 2.1.2+cu121
68
  - Datasets 2.14.6
69
  - Tokenizers 0.15.2
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "epoch": 2.87,
3
- "eval_loss": 1.8004332780838013,
4
- "eval_runtime": 1.8174,
5
- "eval_samples": 100,
6
- "eval_samples_per_second": 3.852,
7
- "eval_steps_per_second": 0.55,
8
- "train_loss": 1.592680846651395,
9
- "train_runtime": 972.6889,
10
  "train_samples": 4848,
11
- "train_samples_per_second": 3.3,
12
- "train_steps_per_second": 0.049
13
  }
 
1
  {
2
+ "epoch": 2.91,
3
+ "train_loss": 0.807617746147455,
4
+ "train_runtime": 901.1485,
 
 
 
 
 
5
  "train_samples": 4848,
6
+ "train_samples_per_second": 3.685,
7
+ "train_steps_per_second": 0.057
8
  }
config.json CHANGED
@@ -20,7 +20,7 @@
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
- "transformers_version": "4.39.0.dev0",
24
- "use_cache": true,
25
  "vocab_size": 32000
26
  }
 
20
  "sliding_window": 4096,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
23
+ "transformers_version": "4.38.2",
24
+ "use_cache": false,
25
  "vocab_size": 32000
26
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.39.0.dev0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.38.2"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed42368472ceff8e046b5df6a192e73b5833219c86863f16b7258a93ecc9d6a8
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a82d850e450056ae8b1963991c10eee8a3e644103163b7176bb8f307e82a80d
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45b0caea5d046e61cfc3ad8651e4b9d85f8454bbe22fb65b71729f0bec9e8aab
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b120958fbdea318f7a96b8f13a322a5d815a87b3b372598e5558f1ff979054d
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd6a9ac5306af2d4d20adbac7a4ce8f84bd95b0ef63efb80d256d6fd25630121
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eabe1b6021d809fff031c25c2e6afe1ef64b7b910c280df2004fa7fcd3f228ef
3
  size 4540516344
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.87,
3
- "train_loss": 1.592680846651395,
4
- "train_runtime": 972.6889,
5
  "train_samples": 4848,
6
- "train_samples_per_second": 3.3,
7
- "train_steps_per_second": 0.049
8
  }
 
1
  {
2
+ "epoch": 2.91,
3
+ "train_loss": 0.807617746147455,
4
+ "train_runtime": 901.1485,
5
  "train_samples": 4848,
6
+ "train_samples_per_second": 3.685,
7
+ "train_steps_per_second": 0.057
8
  }
trainer_state.json CHANGED
@@ -1,123 +1,130 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.8656716417910446,
5
  "eval_steps": 500,
6
- "global_step": 48,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
- "grad_norm": 7.837143285917327,
14
- "learning_rate": 4.000000000000001e-06,
15
- "loss": 2.2354,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.3,
20
- "grad_norm": 9.311491863709907,
21
- "learning_rate": 2e-05,
22
- "loss": 2.1965,
23
  "step": 5
24
  },
25
  {
26
- "epoch": 0.6,
27
- "grad_norm": 4.970375942777289,
28
- "learning_rate": 1.9340161087325483e-05,
29
- "loss": 2.1045,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.9,
34
- "grad_norm": 3.5227839335545066,
35
- "learning_rate": 1.744772182743782e-05,
36
- "loss": 2.2107,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.96,
41
- "eval_loss": 1.5384026765823364,
42
- "eval_runtime": 1.7145,
43
- "eval_samples_per_second": 4.083,
44
- "eval_steps_per_second": 0.583,
45
- "step": 16
46
  },
47
  {
48
- "epoch": 1.19,
49
- "grad_norm": 4.57009081553093,
50
- "learning_rate": 1.4572423233046386e-05,
51
- "loss": 1.8611,
52
  "step": 20
53
  },
54
  {
55
- "epoch": 1.49,
56
- "grad_norm": 5.062546542837419,
57
- "learning_rate": 1.1093712083778748e-05,
58
- "loss": 1.533,
59
  "step": 25
60
  },
61
  {
62
- "epoch": 1.79,
63
- "grad_norm": 3.8164674548597004,
64
- "learning_rate": 7.470666176083193e-06,
65
- "loss": 1.4572,
66
  "step": 30
67
  },
68
  {
69
- "epoch": 1.97,
70
- "eval_loss": 1.646084189414978,
71
- "eval_runtime": 1.7409,
72
- "eval_samples_per_second": 4.021,
73
- "eval_steps_per_second": 0.574,
74
- "step": 33
75
  },
76
  {
77
- "epoch": 2.09,
78
- "grad_norm": 9.370778150439074,
79
- "learning_rate": 4.181410844420473e-06,
80
- "loss": 1.342,
 
81
  "step": 35
82
  },
83
  {
84
- "epoch": 2.39,
85
- "grad_norm": 4.2248384357705895,
86
- "learning_rate": 1.660021821101222e-06,
87
- "loss": 1.0214,
88
  "step": 40
89
  },
90
  {
91
- "epoch": 2.69,
92
- "grad_norm": 4.735590938981405,
93
- "learning_rate": 2.392412244407294e-07,
94
- "loss": 0.9565,
95
  "step": 45
96
  },
97
  {
98
- "epoch": 2.87,
99
- "eval_loss": 1.8004332780838013,
100
- "eval_runtime": 1.6722,
101
- "eval_samples_per_second": 4.186,
102
- "eval_steps_per_second": 0.598,
103
- "step": 48
 
 
 
 
 
 
 
104
  },
105
  {
106
- "epoch": 2.87,
107
- "step": 48,
108
- "total_flos": 9997878558720.0,
109
- "train_loss": 1.592680846651395,
110
- "train_runtime": 972.6889,
111
- "train_samples_per_second": 3.3,
112
- "train_steps_per_second": 0.049
113
  }
114
  ],
115
  "logging_steps": 5,
116
- "max_steps": 48,
117
  "num_input_tokens_seen": 0,
118
  "num_train_epochs": 3,
119
  "save_steps": 500,
120
- "total_flos": 9997878558720.0,
121
  "train_batch_size": 4,
122
  "trial_name": null,
123
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.914285714285714,
5
  "eval_steps": 500,
6
+ "global_step": 51,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06,
13
+ "grad_norm": 7.817991325483718,
14
+ "learning_rate": 3.3333333333333333e-06,
15
+ "loss": 0.965,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.29,
20
+ "grad_norm": 49.73347603245408,
21
+ "learning_rate": 1.6666666666666667e-05,
22
+ "loss": 1.1198,
23
  "step": 5
24
  },
25
  {
26
+ "epoch": 0.57,
27
+ "grad_norm": 4.523926056457663,
28
+ "learning_rate": 1.961261695938319e-05,
29
+ "loss": 1.0209,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.86,
34
+ "grad_norm": 3.8093568875972696,
35
+ "learning_rate": 1.8090169943749477e-05,
36
+ "loss": 1.0022,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.97,
41
+ "eval_loss": 1.349517822265625,
42
+ "eval_runtime": 2.9563,
43
+ "eval_samples_per_second": 7.78,
44
+ "eval_steps_per_second": 0.677,
45
+ "step": 17
46
  },
47
  {
48
+ "epoch": 1.14,
49
+ "grad_norm": 2.9739767130682138,
50
+ "learning_rate": 1.5591929034707468e-05,
51
+ "loss": 0.8913,
52
  "step": 20
53
  },
54
  {
55
+ "epoch": 1.43,
56
+ "grad_norm": 2.5183506153336306,
57
+ "learning_rate": 1.2419218955996677e-05,
58
+ "loss": 0.8043,
59
  "step": 25
60
  },
61
  {
62
+ "epoch": 1.71,
63
+ "grad_norm": 2.193683108302545,
64
+ "learning_rate": 8.954715367323468e-06,
65
+ "loss": 0.7194,
66
  "step": 30
67
  },
68
  {
69
+ "epoch": 2.0,
70
+ "grad_norm": 2.3903108778369,
71
+ "learning_rate": 5.616288532109225e-06,
72
+ "loss": 0.7519,
73
+ "step": 35
 
74
  },
75
  {
76
+ "epoch": 2.0,
77
+ "eval_loss": 1.3249517679214478,
78
+ "eval_runtime": 2.7504,
79
+ "eval_samples_per_second": 8.363,
80
+ "eval_steps_per_second": 0.727,
81
  "step": 35
82
  },
83
  {
84
+ "epoch": 2.29,
85
+ "grad_norm": 3.1793272954159764,
86
+ "learning_rate": 2.8066019966134907e-06,
87
+ "loss": 0.6556,
88
  "step": 40
89
  },
90
  {
91
+ "epoch": 2.57,
92
+ "grad_norm": 3.802719468935368,
93
+ "learning_rate": 8.645454235739903e-07,
94
+ "loss": 0.6099,
95
  "step": 45
96
  },
97
  {
98
+ "epoch": 2.86,
99
+ "grad_norm": 2.3574393295768283,
100
+ "learning_rate": 2.4359497401758026e-08,
101
+ "loss": 0.5685,
102
+ "step": 50
103
+ },
104
+ {
105
+ "epoch": 2.91,
106
+ "eval_loss": 1.3446100950241089,
107
+ "eval_runtime": 2.6623,
108
+ "eval_samples_per_second": 8.639,
109
+ "eval_steps_per_second": 0.751,
110
+ "step": 51
111
  },
112
  {
113
+ "epoch": 2.91,
114
+ "step": 51,
115
+ "total_flos": 10626017525760.0,
116
+ "train_loss": 0.807617746147455,
117
+ "train_runtime": 901.1485,
118
+ "train_samples_per_second": 3.685,
119
+ "train_steps_per_second": 0.057
120
  }
121
  ],
122
  "logging_steps": 5,
123
+ "max_steps": 51,
124
  "num_input_tokens_seen": 0,
125
  "num_train_epochs": 3,
126
  "save_steps": 500,
127
+ "total_flos": 10626017525760.0,
128
  "train_batch_size": 4,
129
  "trial_name": null,
130
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b697b879fe9b55b1274599bb1b46f70520cdd827378acc80bc6c66c6049a47a3
3
  size 6200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c50b82c9463e352609d63e82a468a5a19e8523f8a053072ef0e24d85b76eb28
3
  size 6200