chansung commited on
Commit
e8f608c
1 Parent(s): 2acd92c

Model save

Browse files
README.md CHANGED
@@ -20,7 +20,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 149.4047
24
 
25
  ## Model description
26
 
@@ -39,7 +39,7 @@ More information needed
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
- - learning_rate: 0.03
43
  - train_batch_size: 8
44
  - eval_batch_size: 8
45
  - seed: 42
@@ -49,15 +49,15 @@ The following hyperparameters were used during training:
49
  - total_train_batch_size: 128
50
  - total_eval_batch_size: 64
51
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
- - lr_scheduler_type: linear
53
- - lr_scheduler_warmup_ratio: 0.06
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
- | 149.2635 | 1.0 | 140 | 149.4047 |
61
 
62
 
63
  ### Framework versions
 
20
 
21
  This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 1.6581
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
  - train_batch_size: 8
44
  - eval_batch_size: 8
45
  - seed: 42
 
49
  - total_train_batch_size: 128
50
  - total_eval_batch_size: 64
51
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
  - num_epochs: 1
55
 
56
  ### Training results
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 1.6561 | 1.0 | 140 | 1.6581 |
61
 
62
 
63
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 149.4047088623047,
4
- "eval_runtime": 2.8669,
5
- "eval_samples": 518,
6
- "eval_samples_per_second": 62.087,
7
- "eval_steps_per_second": 1.046,
8
- "total_flos": 4.2812236905630925e+17,
9
- "train_loss": 249.82138957977295,
10
- "train_runtime": 1603.7748,
11
  "train_samples": 51241,
12
- "train_samples_per_second": 11.164,
13
- "train_steps_per_second": 0.087
14
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 4.268849030789857e+17,
4
+ "train_loss": 5.955788305827549,
5
+ "train_runtime": 1733.7968,
 
 
 
 
 
6
  "train_samples": 51241,
7
+ "train_samples_per_second": 10.326,
8
+ "train_steps_per_second": 0.081
9
  }
runs/Nov16_09-08-18_main-lora-gemma7b-alpaca-0-0/events.out.tfevents.1731766677.main-lora-gemma7b-alpaca-0-0.459.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6204c5fe75a8ba866469038f47376750537d637d555fdceb72f7df4437762ff1
3
- size 12689
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81d80ff5a57352be8e43e423954633ba553c054dca1139c35195a0bd83c46080
3
+ size 13314
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "total_flos": 4.2812236905630925e+17,
4
- "train_loss": 249.82138957977295,
5
- "train_runtime": 1603.7748,
6
  "train_samples": 51241,
7
- "train_samples_per_second": 11.164,
8
- "train_steps_per_second": 0.087
9
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "total_flos": 4.268849030789857e+17,
4
+ "train_loss": 5.955788305827549,
5
+ "train_runtime": 1733.7968,
6
  "train_samples": 51241,
7
+ "train_samples_per_second": 10.326,
8
+ "train_steps_per_second": 0.081
9
  }
trainer_state.json CHANGED
@@ -10,223 +10,223 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.007142857142857143,
13
- "grad_norm": 286.327392578125,
14
- "learning_rate": 0.003333333333333333,
15
  "loss": 27.4831,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03571428571428571,
20
- "grad_norm": 564.7811889648438,
21
- "learning_rate": 0.016666666666666666,
22
- "loss": 180.9505,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07142857142857142,
27
- "grad_norm": 45641.8671875,
28
- "learning_rate": 0.029770992366412213,
29
- "loss": 193.5063,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10714285714285714,
34
- "grad_norm": 1771723.5,
35
- "learning_rate": 0.02862595419847328,
36
- "loss": 376.766,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
- "grad_norm": 286214.5,
42
- "learning_rate": 0.02748091603053435,
43
- "loss": 569.9451,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17857142857142858,
48
- "grad_norm": 1100.7747802734375,
49
- "learning_rate": 0.02633587786259542,
50
- "loss": 615.8839,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21428571428571427,
55
- "grad_norm": 169022.0,
56
- "learning_rate": 0.025190839694656485,
57
- "loss": 481.0353,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25,
62
- "grad_norm": 333.40972900390625,
63
- "learning_rate": 0.024045801526717557,
64
- "loss": 305.1498,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.2857142857142857,
69
- "grad_norm": 139948.15625,
70
- "learning_rate": 0.022900763358778626,
71
- "loss": 241.1332,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.32142857142857145,
76
- "grad_norm": 1393742.875,
77
- "learning_rate": 0.021755725190839695,
78
- "loss": 207.3882,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.35714285714285715,
83
- "grad_norm": 12597.32421875,
84
- "learning_rate": 0.020610687022900764,
85
- "loss": 201.0974,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.39285714285714285,
90
- "grad_norm": 1081976.875,
91
- "learning_rate": 0.01946564885496183,
92
- "loss": 222.2186,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.42857142857142855,
97
- "grad_norm": 18104.56640625,
98
- "learning_rate": 0.0183206106870229,
99
- "loss": 254.4615,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4642857142857143,
104
- "grad_norm": 53967.2265625,
105
- "learning_rate": 0.017175572519083967,
106
- "loss": 256.7784,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5,
111
- "grad_norm": 36778.24609375,
112
- "learning_rate": 0.01603053435114504,
113
- "loss": 189.0749,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5357142857142857,
118
- "grad_norm": 1759.3746337890625,
119
- "learning_rate": 0.014885496183206106,
120
- "loss": 198.0302,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.5714285714285714,
125
- "grad_norm": 2888.434326171875,
126
- "learning_rate": 0.013740458015267175,
127
- "loss": 208.9426,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6071428571428571,
132
- "grad_norm": 3183.7255859375,
133
- "learning_rate": 0.012595419847328242,
134
- "loss": 237.6604,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6428571428571429,
139
- "grad_norm": 295.5591735839844,
140
- "learning_rate": 0.011450381679389313,
141
- "loss": 259.345,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6785714285714286,
146
- "grad_norm": 2817.36083984375,
147
- "learning_rate": 0.010305343511450382,
148
- "loss": 229.8393,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7142857142857143,
153
- "grad_norm": 1439.1002197265625,
154
- "learning_rate": 0.00916030534351145,
155
- "loss": 206.7085,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.75,
160
- "grad_norm": 2063.854248046875,
161
- "learning_rate": 0.00801526717557252,
162
- "loss": 200.2123,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7857142857142857,
167
- "grad_norm": 3193.01953125,
168
- "learning_rate": 0.006870229007633588,
169
- "loss": 195.9004,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8214285714285714,
174
- "grad_norm": 1513.8978271484375,
175
- "learning_rate": 0.0057251908396946565,
176
- "loss": 188.8404,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8571428571428571,
181
- "grad_norm": 5677.81494140625,
182
- "learning_rate": 0.004580152671755725,
183
- "loss": 176.7594,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8928571428571429,
188
- "grad_norm": 726.0751953125,
189
- "learning_rate": 0.003435114503816794,
190
- "loss": 167.6159,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9285714285714286,
195
- "grad_norm": 1151.048095703125,
196
- "learning_rate": 0.0022900763358778627,
197
- "loss": 155.8075,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9642857142857143,
202
- "grad_norm": 3488.1396484375,
203
- "learning_rate": 0.0011450381679389313,
204
- "loss": 155.3781,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
- "grad_norm": 718.7391357421875,
210
  "learning_rate": 0.0,
211
- "loss": 149.2635,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 1.0,
216
- "eval_loss": 149.4047088623047,
217
- "eval_runtime": 2.8984,
218
- "eval_samples_per_second": 61.413,
219
- "eval_steps_per_second": 1.035,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 1.0,
224
  "step": 140,
225
- "total_flos": 4.2812236905630925e+17,
226
- "train_loss": 249.82138957977295,
227
- "train_runtime": 1603.7748,
228
- "train_samples_per_second": 11.164,
229
- "train_steps_per_second": 0.087
230
  }
231
  ],
232
  "logging_steps": 5,
@@ -246,7 +246,7 @@
246
  "attributes": {}
247
  }
248
  },
249
- "total_flos": 4.2812236905630925e+17,
250
  "train_batch_size": 8,
251
  "trial_name": null,
252
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.007142857142857143,
13
+ "grad_norm": 106.10701751708984,
14
+ "learning_rate": 1.4285714285714285e-05,
15
  "loss": 27.4831,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.03571428571428571,
20
+ "grad_norm": 46.149723052978516,
21
+ "learning_rate": 7.142857142857143e-05,
22
+ "loss": 26.2992,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.07142857142857142,
27
+ "grad_norm": 16.458189010620117,
28
+ "learning_rate": 0.00014285714285714287,
29
+ "loss": 20.4929,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.10714285714285714,
34
+ "grad_norm": 9.764805793762207,
35
+ "learning_rate": 0.00019996891820008164,
36
+ "loss": 16.0513,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.14285714285714285,
41
+ "grad_norm": 3.3296561241149902,
42
+ "learning_rate": 0.00019888308262251285,
43
+ "loss": 13.4526,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.17857142857142858,
48
+ "grad_norm": 3.235478162765503,
49
+ "learning_rate": 0.0001962624246950012,
50
+ "loss": 12.6172,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.21428571428571427,
55
+ "grad_norm": 5.432589530944824,
56
+ "learning_rate": 0.00019214762118704076,
57
+ "loss": 11.9808,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.25,
62
+ "grad_norm": 10.644349098205566,
63
+ "learning_rate": 0.00018660254037844388,
64
+ "loss": 11.0065,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.2857142857142857,
69
+ "grad_norm": 14.282907485961914,
70
+ "learning_rate": 0.00017971325072229226,
71
+ "loss": 9.3131,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.32142857142857145,
76
+ "grad_norm": 21.60712432861328,
77
+ "learning_rate": 0.00017158668492597186,
78
+ "loss": 7.1723,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.35714285714285715,
83
+ "grad_norm": 15.418853759765625,
84
+ "learning_rate": 0.00016234898018587337,
85
+ "loss": 4.6599,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.39285714285714285,
90
+ "grad_norm": 5.335650444030762,
91
+ "learning_rate": 0.0001521435203379498,
92
+ "loss": 2.855,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.42857142857142855,
97
+ "grad_norm": 4.913125038146973,
98
+ "learning_rate": 0.00014112871031306119,
99
+ "loss": 2.3938,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.4642857142857143,
104
+ "grad_norm": 3.951244592666626,
105
+ "learning_rate": 0.00012947551744109043,
106
+ "loss": 2.1646,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.5,
111
+ "grad_norm": 1.6933783292770386,
112
+ "learning_rate": 0.00011736481776669306,
113
+ "loss": 2.0061,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.5357142857142857,
118
+ "grad_norm": 0.9344208836555481,
119
+ "learning_rate": 0.00010498458856606972,
120
+ "loss": 1.9092,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.5714285714285714,
125
+ "grad_norm": 0.9437263011932373,
126
+ "learning_rate": 9.252699064135758e-05,
127
+ "loss": 1.8086,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.6071428571428571,
132
+ "grad_norm": 1.5205684900283813,
133
+ "learning_rate": 8.018538568006027e-05,
134
+ "loss": 1.7838,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.6428571428571429,
139
+ "grad_norm": 0.8009554743766785,
140
+ "learning_rate": 6.815133497483157e-05,
141
+ "loss": 1.7433,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.6785714285714286,
146
+ "grad_norm": 0.9393540620803833,
147
+ "learning_rate": 5.6611626088244194e-05,
148
+ "loss": 1.736,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.7142857142857143,
153
+ "grad_norm": 1.6479978561401367,
154
+ "learning_rate": 4.574537361342407e-05,
155
+ "loss": 1.6976,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.75,
160
+ "grad_norm": 0.7829992771148682,
161
+ "learning_rate": 3.5721239031346066e-05,
162
+ "loss": 1.6793,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.7857142857142857,
167
+ "grad_norm": 0.7236832976341248,
168
+ "learning_rate": 2.669481281701739e-05,
169
+ "loss": 1.7,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.8214285714285714,
174
+ "grad_norm": 0.7881686687469482,
175
+ "learning_rate": 1.880619942841435e-05,
176
+ "loss": 1.6778,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.8571428571428571,
181
+ "grad_norm": 1.7755261659622192,
182
+ "learning_rate": 1.2177842662977135e-05,
183
+ "loss": 1.6757,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.8928571428571429,
188
+ "grad_norm": 1.0512515306472778,
189
+ "learning_rate": 6.9126251355795864e-06,
190
+ "loss": 1.6766,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.9285714285714286,
195
+ "grad_norm": 0.7070327997207642,
196
+ "learning_rate": 3.092271377092215e-06,
197
+ "loss": 1.67,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.9642857142857143,
202
+ "grad_norm": 0.7091158628463745,
203
+ "learning_rate": 7.760793399827937e-07,
204
+ "loss": 1.6459,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 1.0,
209
+ "grad_norm": 0.7374704480171204,
210
  "learning_rate": 0.0,
211
+ "loss": 1.6561,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 1.0,
216
+ "eval_loss": 1.6581236124038696,
217
+ "eval_runtime": 3.8242,
218
+ "eval_samples_per_second": 46.545,
219
+ "eval_steps_per_second": 0.784,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 1.0,
224
  "step": 140,
225
+ "total_flos": 4.268849030789857e+17,
226
+ "train_loss": 5.955788305827549,
227
+ "train_runtime": 1733.7968,
228
+ "train_samples_per_second": 10.326,
229
+ "train_steps_per_second": 0.081
230
  }
231
  ],
232
  "logging_steps": 5,
 
246
  "attributes": {}
247
  }
248
  },
249
+ "total_flos": 4.268849030789857e+17,
250
  "train_batch_size": 8,
251
  "trial_name": null,
252
  "trial_params": null