itsLeen commited on
Commit
d56c4dc
1 Parent(s): 7e26f12

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -16,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # finetuned-fake-food
18
 
19
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.6577
22
- - Accuracy: 0.6096
23
 
24
  ## Model description
25
 
 
3
  license: apache-2.0
4
  base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  metrics:
9
  - accuracy
 
17
 
18
  # finetuned-fake-food
19
 
20
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the indian_food_images dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.6574
23
+ - Accuracy: 0.6164
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.9523809523809523,
4
- "eval_loss": 0.1445566862821579,
5
- "eval_runtime": 1.5904,
6
- "eval_samples_per_second": 13.204,
7
- "eval_steps_per_second": 1.886,
8
- "total_flos": 1.8288109549043712e+17,
9
- "train_loss": 0.18390971183776855,
10
- "train_runtime": 151.0028,
11
- "train_samples_per_second": 15.629,
12
- "train_steps_per_second": 1.987
13
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6164383561643836,
4
+ "eval_loss": 0.6574238538742065,
5
+ "eval_runtime": 6.2004,
6
+ "eval_samples_per_second": 23.547,
7
+ "eval_steps_per_second": 3.064,
8
+ "total_flos": 6.400838342165299e+17,
9
+ "train_loss": 0.6831480086282725,
10
+ "train_runtime": 1157.4221,
11
+ "train_samples_per_second": 7.137,
12
+ "train_steps_per_second": 3.568
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "eval_accuracy": 0.9523809523809523,
4
- "eval_loss": 0.1445566862821579,
5
- "eval_runtime": 1.5904,
6
- "eval_samples_per_second": 13.204,
7
- "eval_steps_per_second": 1.886
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_accuracy": 0.6164383561643836,
4
+ "eval_loss": 0.6574238538742065,
5
+ "eval_runtime": 6.2004,
6
+ "eval_samples_per_second": 23.547,
7
+ "eval_steps_per_second": 3.064
8
  }
runs/Oct01_10-59-17_a59caee1d103/events.out.tfevents.1727781816.a59caee1d103.4704.8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a80a22aad7ade61395653bb941d35ce87de78d580c4adbd6d2feb81c7353e4e
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 1.8288109549043712e+17,
4
- "train_loss": 0.18390971183776855,
5
- "train_runtime": 151.0028,
6
- "train_samples_per_second": 15.629,
7
- "train_steps_per_second": 1.987
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "total_flos": 6.400838342165299e+17,
4
+ "train_loss": 0.6831480086282725,
5
+ "train_runtime": 1157.4221,
6
+ "train_samples_per_second": 7.137,
7
+ "train_steps_per_second": 3.568
8
  }
trainer_state.json CHANGED
@@ -1,75 +1,683 @@
1
  {
2
- "best_metric": 0.1445566862821579,
3
- "best_model_checkpoint": "finetuned-fake-food/checkpoint-200",
4
- "epoch": 20.0,
5
  "eval_steps": 100,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 6.666666666666667,
13
- "grad_norm": 0.19071011245250702,
14
- "learning_rate": 0.00013333333333333334,
15
- "loss": 0.328,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 6.666666666666667,
20
- "eval_accuracy": 0.8571428571428571,
21
- "eval_loss": 0.38541921973228455,
22
- "eval_runtime": 0.6081,
23
- "eval_samples_per_second": 34.535,
24
- "eval_steps_per_second": 4.934,
25
  "step": 100
26
  },
27
  {
28
- "epoch": 13.333333333333334,
29
- "grad_norm": 4.4858717918396,
30
- "learning_rate": 6.666666666666667e-05,
31
- "loss": 0.1729,
32
  "step": 200
33
  },
34
  {
35
- "epoch": 13.333333333333334,
36
- "eval_accuracy": 0.9523809523809523,
37
- "eval_loss": 0.1445566862821579,
38
- "eval_runtime": 0.6148,
39
- "eval_samples_per_second": 34.158,
40
- "eval_steps_per_second": 4.88,
41
  "step": 200
42
  },
43
  {
44
- "epoch": 20.0,
45
- "grad_norm": 0.0665096864104271,
46
- "learning_rate": 0.0,
47
- "loss": 0.0508,
48
  "step": 300
49
  },
50
  {
51
- "epoch": 20.0,
52
- "eval_accuracy": 0.9523809523809523,
53
- "eval_loss": 0.2076147198677063,
54
- "eval_runtime": 0.6275,
55
- "eval_samples_per_second": 33.468,
56
- "eval_steps_per_second": 4.781,
57
  "step": 300
58
  },
59
  {
60
- "epoch": 20.0,
61
- "step": 300,
62
- "total_flos": 1.8288109549043712e+17,
63
- "train_loss": 0.18390971183776855,
64
- "train_runtime": 151.0028,
65
- "train_samples_per_second": 15.629,
66
- "train_steps_per_second": 1.987
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  }
68
  ],
69
  "logging_steps": 100,
70
- "max_steps": 300,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 20,
73
  "save_steps": 100,
74
  "stateful_callbacks": {
75
  "TrainerControl": {
@@ -83,8 +691,8 @@
83
  "attributes": {}
84
  }
85
  },
86
- "total_flos": 1.8288109549043712e+17,
87
- "train_batch_size": 8,
88
  "trial_name": null,
89
  "trial_params": null
90
  }
 
1
  {
2
+ "best_metric": 0.6574238538742065,
3
+ "best_model_checkpoint": "finetuned-fake-food/checkpoint-4000",
4
+ "epoch": 10.0,
5
  "eval_steps": 100,
6
+ "global_step": 4130,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.24213075060532688,
13
+ "grad_norm": 0.15737777948379517,
14
+ "learning_rate": 0.00019515738498789345,
15
+ "loss": 0.6977,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.24213075060532688,
20
+ "eval_accuracy": 0.5821917808219178,
21
+ "eval_loss": 0.6804767847061157,
22
+ "eval_runtime": 5.525,
23
+ "eval_samples_per_second": 26.425,
24
+ "eval_steps_per_second": 3.439,
25
  "step": 100
26
  },
27
  {
28
+ "epoch": 0.48426150121065376,
29
+ "grad_norm": 0.03558634594082832,
30
+ "learning_rate": 0.00019031476997578695,
31
+ "loss": 0.6956,
32
  "step": 200
33
  },
34
  {
35
+ "epoch": 0.48426150121065376,
36
+ "eval_accuracy": 0.4178082191780822,
37
+ "eval_loss": 0.6935968399047852,
38
+ "eval_runtime": 5.2465,
39
+ "eval_samples_per_second": 27.828,
40
+ "eval_steps_per_second": 3.621,
41
  "step": 200
42
  },
43
  {
44
+ "epoch": 0.7263922518159807,
45
+ "grad_norm": 1.5009288787841797,
46
+ "learning_rate": 0.0001854721549636804,
47
+ "loss": 0.6795,
48
  "step": 300
49
  },
50
  {
51
+ "epoch": 0.7263922518159807,
52
+ "eval_accuracy": 0.6506849315068494,
53
+ "eval_loss": 0.6734184622764587,
54
+ "eval_runtime": 5.9372,
55
+ "eval_samples_per_second": 24.591,
56
+ "eval_steps_per_second": 3.2,
57
  "step": 300
58
  },
59
  {
60
+ "epoch": 0.9685230024213075,
61
+ "grad_norm": 0.23967677354812622,
62
+ "learning_rate": 0.00018062953995157384,
63
+ "loss": 0.7061,
64
+ "step": 400
65
+ },
66
+ {
67
+ "epoch": 0.9685230024213075,
68
+ "eval_accuracy": 0.5821917808219178,
69
+ "eval_loss": 0.6760488152503967,
70
+ "eval_runtime": 5.7531,
71
+ "eval_samples_per_second": 25.378,
72
+ "eval_steps_per_second": 3.303,
73
+ "step": 400
74
+ },
75
+ {
76
+ "epoch": 1.2106537530266344,
77
+ "grad_norm": 2.081388235092163,
78
+ "learning_rate": 0.00017578692493946732,
79
+ "loss": 0.6941,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 1.2106537530266344,
84
+ "eval_accuracy": 0.5821917808219178,
85
+ "eval_loss": 0.6746240854263306,
86
+ "eval_runtime": 5.812,
87
+ "eval_samples_per_second": 25.121,
88
+ "eval_steps_per_second": 3.269,
89
+ "step": 500
90
+ },
91
+ {
92
+ "epoch": 1.4527845036319613,
93
+ "grad_norm": 0.4285804331302643,
94
+ "learning_rate": 0.0001709443099273608,
95
+ "loss": 0.6898,
96
+ "step": 600
97
+ },
98
+ {
99
+ "epoch": 1.4527845036319613,
100
+ "eval_accuracy": 0.6027397260273972,
101
+ "eval_loss": 0.6674954891204834,
102
+ "eval_runtime": 6.1547,
103
+ "eval_samples_per_second": 23.722,
104
+ "eval_steps_per_second": 3.087,
105
+ "step": 600
106
+ },
107
+ {
108
+ "epoch": 1.694915254237288,
109
+ "grad_norm": 0.12052281200885773,
110
+ "learning_rate": 0.00016610169491525423,
111
+ "loss": 0.6956,
112
+ "step": 700
113
+ },
114
+ {
115
+ "epoch": 1.694915254237288,
116
+ "eval_accuracy": 0.5753424657534246,
117
+ "eval_loss": 0.684603750705719,
118
+ "eval_runtime": 6.0144,
119
+ "eval_samples_per_second": 24.275,
120
+ "eval_steps_per_second": 3.159,
121
+ "step": 700
122
+ },
123
+ {
124
+ "epoch": 1.937046004842615,
125
+ "grad_norm": 0.3585425913333893,
126
+ "learning_rate": 0.0001612590799031477,
127
+ "loss": 0.6847,
128
+ "step": 800
129
+ },
130
+ {
131
+ "epoch": 1.937046004842615,
132
+ "eval_accuracy": 0.5821917808219178,
133
+ "eval_loss": 0.6745873093605042,
134
+ "eval_runtime": 5.8427,
135
+ "eval_samples_per_second": 24.988,
136
+ "eval_steps_per_second": 3.252,
137
+ "step": 800
138
+ },
139
+ {
140
+ "epoch": 2.179176755447942,
141
+ "grad_norm": 1.2445541620254517,
142
+ "learning_rate": 0.00015641646489104115,
143
+ "loss": 0.6949,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 2.179176755447942,
148
+ "eval_accuracy": 0.589041095890411,
149
+ "eval_loss": 0.6779718399047852,
150
+ "eval_runtime": 4.8307,
151
+ "eval_samples_per_second": 30.223,
152
+ "eval_steps_per_second": 3.933,
153
+ "step": 900
154
+ },
155
+ {
156
+ "epoch": 2.4213075060532687,
157
+ "grad_norm": 1.429865837097168,
158
+ "learning_rate": 0.00015157384987893465,
159
+ "loss": 0.703,
160
+ "step": 1000
161
+ },
162
+ {
163
+ "epoch": 2.4213075060532687,
164
+ "eval_accuracy": 0.5753424657534246,
165
+ "eval_loss": 0.6894732117652893,
166
+ "eval_runtime": 5.0834,
167
+ "eval_samples_per_second": 28.721,
168
+ "eval_steps_per_second": 3.738,
169
+ "step": 1000
170
+ },
171
+ {
172
+ "epoch": 2.663438256658596,
173
+ "grad_norm": 1.2485073804855347,
174
+ "learning_rate": 0.0001467312348668281,
175
+ "loss": 0.6851,
176
+ "step": 1100
177
+ },
178
+ {
179
+ "epoch": 2.663438256658596,
180
+ "eval_accuracy": 0.5821917808219178,
181
+ "eval_loss": 0.6741558909416199,
182
+ "eval_runtime": 6.0858,
183
+ "eval_samples_per_second": 23.99,
184
+ "eval_steps_per_second": 3.122,
185
+ "step": 1100
186
+ },
187
+ {
188
+ "epoch": 2.9055690072639226,
189
+ "grad_norm": 0.1997382789850235,
190
+ "learning_rate": 0.00014188861985472154,
191
+ "loss": 0.6878,
192
+ "step": 1200
193
+ },
194
+ {
195
+ "epoch": 2.9055690072639226,
196
+ "eval_accuracy": 0.6301369863013698,
197
+ "eval_loss": 0.674239456653595,
198
+ "eval_runtime": 6.1499,
199
+ "eval_samples_per_second": 23.74,
200
+ "eval_steps_per_second": 3.089,
201
+ "step": 1200
202
+ },
203
+ {
204
+ "epoch": 3.1476997578692494,
205
+ "grad_norm": 0.4324168860912323,
206
+ "learning_rate": 0.00013704600484261504,
207
+ "loss": 0.68,
208
+ "step": 1300
209
+ },
210
+ {
211
+ "epoch": 3.1476997578692494,
212
+ "eval_accuracy": 0.5821917808219178,
213
+ "eval_loss": 0.6712663173675537,
214
+ "eval_runtime": 6.0266,
215
+ "eval_samples_per_second": 24.226,
216
+ "eval_steps_per_second": 3.153,
217
+ "step": 1300
218
+ },
219
+ {
220
+ "epoch": 3.389830508474576,
221
+ "grad_norm": 0.9948041439056396,
222
+ "learning_rate": 0.00013220338983050849,
223
+ "loss": 0.6728,
224
+ "step": 1400
225
+ },
226
+ {
227
+ "epoch": 3.389830508474576,
228
+ "eval_accuracy": 0.5958904109589042,
229
+ "eval_loss": 0.6838211417198181,
230
+ "eval_runtime": 6.1631,
231
+ "eval_samples_per_second": 23.689,
232
+ "eval_steps_per_second": 3.083,
233
+ "step": 1400
234
+ },
235
+ {
236
+ "epoch": 3.6319612590799033,
237
+ "grad_norm": 1.2490299940109253,
238
+ "learning_rate": 0.00012736077481840193,
239
+ "loss": 0.698,
240
+ "step": 1500
241
+ },
242
+ {
243
+ "epoch": 3.6319612590799033,
244
+ "eval_accuracy": 0.5821917808219178,
245
+ "eval_loss": 0.6774668097496033,
246
+ "eval_runtime": 6.0857,
247
+ "eval_samples_per_second": 23.991,
248
+ "eval_steps_per_second": 3.122,
249
+ "step": 1500
250
+ },
251
+ {
252
+ "epoch": 3.87409200968523,
253
+ "grad_norm": 0.2908919155597687,
254
+ "learning_rate": 0.0001225181598062954,
255
+ "loss": 0.7033,
256
+ "step": 1600
257
+ },
258
+ {
259
+ "epoch": 3.87409200968523,
260
+ "eval_accuracy": 0.5821917808219178,
261
+ "eval_loss": 0.6734635829925537,
262
+ "eval_runtime": 5.783,
263
+ "eval_samples_per_second": 25.247,
264
+ "eval_steps_per_second": 3.286,
265
+ "step": 1600
266
+ },
267
+ {
268
+ "epoch": 4.116222760290557,
269
+ "grad_norm": 0.21098549664020538,
270
+ "learning_rate": 0.00011767554479418887,
271
+ "loss": 0.6973,
272
+ "step": 1700
273
+ },
274
+ {
275
+ "epoch": 4.116222760290557,
276
+ "eval_accuracy": 0.6232876712328768,
277
+ "eval_loss": 0.6803831458091736,
278
+ "eval_runtime": 4.6265,
279
+ "eval_samples_per_second": 31.557,
280
+ "eval_steps_per_second": 4.107,
281
+ "step": 1700
282
+ },
283
+ {
284
+ "epoch": 4.358353510895884,
285
+ "grad_norm": 0.03869936615228653,
286
+ "learning_rate": 0.00011283292978208233,
287
+ "loss": 0.6822,
288
+ "step": 1800
289
+ },
290
+ {
291
+ "epoch": 4.358353510895884,
292
+ "eval_accuracy": 0.6027397260273972,
293
+ "eval_loss": 0.6847726702690125,
294
+ "eval_runtime": 4.6717,
295
+ "eval_samples_per_second": 31.252,
296
+ "eval_steps_per_second": 4.067,
297
+ "step": 1800
298
+ },
299
+ {
300
+ "epoch": 4.600484261501211,
301
+ "grad_norm": 0.13196176290512085,
302
+ "learning_rate": 0.00010799031476997579,
303
+ "loss": 0.6896,
304
+ "step": 1900
305
+ },
306
+ {
307
+ "epoch": 4.600484261501211,
308
+ "eval_accuracy": 0.541095890410959,
309
+ "eval_loss": 0.6835151314735413,
310
+ "eval_runtime": 5.191,
311
+ "eval_samples_per_second": 28.126,
312
+ "eval_steps_per_second": 3.66,
313
+ "step": 1900
314
+ },
315
+ {
316
+ "epoch": 4.842615012106537,
317
+ "grad_norm": 0.5055987238883972,
318
+ "learning_rate": 0.00010314769975786926,
319
+ "loss": 0.6772,
320
+ "step": 2000
321
+ },
322
+ {
323
+ "epoch": 4.842615012106537,
324
+ "eval_accuracy": 0.6095890410958904,
325
+ "eval_loss": 0.6753013134002686,
326
+ "eval_runtime": 6.1231,
327
+ "eval_samples_per_second": 23.844,
328
+ "eval_steps_per_second": 3.103,
329
+ "step": 2000
330
+ },
331
+ {
332
+ "epoch": 5.084745762711864,
333
+ "grad_norm": 0.4209335148334503,
334
+ "learning_rate": 9.835351089588378e-05,
335
+ "loss": 0.6843,
336
+ "step": 2100
337
+ },
338
+ {
339
+ "epoch": 5.084745762711864,
340
+ "eval_accuracy": 0.589041095890411,
341
+ "eval_loss": 0.6667279601097107,
342
+ "eval_runtime": 4.6413,
343
+ "eval_samples_per_second": 31.457,
344
+ "eval_steps_per_second": 4.094,
345
+ "step": 2100
346
+ },
347
+ {
348
+ "epoch": 5.326876513317191,
349
+ "grad_norm": 1.4965670108795166,
350
+ "learning_rate": 9.351089588377724e-05,
351
+ "loss": 0.6898,
352
+ "step": 2200
353
+ },
354
+ {
355
+ "epoch": 5.326876513317191,
356
+ "eval_accuracy": 0.5821917808219178,
357
+ "eval_loss": 0.6725605726242065,
358
+ "eval_runtime": 6.034,
359
+ "eval_samples_per_second": 24.196,
360
+ "eval_steps_per_second": 3.149,
361
+ "step": 2200
362
+ },
363
+ {
364
+ "epoch": 5.5690072639225185,
365
+ "grad_norm": 0.4163062572479248,
366
+ "learning_rate": 8.86682808716707e-05,
367
+ "loss": 0.6868,
368
+ "step": 2300
369
+ },
370
+ {
371
+ "epoch": 5.5690072639225185,
372
+ "eval_accuracy": 0.5616438356164384,
373
+ "eval_loss": 0.6784049272537231,
374
+ "eval_runtime": 6.0533,
375
+ "eval_samples_per_second": 24.119,
376
+ "eval_steps_per_second": 3.139,
377
+ "step": 2300
378
+ },
379
+ {
380
+ "epoch": 5.811138014527845,
381
+ "grad_norm": 1.2287280559539795,
382
+ "learning_rate": 8.382566585956417e-05,
383
+ "loss": 0.6636,
384
+ "step": 2400
385
+ },
386
+ {
387
+ "epoch": 5.811138014527845,
388
+ "eval_accuracy": 0.6301369863013698,
389
+ "eval_loss": 0.6639688611030579,
390
+ "eval_runtime": 5.3104,
391
+ "eval_samples_per_second": 27.493,
392
+ "eval_steps_per_second": 3.578,
393
+ "step": 2400
394
+ },
395
+ {
396
+ "epoch": 6.053268765133172,
397
+ "grad_norm": 0.8932170867919922,
398
+ "learning_rate": 7.898305084745763e-05,
399
+ "loss": 0.6833,
400
+ "step": 2500
401
+ },
402
+ {
403
+ "epoch": 6.053268765133172,
404
+ "eval_accuracy": 0.5136986301369864,
405
+ "eval_loss": 0.676824688911438,
406
+ "eval_runtime": 4.6631,
407
+ "eval_samples_per_second": 31.309,
408
+ "eval_steps_per_second": 4.074,
409
+ "step": 2500
410
+ },
411
+ {
412
+ "epoch": 6.295399515738499,
413
+ "grad_norm": 1.1837154626846313,
414
+ "learning_rate": 7.414043583535109e-05,
415
+ "loss": 0.678,
416
+ "step": 2600
417
+ },
418
+ {
419
+ "epoch": 6.295399515738499,
420
+ "eval_accuracy": 0.6232876712328768,
421
+ "eval_loss": 0.6652230024337769,
422
+ "eval_runtime": 4.933,
423
+ "eval_samples_per_second": 29.597,
424
+ "eval_steps_per_second": 3.852,
425
+ "step": 2600
426
+ },
427
+ {
428
+ "epoch": 6.5375302663438255,
429
+ "grad_norm": 1.4030615091323853,
430
+ "learning_rate": 6.929782082324455e-05,
431
+ "loss": 0.6672,
432
+ "step": 2700
433
+ },
434
+ {
435
+ "epoch": 6.5375302663438255,
436
+ "eval_accuracy": 0.547945205479452,
437
+ "eval_loss": 0.6735221147537231,
438
+ "eval_runtime": 6.0106,
439
+ "eval_samples_per_second": 24.29,
440
+ "eval_steps_per_second": 3.161,
441
+ "step": 2700
442
+ },
443
+ {
444
+ "epoch": 6.779661016949152,
445
+ "grad_norm": 0.6782599687576294,
446
+ "learning_rate": 6.445520581113802e-05,
447
+ "loss": 0.6975,
448
+ "step": 2800
449
+ },
450
+ {
451
+ "epoch": 6.779661016949152,
452
+ "eval_accuracy": 0.589041095890411,
453
+ "eval_loss": 0.6686810851097107,
454
+ "eval_runtime": 4.7582,
455
+ "eval_samples_per_second": 30.684,
456
+ "eval_steps_per_second": 3.993,
457
+ "step": 2800
458
+ },
459
+ {
460
+ "epoch": 7.021791767554479,
461
+ "grad_norm": 0.4288092255592346,
462
+ "learning_rate": 5.961259079903147e-05,
463
+ "loss": 0.6858,
464
+ "step": 2900
465
+ },
466
+ {
467
+ "epoch": 7.021791767554479,
468
+ "eval_accuracy": 0.6027397260273972,
469
+ "eval_loss": 0.6672346591949463,
470
+ "eval_runtime": 4.7612,
471
+ "eval_samples_per_second": 30.665,
472
+ "eval_steps_per_second": 3.991,
473
+ "step": 2900
474
+ },
475
+ {
476
+ "epoch": 7.263922518159807,
477
+ "grad_norm": 1.373633861541748,
478
+ "learning_rate": 5.4769975786924946e-05,
479
+ "loss": 0.6687,
480
+ "step": 3000
481
+ },
482
+ {
483
+ "epoch": 7.263922518159807,
484
+ "eval_accuracy": 0.5753424657534246,
485
+ "eval_loss": 0.6647915840148926,
486
+ "eval_runtime": 4.681,
487
+ "eval_samples_per_second": 31.19,
488
+ "eval_steps_per_second": 4.059,
489
+ "step": 3000
490
+ },
491
+ {
492
+ "epoch": 7.506053268765133,
493
+ "grad_norm": 0.4883480668067932,
494
+ "learning_rate": 4.9927360774818404e-05,
495
+ "loss": 0.6636,
496
+ "step": 3100
497
+ },
498
+ {
499
+ "epoch": 7.506053268765133,
500
+ "eval_accuracy": 0.5684931506849316,
501
+ "eval_loss": 0.6673935055732727,
502
+ "eval_runtime": 4.6832,
503
+ "eval_samples_per_second": 31.175,
504
+ "eval_steps_per_second": 4.057,
505
+ "step": 3100
506
+ },
507
+ {
508
+ "epoch": 7.74818401937046,
509
+ "grad_norm": 0.2553524672985077,
510
+ "learning_rate": 4.508474576271187e-05,
511
+ "loss": 0.6904,
512
+ "step": 3200
513
+ },
514
+ {
515
+ "epoch": 7.74818401937046,
516
+ "eval_accuracy": 0.5342465753424658,
517
+ "eval_loss": 0.6751775741577148,
518
+ "eval_runtime": 4.7128,
519
+ "eval_samples_per_second": 30.979,
520
+ "eval_steps_per_second": 4.032,
521
+ "step": 3200
522
+ },
523
+ {
524
+ "epoch": 7.990314769975787,
525
+ "grad_norm": 0.5203524827957153,
526
+ "learning_rate": 4.024213075060533e-05,
527
+ "loss": 0.6585,
528
+ "step": 3300
529
+ },
530
+ {
531
+ "epoch": 7.990314769975787,
532
+ "eval_accuracy": 0.5958904109589042,
533
+ "eval_loss": 0.7023173570632935,
534
+ "eval_runtime": 5.9291,
535
+ "eval_samples_per_second": 24.624,
536
+ "eval_steps_per_second": 3.205,
537
+ "step": 3300
538
+ },
539
+ {
540
+ "epoch": 8.232445520581114,
541
+ "grad_norm": 1.3221914768218994,
542
+ "learning_rate": 3.539951573849879e-05,
543
+ "loss": 0.6874,
544
+ "step": 3400
545
+ },
546
+ {
547
+ "epoch": 8.232445520581114,
548
+ "eval_accuracy": 0.5753424657534246,
549
+ "eval_loss": 0.6615224480628967,
550
+ "eval_runtime": 5.9877,
551
+ "eval_samples_per_second": 24.383,
552
+ "eval_steps_per_second": 3.173,
553
+ "step": 3400
554
+ },
555
+ {
556
+ "epoch": 8.474576271186441,
557
+ "grad_norm": 0.6332941651344299,
558
+ "learning_rate": 3.055690072639225e-05,
559
+ "loss": 0.6444,
560
+ "step": 3500
561
+ },
562
+ {
563
+ "epoch": 8.474576271186441,
564
+ "eval_accuracy": 0.5205479452054794,
565
+ "eval_loss": 0.772119402885437,
566
+ "eval_runtime": 5.8976,
567
+ "eval_samples_per_second": 24.756,
568
+ "eval_steps_per_second": 3.222,
569
+ "step": 3500
570
+ },
571
+ {
572
+ "epoch": 8.716707021791768,
573
+ "grad_norm": 1.8709771633148193,
574
+ "learning_rate": 2.5714285714285714e-05,
575
+ "loss": 0.6803,
576
+ "step": 3600
577
+ },
578
+ {
579
+ "epoch": 8.716707021791768,
580
+ "eval_accuracy": 0.5821917808219178,
581
+ "eval_loss": 0.6808822751045227,
582
+ "eval_runtime": 4.743,
583
+ "eval_samples_per_second": 30.782,
584
+ "eval_steps_per_second": 4.006,
585
+ "step": 3600
586
+ },
587
+ {
588
+ "epoch": 8.958837772397095,
589
+ "grad_norm": 1.1251460313796997,
590
+ "learning_rate": 2.0871670702179177e-05,
591
+ "loss": 0.6782,
592
+ "step": 3700
593
+ },
594
+ {
595
+ "epoch": 8.958837772397095,
596
+ "eval_accuracy": 0.5821917808219178,
597
+ "eval_loss": 0.6638409495353699,
598
+ "eval_runtime": 5.3184,
599
+ "eval_samples_per_second": 27.452,
600
+ "eval_steps_per_second": 3.573,
601
+ "step": 3700
602
+ },
603
+ {
604
+ "epoch": 9.200968523002421,
605
+ "grad_norm": 0.5918538570404053,
606
+ "learning_rate": 1.602905569007264e-05,
607
+ "loss": 0.6536,
608
+ "step": 3800
609
+ },
610
+ {
611
+ "epoch": 9.200968523002421,
612
+ "eval_accuracy": 0.6232876712328768,
613
+ "eval_loss": 0.6606671214103699,
614
+ "eval_runtime": 4.6821,
615
+ "eval_samples_per_second": 31.183,
616
+ "eval_steps_per_second": 4.058,
617
+ "step": 3800
618
+ },
619
+ {
620
+ "epoch": 9.443099273607748,
621
+ "grad_norm": 1.1931524276733398,
622
+ "learning_rate": 1.1186440677966102e-05,
623
+ "loss": 0.6188,
624
+ "step": 3900
625
+ },
626
+ {
627
+ "epoch": 9.443099273607748,
628
+ "eval_accuracy": 0.5684931506849316,
629
+ "eval_loss": 0.7090215682983398,
630
+ "eval_runtime": 4.6915,
631
+ "eval_samples_per_second": 31.12,
632
+ "eval_steps_per_second": 4.05,
633
+ "step": 3900
634
+ },
635
+ {
636
+ "epoch": 9.685230024213075,
637
+ "grad_norm": 2.177264928817749,
638
+ "learning_rate": 6.3438256658595635e-06,
639
+ "loss": 0.7026,
640
+ "step": 4000
641
+ },
642
+ {
643
+ "epoch": 9.685230024213075,
644
+ "eval_accuracy": 0.6164383561643836,
645
+ "eval_loss": 0.6574238538742065,
646
+ "eval_runtime": 4.7502,
647
+ "eval_samples_per_second": 30.736,
648
+ "eval_steps_per_second": 4.0,
649
+ "step": 4000
650
+ },
651
+ {
652
+ "epoch": 9.927360774818402,
653
+ "grad_norm": 0.5541784167289734,
654
+ "learning_rate": 1.549636803874092e-06,
655
+ "loss": 0.7008,
656
+ "step": 4100
657
+ },
658
+ {
659
+ "epoch": 9.927360774818402,
660
+ "eval_accuracy": 0.6095890410958904,
661
+ "eval_loss": 0.6576805710792542,
662
+ "eval_runtime": 4.7337,
663
+ "eval_samples_per_second": 30.843,
664
+ "eval_steps_per_second": 4.014,
665
+ "step": 4100
666
+ },
667
+ {
668
+ "epoch": 10.0,
669
+ "step": 4130,
670
+ "total_flos": 6.400838342165299e+17,
671
+ "train_loss": 0.6831480086282725,
672
+ "train_runtime": 1157.4221,
673
+ "train_samples_per_second": 7.137,
674
+ "train_steps_per_second": 3.568
675
  }
676
  ],
677
  "logging_steps": 100,
678
+ "max_steps": 4130,
679
  "num_input_tokens_seen": 0,
680
+ "num_train_epochs": 10,
681
  "save_steps": 100,
682
  "stateful_callbacks": {
683
  "TrainerControl": {
 
691
  "attributes": {}
692
  }
693
  },
694
+ "total_flos": 6.400838342165299e+17,
695
+ "train_batch_size": 2,
696
  "trial_name": null,
697
  "trial_params": null
698
  }