josesantorcuato commited on
Commit
6556e73
1 Parent(s): f029b5d

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.184905660377359,
3
- "eval_accuracy": 0.7978142076502732,
4
- "eval_loss": 0.5372045040130615,
5
- "eval_runtime": 17.466,
6
- "eval_samples_per_second": 10.478,
7
- "eval_steps_per_second": 1.317
8
  }
 
1
  {
2
+ "epoch": 9.080434782608696,
3
+ "eval_accuracy": 0.9617834394904459,
4
+ "eval_loss": 0.1620456427335739,
5
+ "eval_runtime": 14.8873,
6
+ "eval_samples_per_second": 10.546,
7
+ "eval_steps_per_second": 1.343
8
  }
runs/Oct29_20-22-21_7c56bb07786b/events.out.tfevents.1730234240.7c56bb07786b.46717.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fb4f6437df2b77386f3957338c87e1e8b3ba01150cccd948a8369e14e4689d2
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b3bd51f784cc2ef1692ab398c23c7cf1ec89f936afa92ce70bc1ffcdcf7474
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.184905660377359,
3
- "eval_accuracy": 0.7978142076502732,
4
- "eval_loss": 0.5372045040130615,
5
- "eval_runtime": 17.466,
6
- "eval_samples_per_second": 10.478,
7
- "eval_steps_per_second": 1.317
8
  }
 
1
  {
2
+ "epoch": 9.080434782608696,
3
+ "eval_accuracy": 0.9617834394904459,
4
+ "eval_loss": 0.1620456427335739,
5
+ "eval_runtime": 14.8873,
6
+ "eval_samples_per_second": 10.546,
7
+ "eval_steps_per_second": 1.343
8
  }
trainer_state.json CHANGED
@@ -1,270 +1,455 @@
1
  {
2
- "best_metric": 0.8452380952380952,
3
- "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-265",
4
- "epoch": 4.184905660377359,
5
  "eval_steps": 500,
6
- "global_step": 265,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03773584905660377,
13
- "grad_norm": 8.033324241638184,
14
- "learning_rate": 1.8518518518518518e-05,
15
- "loss": 2.7609,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.07547169811320754,
20
- "grad_norm": 7.882028102874756,
21
- "learning_rate": 3.7037037037037037e-05,
22
- "loss": 2.6513,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.11320754716981132,
27
- "grad_norm": 8.582988739013672,
28
- "learning_rate": 4.936974789915967e-05,
29
- "loss": 2.5533,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.1509433962264151,
34
- "grad_norm": 9.038335800170898,
35
- "learning_rate": 4.726890756302521e-05,
36
- "loss": 2.3765,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.18867924528301888,
41
- "grad_norm": 10.162276268005371,
42
- "learning_rate": 4.516806722689076e-05,
43
- "loss": 2.2682,
44
- "step": 50
 
 
45
  },
46
  {
47
- "epoch": 0.2037735849056604,
48
- "eval_accuracy": 0.32142857142857145,
49
- "eval_loss": 2.0374372005462646,
50
- "eval_runtime": 21.9106,
51
- "eval_samples_per_second": 3.834,
52
- "eval_steps_per_second": 0.502,
53
- "step": 54
54
  },
55
  {
56
- "epoch": 1.0226415094339623,
57
- "grad_norm": 8.538351058959961,
58
- "learning_rate": 4.3067226890756305e-05,
59
- "loss": 1.9532,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.060377358490566,
64
- "grad_norm": 8.961767196655273,
65
- "learning_rate": 4.096638655462185e-05,
66
- "loss": 1.5809,
67
  "step": 70
68
  },
69
  {
70
- "epoch": 1.0981132075471698,
71
- "grad_norm": 8.71930980682373,
72
- "learning_rate": 3.88655462184874e-05,
73
- "loss": 1.4653,
74
  "step": 80
75
  },
76
  {
77
- "epoch": 1.1358490566037736,
78
- "grad_norm": 11.393306732177734,
79
- "learning_rate": 3.6764705882352945e-05,
80
- "loss": 1.3126,
81
  "step": 90
82
  },
83
  {
84
- "epoch": 1.1735849056603773,
85
- "grad_norm": 8.893975257873535,
86
- "learning_rate": 3.466386554621849e-05,
87
- "loss": 1.0997,
88
- "step": 100
 
 
89
  },
90
  {
91
- "epoch": 1.2037735849056603,
92
- "eval_accuracy": 0.6666666666666666,
93
- "eval_loss": 1.0029432773590088,
94
- "eval_runtime": 7.8357,
95
- "eval_samples_per_second": 10.72,
96
- "eval_steps_per_second": 1.404,
97
- "step": 108
98
  },
99
  {
100
- "epoch": 2.0075471698113208,
101
- "grad_norm": 8.756331443786621,
102
- "learning_rate": 3.2563025210084034e-05,
103
- "loss": 1.0144,
104
  "step": 110
105
  },
106
  {
107
- "epoch": 2.0452830188679245,
108
- "grad_norm": 15.8713960647583,
109
- "learning_rate": 3.0462184873949578e-05,
110
- "loss": 0.9754,
111
  "step": 120
112
  },
113
  {
114
- "epoch": 2.0830188679245283,
115
- "grad_norm": 25.470157623291016,
116
- "learning_rate": 2.8361344537815126e-05,
117
- "loss": 1.1946,
118
  "step": 130
119
  },
120
  {
121
- "epoch": 2.120754716981132,
122
- "grad_norm": 6.735860347747803,
123
- "learning_rate": 2.6260504201680674e-05,
124
- "loss": 0.782,
125
  "step": 140
126
  },
127
  {
128
- "epoch": 2.158490566037736,
129
- "grad_norm": 8.800786018371582,
130
- "learning_rate": 2.415966386554622e-05,
131
- "loss": 0.8976,
132
- "step": 150
 
 
133
  },
134
  {
135
- "epoch": 2.1962264150943396,
136
- "grad_norm": 8.588472366333008,
137
- "learning_rate": 2.2058823529411766e-05,
138
- "loss": 0.6996,
139
- "step": 160
140
  },
141
  {
142
- "epoch": 2.2037735849056603,
143
- "eval_accuracy": 0.7857142857142857,
144
- "eval_loss": 0.7632536292076111,
145
- "eval_runtime": 8.3664,
146
- "eval_samples_per_second": 10.04,
147
- "eval_steps_per_second": 1.315,
148
- "step": 162
149
  },
150
  {
151
- "epoch": 3.030188679245283,
152
- "grad_norm": 7.108087539672852,
153
- "learning_rate": 1.9957983193277314e-05,
154
- "loss": 0.8053,
155
  "step": 170
156
  },
157
  {
158
- "epoch": 3.0679245283018868,
159
- "grad_norm": 8.610198974609375,
160
- "learning_rate": 1.785714285714286e-05,
161
- "loss": 0.6667,
162
  "step": 180
163
  },
164
  {
165
- "epoch": 3.1056603773584905,
166
- "grad_norm": 11.434289932250977,
167
- "learning_rate": 1.5756302521008403e-05,
168
- "loss": 0.6006,
 
 
 
 
 
 
 
 
 
169
  "step": 190
170
  },
171
  {
172
- "epoch": 3.1433962264150943,
173
- "grad_norm": 4.931293487548828,
174
- "learning_rate": 1.3655462184873949e-05,
175
- "loss": 0.635,
176
  "step": 200
177
  },
178
  {
179
- "epoch": 3.181132075471698,
180
- "grad_norm": 6.236601829528809,
181
- "learning_rate": 1.1554621848739497e-05,
182
- "loss": 0.7031,
183
  "step": 210
184
  },
185
  {
186
- "epoch": 3.2037735849056603,
187
- "eval_accuracy": 0.7857142857142857,
188
- "eval_loss": 0.5939908027648926,
189
- "eval_runtime": 7.8516,
190
- "eval_samples_per_second": 10.698,
191
- "eval_steps_per_second": 1.401,
192
- "step": 216
193
- },
194
- {
195
- "epoch": 4.0150943396226415,
196
- "grad_norm": 5.458017349243164,
197
- "learning_rate": 9.453781512605041e-06,
198
- "loss": 0.5998,
199
  "step": 220
200
  },
201
  {
202
- "epoch": 4.052830188679246,
203
- "grad_norm": 4.542972087860107,
204
- "learning_rate": 7.3529411764705884e-06,
205
- "loss": 0.5022,
206
  "step": 230
207
  },
208
  {
209
- "epoch": 4.090566037735849,
210
- "grad_norm": 17.187719345092773,
211
- "learning_rate": 5.252100840336135e-06,
212
- "loss": 0.4509,
 
 
 
 
 
 
 
 
 
213
  "step": 240
214
  },
215
  {
216
- "epoch": 4.128301886792453,
217
- "grad_norm": 18.115018844604492,
218
- "learning_rate": 3.1512605042016808e-06,
219
- "loss": 0.5752,
220
  "step": 250
221
  },
222
  {
223
- "epoch": 4.166037735849057,
224
- "grad_norm": 6.622856616973877,
225
- "learning_rate": 1.0504201680672271e-06,
226
- "loss": 0.4078,
227
  "step": 260
228
  },
229
  {
230
- "epoch": 4.184905660377359,
231
- "eval_accuracy": 0.8452380952380952,
232
- "eval_loss": 0.5297083854675293,
233
- "eval_runtime": 8.7988,
234
- "eval_samples_per_second": 9.547,
235
- "eval_steps_per_second": 1.25,
236
- "step": 265
237
- },
238
- {
239
- "epoch": 4.184905660377359,
240
- "step": 265,
241
- "total_flos": 2.631970050168324e+18,
242
- "train_loss": 1.195451885799192,
243
- "train_runtime": 648.8571,
244
- "train_samples_per_second": 3.267,
245
- "train_steps_per_second": 0.408
246
- },
247
- {
248
- "epoch": 4.184905660377359,
249
- "eval_accuracy": 0.7978142076502732,
250
- "eval_loss": 0.5372046828269958,
251
- "eval_runtime": 49.961,
252
- "eval_samples_per_second": 3.663,
253
- "eval_steps_per_second": 0.46,
254
- "step": 265
255
- },
256
- {
257
- "epoch": 4.184905660377359,
258
- "eval_accuracy": 0.7978142076502732,
259
- "eval_loss": 0.5372045040130615,
260
- "eval_runtime": 17.466,
261
- "eval_samples_per_second": 10.478,
262
- "eval_steps_per_second": 1.317,
263
- "step": 265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  }
265
  ],
266
  "logging_steps": 10,
267
- "max_steps": 265,
268
  "num_input_tokens_seen": 0,
269
  "num_train_epochs": 9223372036854775807,
270
  "save_steps": 500,
@@ -280,7 +465,7 @@
280
  "attributes": {}
281
  }
282
  },
283
- "total_flos": 2.631970050168324e+18,
284
  "train_batch_size": 8,
285
  "trial_name": null,
286
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9848484848484849,
3
+ "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset/checkpoint-329",
4
+ "epoch": 9.080434782608696,
5
  "eval_steps": 500,
6
+ "global_step": 460,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.021739130434782608,
13
+ "grad_norm": 9.379189491271973,
14
+ "learning_rate": 1.0869565217391305e-05,
15
+ "loss": 2.5308,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.043478260869565216,
20
+ "grad_norm": 10.402758598327637,
21
+ "learning_rate": 2.173913043478261e-05,
22
+ "loss": 2.4704,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.06521739130434782,
27
+ "grad_norm": 8.703824043273926,
28
+ "learning_rate": 3.260869565217392e-05,
29
+ "loss": 2.4447,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.08695652173913043,
34
+ "grad_norm": 10.318586349487305,
35
+ "learning_rate": 4.347826086956522e-05,
36
+ "loss": 2.2403,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.10217391304347827,
41
+ "eval_accuracy": 0.13636363636363635,
42
+ "eval_loss": 2.2386703491210938,
43
+ "eval_runtime": 6.4992,
44
+ "eval_samples_per_second": 10.155,
45
+ "eval_steps_per_second": 1.385,
46
+ "step": 47
47
  },
48
  {
49
+ "epoch": 1.0065217391304349,
50
+ "grad_norm": 8.518580436706543,
51
+ "learning_rate": 4.9516908212560386e-05,
52
+ "loss": 2.215,
53
+ "step": 50
 
 
54
  },
55
  {
56
+ "epoch": 1.0282608695652173,
57
+ "grad_norm": 11.493647575378418,
58
+ "learning_rate": 4.830917874396135e-05,
59
+ "loss": 1.8724,
60
  "step": 60
61
  },
62
  {
63
+ "epoch": 1.05,
64
+ "grad_norm": 9.734604835510254,
65
+ "learning_rate": 4.710144927536232e-05,
66
+ "loss": 1.7272,
67
  "step": 70
68
  },
69
  {
70
+ "epoch": 1.0717391304347825,
71
+ "grad_norm": 9.319704055786133,
72
+ "learning_rate": 4.589371980676328e-05,
73
+ "loss": 1.5902,
74
  "step": 80
75
  },
76
  {
77
+ "epoch": 1.0934782608695652,
78
+ "grad_norm": 7.253905773162842,
79
+ "learning_rate": 4.4685990338164255e-05,
80
+ "loss": 1.2918,
81
  "step": 90
82
  },
83
  {
84
+ "epoch": 1.1021739130434782,
85
+ "eval_accuracy": 0.5151515151515151,
86
+ "eval_loss": 1.3612327575683594,
87
+ "eval_runtime": 6.203,
88
+ "eval_samples_per_second": 10.64,
89
+ "eval_steps_per_second": 1.451,
90
+ "step": 94
91
  },
92
  {
93
+ "epoch": 2.0130434782608697,
94
+ "grad_norm": 9.277091026306152,
95
+ "learning_rate": 4.347826086956522e-05,
96
+ "loss": 1.1276,
97
+ "step": 100
 
 
98
  },
99
  {
100
+ "epoch": 2.034782608695652,
101
+ "grad_norm": 6.776515960693359,
102
+ "learning_rate": 4.2270531400966186e-05,
103
+ "loss": 0.8852,
104
  "step": 110
105
  },
106
  {
107
+ "epoch": 2.0565217391304347,
108
+ "grad_norm": 7.559288501739502,
109
+ "learning_rate": 4.106280193236715e-05,
110
+ "loss": 0.8805,
111
  "step": 120
112
  },
113
  {
114
+ "epoch": 2.0782608695652174,
115
+ "grad_norm": 14.196601867675781,
116
+ "learning_rate": 3.985507246376812e-05,
117
+ "loss": 0.7797,
118
  "step": 130
119
  },
120
  {
121
+ "epoch": 2.1,
122
+ "grad_norm": 14.56787109375,
123
+ "learning_rate": 3.864734299516908e-05,
124
+ "loss": 0.8662,
125
  "step": 140
126
  },
127
  {
128
+ "epoch": 2.1021739130434782,
129
+ "eval_accuracy": 0.7424242424242424,
130
+ "eval_loss": 0.8151518702507019,
131
+ "eval_runtime": 6.4913,
132
+ "eval_samples_per_second": 10.167,
133
+ "eval_steps_per_second": 1.386,
134
+ "step": 141
135
  },
136
  {
137
+ "epoch": 3.0195652173913046,
138
+ "grad_norm": 6.646560192108154,
139
+ "learning_rate": 3.743961352657005e-05,
140
+ "loss": 0.6663,
141
+ "step": 150
142
  },
143
  {
144
+ "epoch": 3.041304347826087,
145
+ "grad_norm": 6.8809356689453125,
146
+ "learning_rate": 3.6231884057971014e-05,
147
+ "loss": 0.549,
148
+ "step": 160
 
 
149
  },
150
  {
151
+ "epoch": 3.0630434782608695,
152
+ "grad_norm": 12.483465194702148,
153
+ "learning_rate": 3.502415458937198e-05,
154
+ "loss": 0.5251,
155
  "step": 170
156
  },
157
  {
158
+ "epoch": 3.0847826086956522,
159
+ "grad_norm": 7.2777276039123535,
160
+ "learning_rate": 3.381642512077295e-05,
161
+ "loss": 0.6072,
162
  "step": 180
163
  },
164
  {
165
+ "epoch": 3.1021739130434782,
166
+ "eval_accuracy": 0.8939393939393939,
167
+ "eval_loss": 0.39679834246635437,
168
+ "eval_runtime": 5.8658,
169
+ "eval_samples_per_second": 11.252,
170
+ "eval_steps_per_second": 1.534,
171
+ "step": 188
172
+ },
173
+ {
174
+ "epoch": 4.004347826086956,
175
+ "grad_norm": 6.928377628326416,
176
+ "learning_rate": 3.260869565217392e-05,
177
+ "loss": 0.4876,
178
  "step": 190
179
  },
180
  {
181
+ "epoch": 4.026086956521739,
182
+ "grad_norm": 19.643159866333008,
183
+ "learning_rate": 3.140096618357488e-05,
184
+ "loss": 0.3014,
185
  "step": 200
186
  },
187
  {
188
+ "epoch": 4.047826086956522,
189
+ "grad_norm": 4.118589401245117,
190
+ "learning_rate": 3.0193236714975848e-05,
191
+ "loss": 0.2793,
192
  "step": 210
193
  },
194
  {
195
+ "epoch": 4.069565217391304,
196
+ "grad_norm": 1.8621646165847778,
197
+ "learning_rate": 2.8985507246376814e-05,
198
+ "loss": 0.2754,
 
 
 
 
 
 
 
 
 
199
  "step": 220
200
  },
201
  {
202
+ "epoch": 4.091304347826087,
203
+ "grad_norm": 18.98653793334961,
204
+ "learning_rate": 2.777777777777778e-05,
205
+ "loss": 0.2958,
206
  "step": 230
207
  },
208
  {
209
+ "epoch": 4.102173913043479,
210
+ "eval_accuracy": 0.8787878787878788,
211
+ "eval_loss": 0.3365328013896942,
212
+ "eval_runtime": 5.9112,
213
+ "eval_samples_per_second": 11.165,
214
+ "eval_steps_per_second": 1.523,
215
+ "step": 235
216
+ },
217
+ {
218
+ "epoch": 5.010869565217392,
219
+ "grad_norm": 11.831360816955566,
220
+ "learning_rate": 2.6570048309178748e-05,
221
+ "loss": 0.2189,
222
  "step": 240
223
  },
224
  {
225
+ "epoch": 5.032608695652174,
226
+ "grad_norm": 5.401520252227783,
227
+ "learning_rate": 2.5362318840579714e-05,
228
+ "loss": 0.1791,
229
  "step": 250
230
  },
231
  {
232
+ "epoch": 5.054347826086956,
233
+ "grad_norm": 9.114124298095703,
234
+ "learning_rate": 2.4154589371980676e-05,
235
+ "loss": 0.2604,
236
  "step": 260
237
  },
238
  {
239
+ "epoch": 5.076086956521739,
240
+ "grad_norm": 1.7160027027130127,
241
+ "learning_rate": 2.294685990338164e-05,
242
+ "loss": 0.1396,
243
+ "step": 270
244
+ },
245
+ {
246
+ "epoch": 5.0978260869565215,
247
+ "grad_norm": 1.6239838600158691,
248
+ "learning_rate": 2.173913043478261e-05,
249
+ "loss": 0.1534,
250
+ "step": 280
251
+ },
252
+ {
253
+ "epoch": 5.102173913043479,
254
+ "eval_accuracy": 0.9242424242424242,
255
+ "eval_loss": 0.25064730644226074,
256
+ "eval_runtime": 5.9732,
257
+ "eval_samples_per_second": 11.049,
258
+ "eval_steps_per_second": 1.507,
259
+ "step": 282
260
+ },
261
+ {
262
+ "epoch": 6.017391304347826,
263
+ "grad_norm": 11.243247032165527,
264
+ "learning_rate": 2.0531400966183576e-05,
265
+ "loss": 0.1272,
266
+ "step": 290
267
+ },
268
+ {
269
+ "epoch": 6.039130434782609,
270
+ "grad_norm": 11.26307487487793,
271
+ "learning_rate": 1.932367149758454e-05,
272
+ "loss": 0.0651,
273
+ "step": 300
274
+ },
275
+ {
276
+ "epoch": 6.060869565217391,
277
+ "grad_norm": 18.92414665222168,
278
+ "learning_rate": 1.8115942028985507e-05,
279
+ "loss": 0.167,
280
+ "step": 310
281
+ },
282
+ {
283
+ "epoch": 6.082608695652174,
284
+ "grad_norm": 0.3199400305747986,
285
+ "learning_rate": 1.6908212560386476e-05,
286
+ "loss": 0.0907,
287
+ "step": 320
288
+ },
289
+ {
290
+ "epoch": 6.102173913043479,
291
+ "eval_accuracy": 0.9848484848484849,
292
+ "eval_loss": 0.1101275309920311,
293
+ "eval_runtime": 6.2213,
294
+ "eval_samples_per_second": 10.609,
295
+ "eval_steps_per_second": 1.447,
296
+ "step": 329
297
+ },
298
+ {
299
+ "epoch": 7.002173913043478,
300
+ "grad_norm": 5.541162490844727,
301
+ "learning_rate": 1.570048309178744e-05,
302
+ "loss": 0.1261,
303
+ "step": 330
304
+ },
305
+ {
306
+ "epoch": 7.023913043478261,
307
+ "grad_norm": 9.781050682067871,
308
+ "learning_rate": 1.4492753623188407e-05,
309
+ "loss": 0.0858,
310
+ "step": 340
311
+ },
312
+ {
313
+ "epoch": 7.0456521739130435,
314
+ "grad_norm": 15.221212387084961,
315
+ "learning_rate": 1.3285024154589374e-05,
316
+ "loss": 0.0677,
317
+ "step": 350
318
+ },
319
+ {
320
+ "epoch": 7.067391304347826,
321
+ "grad_norm": 0.271314412355423,
322
+ "learning_rate": 1.2077294685990338e-05,
323
+ "loss": 0.0875,
324
+ "step": 360
325
+ },
326
+ {
327
+ "epoch": 7.089130434782609,
328
+ "grad_norm": 7.625803470611572,
329
+ "learning_rate": 1.0869565217391305e-05,
330
+ "loss": 0.1085,
331
+ "step": 370
332
+ },
333
+ {
334
+ "epoch": 7.102173913043479,
335
+ "eval_accuracy": 0.9545454545454546,
336
+ "eval_loss": 0.10326449573040009,
337
+ "eval_runtime": 6.6343,
338
+ "eval_samples_per_second": 9.948,
339
+ "eval_steps_per_second": 1.357,
340
+ "step": 376
341
+ },
342
+ {
343
+ "epoch": 8.008695652173913,
344
+ "grad_norm": 0.3747415244579315,
345
+ "learning_rate": 9.66183574879227e-06,
346
+ "loss": 0.1309,
347
+ "step": 380
348
+ },
349
+ {
350
+ "epoch": 8.030434782608696,
351
+ "grad_norm": 0.6200582981109619,
352
+ "learning_rate": 8.454106280193238e-06,
353
+ "loss": 0.0571,
354
+ "step": 390
355
+ },
356
+ {
357
+ "epoch": 8.052173913043479,
358
+ "grad_norm": 0.3507235646247864,
359
+ "learning_rate": 7.246376811594203e-06,
360
+ "loss": 0.0293,
361
+ "step": 400
362
+ },
363
+ {
364
+ "epoch": 8.07391304347826,
365
+ "grad_norm": 0.3272978961467743,
366
+ "learning_rate": 6.038647342995169e-06,
367
+ "loss": 0.0481,
368
+ "step": 410
369
+ },
370
+ {
371
+ "epoch": 8.095652173913043,
372
+ "grad_norm": 0.4806969165802002,
373
+ "learning_rate": 4.830917874396135e-06,
374
+ "loss": 0.0666,
375
+ "step": 420
376
+ },
377
+ {
378
+ "epoch": 8.102173913043478,
379
+ "eval_accuracy": 0.9696969696969697,
380
+ "eval_loss": 0.10309642553329468,
381
+ "eval_runtime": 6.078,
382
+ "eval_samples_per_second": 10.859,
383
+ "eval_steps_per_second": 1.481,
384
+ "step": 423
385
+ },
386
+ {
387
+ "epoch": 9.015217391304347,
388
+ "grad_norm": 0.6005312204360962,
389
+ "learning_rate": 3.6231884057971017e-06,
390
+ "loss": 0.0985,
391
+ "step": 430
392
+ },
393
+ {
394
+ "epoch": 9.03695652173913,
395
+ "grad_norm": 1.0368554592132568,
396
+ "learning_rate": 2.4154589371980677e-06,
397
+ "loss": 0.0249,
398
+ "step": 440
399
+ },
400
+ {
401
+ "epoch": 9.058695652173913,
402
+ "grad_norm": 0.20510777831077576,
403
+ "learning_rate": 1.2077294685990338e-06,
404
+ "loss": 0.0141,
405
+ "step": 450
406
+ },
407
+ {
408
+ "epoch": 9.080434782608696,
409
+ "grad_norm": 0.4739467203617096,
410
+ "learning_rate": 0.0,
411
+ "loss": 0.0251,
412
+ "step": 460
413
+ },
414
+ {
415
+ "epoch": 9.080434782608696,
416
+ "eval_accuracy": 0.9696969696969697,
417
+ "eval_loss": 0.11189308762550354,
418
+ "eval_runtime": 8.098,
419
+ "eval_samples_per_second": 8.15,
420
+ "eval_steps_per_second": 1.111,
421
+ "step": 460
422
+ },
423
+ {
424
+ "epoch": 9.080434782608696,
425
+ "step": 460,
426
+ "total_flos": 4.5186331435416945e+18,
427
+ "train_loss": 0.6365434888264407,
428
+ "train_runtime": 731.5291,
429
+ "train_samples_per_second": 5.031,
430
+ "train_steps_per_second": 0.629
431
+ },
432
+ {
433
+ "epoch": 9.080434782608696,
434
+ "eval_accuracy": 0.9617834394904459,
435
+ "eval_loss": 0.1620456427335739,
436
+ "eval_runtime": 14.6193,
437
+ "eval_samples_per_second": 10.739,
438
+ "eval_steps_per_second": 1.368,
439
+ "step": 460
440
+ },
441
+ {
442
+ "epoch": 9.080434782608696,
443
+ "eval_accuracy": 0.9617834394904459,
444
+ "eval_loss": 0.1620456427335739,
445
+ "eval_runtime": 14.8873,
446
+ "eval_samples_per_second": 10.546,
447
+ "eval_steps_per_second": 1.343,
448
+ "step": 460
449
  }
450
  ],
451
  "logging_steps": 10,
452
+ "max_steps": 460,
453
  "num_input_tokens_seen": 0,
454
  "num_train_epochs": 9223372036854775807,
455
  "save_steps": 500,
 
465
  "attributes": {}
466
  }
467
  },
468
+ "total_flos": 4.5186331435416945e+18,
469
  "train_batch_size": 8,
470
  "trial_name": null,
471
  "trial_params": null