Zawarudoooo commited on
Commit
956af46
·
verified ·
1 Parent(s): 8baed71

ocr version 1

Browse files
Files changed (1) hide show
  1. trainer_state.json +387 -0
trainer_state.json ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1560,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 134.19052124023438,
14
+ "learning_rate": 4.996794871794872e-05,
15
+ "loss": 9.1177,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.1,
20
+ "grad_norm": 46.553192138671875,
21
+ "learning_rate": 4.9006410256410256e-05,
22
+ "loss": 2.6116,
23
+ "step": 31
24
+ },
25
+ {
26
+ "epoch": 0.2,
27
+ "grad_norm": 17.68166732788086,
28
+ "learning_rate": 4.8012820512820516e-05,
29
+ "loss": 0.8879,
30
+ "step": 62
31
+ },
32
+ {
33
+ "epoch": 0.3,
34
+ "grad_norm": 17.172128677368164,
35
+ "learning_rate": 4.701923076923077e-05,
36
+ "loss": 1.0567,
37
+ "step": 93
38
+ },
39
+ {
40
+ "epoch": 0.4,
41
+ "grad_norm": 104.65974426269531,
42
+ "learning_rate": 4.602564102564102e-05,
43
+ "loss": 0.9872,
44
+ "step": 124
45
+ },
46
+ {
47
+ "epoch": 0.5,
48
+ "grad_norm": 16.036008834838867,
49
+ "learning_rate": 4.503205128205128e-05,
50
+ "loss": 0.9717,
51
+ "step": 155
52
+ },
53
+ {
54
+ "epoch": 0.6,
55
+ "grad_norm": 27.394895553588867,
56
+ "learning_rate": 4.403846153846154e-05,
57
+ "loss": 0.8679,
58
+ "step": 186
59
+ },
60
+ {
61
+ "epoch": 0.7,
62
+ "grad_norm": 37.157169342041016,
63
+ "learning_rate": 4.30448717948718e-05,
64
+ "loss": 0.8257,
65
+ "step": 217
66
+ },
67
+ {
68
+ "epoch": 0.79,
69
+ "grad_norm": 8.90817928314209,
70
+ "learning_rate": 4.205128205128206e-05,
71
+ "loss": 0.8604,
72
+ "step": 248
73
+ },
74
+ {
75
+ "epoch": 0.89,
76
+ "grad_norm": 13.38546085357666,
77
+ "learning_rate": 4.105769230769231e-05,
78
+ "loss": 0.8587,
79
+ "step": 279
80
+ },
81
+ {
82
+ "epoch": 0.99,
83
+ "grad_norm": 32.03287124633789,
84
+ "learning_rate": 4.006410256410257e-05,
85
+ "loss": 0.8512,
86
+ "step": 310
87
+ },
88
+ {
89
+ "epoch": 1.09,
90
+ "grad_norm": 20.29245376586914,
91
+ "learning_rate": 3.9070512820512824e-05,
92
+ "loss": 0.6728,
93
+ "step": 341
94
+ },
95
+ {
96
+ "epoch": 1.19,
97
+ "grad_norm": 10.910124778747559,
98
+ "learning_rate": 3.807692307692308e-05,
99
+ "loss": 0.611,
100
+ "step": 372
101
+ },
102
+ {
103
+ "epoch": 1.29,
104
+ "grad_norm": 14.703102111816406,
105
+ "learning_rate": 3.708333333333334e-05,
106
+ "loss": 0.6468,
107
+ "step": 403
108
+ },
109
+ {
110
+ "epoch": 1.39,
111
+ "grad_norm": 9.841747283935547,
112
+ "learning_rate": 3.608974358974359e-05,
113
+ "loss": 0.5327,
114
+ "step": 434
115
+ },
116
+ {
117
+ "epoch": 1.49,
118
+ "grad_norm": 15.09598445892334,
119
+ "learning_rate": 3.5096153846153845e-05,
120
+ "loss": 0.5669,
121
+ "step": 465
122
+ },
123
+ {
124
+ "epoch": 1.59,
125
+ "grad_norm": 18.652040481567383,
126
+ "learning_rate": 3.4102564102564105e-05,
127
+ "loss": 0.4854,
128
+ "step": 496
129
+ },
130
+ {
131
+ "epoch": 1.69,
132
+ "grad_norm": 8.733405113220215,
133
+ "learning_rate": 3.310897435897436e-05,
134
+ "loss": 0.5627,
135
+ "step": 527
136
+ },
137
+ {
138
+ "epoch": 1.79,
139
+ "grad_norm": 21.200965881347656,
140
+ "learning_rate": 3.211538461538462e-05,
141
+ "loss": 0.5001,
142
+ "step": 558
143
+ },
144
+ {
145
+ "epoch": 1.89,
146
+ "grad_norm": 7.368778228759766,
147
+ "learning_rate": 3.112179487179487e-05,
148
+ "loss": 0.4897,
149
+ "step": 589
150
+ },
151
+ {
152
+ "epoch": 1.99,
153
+ "grad_norm": 16.03817367553711,
154
+ "learning_rate": 3.012820512820513e-05,
155
+ "loss": 0.5294,
156
+ "step": 620
157
+ },
158
+ {
159
+ "epoch": 2.09,
160
+ "grad_norm": 35.705631256103516,
161
+ "learning_rate": 2.913461538461539e-05,
162
+ "loss": 0.4197,
163
+ "step": 651
164
+ },
165
+ {
166
+ "epoch": 2.19,
167
+ "grad_norm": 10.462698936462402,
168
+ "learning_rate": 2.8141025641025643e-05,
169
+ "loss": 0.3589,
170
+ "step": 682
171
+ },
172
+ {
173
+ "epoch": 2.29,
174
+ "grad_norm": 7.011005878448486,
175
+ "learning_rate": 2.7147435897435896e-05,
176
+ "loss": 0.4396,
177
+ "step": 713
178
+ },
179
+ {
180
+ "epoch": 2.38,
181
+ "grad_norm": 4.32899284362793,
182
+ "learning_rate": 2.6153846153846157e-05,
183
+ "loss": 0.4604,
184
+ "step": 744
185
+ },
186
+ {
187
+ "epoch": 2.48,
188
+ "grad_norm": 5.010735988616943,
189
+ "learning_rate": 2.516025641025641e-05,
190
+ "loss": 0.3764,
191
+ "step": 775
192
+ },
193
+ {
194
+ "epoch": 2.58,
195
+ "grad_norm": 6.4428582191467285,
196
+ "learning_rate": 2.4166666666666667e-05,
197
+ "loss": 0.3035,
198
+ "step": 806
199
+ },
200
+ {
201
+ "epoch": 2.68,
202
+ "grad_norm": 11.917937278747559,
203
+ "learning_rate": 2.3173076923076924e-05,
204
+ "loss": 0.3287,
205
+ "step": 837
206
+ },
207
+ {
208
+ "epoch": 2.78,
209
+ "grad_norm": 18.710346221923828,
210
+ "learning_rate": 2.217948717948718e-05,
211
+ "loss": 0.3209,
212
+ "step": 868
213
+ },
214
+ {
215
+ "epoch": 2.88,
216
+ "grad_norm": 5.8296990394592285,
217
+ "learning_rate": 2.1185897435897437e-05,
218
+ "loss": 0.329,
219
+ "step": 899
220
+ },
221
+ {
222
+ "epoch": 2.98,
223
+ "grad_norm": 11.532828330993652,
224
+ "learning_rate": 2.0192307692307694e-05,
225
+ "loss": 0.2934,
226
+ "step": 930
227
+ },
228
+ {
229
+ "epoch": 3.08,
230
+ "grad_norm": 15.149397850036621,
231
+ "learning_rate": 1.919871794871795e-05,
232
+ "loss": 0.2682,
233
+ "step": 961
234
+ },
235
+ {
236
+ "epoch": 3.18,
237
+ "grad_norm": 3.612698793411255,
238
+ "learning_rate": 1.8205128205128204e-05,
239
+ "loss": 0.264,
240
+ "step": 992
241
+ },
242
+ {
243
+ "epoch": 3.28,
244
+ "grad_norm": 1.7986979484558105,
245
+ "learning_rate": 1.721153846153846e-05,
246
+ "loss": 0.2085,
247
+ "step": 1023
248
+ },
249
+ {
250
+ "epoch": 3.38,
251
+ "grad_norm": 3.6060192584991455,
252
+ "learning_rate": 1.6217948717948718e-05,
253
+ "loss": 0.247,
254
+ "step": 1054
255
+ },
256
+ {
257
+ "epoch": 3.48,
258
+ "grad_norm": 2.7116451263427734,
259
+ "learning_rate": 1.5224358974358973e-05,
260
+ "loss": 0.2104,
261
+ "step": 1085
262
+ },
263
+ {
264
+ "epoch": 3.58,
265
+ "grad_norm": 4.838766574859619,
266
+ "learning_rate": 1.423076923076923e-05,
267
+ "loss": 0.234,
268
+ "step": 1116
269
+ },
270
+ {
271
+ "epoch": 3.68,
272
+ "grad_norm": 2.237657070159912,
273
+ "learning_rate": 1.3237179487179489e-05,
274
+ "loss": 0.2033,
275
+ "step": 1147
276
+ },
277
+ {
278
+ "epoch": 3.78,
279
+ "grad_norm": 1.6461944580078125,
280
+ "learning_rate": 1.2243589743589744e-05,
281
+ "loss": 0.2098,
282
+ "step": 1178
283
+ },
284
+ {
285
+ "epoch": 3.88,
286
+ "grad_norm": 6.327276229858398,
287
+ "learning_rate": 1.125e-05,
288
+ "loss": 0.2073,
289
+ "step": 1209
290
+ },
291
+ {
292
+ "epoch": 3.97,
293
+ "grad_norm": 2.9778146743774414,
294
+ "learning_rate": 1.0256410256410256e-05,
295
+ "loss": 0.1988,
296
+ "step": 1240
297
+ },
298
+ {
299
+ "epoch": 4.07,
300
+ "grad_norm": 1.4347281455993652,
301
+ "learning_rate": 9.262820512820514e-06,
302
+ "loss": 0.1664,
303
+ "step": 1271
304
+ },
305
+ {
306
+ "epoch": 4.17,
307
+ "grad_norm": 2.844505786895752,
308
+ "learning_rate": 8.26923076923077e-06,
309
+ "loss": 0.1529,
310
+ "step": 1302
311
+ },
312
+ {
313
+ "epoch": 4.27,
314
+ "grad_norm": 1.985013723373413,
315
+ "learning_rate": 7.275641025641026e-06,
316
+ "loss": 0.1447,
317
+ "step": 1333
318
+ },
319
+ {
320
+ "epoch": 4.37,
321
+ "grad_norm": 2.9127843379974365,
322
+ "learning_rate": 6.282051282051282e-06,
323
+ "loss": 0.1375,
324
+ "step": 1364
325
+ },
326
+ {
327
+ "epoch": 4.47,
328
+ "grad_norm": 2.6174566745758057,
329
+ "learning_rate": 5.288461538461538e-06,
330
+ "loss": 0.1515,
331
+ "step": 1395
332
+ },
333
+ {
334
+ "epoch": 4.57,
335
+ "grad_norm": 1.2411088943481445,
336
+ "learning_rate": 4.294871794871795e-06,
337
+ "loss": 0.1408,
338
+ "step": 1426
339
+ },
340
+ {
341
+ "epoch": 4.67,
342
+ "grad_norm": 1.8333454132080078,
343
+ "learning_rate": 3.3012820512820517e-06,
344
+ "loss": 0.1372,
345
+ "step": 1457
346
+ },
347
+ {
348
+ "epoch": 4.77,
349
+ "grad_norm": 1.785672903060913,
350
+ "learning_rate": 2.307692307692308e-06,
351
+ "loss": 0.1409,
352
+ "step": 1488
353
+ },
354
+ {
355
+ "epoch": 4.87,
356
+ "grad_norm": 3.533236026763916,
357
+ "learning_rate": 1.3141025641025643e-06,
358
+ "loss": 0.1276,
359
+ "step": 1519
360
+ },
361
+ {
362
+ "epoch": 4.97,
363
+ "grad_norm": 1.3145009279251099,
364
+ "learning_rate": 3.205128205128205e-07,
365
+ "loss": 0.1329,
366
+ "step": 1550
367
+ },
368
+ {
369
+ "epoch": 5.0,
370
+ "step": 1560,
371
+ "total_flos": 1.845867535870722e+19,
372
+ "train_loss": 0.4759287901413746,
373
+ "train_runtime": 3868.1419,
374
+ "train_samples_per_second": 3.224,
375
+ "train_steps_per_second": 0.403
376
+ }
377
+ ],
378
+ "logging_steps": 31,
379
+ "max_steps": 1560,
380
+ "num_input_tokens_seen": 0,
381
+ "num_train_epochs": 5,
382
+ "save_steps": 500,
383
+ "total_flos": 1.845867535870722e+19,
384
+ "train_batch_size": 8,
385
+ "trial_name": null,
386
+ "trial_params": null
387
+ }